diff --git "a/training_log.json" "b/training_log.json" deleted file mode 100644--- "a/training_log.json" +++ /dev/null @@ -1,15511 +0,0 @@ -[ - { - "loss": 0.0, - "grad_norm": 0.0015333890914916992, - "learning_rate": 0.0, - "num_tokens": 1216.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 1.0943000316619873, - "rewards/env_reward/std": 0.0, - "reward": 2.5943000316619873, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 5.4836273193359375e-06, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0005, - "step": 1 - }, - { - "loss": 0.0, - "grad_norm": 0.001904299482703209, - "learning_rate": 1.0000000000000002e-06, - "num_tokens": 2452.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.024800000712275505, - "rewards/env_reward/std": 0.0, - "reward": 1.4751999378204346, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 5.801518909720471e-06, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.001, - "step": 2 - }, - { - "loss": 0.0, - "grad_norm": 5.467747211456299, - "learning_rate": 2.0000000000000003e-06, - "num_tokens": 3668.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.04742499813437462, - "rewards/env_reward/std": 0.0726500004529953, - "reward": 1.5474250316619873, - "reward_std": 0.07264995574951172, - "frac_reward_zero_std": 0.0, - "completion_length": 3.0, - "kl": 2.7855238235119373e-05, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0015, - "step": 3 - }, - { - "loss": 0.0001, - "grad_norm": NaN, - "learning_rate": 3e-06, - "num_tokens": 4918.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 5.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 5.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.27184998989105225, - "rewards/env_reward/std": 0.30637651681900024, - "reward": 1.2281500101089478, - "reward_std": 0.30637648701667786, - "frac_reward_zero_std": 0.0, - "completion_length": 5.0, - "kl": 0.0009263694344099349, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.002, - "step": 4 - }, - { - "loss": 0.0, - "grad_norm": NaN, - "learning_rate": 3e-06, - "num_tokens": 6134.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.0914749950170517, - "rewards/env_reward/std": 0.2446500062942505, - "reward": 1.591475009918213, - "reward_std": 0.2446500062942505, - "frac_reward_zero_std": 0.0, - "completion_length": 3.0, - "kl": 8.985400313576974e-06, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0025, - "step": 5 - }, - { - "loss": 0.0, - "grad_norm": 5.823563575744629, - "learning_rate": 3e-06, - "num_tokens": 7346.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.055275000631809235, - "rewards/env_reward/std": 0.05144999921321869, - "reward": 1.4447250366210938, - "reward_std": 0.05145001411437988, - "frac_reward_zero_std": 0.0, - "completion_length": 3.0, - "kl": 2.5783977740445607e-05, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.003, - "step": 6 - }, - { - "loss": 0.0, - "grad_norm": 3.515963554382324, - "learning_rate": 4.000000000000001e-06, - "num_tokens": 8558.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.02980000153183937, - "rewards/env_reward/std": 0.025125950574874878, - "reward": 1.5297999382019043, - "reward_std": 0.025125989690423012, - "frac_reward_zero_std": 0.0, - "completion_length": 3.0, - "kl": 2.0618240043290825e-05, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0035, - "step": 7 - }, - { - "loss": 0.0, - "grad_norm": 0.0012212995206937194, - "learning_rate": 5e-06, - "num_tokens": 9770.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.017500000074505806, - "rewards/env_reward/std": 0.0, - "reward": 1.5175000429153442, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 2.9007594548602356e-06, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.004, - "step": 8 - }, - { - "loss": 0.0, - "grad_norm": 4.4884257316589355, - "learning_rate": 6e-06, - "num_tokens": 10982.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.3139750063419342, - "rewards/env_reward/std": 0.15238404273986816, - "reward": 1.1860249042510986, - "reward_std": 0.15238407254219055, - "frac_reward_zero_std": 0.0, - "completion_length": 3.0, - "kl": 3.519654396200167e-05, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0045, - "step": 9 - }, - { - "loss": 0.0, - "grad_norm": NaN, - "learning_rate": 7.000000000000001e-06, - "num_tokens": 12199.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.40117499232292175, - "rewards/env_reward/std": 0.4204060137271881, - "reward": 1.9011750221252441, - "reward_std": 0.4204059839248657, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.00046923011541366577, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.005, - "step": 10 - }, - { - "loss": 0.0, - "grad_norm": 5.395345687866211, - "learning_rate": 7.000000000000001e-06, - "num_tokens": 13411.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.26269999146461487, - "rewards/env_reward/std": 0.17459072172641754, - "reward": 1.762700080871582, - "reward_std": 0.17459072172641754, - "frac_reward_zero_std": 0.0, - "completion_length": 3.0, - "kl": 4.217028799757827e-05, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0055, - "step": 11 - }, - { - "loss": 0.0002, - "grad_norm": 24.1549015045166, - "learning_rate": 8.000000000000001e-06, - "num_tokens": 14624.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.05982499569654465, - "rewards/env_reward/std": 0.05187172815203667, - "reward": 1.5598249435424805, - "reward_std": 0.05187166854739189, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.00168648362159729, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.006, - "step": 12 - }, - { - "loss": 0.0, - "grad_norm": 3.7908565998077393, - "learning_rate": 9e-06, - "num_tokens": 15840.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.20547500252723694, - "rewards/env_reward/std": 0.05324999988079071, - "reward": 1.7054749727249146, - "reward_std": 0.0532500334084034, - "frac_reward_zero_std": 0.0, - "completion_length": 3.0, - "kl": 7.70886759937639e-06, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0065, - "step": 13 - }, - { - "loss": 0.0, - "grad_norm": 0.00359143503010273, - "learning_rate": 1e-05, - "num_tokens": 17052.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.792900025844574, - "rewards/env_reward/std": 0.0, - "reward": 2.2929000854492188, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 1.2079874977644067e-05, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.007, - "step": 14 - }, - { - "loss": 0.0, - "grad_norm": 3.261981248855591, - "learning_rate": 1.1000000000000001e-05, - "num_tokens": 18264.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.016324998810887337, - "rewards/env_reward/std": 0.06768739223480225, - "reward": 1.5163249969482422, - "reward_std": 0.06768736243247986, - "frac_reward_zero_std": 0.0, - "completion_length": 3.0, - "kl": 7.071098025335232e-05, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0075, - "step": 15 - }, - { - "loss": 0.0002, - "grad_norm": 22.216018676757812, - "learning_rate": 1.2e-05, - "num_tokens": 19477.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.06070000305771828, - "rewards/env_reward/std": 0.10127566754817963, - "reward": 1.4393000602722168, - "reward_std": 0.10127566009759903, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.0017704889178276062, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.008, - "step": 16 - }, - { - "loss": 0.0017, - "grad_norm": 22.440473556518555, - "learning_rate": 1.3000000000000001e-05, - "num_tokens": 20690.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.702750027179718, - "rewards/env_reward/std": 1.534355640411377, - "reward": 2.2027499675750732, - "reward_std": 1.534355640411377, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.01707853004336357, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0085, - "step": 17 - }, - { - "loss": 0.0001, - "grad_norm": 3.5670650005340576, - "learning_rate": 1.4000000000000001e-05, - "num_tokens": 21902.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.3511750102043152, - "rewards/env_reward/std": 0.2421124279499054, - "reward": 1.85117506980896, - "reward_std": 0.24211247265338898, - "frac_reward_zero_std": 0.0, - "completion_length": 3.0, - "kl": 0.0013786455432036604, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.009, - "step": 18 - }, - { - "loss": 0.0, - "grad_norm": 0.023953210562467575, - "learning_rate": 1.5e-05, - "num_tokens": 23114.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.021400000900030136, - "rewards/env_reward/std": 0.0, - "reward": 1.521399974822998, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 0.0004576047358568758, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0095, - "step": 19 - }, - { - "loss": 0.0516, - "grad_norm": 13.440083503723145, - "learning_rate": 1.6000000000000003e-05, - "num_tokens": 24331.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.5774250030517578, - "rewards/env_reward/std": 0.35871684551239014, - "reward": 2.077425003051758, - "reward_std": 0.3587168753147125, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.5155713111162186, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.01, - "step": 20 - }, - { - "loss": 0.0846, - "grad_norm": 8.484152793884277, - "learning_rate": 1.7000000000000003e-05, - "num_tokens": 25548.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.0176750048995018, - "rewards/env_reward/std": 0.13360247015953064, - "reward": 1.4823249578475952, - "reward_std": 0.13360245525836945, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.8458694666624069, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0105, - "step": 21 - }, - { - "loss": 0.0675, - "grad_norm": 9.707590103149414, - "learning_rate": 1.8e-05, - "num_tokens": 26765.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.017500000074505806, - "rewards/env_reward/std": 0.060531098395586014, - "reward": 1.5174999237060547, - "reward_std": 0.06053108721971512, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.6749855130910873, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.011, - "step": 22 - }, - { - "loss": 0.0617, - "grad_norm": 3.4466989040374756, - "learning_rate": 1.9e-05, - "num_tokens": 27978.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.29785001277923584, - "rewards/env_reward/std": 0.25378555059432983, - "reward": 1.2021499872207642, - "reward_std": 0.25378552079200745, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.6174105107784271, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0115, - "step": 23 - }, - { - "loss": 0.0338, - "grad_norm": 39.08412170410156, - "learning_rate": 2e-05, - "num_tokens": 29196.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 5.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 5.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.1463249921798706, - "rewards/env_reward/std": 0.3274500072002411, - "reward": 1.3536750078201294, - "reward_std": 0.3274500072002411, - "frac_reward_zero_std": 0.0, - "completion_length": 5.0, - "kl": 0.3376213669835124, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.012, - "step": 24 - }, - { - "loss": 0.0151, - "grad_norm": 20.261240005493164, - "learning_rate": 2.1e-05, - "num_tokens": 30410.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.9611750245094299, - "rewards/env_reward/std": 0.5365340113639832, - "reward": 2.461174964904785, - "reward_std": 0.5365338921546936, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.1506122574210167, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0125, - "step": 25 - }, - { - "loss": 0.0145, - "grad_norm": 11.482789993286133, - "learning_rate": 2.2000000000000003e-05, - "num_tokens": 31624.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.14649999141693115, - "rewards/env_reward/std": 0.06558185815811157, - "reward": 1.6464999914169312, - "reward_std": 0.06558185070753098, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.14546086639165878, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.013, - "step": 26 - }, - { - "loss": 0.0124, - "grad_norm": 14.635064125061035, - "learning_rate": 2.3000000000000003e-05, - "num_tokens": 32842.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.5030750036239624, - "rewards/env_reward/std": 0.7913716435432434, - "reward": 2.003074884414673, - "reward_std": 0.7913715243339539, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.12426742166280746, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0135, - "step": 27 - }, - { - "loss": 0.0039, - "grad_norm": 16.629444122314453, - "learning_rate": 2.4e-05, - "num_tokens": 34055.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 1.0267250537872314, - "rewards/env_reward/std": 0.4732673764228821, - "reward": 2.5267250537872314, - "reward_std": 0.47326740622520447, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.039139650762081146, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.014, - "step": 28 - }, - { - "loss": 0.0045, - "grad_norm": 26.910884857177734, - "learning_rate": 2.5e-05, - "num_tokens": 35273.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 1.5891250371932983, - "rewards/env_reward/std": 1.5938708782196045, - "reward": 3.089125156402588, - "reward_std": 1.5938708782196045, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.04533430188894272, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0145, - "step": 29 - }, - { - "loss": 0.0012, - "grad_norm": 22.792469024658203, - "learning_rate": 2.6000000000000002e-05, - "num_tokens": 36490.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.793999969959259, - "rewards/env_reward/std": 1.4766161441802979, - "reward": 2.2939999103546143, - "reward_std": 1.4766160249710083, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.0121101513504982, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.015, - "step": 30 - }, - { - "loss": 0.0044, - "grad_norm": 19.31297492980957, - "learning_rate": 2.7000000000000002e-05, - "num_tokens": 37707.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.05525000020861626, - "rewards/env_reward/std": 0.05696369707584381, - "reward": 1.5552499294281006, - "reward_std": 0.056963708251714706, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.04407123476266861, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0155, - "step": 31 - }, - { - "loss": 0.0139, - "grad_norm": 9.066012382507324, - "learning_rate": 2.8000000000000003e-05, - "num_tokens": 38920.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.021675001829862595, - "rewards/env_reward/std": 0.15254653990268707, - "reward": 1.5216751098632812, - "reward_std": 0.15254652500152588, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.1389305256307125, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.016, - "step": 32 - }, - { - "loss": 0.0196, - "grad_norm": 21.37767791748047, - "learning_rate": 2.9e-05, - "num_tokens": 40134.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.6998249888420105, - "rewards/env_reward/std": 0.11420779675245285, - "reward": 2.1998250484466553, - "reward_std": 0.11420782655477524, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.1958363577723503, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0165, - "step": 33 - }, - { - "loss": 0.0086, - "grad_norm": 6.699766635894775, - "learning_rate": 3e-05, - "num_tokens": 41350.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.06562499701976776, - "rewards/env_reward/std": 0.1474488228559494, - "reward": 1.4343750476837158, - "reward_std": 0.1474488228559494, - "frac_reward_zero_std": 0.0, - "completion_length": 3.0, - "kl": 0.08599535003304482, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.017, - "step": 34 - }, - { - "loss": 0.0004, - "grad_norm": 4.889353275299072, - "learning_rate": 3.1e-05, - "num_tokens": 42562.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.03232499957084656, - "rewards/env_reward/std": 0.020850002765655518, - "reward": 1.532325029373169, - "reward_std": 0.02085002325475216, - "frac_reward_zero_std": 0.0, - "completion_length": 3.0, - "kl": 0.0037796597498527262, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0175, - "step": 35 - }, - { - "loss": 0.1285, - "grad_norm": 15.511324882507324, - "learning_rate": 3.2000000000000005e-05, - "num_tokens": 43775.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.3314250111579895, - "rewards/env_reward/std": 0.17125000059604645, - "reward": 1.8314249515533447, - "reward_std": 0.17124998569488525, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 1.2848919034004211, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.018, - "step": 36 - }, - { - "loss": 0.0175, - "grad_norm": 5.048687934875488, - "learning_rate": 3.3e-05, - "num_tokens": 44991.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.0533749982714653, - "rewards/env_reward/std": 0.06385000050067902, - "reward": 1.446624994277954, - "reward_std": 0.06384996324777603, - "frac_reward_zero_std": 0.0, - "completion_length": 3.0, - "kl": 0.1752433218061924, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0185, - "step": 37 - }, - { - "loss": 0.0024, - "grad_norm": 4.65332555770874, - "learning_rate": 3.4000000000000007e-05, - "num_tokens": 46203.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.03635000064969063, - "rewards/env_reward/std": 0.02292080596089363, - "reward": 1.5363500118255615, - "reward_std": 0.022920822724699974, - "frac_reward_zero_std": 0.0, - "completion_length": 3.0, - "kl": 0.023557812673971057, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.019, - "step": 38 - }, - { - "loss": 0.0679, - "grad_norm": 48.29575729370117, - "learning_rate": 3.5e-05, - "num_tokens": 47420.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.6989249587059021, - "rewards/env_reward/std": 0.649150013923645, - "reward": 2.198925018310547, - "reward_std": 0.649150013923645, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.6788432970643044, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0195, - "step": 39 - }, - { - "loss": 0.0063, - "grad_norm": 0.36195874214172363, - "learning_rate": 3.6e-05, - "num_tokens": 48640.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.41749998927116394, - "rewards/env_reward/std": 0.0, - "reward": 1.9175000190734863, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.06268331408500671, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.02, - "step": 40 - }, - { - "loss": 0.0269, - "grad_norm": 13.39460563659668, - "learning_rate": 3.7e-05, - "num_tokens": 49857.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.8564249873161316, - "rewards/env_reward/std": 1.4304662942886353, - "reward": 2.3564250469207764, - "reward_std": 1.4304662942886353, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.26936937868595123, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0205, - "step": 41 - }, - { - "loss": 0.0187, - "grad_norm": 30.891935348510742, - "learning_rate": 3.8e-05, - "num_tokens": 51071.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 5.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 5.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.38007497787475586, - "rewards/env_reward/std": 0.18915000557899475, - "reward": 1.1199250221252441, - "reward_std": 0.18914997577667236, - "frac_reward_zero_std": 0.0, - "completion_length": 5.0, - "kl": 0.1872375439852476, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.021, - "step": 42 - }, - { - "loss": 0.0032, - "grad_norm": 3.8051974773406982, - "learning_rate": 3.9000000000000006e-05, - "num_tokens": 52291.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 1.5476000308990479, - "rewards/env_reward/std": 1.6770870685577393, - "reward": 3.047600030899048, - "reward_std": 1.6770870685577393, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.031939536333084106, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0215, - "step": 43 - }, - { - "loss": 0.0046, - "grad_norm": 0.1436671018600464, - "learning_rate": 4e-05, - "num_tokens": 53503.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.09260000288486481, - "rewards/env_reward/std": 0.0, - "reward": 1.4074000120162964, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 0.04641978070139885, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.022, - "step": 44 - }, - { - "loss": 0.0088, - "grad_norm": 3.5930678844451904, - "learning_rate": 4.1e-05, - "num_tokens": 54723.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.11879999935626984, - "rewards/env_reward/std": 0.19900000095367432, - "reward": 1.6187999248504639, - "reward_std": 0.19900000095367432, - "frac_reward_zero_std": 0.0, - "completion_length": 3.0, - "kl": 0.08768663927912712, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0225, - "step": 45 - }, - { - "loss": 0.0091, - "grad_norm": 21.19375228881836, - "learning_rate": 4.2e-05, - "num_tokens": 55940.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.9104750156402588, - "rewards/env_reward/std": 1.393201231956482, - "reward": 2.410475015640259, - "reward_std": 1.393201231956482, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.09141915291547775, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.023, - "step": 46 - }, - { - "loss": 0.0113, - "grad_norm": 7.426270008087158, - "learning_rate": 4.3e-05, - "num_tokens": 57157.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.19827499985694885, - "rewards/env_reward/std": 0.44344282150268555, - "reward": 1.6982749700546265, - "reward_std": 0.44344279170036316, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.11329534649848938, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0235, - "step": 47 - }, - { - "loss": 0.0096, - "grad_norm": 23.390466690063477, - "learning_rate": 4.4000000000000006e-05, - "num_tokens": 58372.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.4764999747276306, - "rewards/env_reward/std": 0.07039998471736908, - "reward": 1.9765000343322754, - "reward_std": 0.07039991766214371, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.09569882601499557, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.024, - "step": 48 - }, - { - "loss": 0.0228, - "grad_norm": 13.507621765136719, - "learning_rate": 4.5e-05, - "num_tokens": 59585.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.19177499413490295, - "rewards/env_reward/std": 0.6475414037704468, - "reward": 1.6917749643325806, - "reward_std": 0.647541344165802, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.22822999954223633, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0245, - "step": 49 - }, - { - "loss": 0.0147, - "grad_norm": 16.81793212890625, - "learning_rate": 4.600000000000001e-05, - "num_tokens": 60803.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.08905000984668732, - "rewards/env_reward/std": 0.642199695110321, - "reward": 1.589050054550171, - "reward_std": 0.6421996355056763, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.1465737447142601, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.025, - "step": 50 - }, - { - "loss": 0.011, - "grad_norm": 13.575445175170898, - "learning_rate": 4.7e-05, - "num_tokens": 62051.0, - "completions/mean_length": 4.0, - "completions/min_length": 3.0, - "completions/max_length": 5.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 5.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 1.0112500190734863, - "rewards/env_reward/std": 1.2719687223434448, - "reward": 2.5112500190734863, - "reward_std": 1.2719687223434448, - "frac_reward_zero_std": 0.0, - "completion_length": 5.0, - "kl": 0.11031558783724904, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0255, - "step": 51 - }, - { - "loss": 0.0137, - "grad_norm": 23.81420135498047, - "learning_rate": 4.8e-05, - "num_tokens": 63269.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.11865000426769257, - "rewards/env_reward/std": 0.04601481929421425, - "reward": 1.618649959564209, - "reward_std": 0.04601481184363365, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.13692589104175568, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.026, - "step": 52 - }, - { - "loss": 0.0209, - "grad_norm": 23.225088119506836, - "learning_rate": 4.9e-05, - "num_tokens": 64483.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.3612000048160553, - "rewards/env_reward/std": 0.09145228564739227, - "reward": 1.8612000942230225, - "reward_std": 0.09145224839448929, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.20900648832321167, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0265, - "step": 53 - }, - { - "loss": 0.0341, - "grad_norm": 28.92061996459961, - "learning_rate": 5e-05, - "num_tokens": 65701.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.5038999915122986, - "rewards/env_reward/std": 0.05588749051094055, - "reward": 2.0039000511169434, - "reward_std": 0.05588748678565025, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.34077559411525726, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.027, - "step": 54 - }, - { - "loss": 0.0357, - "grad_norm": 22.28899383544922, - "learning_rate": 4.9888888888888894e-05, - "num_tokens": 66916.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.45639997720718384, - "rewards/env_reward/std": 0.09699998795986176, - "reward": 1.956399917602539, - "reward_std": 0.09700000286102295, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.3573853522539139, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0275, - "step": 55 - }, - { - "loss": 0.0179, - "grad_norm": 0.27123039960861206, - "learning_rate": 4.977777777777778e-05, - "num_tokens": 68132.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.05339999869465828, - "rewards/env_reward/std": 0.0, - "reward": 1.5534000396728516, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 0.17926177382469177, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.028, - "step": 56 - }, - { - "loss": 0.0523, - "grad_norm": 17.443599700927734, - "learning_rate": 4.966666666666667e-05, - "num_tokens": 69350.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.10307499766349792, - "rewards/env_reward/std": 0.03798661008477211, - "reward": 1.6030750274658203, - "reward_std": 0.03798658773303032, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.5231126323342323, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0285, - "step": 57 - }, - { - "loss": 0.049, - "grad_norm": 18.03072738647461, - "learning_rate": 4.955555555555556e-05, - "num_tokens": 70573.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.35887500643730164, - "rewards/env_reward/std": 0.41454997658729553, - "reward": 1.858875036239624, - "reward_std": 0.41454997658729553, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.4901750087738037, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.029, - "step": 58 - }, - { - "loss": 0.1065, - "grad_norm": 27.605003356933594, - "learning_rate": 4.9444444444444446e-05, - "num_tokens": 71790.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.08367499709129333, - "rewards/env_reward/std": 0.04114999994635582, - "reward": 1.5836749076843262, - "reward_std": 0.041150014847517014, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 1.0653182864189148, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0295, - "step": 59 - }, - { - "loss": 0.109, - "grad_norm": 25.35848045349121, - "learning_rate": 4.933333333333334e-05, - "num_tokens": 73007.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.19287501275539398, - "rewards/env_reward/std": 0.05770479142665863, - "reward": 1.6928750276565552, - "reward_std": 0.05770481005311012, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 1.0902290344238281, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.03, - "step": 60 - }, - { - "loss": 0.102, - "grad_norm": 18.48727798461914, - "learning_rate": 4.922222222222222e-05, - "num_tokens": 74225.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 5.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 5.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.23907500505447388, - "rewards/env_reward/std": 0.8462180495262146, - "reward": 1.2609249353408813, - "reward_std": 0.8462179899215698, - "frac_reward_zero_std": 0.0, - "completion_length": 5.0, - "kl": 1.0198184018954635, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0305, - "step": 61 - }, - { - "loss": 0.0864, - "grad_norm": 19.874589920043945, - "learning_rate": 4.9111111111111114e-05, - "num_tokens": 75443.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.2953000068664551, - "rewards/env_reward/std": 0.2296384572982788, - "reward": 1.795300006866455, - "reward_std": 0.2296384572982788, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.8640272691845894, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.031, - "step": 62 - }, - { - "loss": 0.0702, - "grad_norm": 19.802614212036133, - "learning_rate": 4.9e-05, - "num_tokens": 76681.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 1.56659996509552, - "rewards/env_reward/std": 1.6551477909088135, - "reward": 3.0666000843048096, - "reward_std": 1.655147671699524, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.7016429305076599, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0315, - "step": 63 - }, - { - "loss": 0.0658, - "grad_norm": 29.09418296813965, - "learning_rate": 4.888888888888889e-05, - "num_tokens": 77895.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.09319999814033508, - "rewards/env_reward/std": 0.07863511145114899, - "reward": 1.4068000316619873, - "reward_std": 0.078635074198246, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.6582788825035095, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.032, - "step": 64 - }, - { - "loss": 0.0473, - "grad_norm": 19.2071475982666, - "learning_rate": 4.8777777777777775e-05, - "num_tokens": 79118.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.16110000014305115, - "rewards/env_reward/std": 0.16779999434947968, - "reward": 1.661099910736084, - "reward_std": 0.16780002415180206, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.4729412868618965, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0325, - "step": 65 - }, - { - "loss": 0.0064, - "grad_norm": 2.748170852661133, - "learning_rate": 4.866666666666667e-05, - "num_tokens": 80342.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 2.250649929046631, - "rewards/env_reward/std": 1.4378999471664429, - "reward": 3.750649929046631, - "reward_std": 1.4378999471664429, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.06360267847776413, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.033, - "step": 66 - }, - { - "loss": 0.045, - "grad_norm": 3.3013949394226074, - "learning_rate": 4.855555555555556e-05, - "num_tokens": 81562.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.36434999108314514, - "rewards/env_reward/std": 0.12464992702007294, - "reward": 1.8643500804901123, - "reward_std": 0.12464995682239532, - "frac_reward_zero_std": 0.0, - "completion_length": 3.0, - "kl": 0.45019371435046196, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0335, - "step": 67 - }, - { - "loss": 0.0816, - "grad_norm": 5.779470920562744, - "learning_rate": 4.844444444444445e-05, - "num_tokens": 82779.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.2705250084400177, - "rewards/env_reward/std": 0.2717834413051605, - "reward": 1.7705249786376953, - "reward_std": 0.2717834413051605, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.815668910741806, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.034, - "step": 68 - }, - { - "loss": 0.0289, - "grad_norm": 26.5799617767334, - "learning_rate": 4.8333333333333334e-05, - "num_tokens": 83993.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.4679499864578247, - "rewards/env_reward/std": 0.042666174471378326, - "reward": 1.9679499864578247, - "reward_std": 0.042666174471378326, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.2885260581970215, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0345, - "step": 69 - }, - { - "loss": 0.014, - "grad_norm": 21.3876895904541, - "learning_rate": 4.8222222222222225e-05, - "num_tokens": 85208.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 2.292875051498413, - "rewards/env_reward/std": 1.4142500162124634, - "reward": 3.792875051498413, - "reward_std": 1.4142500162124634, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.13965822756290436, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.035, - "step": 70 - }, - { - "loss": 0.0306, - "grad_norm": 24.042043685913086, - "learning_rate": 4.811111111111111e-05, - "num_tokens": 86422.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 1.2664499282836914, - "rewards/env_reward/std": 1.5692957639694214, - "reward": 2.7664499282836914, - "reward_std": 1.5692955255508423, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.3056928962469101, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0355, - "step": 71 - }, - { - "loss": 0.0218, - "grad_norm": 3.963625907897949, - "learning_rate": 4.8e-05, - "num_tokens": 87638.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.47062501311302185, - "rewards/env_reward/std": 0.12355001270771027, - "reward": 1.9706251621246338, - "reward_std": 0.12355005741119385, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.21846546977758408, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.036, - "step": 72 - }, - { - "loss": 0.0292, - "grad_norm": 20.61098861694336, - "learning_rate": 4.7888888888888886e-05, - "num_tokens": 88853.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.12492500245571136, - "rewards/env_reward/std": 0.15604999661445618, - "reward": 1.6249250173568726, - "reward_std": 0.1560499668121338, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.29186780005693436, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0365, - "step": 73 - }, - { - "loss": 0.0662, - "grad_norm": 14.41640567779541, - "learning_rate": 4.7777777777777784e-05, - "num_tokens": 90072.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.4067000150680542, - "rewards/env_reward/std": 0.14239999651908875, - "reward": 1.9067000150680542, - "reward_std": 0.14240002632141113, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.6624188795685768, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.037, - "step": 74 - }, - { - "loss": 0.0778, - "grad_norm": 6.708349227905273, - "learning_rate": 4.766666666666667e-05, - "num_tokens": 91286.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 0.25, - "rewards/format_valid/std": 1.5, - "rewards/action_legal/mean": 0.125, - "rewards/action_legal/std": 0.75, - "rewards/env_reward/mean": -0.6858749985694885, - "rewards/env_reward/std": 1.5449819564819336, - "reward": -0.3108749985694885, - "reward_std": 3.793658494949341, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.7776949852705002, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0375, - "step": 75 - }, - { - "loss": 0.1064, - "grad_norm": 2.824847936630249, - "learning_rate": 4.755555555555556e-05, - "num_tokens": 92501.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.17409999668598175, - "rewards/env_reward/std": 0.07380000501871109, - "reward": 1.6740999221801758, - "reward_std": 0.0737999677658081, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 1.0637851729989052, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.038, - "step": 76 - }, - { - "loss": 0.1088, - "grad_norm": NaN, - "learning_rate": 4.7444444444444445e-05, - "num_tokens": 93720.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.524150013923645, - "rewards/env_reward/std": 0.03130000829696655, - "reward": 2.0241501331329346, - "reward_std": 0.03129998967051506, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 1.08802729845047, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0385, - "step": 77 - }, - { - "loss": 0.2122, - "grad_norm": 56.26820373535156, - "learning_rate": 4.7444444444444445e-05, - "num_tokens": 94938.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.1987999975681305, - "rewards/env_reward/std": 0.04688084498047829, - "reward": 1.6988000869750977, - "reward_std": 0.04688084498047829, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 2.122272402048111, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.039, - "step": 78 - }, - { - "loss": 0.0738, - "grad_norm": 4.279953479766846, - "learning_rate": 4.7333333333333336e-05, - "num_tokens": 96157.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.10649999976158142, - "rewards/env_reward/std": 0.03020000271499157, - "reward": 1.6065000295639038, - "reward_std": 0.03020000457763672, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.7380976751446724, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0395, - "step": 79 - }, - { - "loss": 0.0759, - "grad_norm": 0.21008452773094177, - "learning_rate": 4.722222222222222e-05, - "num_tokens": 97373.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.2621999979019165, - "rewards/env_reward/std": 0.0, - "reward": 1.2378000020980835, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.7594332098960876, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.04, - "step": 80 - }, - { - "loss": 0.0536, - "grad_norm": 0.1332825869321823, - "learning_rate": 4.711111111111111e-05, - "num_tokens": 98589.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.07129999995231628, - "rewards/env_reward/std": 0.0, - "reward": 1.4286999702453613, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.5356340408325195, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0405, - "step": 81 - }, - { - "loss": 0.0905, - "grad_norm": 32.49578857421875, - "learning_rate": 4.7e-05, - "num_tokens": 99808.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.08722500503063202, - "rewards/env_reward/std": 0.03164999932050705, - "reward": 1.5872249603271484, - "reward_std": 0.031649984419345856, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.9049255102872849, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.041, - "step": 82 - }, - { - "loss": 0.0264, - "grad_norm": 0.06416141986846924, - "learning_rate": 4.6888888888888895e-05, - "num_tokens": 101024.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 2.6410000324249268, - "rewards/env_reward/std": 0.0, - "reward": 4.140999794006348, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.2637370824813843, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0415, - "step": 83 - }, - { - "loss": 0.1094, - "grad_norm": 36.0538444519043, - "learning_rate": 4.677777777777778e-05, - "num_tokens": 102242.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.010550007224082947, - "rewards/env_reward/std": 0.5143613815307617, - "reward": 1.5105500221252441, - "reward_std": 0.5143613815307617, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 1.0936070084571838, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.042, - "step": 84 - }, - { - "loss": 0.0578, - "grad_norm": 12.624850273132324, - "learning_rate": 4.666666666666667e-05, - "num_tokens": 103457.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.09380000084638596, - "rewards/env_reward/std": 0.04500000178813934, - "reward": 1.4061999320983887, - "reward_std": 0.04499995708465576, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.5784699767827988, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0425, - "step": 85 - }, - { - "loss": 0.0414, - "grad_norm": 0.054093338549137115, - "learning_rate": 4.6555555555555556e-05, - "num_tokens": 104681.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.12860000133514404, - "rewards/env_reward/std": 0.0, - "reward": 1.628600001335144, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.41362982988357544, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.043, - "step": 86 - }, - { - "loss": 0.0287, - "grad_norm": 0.052269965410232544, - "learning_rate": 4.644444444444445e-05, - "num_tokens": 105901.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.21299999952316284, - "rewards/env_reward/std": 0.0, - "reward": 1.7130000591278076, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.28686797618865967, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0435, - "step": 87 - }, - { - "loss": 0.0305, - "grad_norm": 0.059413399547338486, - "learning_rate": 4.633333333333333e-05, - "num_tokens": 107121.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.11779999732971191, - "rewards/env_reward/std": 0.0, - "reward": 1.617799997329712, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.3047557473182678, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.044, - "step": 88 - }, - { - "loss": 0.0364, - "grad_norm": 13.943113327026367, - "learning_rate": 4.6222222222222224e-05, - "num_tokens": 108340.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.06332500278949738, - "rewards/env_reward/std": 0.16214999556541443, - "reward": 1.5633249282836914, - "reward_std": 0.16214998066425323, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.3640783429145813, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0445, - "step": 89 - }, - { - "loss": 0.0191, - "grad_norm": 0.0353960245847702, - "learning_rate": 4.6111111111111115e-05, - "num_tokens": 109556.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.28200000524520874, - "rewards/env_reward/std": 0.0, - "reward": 1.7820000648498535, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.19095450639724731, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.045, - "step": 90 - }, - { - "loss": 0.0612, - "grad_norm": 25.147192001342773, - "learning_rate": 4.600000000000001e-05, - "num_tokens": 110802.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 5.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 5.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.296999990940094, - "rewards/env_reward/std": 0.3095349669456482, - "reward": 1.2030000686645508, - "reward_std": 0.3095349371433258, - "frac_reward_zero_std": 0.0, - "completion_length": 5.0, - "kl": 0.6122921258211136, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0455, - "step": 91 - }, - { - "loss": 0.0685, - "grad_norm": 0.17055261135101318, - "learning_rate": 4.588888888888889e-05, - "num_tokens": 112022.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.1607999950647354, - "rewards/env_reward/std": 0.0, - "reward": 1.3392000198364258, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.6851677894592285, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.046, - "step": 92 - }, - { - "loss": 0.0863, - "grad_norm": 13.699186325073242, - "learning_rate": 4.577777777777778e-05, - "num_tokens": 113237.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.2336750030517578, - "rewards/env_reward/std": 0.3572499752044678, - "reward": 1.7336750030517578, - "reward_std": 0.3572499752044678, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.8626954779028893, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0465, - "step": 93 - }, - { - "loss": 0.0274, - "grad_norm": 0.07039373368024826, - "learning_rate": 4.566666666666667e-05, - "num_tokens": 114457.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 2.6287999153137207, - "rewards/env_reward/std": 0.0, - "reward": 4.128799915313721, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.2735567092895508, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.047, - "step": 94 - }, - { - "loss": 0.061, - "grad_norm": 4.729802131652832, - "learning_rate": 4.555555555555556e-05, - "num_tokens": 115673.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.0939750000834465, - "rewards/env_reward/std": 0.024350004270672798, - "reward": 1.5939749479293823, - "reward_std": 0.024349967017769814, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.6097765266895294, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0475, - "step": 95 - }, - { - "loss": 0.0207, - "grad_norm": 0.062468525022268295, - "learning_rate": 4.5444444444444444e-05, - "num_tokens": 116917.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.0908999964594841, - "rewards/env_reward/std": 0.0, - "reward": 1.59089994430542, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.20688144862651825, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.048, - "step": 96 - }, - { - "loss": 0.0827, - "grad_norm": 6.658811569213867, - "learning_rate": 4.5333333333333335e-05, - "num_tokens": 118131.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.163100004196167, - "rewards/env_reward/std": 0.018706146627664566, - "reward": 1.336899995803833, - "reward_std": 0.018706224858760834, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.8271568417549133, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0485, - "step": 97 - }, - { - "loss": 0.0433, - "grad_norm": 0.15103508532047272, - "learning_rate": 4.522222222222223e-05, - "num_tokens": 119351.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.12250000238418579, - "rewards/env_reward/std": 0.0, - "reward": 1.622499942779541, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.4332352876663208, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.049, - "step": 98 - }, - { - "loss": 0.0089, - "grad_norm": 0.036531127989292145, - "learning_rate": 4.511111111111112e-05, - "num_tokens": 120571.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 3.0, - "rewards/env_reward/std": 0.0, - "reward": 4.5, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.08936262130737305, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0495, - "step": 99 - }, - { - "loss": 0.0212, - "grad_norm": 0.08901583403348923, - "learning_rate": 4.5e-05, - "num_tokens": 121791.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.5421000123023987, - "rewards/env_reward/std": 0.0, - "reward": 2.042099952697754, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.2119981050491333, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.05, - "step": 100 - }, - { - "loss": 0.2432, - "grad_norm": 66.02021026611328, - "learning_rate": 4.4888888888888894e-05, - "num_tokens": 123008.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.5019000172615051, - "rewards/env_reward/std": 0.054399993270635605, - "reward": 2.0018999576568604, - "reward_std": 0.054399967193603516, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 2.4316359385848045, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0505, - "step": 101 - }, - { - "loss": 0.0245, - "grad_norm": 2.3296234607696533, - "learning_rate": 4.477777777777778e-05, - "num_tokens": 124224.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 2.2127251625061035, - "rewards/env_reward/std": 1.4261500835418701, - "reward": 3.7127251625061035, - "reward_std": 1.4261500835418701, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.24477409571409225, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.051, - "step": 102 - }, - { - "loss": 0.0983, - "grad_norm": 9.382343292236328, - "learning_rate": 4.466666666666667e-05, - "num_tokens": 125443.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.573775053024292, - "rewards/env_reward/std": 0.14553172886371613, - "reward": 2.073775053024292, - "reward_std": 0.1455318182706833, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.9832261502742767, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0515, - "step": 103 - }, - { - "loss": 0.114, - "grad_norm": 2.782944679260254, - "learning_rate": 4.4555555555555555e-05, - "num_tokens": 126662.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.5668249726295471, - "rewards/env_reward/std": 0.27695000171661377, - "reward": 2.0668249130249023, - "reward_std": 0.2769499123096466, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 1.1402944773435593, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.052, - "step": 104 - }, - { - "loss": 0.0581, - "grad_norm": 7.829272270202637, - "learning_rate": 4.4444444444444447e-05, - "num_tokens": 127877.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 1.1490999460220337, - "rewards/env_reward/std": 0.5705999732017517, - "reward": 2.6491000652313232, - "reward_std": 0.5706000328063965, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.580941379070282, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0525, - "step": 105 - }, - { - "loss": 0.0701, - "grad_norm": 27.5728759765625, - "learning_rate": 4.433333333333334e-05, - "num_tokens": 129096.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.2233249992132187, - "rewards/env_reward/std": 0.05404999852180481, - "reward": 1.7233250141143799, - "reward_std": 0.05404997244477272, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.7010266333818436, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.053, - "step": 106 - }, - { - "loss": 0.0109, - "grad_norm": 2.1162190437316895, - "learning_rate": 4.422222222222222e-05, - "num_tokens": 130316.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.06030000001192093, - "rewards/env_reward/std": 0.026399999856948853, - "reward": 1.5602998733520508, - "reward_std": 0.026400011032819748, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.10948272794485092, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0535, - "step": 107 - }, - { - "loss": 0.0407, - "grad_norm": 11.704363822937012, - "learning_rate": 4.4111111111111114e-05, - "num_tokens": 131535.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.12099999934434891, - "rewards/env_reward/std": 0.03739999607205391, - "reward": 1.6209999322891235, - "reward_std": 0.03739996626973152, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.4071061462163925, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.054, - "step": 108 - }, - { - "loss": 0.0236, - "grad_norm": 0.09328442811965942, - "learning_rate": 4.4000000000000006e-05, - "num_tokens": 132751.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 2.6695001125335693, - "rewards/env_reward/std": 0.0, - "reward": 4.169500350952148, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.23597905039787292, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0545, - "step": 109 - }, - { - "loss": 0.0301, - "grad_norm": 0.0933489054441452, - "learning_rate": 4.388888888888889e-05, - "num_tokens": 133975.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.12200000137090683, - "rewards/env_reward/std": 0.0, - "reward": 1.621999979019165, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.30128175020217896, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.055, - "step": 110 - }, - { - "loss": 0.0356, - "grad_norm": 7.820048809051514, - "learning_rate": 4.377777777777778e-05, - "num_tokens": 135194.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.10917499661445618, - "rewards/env_reward/std": 0.04259517788887024, - "reward": 1.6091749668121338, - "reward_std": 0.04259520396590233, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.3561866581439972, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0555, - "step": 111 - }, - { - "loss": 0.0152, - "grad_norm": 0.07931513339281082, - "learning_rate": 4.3666666666666666e-05, - "num_tokens": 136442.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.5921000242233276, - "rewards/env_reward/std": 0.0, - "reward": 2.092100143432617, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.15175144374370575, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.056, - "step": 112 - }, - { - "loss": 0.0653, - "grad_norm": 9.840944290161133, - "learning_rate": 4.355555555555556e-05, - "num_tokens": 137657.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 2.1259751319885254, - "rewards/env_reward/std": 1.2256500720977783, - "reward": 3.6259751319885254, - "reward_std": 1.2256500720977783, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.6528940796852112, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0565, - "step": 113 - }, - { - "loss": 0.0145, - "grad_norm": 0.17490524053573608, - "learning_rate": 4.344444444444445e-05, - "num_tokens": 138869.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.25099998712539673, - "rewards/env_reward/std": 0.0, - "reward": 1.750999927520752, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 0.14506137371063232, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.057, - "step": 114 - }, - { - "loss": 0.0443, - "grad_norm": 2.580396890640259, - "learning_rate": 4.3333333333333334e-05, - "num_tokens": 140089.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.09184999763965607, - "rewards/env_reward/std": 0.029099998995661736, - "reward": 1.5918500423431396, - "reward_std": 0.029099982231855392, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.44321155548095703, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0575, - "step": 115 - }, - { - "loss": 0.0254, - "grad_norm": 0.03984629362821579, - "learning_rate": 4.3222222222222226e-05, - "num_tokens": 141309.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.5141000151634216, - "rewards/env_reward/std": 0.0, - "reward": 2.0141000747680664, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.25392112135887146, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.058, - "step": 116 - }, - { - "loss": 0.0429, - "grad_norm": 0.04275592789053917, - "learning_rate": 4.311111111111111e-05, - "num_tokens": 142533.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.1080000028014183, - "rewards/env_reward/std": 0.0, - "reward": 1.6080000400543213, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.4292468726634979, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0585, - "step": 117 - }, - { - "loss": 0.0406, - "grad_norm": 0.03664204105734825, - "learning_rate": 4.3e-05, - "num_tokens": 143753.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.09380000084638596, - "rewards/env_reward/std": 0.0, - "reward": 1.5937999486923218, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.4057614803314209, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.059, - "step": 118 - }, - { - "loss": 0.0157, - "grad_norm": 0.020087143406271935, - "learning_rate": 4.2888888888888886e-05, - "num_tokens": 144993.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 2.705399990081787, - "rewards/env_reward/std": 0.0, - "reward": 4.205399990081787, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.15697520971298218, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0595, - "step": 119 - }, - { - "loss": 0.0447, - "grad_norm": 0.04034416750073433, - "learning_rate": 4.277777777777778e-05, - "num_tokens": 146209.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.4903999865055084, - "rewards/env_reward/std": 0.0, - "reward": 1.990399956703186, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.4468948245048523, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.06, - "step": 120 - }, - { - "loss": 0.0335, - "grad_norm": 0.026327671483159065, - "learning_rate": 4.266666666666667e-05, - "num_tokens": 147429.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.4788999855518341, - "rewards/env_reward/std": 0.0, - "reward": 1.9788999557495117, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.334785133600235, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0605, - "step": 121 - }, - { - "loss": 0.0422, - "grad_norm": 0.02646907977759838, - "learning_rate": 4.255555555555556e-05, - "num_tokens": 148649.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.08839999884366989, - "rewards/env_reward/std": 0.0, - "reward": 1.5884000062942505, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.422196626663208, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.061, - "step": 122 - }, - { - "loss": 0.5017, - "grad_norm": NaN, - "learning_rate": 4.2444444444444445e-05, - "num_tokens": 149864.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.0749250054359436, - "rewards/env_reward/std": 0.0864500030875206, - "reward": 1.5749249458312988, - "reward_std": 0.08644998073577881, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 5.016612961888313, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0615, - "step": 123 - }, - { - "loss": 0.0299, - "grad_norm": 0.03146585077047348, - "learning_rate": 4.2444444444444445e-05, - "num_tokens": 151080.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 2.3415000438690186, - "rewards/env_reward/std": 0.0, - "reward": 3.8415000438690186, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.2987973093986511, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.062, - "step": 124 - }, - { - "loss": 0.0243, - "grad_norm": 0.020708182826638222, - "learning_rate": 4.233333333333334e-05, - "num_tokens": 152300.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.16120000183582306, - "rewards/env_reward/std": 0.0, - "reward": 1.6612000465393066, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.24254640936851501, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0625, - "step": 125 - }, - { - "loss": 0.0815, - "grad_norm": 0.03530203923583031, - "learning_rate": 4.222222222222222e-05, - "num_tokens": 153524.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.22169999778270721, - "rewards/env_reward/std": 0.0, - "reward": 1.2783000469207764, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.8152614235877991, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.063, - "step": 126 - }, - { - "loss": 0.5948, - "grad_norm": NaN, - "learning_rate": 4.211111111111111e-05, - "num_tokens": 154743.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.15600000321865082, - "rewards/env_reward/std": 0.0, - "reward": 1.656000018119812, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 5.947892501950264, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0635, - "step": 127 - }, - { - "loss": 0.0188, - "grad_norm": 0.011080100201070309, - "learning_rate": 4.211111111111111e-05, - "num_tokens": 155959.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 3.0, - "rewards/env_reward/std": 0.0, - "reward": 4.5, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.18799710273742676, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.064, - "step": 128 - }, - { - "loss": 0.031, - "grad_norm": 0.02280164510011673, - "learning_rate": 4.2e-05, - "num_tokens": 157179.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 3.0, - "rewards/env_reward/std": 0.0, - "reward": 4.5, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.31020915508270264, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0645, - "step": 129 - }, - { - "loss": 0.0566, - "grad_norm": 0.06029680743813515, - "learning_rate": 4.188888888888889e-05, - "num_tokens": 158395.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.07129999995231628, - "rewards/env_reward/std": 0.0, - "reward": 1.4286999702453613, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.5657824277877808, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.065, - "step": 130 - }, - { - "loss": 1.6834, - "grad_norm": 317.1106262207031, - "learning_rate": 4.177777777777778e-05, - "num_tokens": 159641.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.45752501487731934, - "rewards/env_reward/std": 0.028950003907084465, - "reward": 1.9575250148773193, - "reward_std": 0.028950056061148643, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 16.833803206682205, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0655, - "step": 131 - }, - { - "loss": 1.0988, - "grad_norm": NaN, - "learning_rate": 4.166666666666667e-05, - "num_tokens": 160860.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.12665000557899475, - "rewards/env_reward/std": 0.35850000381469727, - "reward": 1.3733500242233276, - "reward_std": 0.35850000381469727, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 10.988276034593582, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.066, - "step": 132 - }, - { - "loss": 1.4756, - "grad_norm": 285.3789978027344, - "learning_rate": 4.166666666666667e-05, - "num_tokens": 162077.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.3772500157356262, - "rewards/env_reward/std": 0.2277943193912506, - "reward": 1.8772499561309814, - "reward_std": 0.22779428958892822, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 14.75564831495285, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0665, - "step": 133 - }, - { - "loss": 0.012, - "grad_norm": 0.01531064510345459, - "learning_rate": 4.155555555555556e-05, - "num_tokens": 163297.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 3.0, - "rewards/env_reward/std": 0.0, - "reward": 4.5, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.119660884141922, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.067, - "step": 134 - }, - { - "loss": 0.0384, - "grad_norm": 0.042683765292167664, - "learning_rate": 4.144444444444445e-05, - "num_tokens": 164517.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.38960000872612, - "rewards/env_reward/std": 0.0, - "reward": 1.8896000385284424, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.3839707374572754, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0675, - "step": 135 - }, - { - "loss": 0.0353, - "grad_norm": 0.027836784720420837, - "learning_rate": 4.133333333333333e-05, - "num_tokens": 165737.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.593999981880188, - "rewards/env_reward/std": 0.0, - "reward": 2.0939998626708984, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.3534739017486572, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.068, - "step": 136 - }, - { - "loss": 0.3368, - "grad_norm": 24.851154327392578, - "learning_rate": 4.1222222222222224e-05, - "num_tokens": 166960.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 2.200200080871582, - "rewards/env_reward/std": 1.5382000207901, - "reward": 3.700200080871582, - "reward_std": 1.5382001399993896, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 3.368258073925972, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0685, - "step": 137 - }, - { - "loss": 0.046, - "grad_norm": 0.05512086674571037, - "learning_rate": 4.111111111111111e-05, - "num_tokens": 168176.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 2.74180006980896, - "rewards/env_reward/std": 0.0, - "reward": 4.241800308227539, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.45961225032806396, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.069, - "step": 138 - }, - { - "loss": 0.0351, - "grad_norm": 0.0317252017557621, - "learning_rate": 4.1e-05, - "num_tokens": 169396.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 2.27810001373291, - "rewards/env_reward/std": 0.0, - "reward": 3.77810001373291, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.3511863350868225, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0695, - "step": 139 - }, - { - "loss": 0.1634, - "grad_norm": 9.790648460388184, - "learning_rate": 4.088888888888889e-05, - "num_tokens": 170639.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.13592499494552612, - "rewards/env_reward/std": 0.07474999874830246, - "reward": 1.635925054550171, - "reward_std": 0.07474998384714127, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 1.633728265762329, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.07, - "step": 140 - }, - { - "loss": 0.0719, - "grad_norm": 10.20471477508545, - "learning_rate": 4.0777777777777783e-05, - "num_tokens": 171858.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.11779999732971191, - "rewards/env_reward/std": 0.0, - "reward": 1.617799997329712, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.7185821086168289, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0705, - "step": 141 - }, - { - "loss": 0.0609, - "grad_norm": 0.04811082035303116, - "learning_rate": 4.066666666666667e-05, - "num_tokens": 173074.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.01860000006854534, - "rewards/env_reward/std": 0.0, - "reward": 1.5185999870300293, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.6092149615287781, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.071, - "step": 142 - }, - { - "loss": 0.0715, - "grad_norm": 10.69751262664795, - "learning_rate": 4.055555555555556e-05, - "num_tokens": 174289.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 1.9459749460220337, - "rewards/env_reward/std": 1.3590500354766846, - "reward": 3.445974826812744, - "reward_std": 1.3590497970581055, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.7153288573026657, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0715, - "step": 143 - }, - { - "loss": 0.0376, - "grad_norm": 0.038950882852077484, - "learning_rate": 4.0444444444444444e-05, - "num_tokens": 175505.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.43720000982284546, - "rewards/env_reward/std": 0.0, - "reward": 1.9372000694274902, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.37605082988739014, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.072, - "step": 144 - }, - { - "loss": 0.0852, - "grad_norm": 21.775800704956055, - "learning_rate": 4.0333333333333336e-05, - "num_tokens": 176724.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.13734997808933258, - "rewards/env_reward/std": 0.5797000527381897, - "reward": 1.6373498439788818, - "reward_std": 0.5796998739242554, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.8518322631716728, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0725, - "step": 145 - }, - { - "loss": 0.0286, - "grad_norm": 0.04734448343515396, - "learning_rate": 4.022222222222222e-05, - "num_tokens": 177944.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.21299999952316284, - "rewards/env_reward/std": 0.0, - "reward": 1.7130000591278076, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.2863689064979553, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.073, - "step": 146 - }, - { - "loss": 0.0756, - "grad_norm": 18.95853042602539, - "learning_rate": 4.011111111111111e-05, - "num_tokens": 179159.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.12667499482631683, - "rewards/env_reward/std": 0.16345000267028809, - "reward": 1.373324990272522, - "reward_std": 0.16345000267028809, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.7564148157835007, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0735, - "step": 147 - }, - { - "loss": 0.0258, - "grad_norm": 0.031110836192965508, - "learning_rate": 4e-05, - "num_tokens": 180375.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 2.9486000537872314, - "rewards/env_reward/std": 0.0, - "reward": 4.448599815368652, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.2578928470611572, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.074, - "step": 148 - }, - { - "loss": 0.0471, - "grad_norm": 16.788101196289062, - "learning_rate": 3.9888888888888895e-05, - "num_tokens": 181594.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.4031500220298767, - "rewards/env_reward/std": 0.4043000340461731, - "reward": 1.903149962425232, - "reward_std": 0.4043000638484955, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.4707563817501068, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0745, - "step": 149 - }, - { - "loss": 0.0301, - "grad_norm": 0.05600623041391373, - "learning_rate": 3.977777777777778e-05, - "num_tokens": 182814.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.11779999732971191, - "rewards/env_reward/std": 0.0, - "reward": 1.617799997329712, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.30064845085144043, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.075, - "step": 150 - }, - { - "loss": 0.0347, - "grad_norm": 15.009771347045898, - "learning_rate": 3.966666666666667e-05, - "num_tokens": 184033.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.44749999046325684, - "rewards/env_reward/std": 0.054399993270635605, - "reward": 1.9474999904632568, - "reward_std": 0.054399967193603516, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.34688687324523926, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0755, - "step": 151 - }, - { - "loss": 0.026, - "grad_norm": 14.952474594116211, - "learning_rate": 3.9555555555555556e-05, - "num_tokens": 185252.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.06487499922513962, - "rewards/env_reward/std": 0.027149999514222145, - "reward": 1.5648750066757202, - "reward_std": 0.027149956673383713, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.25970758497714996, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.076, - "step": 152 - }, - { - "loss": 0.1091, - "grad_norm": 15.009089469909668, - "learning_rate": 3.944444444444445e-05, - "num_tokens": 186471.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.1564750075340271, - "rewards/env_reward/std": 0.2308499962091446, - "reward": 1.3435250520706177, - "reward_std": 0.2308499962091446, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 1.0911710932850838, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0765, - "step": 153 - }, - { - "loss": 0.0086, - "grad_norm": 0.06202450767159462, - "learning_rate": 3.933333333333333e-05, - "num_tokens": 187719.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.16519999504089355, - "rewards/env_reward/std": 0.0, - "reward": 1.6651999950408936, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.08641213178634644, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.077, - "step": 154 - }, - { - "loss": 0.0239, - "grad_norm": 0.06951211392879486, - "learning_rate": 3.922222222222223e-05, - "num_tokens": 188939.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.08739999681711197, - "rewards/env_reward/std": 0.0, - "reward": 1.587399959564209, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.23888960480690002, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0775, - "step": 155 - }, - { - "loss": 0.0192, - "grad_norm": 0.0887007862329483, - "learning_rate": 3.9111111111111115e-05, - "num_tokens": 190155.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 3.0, - "rewards/env_reward/std": 0.0, - "reward": 4.5, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.19150155782699585, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.078, - "step": 156 - }, - { - "loss": 0.0499, - "grad_norm": 17.98279571533203, - "learning_rate": 3.9000000000000006e-05, - "num_tokens": 191374.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.11112499237060547, - "rewards/env_reward/std": 0.03234999626874924, - "reward": 1.6111249923706055, - "reward_std": 0.032350022345781326, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.4987874999642372, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0785, - "step": 157 - }, - { - "loss": 0.0095, - "grad_norm": 0.2604275941848755, - "learning_rate": 3.888888888888889e-05, - "num_tokens": 192586.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.11249999701976776, - "rewards/env_reward/std": 0.0, - "reward": 1.6124999523162842, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 0.09546057879924774, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.079, - "step": 158 - }, - { - "loss": 0.0635, - "grad_norm": 7.868729114532471, - "learning_rate": 3.877777777777778e-05, - "num_tokens": 193805.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.1739250123500824, - "rewards/env_reward/std": 0.12384999543428421, - "reward": 1.6739249229431152, - "reward_std": 0.12384998798370361, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.635490357875824, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0795, - "step": 159 - }, - { - "loss": 0.0751, - "grad_norm": 8.487398147583008, - "learning_rate": 3.866666666666667e-05, - "num_tokens": 195019.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.6370000243186951, - "rewards/env_reward/std": 0.4801245331764221, - "reward": 2.13700008392334, - "reward_std": 0.4801245927810669, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.751496285200119, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.08, - "step": 160 - }, - { - "loss": 0.0911, - "grad_norm": 5.337852954864502, - "learning_rate": 3.855555555555556e-05, - "num_tokens": 196237.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.36442500352859497, - "rewards/env_reward/std": 0.24691858887672424, - "reward": 1.8644250631332397, - "reward_std": 0.24691854417324066, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.9107427895069122, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0805, - "step": 161 - }, - { - "loss": 0.0027, - "grad_norm": 3.4733355045318604, - "learning_rate": 3.844444444444444e-05, - "num_tokens": 197449.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.49985000491142273, - "rewards/env_reward/std": 0.7634302973747253, - "reward": 1.9998500347137451, - "reward_std": 0.7634302377700806, - "frac_reward_zero_std": 0.0, - "completion_length": 3.0, - "kl": 0.027130742906592786, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.081, - "step": 162 - }, - { - "loss": 0.1171, - "grad_norm": 11.301681518554688, - "learning_rate": 3.8333333333333334e-05, - "num_tokens": 198671.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.3044999837875366, - "rewards/env_reward/std": 0.2737325131893158, - "reward": 1.8045001029968262, - "reward_std": 0.27373257279396057, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 1.1712135076522827, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0815, - "step": 163 - }, - { - "loss": 0.0945, - "grad_norm": 4.037586212158203, - "learning_rate": 3.8222222222222226e-05, - "num_tokens": 199885.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.31325000524520874, - "rewards/env_reward/std": 0.40956011414527893, - "reward": 1.813249945640564, - "reward_std": 0.40956008434295654, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.9451059624552727, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.082, - "step": 164 - }, - { - "loss": 0.1085, - "grad_norm": 6.316772937774658, - "learning_rate": 3.811111111111112e-05, - "num_tokens": 201098.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.448199987411499, - "rewards/env_reward/std": 0.2181999832391739, - "reward": 1.9482001066207886, - "reward_std": 0.21819999814033508, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 1.0854874588549137, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0825, - "step": 165 - }, - { - "loss": 0.1035, - "grad_norm": 5.041471004486084, - "learning_rate": 3.8e-05, - "num_tokens": 202340.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.14730000495910645, - "rewards/env_reward/std": 0.5735984444618225, - "reward": 1.6473000049591064, - "reward_std": 0.5735983848571777, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 1.0352106094360352, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.083, - "step": 166 - }, - { - "loss": 0.1103, - "grad_norm": 9.183646202087402, - "learning_rate": 3.7888888888888894e-05, - "num_tokens": 203553.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.6046750545501709, - "rewards/env_reward/std": 1.2963500022888184, - "reward": 2.104675054550171, - "reward_std": 1.2963500022888184, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 1.103184774518013, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0835, - "step": 167 - }, - { - "loss": 0.0571, - "grad_norm": 7.774601459503174, - "learning_rate": 3.777777777777778e-05, - "num_tokens": 204772.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.536175012588501, - "rewards/env_reward/std": 0.05065000429749489, - "reward": 2.036175012588501, - "reward_std": 0.050650037825107574, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.5715078748762608, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.084, - "step": 168 - }, - { - "loss": 0.0006, - "grad_norm": 0.020840495824813843, - "learning_rate": 3.766666666666667e-05, - "num_tokens": 205984.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.38589999079704285, - "rewards/env_reward/std": 0.0, - "reward": 1.8859000205993652, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 0.0058740577660501, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0845, - "step": 169 - }, - { - "loss": 0.1056, - "grad_norm": 9.08940601348877, - "learning_rate": 3.7555555555555554e-05, - "num_tokens": 207201.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.13997501134872437, - "rewards/env_reward/std": 0.24175000190734863, - "reward": 1.6399749517440796, - "reward_std": 0.24174998700618744, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 1.0561846308410168, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.085, - "step": 170 - }, - { - "loss": 0.1105, - "grad_norm": 9.16537857055664, - "learning_rate": 3.7444444444444446e-05, - "num_tokens": 208418.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.16380000114440918, - "rewards/env_reward/std": 0.25220000743865967, - "reward": 1.6638000011444092, - "reward_std": 0.2521999776363373, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 1.1045464426279068, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0855, - "step": 171 - }, - { - "loss": 0.0097, - "grad_norm": 2.578529119491577, - "learning_rate": 3.733333333333334e-05, - "num_tokens": 209630.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.30320000648498535, - "rewards/env_reward/std": 0.09060001373291016, - "reward": 1.8032000064849854, - "reward_std": 0.09060001373291016, - "frac_reward_zero_std": 0.0, - "completion_length": 3.0, - "kl": 0.09665492875501513, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.086, - "step": 172 - }, - { - "loss": 0.0034, - "grad_norm": 3.320836305618286, - "learning_rate": 3.722222222222222e-05, - "num_tokens": 210846.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.19130000472068787, - "rewards/env_reward/std": 0.04619999974966049, - "reward": 1.6913000345230103, - "reward_std": 0.046199996024370193, - "frac_reward_zero_std": 0.0, - "completion_length": 3.0, - "kl": 0.03439147397875786, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0865, - "step": 173 - }, - { - "loss": 0.0757, - "grad_norm": 9.950850486755371, - "learning_rate": 3.7111111111111113e-05, - "num_tokens": 212060.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 1.4722000360488892, - "rewards/env_reward/std": 1.7641515731811523, - "reward": 2.9721999168395996, - "reward_std": 1.7641514539718628, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.7566990703344345, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.087, - "step": 174 - }, - { - "loss": 0.0018, - "grad_norm": 3.615767478942871, - "learning_rate": 3.7e-05, - "num_tokens": 213272.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.18514999747276306, - "rewards/env_reward/std": 0.20029999315738678, - "reward": 1.3148499727249146, - "reward_std": 0.20029997825622559, - "frac_reward_zero_std": 0.0, - "completion_length": 3.0, - "kl": 0.018229197012260556, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0875, - "step": 175 - }, - { - "loss": 0.0004, - "grad_norm": 0.01711435616016388, - "learning_rate": 3.688888888888889e-05, - "num_tokens": 214484.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.04910000041127205, - "rewards/env_reward/std": 0.0, - "reward": 1.4509000778198242, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 0.003533522365614772, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.088, - "step": 176 - }, - { - "loss": 0.1084, - "grad_norm": 3.395838737487793, - "learning_rate": 3.677777777777778e-05, - "num_tokens": 215701.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.49125000834465027, - "rewards/env_reward/std": 0.42663922905921936, - "reward": 1.9912500381469727, - "reward_std": 0.42663925886154175, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 1.0841109305620193, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0885, - "step": 177 - }, - { - "loss": 0.1782, - "grad_norm": 3.345154047012329, - "learning_rate": 3.6666666666666666e-05, - "num_tokens": 216917.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.07502499967813492, - "rewards/env_reward/std": 0.2228500097990036, - "reward": 1.4249749183654785, - "reward_std": 0.22284995019435883, - "frac_reward_zero_std": 0.0, - "completion_length": 3.0, - "kl": 1.782272845506668, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.089, - "step": 178 - }, - { - "loss": 0.148, - "grad_norm": 10.321990013122559, - "learning_rate": 3.655555555555556e-05, - "num_tokens": 218130.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.8711000084877014, - "rewards/env_reward/std": 1.4336090087890625, - "reward": 2.3710999488830566, - "reward_std": 1.4336090087890625, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 1.4800196141004562, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0895, - "step": 179 - }, - { - "loss": 0.0863, - "grad_norm": 8.806055068969727, - "learning_rate": 3.644444444444445e-05, - "num_tokens": 219343.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.7229999899864197, - "rewards/env_reward/std": 1.5180000066757202, - "reward": 2.2230000495910645, - "reward_std": 1.5180000066757202, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.8633862249553204, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.09, - "step": 180 - }, - { - "loss": 0.1071, - "grad_norm": 8.860750198364258, - "learning_rate": 3.633333333333333e-05, - "num_tokens": 220556.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.7496999502182007, - "rewards/env_reward/std": 1.5017735958099365, - "reward": 2.2497000694274902, - "reward_std": 1.501773476600647, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 1.0707316398620605, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0905, - "step": 181 - }, - { - "loss": 0.0593, - "grad_norm": 9.149721145629883, - "learning_rate": 3.6222222222222225e-05, - "num_tokens": 221774.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 1.2278499603271484, - "rewards/env_reward/std": 1.6160610914230347, - "reward": 2.7278499603271484, - "reward_std": 1.6160610914230347, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.5934920236468315, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.091, - "step": 182 - }, - { - "loss": 0.0992, - "grad_norm": 5.919835090637207, - "learning_rate": 3.611111111111111e-05, - "num_tokens": 222995.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.25929999351501465, - "rewards/env_reward/std": 0.2668980360031128, - "reward": 1.7592999935150146, - "reward_std": 0.2668980360031128, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.9921397641301155, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0915, - "step": 183 - }, - { - "loss": 0.0752, - "grad_norm": 8.201157569885254, - "learning_rate": 3.6e-05, - "num_tokens": 224212.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.6631500124931335, - "rewards/env_reward/std": 1.2741000652313232, - "reward": 2.1631500720977783, - "reward_std": 1.2740998268127441, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.7523262202739716, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.092, - "step": 184 - }, - { - "loss": 0.0011, - "grad_norm": 2.42414927482605, - "learning_rate": 3.5888888888888886e-05, - "num_tokens": 225424.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.016649994999170303, - "rewards/env_reward/std": 0.1965000033378601, - "reward": 1.5166499614715576, - "reward_std": 0.1964999884366989, - "frac_reward_zero_std": 0.0, - "completion_length": 3.0, - "kl": 0.0112856529885903, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0925, - "step": 185 - }, - { - "loss": 0.0532, - "grad_norm": 9.443381309509277, - "learning_rate": 3.577777777777778e-05, - "num_tokens": 226643.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.11549999564886093, - "rewards/env_reward/std": 0.009599998593330383, - "reward": 1.6154999732971191, - "reward_std": 0.009600004181265831, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.5324121937155724, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.093, - "step": 186 - }, - { - "loss": 0.0581, - "grad_norm": 4.851520538330078, - "learning_rate": 3.566666666666667e-05, - "num_tokens": 227857.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.2538999915122986, - "rewards/env_reward/std": 0.20722638070583344, - "reward": 1.7539000511169434, - "reward_std": 0.2072264403104782, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.5809661895036697, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0935, - "step": 187 - }, - { - "loss": 0.0899, - "grad_norm": 8.911284446716309, - "learning_rate": 3.555555555555556e-05, - "num_tokens": 229075.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.09845000505447388, - "rewards/env_reward/std": 0.061433784663677216, - "reward": 1.598449945449829, - "reward_std": 0.061433833092451096, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.8985702842473984, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.094, - "step": 188 - }, - { - "loss": 0.0026, - "grad_norm": 7.717309951782227, - "learning_rate": 3.5444444444444445e-05, - "num_tokens": 230317.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.6454499959945679, - "rewards/env_reward/std": 0.48630210757255554, - "reward": 2.1454498767852783, - "reward_std": 0.4863020181655884, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.02618086338043213, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0945, - "step": 189 - }, - { - "loss": 0.0833, - "grad_norm": 3.0679268836975098, - "learning_rate": 3.5333333333333336e-05, - "num_tokens": 231534.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.28885000944137573, - "rewards/env_reward/std": 0.4101759195327759, - "reward": 1.7888500690460205, - "reward_std": 0.4101759195327759, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.8325570225715637, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.095, - "step": 190 - }, - { - "loss": 0.1207, - "grad_norm": 6.385967254638672, - "learning_rate": 3.522222222222222e-05, - "num_tokens": 232747.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.6820499897003174, - "rewards/env_reward/std": 0.5525884628295898, - "reward": 2.1820499897003174, - "reward_std": 0.5525885224342346, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 1.2066741809248924, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0955, - "step": 191 - }, - { - "loss": 0.0654, - "grad_norm": 7.2965192794799805, - "learning_rate": 3.511111111111111e-05, - "num_tokens": 233961.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.03577499836683273, - "rewards/env_reward/std": 0.09620396047830582, - "reward": 1.5357749462127686, - "reward_std": 0.09620393812656403, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.6539988666772842, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.096, - "step": 192 - }, - { - "loss": 0.035, - "grad_norm": 9.981371879577637, - "learning_rate": 3.5e-05, - "num_tokens": 235184.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 2.225299835205078, - "rewards/env_reward/std": 1.3604000806808472, - "reward": 3.725299835205078, - "reward_std": 1.3603999614715576, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.3499831482768059, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0965, - "step": 193 - }, - { - "loss": 0.0672, - "grad_norm": 8.588502883911133, - "learning_rate": 3.4888888888888895e-05, - "num_tokens": 236401.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.6177250146865845, - "rewards/env_reward/std": 1.22762131690979, - "reward": 2.117724895477295, - "reward_std": 1.22762131690979, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.6720898002386093, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.097, - "step": 194 - }, - { - "loss": 0.0034, - "grad_norm": 3.138629674911499, - "learning_rate": 3.477777777777778e-05, - "num_tokens": 237650.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 0.25, - "rewards/format_valid/std": 1.5, - "rewards/action_legal/mean": 0.125, - "rewards/action_legal/std": 0.75, - "rewards/env_reward/mean": -0.2300250232219696, - "rewards/env_reward/std": 1.8820770978927612, - "reward": 0.14497500658035278, - "reward_std": 4.112740993499756, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.03350692242383957, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0975, - "step": 195 - }, - { - "loss": 0.0167, - "grad_norm": 6.246030330657959, - "learning_rate": 3.466666666666667e-05, - "num_tokens": 238865.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.3423500061035156, - "rewards/env_reward/std": 0.4099000096321106, - "reward": 1.8423500061035156, - "reward_std": 0.4099000096321106, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.16748806089162827, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.098, - "step": 196 - }, - { - "loss": 0.0006, - "grad_norm": 0.06681685894727707, - "learning_rate": 3.4555555555555556e-05, - "num_tokens": 240081.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.2660999894142151, - "rewards/env_reward/std": 0.0, - "reward": 1.7660999298095703, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 0.005917628761380911, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0985, - "step": 197 - }, - { - "loss": 0.009, - "grad_norm": 2.279668092727661, - "learning_rate": 3.444444444444445e-05, - "num_tokens": 241293.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.8304250240325928, - "rewards/env_reward/std": 0.6611500382423401, - "reward": 2.3304250240325928, - "reward_std": 0.6611500382423401, - "frac_reward_zero_std": 0.0, - "completion_length": 3.0, - "kl": 0.09011396765708923, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.099, - "step": 198 - }, - { - "loss": 0.0256, - "grad_norm": 13.054587364196777, - "learning_rate": 3.433333333333333e-05, - "num_tokens": 242512.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.12422500550746918, - "rewards/env_reward/std": 0.0044499970972537994, - "reward": 1.6242250204086304, - "reward_std": 0.004450043197721243, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.2554884999990463, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.0995, - "step": 199 - }, - { - "loss": 0.0225, - "grad_norm": 8.720185279846191, - "learning_rate": 3.4222222222222224e-05, - "num_tokens": 243734.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 1.3869000673294067, - "rewards/env_reward/std": 1.5606932640075684, - "reward": 2.886899948120117, - "reward_std": 1.5606932640075684, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.2250380516052246, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1, - "step": 200 - }, - { - "loss": 0.0472, - "grad_norm": 10.663931846618652, - "learning_rate": 3.411111111111111e-05, - "num_tokens": 244948.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 5.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 5.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.1264750063419342, - "rewards/env_reward/std": 0.15003366768360138, - "reward": 1.3735250234603882, - "reward_std": 0.15003369748592377, - "frac_reward_zero_std": 0.0, - "completion_length": 5.0, - "kl": 0.4722195148933679, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1005, - "step": 201 - }, - { - "loss": 0.0505, - "grad_norm": 8.990918159484863, - "learning_rate": 3.4000000000000007e-05, - "num_tokens": 246165.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 1.2725749015808105, - "rewards/env_reward/std": 1.4268337488174438, - "reward": 2.7725749015808105, - "reward_std": 1.4268337488174438, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.5048771500587463, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.101, - "step": 202 - }, - { - "loss": 0.0222, - "grad_norm": 7.526683807373047, - "learning_rate": 3.388888888888889e-05, - "num_tokens": 247383.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.2131499946117401, - "rewards/env_reward/std": 0.15301643311977386, - "reward": 1.7131500244140625, - "reward_std": 0.15301649272441864, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.22175872698426247, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1015, - "step": 203 - }, - { - "loss": 0.0001, - "grad_norm": 0.012850847095251083, - "learning_rate": 3.377777777777778e-05, - "num_tokens": 248595.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.7128000259399414, - "rewards/env_reward/std": 0.0, - "reward": 2.2128000259399414, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 0.0014626781921833754, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.102, - "step": 204 - }, - { - "loss": 0.0268, - "grad_norm": 6.102322578430176, - "learning_rate": 3.366666666666667e-05, - "num_tokens": 249808.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.6442500352859497, - "rewards/env_reward/std": 0.2973000109195709, - "reward": 2.14424991607666, - "reward_std": 0.2973000407218933, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.2678939402103424, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1025, - "step": 205 - }, - { - "loss": 0.0511, - "grad_norm": 6.538288116455078, - "learning_rate": 3.355555555555556e-05, - "num_tokens": 251026.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.2204499989748001, - "rewards/env_reward/std": 0.024459702894091606, - "reward": 1.2795499563217163, - "reward_std": 0.02445964328944683, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.5111789032816887, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.103, - "step": 206 - }, - { - "loss": 0.0408, - "grad_norm": 9.782583236694336, - "learning_rate": 3.3444444444444443e-05, - "num_tokens": 252240.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.1491749882698059, - "rewards/env_reward/std": 0.03605110943317413, - "reward": 1.3508250713348389, - "reward_std": 0.036051150411367416, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.4080735631287098, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1035, - "step": 207 - }, - { - "loss": 0.0322, - "grad_norm": 5.914154052734375, - "learning_rate": 3.3333333333333335e-05, - "num_tokens": 253457.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.8079249858856201, - "rewards/env_reward/std": 0.40994998812675476, - "reward": 2.307924747467041, - "reward_std": 0.40994998812675476, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.3219486065208912, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.104, - "step": 208 - }, - { - "loss": 0.0284, - "grad_norm": 2.2945775985717773, - "learning_rate": 3.322222222222222e-05, - "num_tokens": 254673.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.14419999718666077, - "rewards/env_reward/std": 0.30845096707344055, - "reward": 1.6441999673843384, - "reward_std": 0.30845096707344055, - "frac_reward_zero_std": 0.0, - "completion_length": 3.0, - "kl": 0.2838685214519501, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1045, - "step": 209 - }, - { - "loss": 0.0027, - "grad_norm": 0.059150148183107376, - "learning_rate": 3.311111111111112e-05, - "num_tokens": 255885.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.992900013923645, - "rewards/env_reward/std": 0.0, - "reward": 2.4928998947143555, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 0.02724589966237545, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.105, - "step": 210 - }, - { - "loss": 0.0065, - "grad_norm": 2.316431760787964, - "learning_rate": 3.3e-05, - "num_tokens": 257105.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.6849249601364136, - "rewards/env_reward/std": 0.37295001745224, - "reward": 2.184924840927124, - "reward_std": 0.3729499578475952, - "frac_reward_zero_std": 0.0, - "completion_length": 3.0, - "kl": 0.06542554311454296, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1055, - "step": 211 - }, - { - "loss": 0.001, - "grad_norm": 0.02504754438996315, - "learning_rate": 3.2888888888888894e-05, - "num_tokens": 258321.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.04019999876618385, - "rewards/env_reward/std": 0.0, - "reward": 1.4598000049591064, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 0.009607851505279541, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.106, - "step": 212 - }, - { - "loss": 0.0834, - "grad_norm": 5.390646457672119, - "learning_rate": 3.277777777777778e-05, - "num_tokens": 259533.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.22612501680850983, - "rewards/env_reward/std": 0.2722500264644623, - "reward": 1.7261250019073486, - "reward_std": 0.2722500264644623, - "frac_reward_zero_std": 0.0, - "completion_length": 3.0, - "kl": 0.8344221711158752, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1065, - "step": 213 - }, - { - "loss": 0.073, - "grad_norm": 8.010059356689453, - "learning_rate": 3.266666666666667e-05, - "num_tokens": 260750.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.18207500874996185, - "rewards/env_reward/std": 0.17445001006126404, - "reward": 1.317924976348877, - "reward_std": 0.17444996535778046, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.7295764461159706, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.107, - "step": 214 - }, - { - "loss": 0.0429, - "grad_norm": 9.196842193603516, - "learning_rate": 3.2555555555555555e-05, - "num_tokens": 261964.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.060099996626377106, - "rewards/env_reward/std": 0.08995117247104645, - "reward": 1.4398999214172363, - "reward_std": 0.08995116502046585, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.42901405692100525, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1075, - "step": 215 - }, - { - "loss": 0.049, - "grad_norm": 6.548834323883057, - "learning_rate": 3.2444444444444446e-05, - "num_tokens": 263177.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.6177250146865845, - "rewards/env_reward/std": 0.3055500090122223, - "reward": 2.117724895477295, - "reward_std": 0.3055499792098999, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.4904120974242687, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.108, - "step": 216 - }, - { - "loss": 0.0098, - "grad_norm": 0.0852428674697876, - "learning_rate": 3.233333333333333e-05, - "num_tokens": 264397.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.4487000107765198, - "rewards/env_reward/std": 0.0, - "reward": 1.948699951171875, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 0.09779202938079834, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1085, - "step": 217 - }, - { - "loss": 0.0026, - "grad_norm": 0.020340105518698692, - "learning_rate": 3.222222222222223e-05, - "num_tokens": 265613.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 1.013700008392334, - "rewards/env_reward/std": 0.0, - "reward": 2.513700008392334, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 0.025670906528830528, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.109, - "step": 218 - }, - { - "loss": 0.0207, - "grad_norm": 5.19188928604126, - "learning_rate": 3.2111111111111114e-05, - "num_tokens": 266855.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.6587499976158142, - "rewards/env_reward/std": 0.49077048897743225, - "reward": 2.158750057220459, - "reward_std": 0.49077045917510986, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.206882506608963, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1095, - "step": 219 - }, - { - "loss": 0.0066, - "grad_norm": 0.027376951649785042, - "learning_rate": 3.2000000000000005e-05, - "num_tokens": 268067.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 1.1727999448776245, - "rewards/env_reward/std": 0.0, - "reward": 2.672800064086914, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 0.06600229442119598, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.11, - "step": 220 - }, - { - "loss": 0.0653, - "grad_norm": 9.92841911315918, - "learning_rate": 3.188888888888889e-05, - "num_tokens": 269285.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 5.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 5.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.5943999886512756, - "rewards/env_reward/std": 0.7274900674819946, - "reward": 0.9056000709533691, - "reward_std": 0.7274901270866394, - "frac_reward_zero_std": 0.0, - "completion_length": 5.0, - "kl": 0.6533837057650089, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1105, - "step": 221 - }, - { - "loss": 0.0187, - "grad_norm": 0.040416963398456573, - "learning_rate": 3.177777777777778e-05, - "num_tokens": 270497.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.33410000801086426, - "rewards/env_reward/std": 0.0, - "reward": 1.1658999919891357, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 0.18729273974895477, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.111, - "step": 222 - }, - { - "loss": 0.0588, - "grad_norm": 5.750544548034668, - "learning_rate": 3.1666666666666666e-05, - "num_tokens": 271710.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.5752999782562256, - "rewards/env_reward/std": 0.26019999384880066, - "reward": 2.0752999782562256, - "reward_std": 0.26020002365112305, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.5881490483880043, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1115, - "step": 223 - }, - { - "loss": 0.113, - "grad_norm": 8.916067123413086, - "learning_rate": 3.155555555555556e-05, - "num_tokens": 272927.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.020275000482797623, - "rewards/env_reward/std": 0.038450002670288086, - "reward": 1.4797250032424927, - "reward_std": 0.038450002670288086, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 1.1298761367797852, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.112, - "step": 224 - }, - { - "loss": 0.0048, - "grad_norm": 0.035957083106040955, - "learning_rate": 3.144444444444445e-05, - "num_tokens": 274143.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 1.035099983215332, - "rewards/env_reward/std": 0.0, - "reward": 2.535099983215332, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 0.04778510332107544, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1125, - "step": 225 - }, - { - "loss": 0.0022, - "grad_norm": 0.01710173487663269, - "learning_rate": 3.1333333333333334e-05, - "num_tokens": 275363.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.9266999959945679, - "rewards/env_reward/std": 0.0, - "reward": 2.4267001152038574, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 0.022123416885733604, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.113, - "step": 226 - }, - { - "loss": 0.0062, - "grad_norm": 0.031508758664131165, - "learning_rate": 3.1222222222222225e-05, - "num_tokens": 276575.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.6335999965667725, - "rewards/env_reward/std": 0.0, - "reward": 2.1335999965667725, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 0.06243076175451279, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1135, - "step": 227 - }, - { - "loss": 0.0029, - "grad_norm": 4.613099575042725, - "learning_rate": 3.111111111111111e-05, - "num_tokens": 277787.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.0832500085234642, - "rewards/env_reward/std": 0.22603262960910797, - "reward": 1.4167499542236328, - "reward_std": 0.22603262960910797, - "frac_reward_zero_std": 0.0, - "completion_length": 3.0, - "kl": 0.029001038521528244, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.114, - "step": 228 - }, - { - "loss": 0.0037, - "grad_norm": 0.026186607778072357, - "learning_rate": 3.1e-05, - "num_tokens": 279003.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 1.0399999618530273, - "rewards/env_reward/std": 0.0, - "reward": 2.5399999618530273, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 0.03700479120016098, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1145, - "step": 229 - }, - { - "loss": 0.0038, - "grad_norm": 0.018804756924510002, - "learning_rate": 3.088888888888889e-05, - "num_tokens": 280219.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.5902000069618225, - "rewards/env_reward/std": 0.0, - "reward": 2.0901999473571777, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 0.03775995969772339, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.115, - "step": 230 - }, - { - "loss": 0.0608, - "grad_norm": 7.023393630981445, - "learning_rate": 3.077777777777778e-05, - "num_tokens": 281436.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.11394999921321869, - "rewards/env_reward/std": 0.22169999778270721, - "reward": 1.6139500141143799, - "reward_std": 0.2217000275850296, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.6076318472623825, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1155, - "step": 231 - }, - { - "loss": 0.0438, - "grad_norm": 2.462205410003662, - "learning_rate": 3.066666666666667e-05, - "num_tokens": 282682.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.28862500190734863, - "rewards/env_reward/std": 0.37281543016433716, - "reward": 1.7886250019073486, - "reward_std": 0.37281543016433716, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.4382362440228462, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.116, - "step": 232 - }, - { - "loss": 0.0013, - "grad_norm": 0.008755974471569061, - "learning_rate": 3.055555555555556e-05, - "num_tokens": 283898.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.036400001496076584, - "rewards/env_reward/std": 0.0, - "reward": 1.5363999605178833, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 0.012826403602957726, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1165, - "step": 233 - }, - { - "loss": 0.0897, - "grad_norm": 3.114485740661621, - "learning_rate": 3.044444444444445e-05, - "num_tokens": 285115.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.33834999799728394, - "rewards/env_reward/std": 0.29102057218551636, - "reward": 1.8383500576019287, - "reward_std": 0.29102060198783875, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.8970653489232063, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.117, - "step": 234 - }, - { - "loss": 0.001, - "grad_norm": 0.006958460435271263, - "learning_rate": 3.0333333333333337e-05, - "num_tokens": 286331.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.31310001015663147, - "rewards/env_reward/std": 0.0, - "reward": 1.813099980354309, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 0.010390917770564556, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1175, - "step": 235 - }, - { - "loss": 0.0761, - "grad_norm": 5.068345546722412, - "learning_rate": 3.0222222222222225e-05, - "num_tokens": 287544.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.6761999726295471, - "rewards/env_reward/std": 0.37300002574920654, - "reward": 2.1761999130249023, - "reward_std": 0.3729999363422394, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.7609379142522812, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.118, - "step": 236 - }, - { - "loss": 0.0307, - "grad_norm": 2.9388842582702637, - "learning_rate": 3.0111111111111113e-05, - "num_tokens": 288788.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.03700000047683716, - "rewards/env_reward/std": 0.15161217749118805, - "reward": 1.5369999408721924, - "reward_std": 0.15161222219467163, - "frac_reward_zero_std": 0.0, - "completion_length": 3.0, - "kl": 0.3069487512111664, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1185, - "step": 237 - }, - { - "loss": 0.0455, - "grad_norm": 3.424729585647583, - "learning_rate": 3e-05, - "num_tokens": 290004.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.203000009059906, - "rewards/env_reward/std": 0.4246000349521637, - "reward": 1.2969999313354492, - "reward_std": 0.4246000051498413, - "frac_reward_zero_std": 0.0, - "completion_length": 3.0, - "kl": 0.45477381348609924, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.119, - "step": 238 - }, - { - "loss": 0.0037, - "grad_norm": 0.014718780294060707, - "learning_rate": 2.988888888888889e-05, - "num_tokens": 291220.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 1.0318000316619873, - "rewards/env_reward/std": 0.0, - "reward": 2.5318000316619873, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 0.03718819469213486, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1195, - "step": 239 - }, - { - "loss": 0.0032, - "grad_norm": 0.023474005982279778, - "learning_rate": 2.9777777777777777e-05, - "num_tokens": 292432.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.1080000028014183, - "rewards/env_reward/std": 0.0, - "reward": 1.3919999599456787, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 0.03223689645528793, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.12, - "step": 240 - }, - { - "loss": 0.0038, - "grad_norm": 0.01683114469051361, - "learning_rate": 2.9666666666666672e-05, - "num_tokens": 293648.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.031300000846385956, - "rewards/env_reward/std": 0.0, - "reward": 1.5312999486923218, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 0.038463495671749115, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1205, - "step": 241 - }, - { - "loss": 0.0649, - "grad_norm": 7.739398002624512, - "learning_rate": 2.955555555555556e-05, - "num_tokens": 294865.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.43119996786117554, - "rewards/env_reward/std": 0.19619999825954437, - "reward": 1.9312000274658203, - "reward_std": 0.19620005786418915, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.6485915556550026, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.121, - "step": 242 - }, - { - "loss": 0.0063, - "grad_norm": 0.0356922373175621, - "learning_rate": 2.9444444444444448e-05, - "num_tokens": 296085.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.021800000220537186, - "rewards/env_reward/std": 0.0, - "reward": 1.5218000411987305, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 0.06295116990804672, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1215, - "step": 243 - }, - { - "loss": 0.0596, - "grad_norm": 5.311740875244141, - "learning_rate": 2.9333333333333336e-05, - "num_tokens": 297302.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.2790749967098236, - "rewards/env_reward/std": 0.10045000165700912, - "reward": 1.7790749073028564, - "reward_std": 0.10045000165700912, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.5959512740373611, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.122, - "step": 244 - }, - { - "loss": 0.0042, - "grad_norm": 0.01989293284714222, - "learning_rate": 2.9222222222222224e-05, - "num_tokens": 298514.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.792900025844574, - "rewards/env_reward/std": 0.0, - "reward": 2.2929000854492188, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 0.04173398017883301, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1225, - "step": 245 - }, - { - "loss": 0.0031, - "grad_norm": 0.007336912676692009, - "learning_rate": 2.9111111111111112e-05, - "num_tokens": 299726.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.7305999994277954, - "rewards/env_reward/std": 0.0, - "reward": 2.230599880218506, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 0.031048377975821495, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.123, - "step": 246 - }, - { - "loss": 0.1736, - "grad_norm": 9.066658020019531, - "learning_rate": 2.9e-05, - "num_tokens": 300938.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.2737250030040741, - "rewards/env_reward/std": 0.12075000256299973, - "reward": 1.2262749671936035, - "reward_std": 0.12074998766183853, - "frac_reward_zero_std": 0.0, - "completion_length": 3.0, - "kl": 1.7364354468882084, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1235, - "step": 247 - }, - { - "loss": 0.0322, - "grad_norm": 3.391655683517456, - "learning_rate": 2.8888888888888888e-05, - "num_tokens": 302150.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.17912501096725464, - "rewards/env_reward/std": 0.02304999530315399, - "reward": 1.6791250705718994, - "reward_std": 0.023049989715218544, - "frac_reward_zero_std": 0.0, - "completion_length": 3.0, - "kl": 0.32196859270334244, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.124, - "step": 248 - }, - { - "loss": 0.0347, - "grad_norm": 2.445664882659912, - "learning_rate": 2.877777777777778e-05, - "num_tokens": 303362.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.3643999993801117, - "rewards/env_reward/std": 0.16140000522136688, - "reward": 1.864400029182434, - "reward_std": 0.16140003502368927, - "frac_reward_zero_std": 0.0, - "completion_length": 3.0, - "kl": 0.34707972407341003, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1245, - "step": 249 - }, - { - "loss": 0.0907, - "grad_norm": 8.821626663208008, - "learning_rate": 2.8666666666666668e-05, - "num_tokens": 304575.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.6588499546051025, - "rewards/env_reward/std": 1.5048999786376953, - "reward": 2.1588497161865234, - "reward_std": 1.5048998594284058, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.9067427404224873, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.125, - "step": 250 - }, - { - "loss": 0.0037, - "grad_norm": 0.010974357835948467, - "learning_rate": 2.855555555555556e-05, - "num_tokens": 305787.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.3894999921321869, - "rewards/env_reward/std": 0.0, - "reward": 1.8895000219345093, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 0.03672550246119499, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1255, - "step": 251 - }, - { - "loss": 0.0292, - "grad_norm": 3.933217763900757, - "learning_rate": 2.8444444444444447e-05, - "num_tokens": 306999.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.1211249977350235, - "rewards/env_reward/std": 0.10684999823570251, - "reward": 1.6211249828338623, - "reward_std": 0.10684998333454132, - "frac_reward_zero_std": 0.0, - "completion_length": 3.0, - "kl": 0.2923688758164644, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.126, - "step": 252 - }, - { - "loss": 0.0503, - "grad_norm": 5.908778190612793, - "learning_rate": 2.8333333333333335e-05, - "num_tokens": 308241.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.13635000586509705, - "rewards/env_reward/std": 0.009872685186564922, - "reward": 1.6363499164581299, - "reward_std": 0.00987267680466175, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.5032139346003532, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1265, - "step": 253 - }, - { - "loss": 0.0028, - "grad_norm": 0.014478636905550957, - "learning_rate": 2.8222222222222223e-05, - "num_tokens": 309457.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.01720000058412552, - "rewards/env_reward/std": 0.0, - "reward": 1.517199993133545, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 0.0278167724609375, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.127, - "step": 254 - }, - { - "loss": 0.0018, - "grad_norm": 0.010732459835708141, - "learning_rate": 2.811111111111111e-05, - "num_tokens": 310669.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.5572999715805054, - "rewards/env_reward/std": 0.0, - "reward": 2.057300090789795, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 0.018153250217437744, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1275, - "step": 255 - }, - { - "loss": 0.0475, - "grad_norm": 6.396478176116943, - "learning_rate": 2.8000000000000003e-05, - "num_tokens": 311891.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.4102250039577484, - "rewards/env_reward/std": 0.24782662093639374, - "reward": 1.9102249145507812, - "reward_std": 0.24782662093639374, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.47455649450421333, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.128, - "step": 256 - }, - { - "loss": 0.0102, - "grad_norm": 0.08098237961530685, - "learning_rate": 2.788888888888889e-05, - "num_tokens": 313103.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.664900004863739, - "rewards/env_reward/std": 0.0, - "reward": 2.164900064468384, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 0.10185647010803223, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1285, - "step": 257 - }, - { - "loss": 0.0108, - "grad_norm": 2.5040080547332764, - "learning_rate": 2.777777777777778e-05, - "num_tokens": 314319.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.7252249717712402, - "rewards/env_reward/std": 0.4409500062465668, - "reward": 2.2252249717712402, - "reward_std": 0.440949946641922, - "frac_reward_zero_std": 0.0, - "completion_length": 3.0, - "kl": 0.10804228484630585, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.129, - "step": 258 - }, - { - "loss": 0.01, - "grad_norm": 2.200788736343384, - "learning_rate": 2.7666666666666667e-05, - "num_tokens": 315531.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.7825750112533569, - "rewards/env_reward/std": 0.48044997453689575, - "reward": 2.2825751304626465, - "reward_std": 0.48044994473457336, - "frac_reward_zero_std": 0.0, - "completion_length": 3.0, - "kl": 0.09993956610560417, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1295, - "step": 259 - }, - { - "loss": 0.0588, - "grad_norm": 7.3923139572143555, - "learning_rate": 2.7555555555555555e-05, - "num_tokens": 316748.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.4410000145435333, - "rewards/env_reward/std": 0.43780001997947693, - "reward": 1.940999984741211, - "reward_std": 0.43779999017715454, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.588453009724617, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.13, - "step": 260 - }, - { - "loss": 0.0036, - "grad_norm": 0.051993146538734436, - "learning_rate": 2.7444444444444443e-05, - "num_tokens": 317960.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.03790000081062317, - "rewards/env_reward/std": 0.0, - "reward": 1.4621000289916992, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 0.03645044565200806, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1305, - "step": 261 - }, - { - "loss": 0.0301, - "grad_norm": 3.2377870082855225, - "learning_rate": 2.733333333333333e-05, - "num_tokens": 319176.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.1784999966621399, - "rewards/env_reward/std": 0.21696822345256805, - "reward": 1.3215000629425049, - "reward_std": 0.21696823835372925, - "frac_reward_zero_std": 0.0, - "completion_length": 3.0, - "kl": 0.3008611798286438, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.131, - "step": 262 - }, - { - "loss": 0.0148, - "grad_norm": 0.13123686611652374, - "learning_rate": 2.7222222222222223e-05, - "num_tokens": 320392.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.14329999685287476, - "rewards/env_reward/std": 0.0, - "reward": 1.6433000564575195, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 0.14758411049842834, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1315, - "step": 263 - }, - { - "loss": 0.0522, - "grad_norm": 3.171525001525879, - "learning_rate": 2.7111111111111114e-05, - "num_tokens": 321628.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.20374999940395355, - "rewards/env_reward/std": 0.10271061211824417, - "reward": 1.7037500143051147, - "reward_std": 0.10271065682172775, - "frac_reward_zero_std": 0.0, - "completion_length": 3.0, - "kl": 0.5219772905111313, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.132, - "step": 264 - }, - { - "loss": 0.0283, - "grad_norm": 9.769635200500488, - "learning_rate": 2.7000000000000002e-05, - "num_tokens": 322847.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.06989999860525131, - "rewards/env_reward/std": 0.11860000342130661, - "reward": 1.5699000358581543, - "reward_std": 0.11860001087188721, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.2829606235027313, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1325, - "step": 265 - }, - { - "loss": 0.0067, - "grad_norm": 0.035419248044490814, - "learning_rate": 2.688888888888889e-05, - "num_tokens": 324063.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.1979999989271164, - "rewards/env_reward/std": 0.0, - "reward": 1.6979999542236328, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 0.06715995073318481, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.133, - "step": 266 - }, - { - "loss": 0.0036, - "grad_norm": 0.03618660196661949, - "learning_rate": 2.677777777777778e-05, - "num_tokens": 325275.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.17569999396800995, - "rewards/env_reward/std": 0.0, - "reward": 1.6756999492645264, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 0.03648458048701286, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1335, - "step": 267 - }, - { - "loss": 0.0012, - "grad_norm": 0.00686953729018569, - "learning_rate": 2.6666666666666667e-05, - "num_tokens": 326487.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.03020000085234642, - "rewards/env_reward/std": 0.0, - "reward": 1.4697999954223633, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 0.0123521089553833, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.134, - "step": 268 - }, - { - "loss": 0.0084, - "grad_norm": 0.03367152810096741, - "learning_rate": 2.6555555555555555e-05, - "num_tokens": 327727.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.03959999978542328, - "rewards/env_reward/std": 0.0, - "reward": 1.4603999853134155, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 0.08355914056301117, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1345, - "step": 269 - }, - { - "loss": 0.0686, - "grad_norm": 13.220355987548828, - "learning_rate": 2.6444444444444443e-05, - "num_tokens": 328940.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.12467499822378159, - "rewards/env_reward/std": 0.23714999854564667, - "reward": 1.6246750354766846, - "reward_std": 0.23714995384216309, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.6856659427285194, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.135, - "step": 270 - }, - { - "loss": 0.0568, - "grad_norm": 13.717921257019043, - "learning_rate": 2.633333333333333e-05, - "num_tokens": 330157.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.0213250033557415, - "rewards/env_reward/std": 0.10735000669956207, - "reward": 1.4786748886108398, - "reward_std": 0.10734999179840088, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.5684559792280197, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1355, - "step": 271 - }, - { - "loss": 0.0026, - "grad_norm": 0.015910228714346886, - "learning_rate": 2.6222222222222226e-05, - "num_tokens": 331373.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.04749999940395355, - "rewards/env_reward/std": 0.0, - "reward": 1.4524999856948853, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 0.025526802986860275, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.136, - "step": 272 - }, - { - "loss": 0.026, - "grad_norm": 14.18847942352295, - "learning_rate": 2.6111111111111114e-05, - "num_tokens": 332591.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.24905000627040863, - "rewards/env_reward/std": 0.2707195580005646, - "reward": 1.7490500211715698, - "reward_std": 0.2707195281982422, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.25982359051704407, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1365, - "step": 273 - }, - { - "loss": 0.0008, - "grad_norm": 0.011627077125012875, - "learning_rate": 2.6000000000000002e-05, - "num_tokens": 333807.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 1.0943000316619873, - "rewards/env_reward/std": 0.0, - "reward": 2.5943000316619873, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 0.008376996032893658, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.137, - "step": 274 - }, - { - "loss": 0.0015, - "grad_norm": 0.025715822353959084, - "learning_rate": 2.588888888888889e-05, - "num_tokens": 335019.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.09470000118017197, - "rewards/env_reward/std": 0.0, - "reward": 1.4053000211715698, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 0.015157541260123253, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1375, - "step": 275 - }, - { - "loss": 0.0037, - "grad_norm": 0.042894382029771805, - "learning_rate": 2.5777777777777778e-05, - "num_tokens": 336235.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.019700000062584877, - "rewards/env_reward/std": 0.0, - "reward": 1.519700050354004, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 0.03723999112844467, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.138, - "step": 276 - }, - { - "loss": 0.0266, - "grad_norm": 0.18419010937213898, - "learning_rate": 2.5666666666666666e-05, - "num_tokens": 337483.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.05939999967813492, - "rewards/env_reward/std": 0.0, - "reward": 1.559399962425232, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 0.26566994190216064, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1385, - "step": 277 - }, - { - "loss": 0.009, - "grad_norm": 14.892497062683105, - "learning_rate": 2.5555555555555554e-05, - "num_tokens": 338701.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 1.4440499544143677, - "rewards/env_reward/std": 1.7966563701629639, - "reward": 2.9440500736236572, - "reward_std": 1.7966562509536743, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.08950217813253403, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.139, - "step": 278 - }, - { - "loss": 0.0093, - "grad_norm": 7.610400199890137, - "learning_rate": 2.5444444444444442e-05, - "num_tokens": 339917.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.014999985694885254, - "rewards/env_reward/std": 0.5485435128211975, - "reward": 1.5150001049041748, - "reward_std": 0.5485435128211975, - "frac_reward_zero_std": 0.0, - "completion_length": 3.0, - "kl": 0.09344403445720673, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1395, - "step": 279 - }, - { - "loss": 0.0179, - "grad_norm": 2.456191062927246, - "learning_rate": 2.5333333333333337e-05, - "num_tokens": 341129.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.5827499628067017, - "rewards/env_reward/std": 0.148499995470047, - "reward": 2.082750082015991, - "reward_std": 0.1485000103712082, - "frac_reward_zero_std": 0.0, - "completion_length": 3.0, - "kl": 0.17908194661140442, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.14, - "step": 280 - }, - { - "loss": 0.15, - "grad_norm": 11.619885444641113, - "learning_rate": 2.5222222222222225e-05, - "num_tokens": 342370.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.45319998264312744, - "rewards/env_reward/std": 1.5305131673812866, - "reward": 1.953200101852417, - "reward_std": 1.5305134057998657, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 1.4995090551674366, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1405, - "step": 281 - }, - { - "loss": 0.0366, - "grad_norm": 0.21036075055599213, - "learning_rate": 2.5111111111111113e-05, - "num_tokens": 343614.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 1.0576000213623047, - "rewards/env_reward/std": 0.0, - "reward": 2.5576000213623047, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 0.3662685751914978, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.141, - "step": 282 - }, - { - "loss": 0.0113, - "grad_norm": 8.74146842956543, - "learning_rate": 2.5e-05, - "num_tokens": 344828.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.4428500235080719, - "rewards/env_reward/std": 0.15349557995796204, - "reward": 1.9428499937057495, - "reward_std": 0.1534956395626068, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.11289814859628677, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1415, - "step": 283 - }, - { - "loss": 0.0467, - "grad_norm": 4.497031211853027, - "learning_rate": 2.488888888888889e-05, - "num_tokens": 346041.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.4057999849319458, - "rewards/env_reward/std": 1.4797999858856201, - "reward": 1.9057999849319458, - "reward_std": 1.4797998666763306, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.4667878672480583, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.142, - "step": 284 - }, - { - "loss": 0.1547, - "grad_norm": 10.665767669677734, - "learning_rate": 2.477777777777778e-05, - "num_tokens": 347254.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.2880000174045563, - "rewards/env_reward/std": 1.1435999870300293, - "reward": 1.7880001068115234, - "reward_std": 1.1435999870300293, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 1.5469659715890884, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1425, - "step": 285 - }, - { - "loss": 0.1671, - "grad_norm": 40.458736419677734, - "learning_rate": 2.466666666666667e-05, - "num_tokens": 348472.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.3573499917984009, - "rewards/env_reward/std": 0.0411650724709034, - "reward": 1.8573499917984009, - "reward_std": 0.04116509109735489, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 1.6707031056284904, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.143, - "step": 286 - }, - { - "loss": 0.0732, - "grad_norm": 6.045069694519043, - "learning_rate": 2.4555555555555557e-05, - "num_tokens": 349684.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.439674973487854, - "rewards/env_reward/std": 0.4867240786552429, - "reward": 1.9396750926971436, - "reward_std": 0.4867240786552429, - "frac_reward_zero_std": 0.0, - "completion_length": 3.0, - "kl": 0.7324017907958478, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1435, - "step": 287 - }, - { - "loss": 0.303, - "grad_norm": 30.9366512298584, - "learning_rate": 2.4444444444444445e-05, - "num_tokens": 350902.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.23154999315738678, - "rewards/env_reward/std": 0.25824877619743347, - "reward": 1.7315499782562256, - "reward_std": 0.2582487463951111, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 3.029875487089157, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.144, - "step": 288 - }, - { - "loss": 0.0467, - "grad_norm": 3.2864396572113037, - "learning_rate": 2.4333333333333336e-05, - "num_tokens": 352114.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.299049973487854, - "rewards/env_reward/std": 0.22349999845027924, - "reward": 1.200950026512146, - "reward_std": 0.22350001335144043, - "frac_reward_zero_std": 0.0, - "completion_length": 3.0, - "kl": 0.467236191034317, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1445, - "step": 289 - }, - { - "loss": 0.0158, - "grad_norm": 0.14807219803333282, - "learning_rate": 2.4222222222222224e-05, - "num_tokens": 353354.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.17890000343322754, - "rewards/env_reward/std": 0.0, - "reward": 1.6789000034332275, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.15768074989318848, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.145, - "step": 290 - }, - { - "loss": 0.2747, - "grad_norm": 36.407737731933594, - "learning_rate": 2.4111111111111113e-05, - "num_tokens": 354567.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.335999995470047, - "rewards/env_reward/std": 0.2505159378051758, - "reward": 1.1640000343322754, - "reward_std": 0.2505159080028534, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 2.7468889504671097, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1455, - "step": 291 - }, - { - "loss": 0.1787, - "grad_norm": 13.456348419189453, - "learning_rate": 2.4e-05, - "num_tokens": 355781.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.40004998445510864, - "rewards/env_reward/std": 0.11044710874557495, - "reward": 1.9000499248504639, - "reward_std": 0.11044712364673615, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 1.787174940109253, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.146, - "step": 292 - }, - { - "loss": 0.0, - "grad_norm": 0.012113215401768684, - "learning_rate": 2.3888888888888892e-05, - "num_tokens": 357001.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.024900000542402267, - "rewards/env_reward/std": 0.0, - "reward": 1.524899959564209, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 0.000368654727935791, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1465, - "step": 293 - }, - { - "loss": 0.0683, - "grad_norm": 3.7350590229034424, - "learning_rate": 2.377777777777778e-05, - "num_tokens": 358215.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.042649999260902405, - "rewards/env_reward/std": 0.11010069400072098, - "reward": 1.4573500156402588, - "reward_std": 0.11010072380304337, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.6829928830265999, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.147, - "step": 294 - }, - { - "loss": 0.0682, - "grad_norm": 8.174044609069824, - "learning_rate": 2.3666666666666668e-05, - "num_tokens": 359456.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.030500005930662155, - "rewards/env_reward/std": 0.2613101601600647, - "reward": 1.530500054359436, - "reward_std": 0.2613101899623871, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.6816707998514175, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1475, - "step": 295 - }, - { - "loss": 0.0335, - "grad_norm": 13.81272029876709, - "learning_rate": 2.3555555555555556e-05, - "num_tokens": 360672.0, - "completions/mean_length": 4.0, - "completions/min_length": 3.0, - "completions/max_length": 5.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 5.0, - "rewards/format_valid/mean": 0.25, - "rewards/format_valid/std": 1.5, - "rewards/action_legal/mean": 0.125, - "rewards/action_legal/std": 0.75, - "rewards/env_reward/mean": -0.808650016784668, - "rewards/env_reward/std": 1.4733041524887085, - "reward": -0.43365007638931274, - "reward_std": 3.7158007621765137, - "frac_reward_zero_std": 0.0, - "completion_length": 5.0, - "kl": 0.33455391973257065, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.148, - "step": 296 - }, - { - "loss": 0.0277, - "grad_norm": 1.6418352127075195, - "learning_rate": 2.3444444444444448e-05, - "num_tokens": 361888.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.09337499737739563, - "rewards/env_reward/std": 0.007350001484155655, - "reward": 1.5933749675750732, - "reward_std": 0.00735000753775239, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.2769545763731003, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1485, - "step": 297 - }, - { - "loss": 0.0079, - "grad_norm": 12.594855308532715, - "learning_rate": 2.3333333333333336e-05, - "num_tokens": 363102.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 1.166100025177002, - "rewards/env_reward/std": 1.2303334474563599, - "reward": 2.666100025177002, - "reward_std": 1.2303334474563599, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.07933932542800903, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.149, - "step": 298 - }, - { - "loss": 0.0161, - "grad_norm": 3.4360291957855225, - "learning_rate": 2.3222222222222224e-05, - "num_tokens": 364314.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.3107999861240387, - "rewards/env_reward/std": 0.5266494154930115, - "reward": 1.8107999563217163, - "reward_std": 0.5266494154930115, - "frac_reward_zero_std": 0.0, - "completion_length": 3.0, - "kl": 0.16086430102586746, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1495, - "step": 299 - }, - { - "loss": 0.0199, - "grad_norm": 12.927040100097656, - "learning_rate": 2.3111111111111112e-05, - "num_tokens": 365532.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.22199998795986176, - "rewards/env_reward/std": 0.3220459520816803, - "reward": 1.722000002861023, - "reward_std": 0.3220460116863251, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.19921859353780746, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.15, - "step": 300 - }, - { - "loss": 0.0097, - "grad_norm": 11.500689506530762, - "learning_rate": 2.3000000000000003e-05, - "num_tokens": 366755.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 2.149049997329712, - "rewards/env_reward/std": 1.404900074005127, - "reward": 3.649049997329712, - "reward_std": 1.404900074005127, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.09673605859279633, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1505, - "step": 301 - }, - { - "loss": 0.0063, - "grad_norm": 3.8352203369140625, - "learning_rate": 2.288888888888889e-05, - "num_tokens": 367971.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.2719250023365021, - "rewards/env_reward/std": 0.4681147336959839, - "reward": 1.7719249725341797, - "reward_std": 0.46811479330062866, - "frac_reward_zero_std": 0.0, - "completion_length": 3.0, - "kl": 0.0630109328776598, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.151, - "step": 302 - }, - { - "loss": 0.0224, - "grad_norm": 10.689901351928711, - "learning_rate": 2.277777777777778e-05, - "num_tokens": 369186.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.17434999346733093, - "rewards/env_reward/std": 0.13670000433921814, - "reward": 1.6743500232696533, - "reward_std": 0.13669998943805695, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.22385764122009277, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1515, - "step": 303 - }, - { - "loss": 0.0436, - "grad_norm": 16.9063663482666, - "learning_rate": 2.2666666666666668e-05, - "num_tokens": 370404.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.17260000109672546, - "rewards/env_reward/std": 0.1812880039215088, - "reward": 1.6726000308990479, - "reward_std": 0.1812879890203476, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.4355325996875763, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.152, - "step": 304 - }, - { - "loss": 0.019, - "grad_norm": 10.453081130981445, - "learning_rate": 2.255555555555556e-05, - "num_tokens": 371622.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 1.5529749393463135, - "rewards/env_reward/std": 1.6722427606582642, - "reward": 3.0529751777648926, - "reward_std": 1.6722426414489746, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.19028642773628235, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1525, - "step": 305 - }, - { - "loss": 0.0007, - "grad_norm": 2.4359123706817627, - "learning_rate": 2.2444444444444447e-05, - "num_tokens": 372834.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.21952499449253082, - "rewards/env_reward/std": 0.2609499990940094, - "reward": 1.2804749011993408, - "reward_std": 0.2609499990940094, - "frac_reward_zero_std": 0.0, - "completion_length": 3.0, - "kl": 0.0068905456573702395, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.153, - "step": 306 - }, - { - "loss": 0.0293, - "grad_norm": 0.18433877825737, - "learning_rate": 2.2333333333333335e-05, - "num_tokens": 374050.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.23100000619888306, - "rewards/env_reward/std": 0.0, - "reward": 1.7309999465942383, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.2930239737033844, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1535, - "step": 307 - }, - { - "loss": 0.0439, - "grad_norm": 7.052338600158691, - "learning_rate": 2.2222222222222223e-05, - "num_tokens": 375269.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 2.196974992752075, - "rewards/env_reward/std": 1.4914499521255493, - "reward": 3.696974992752075, - "reward_std": 1.4914498329162598, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.43932508677244186, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.154, - "step": 308 - }, - { - "loss": 0.0413, - "grad_norm": 7.099660873413086, - "learning_rate": 2.211111111111111e-05, - "num_tokens": 376484.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 2.0833749771118164, - "rewards/env_reward/std": 1.1990500688552856, - "reward": 3.5833749771118164, - "reward_std": 1.199049949645996, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.41285115480422974, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1545, - "step": 309 - }, - { - "loss": 0.2951, - "grad_norm": 31.55593490600586, - "learning_rate": 2.2000000000000003e-05, - "num_tokens": 377697.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.7319250106811523, - "rewards/env_reward/std": 1.3156499862670898, - "reward": 2.2319250106811523, - "reward_std": 1.3156499862670898, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 2.950943909585476, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.155, - "step": 310 - }, - { - "loss": 0.0574, - "grad_norm": 4.8634724617004395, - "learning_rate": 2.188888888888889e-05, - "num_tokens": 378912.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 2.2956249713897705, - "rewards/env_reward/std": 1.3313499689102173, - "reward": 3.7956249713897705, - "reward_std": 1.3313499689102173, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.5737996399402618, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1555, - "step": 311 - }, - { - "loss": 0.2786, - "grad_norm": 24.31743049621582, - "learning_rate": 2.177777777777778e-05, - "num_tokens": 380130.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.4107999801635742, - "rewards/env_reward/std": 0.11281422525644302, - "reward": 1.9107999801635742, - "reward_std": 0.11281431466341019, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 2.786042869091034, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.156, - "step": 312 - }, - { - "loss": 0.3142, - "grad_norm": 28.74258804321289, - "learning_rate": 2.1666666666666667e-05, - "num_tokens": 381344.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 1.4592249393463135, - "rewards/env_reward/std": 1.7793108224868774, - "reward": 2.9592249393463135, - "reward_std": 1.7793108224868774, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 3.1422307789325714, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1565, - "step": 313 - }, - { - "loss": 0.0197, - "grad_norm": 1.6756749153137207, - "learning_rate": 2.1555555555555555e-05, - "num_tokens": 382560.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 2.0082499980926514, - "rewards/env_reward/std": 1.2208999395370483, - "reward": 3.5082499980926514, - "reward_std": 1.2209001779556274, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.19709030538797379, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.157, - "step": 314 - }, - { - "loss": 0.0153, - "grad_norm": 0.08564022183418274, - "learning_rate": 2.1444444444444443e-05, - "num_tokens": 383776.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 2.7123000621795654, - "rewards/env_reward/std": 0.0, - "reward": 4.2123003005981445, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.1531214416027069, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1575, - "step": 315 - }, - { - "loss": 0.0935, - "grad_norm": 2.8297767639160156, - "learning_rate": 2.1333333333333335e-05, - "num_tokens": 384991.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.03192500025033951, - "rewards/env_reward/std": 0.009549999609589577, - "reward": 1.5319249629974365, - "reward_std": 0.009549975395202637, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.9346872717142105, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.158, - "step": 316 - }, - { - "loss": 0.232, - "grad_norm": 44.11138153076172, - "learning_rate": 2.1222222222222223e-05, - "num_tokens": 386208.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.6417250037193298, - "rewards/env_reward/std": 0.5402948260307312, - "reward": 2.1417250633239746, - "reward_std": 0.5402949452400208, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 2.319624111056328, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1585, - "step": 317 - }, - { - "loss": 0.2981, - "grad_norm": 25.64458656311035, - "learning_rate": 2.111111111111111e-05, - "num_tokens": 387422.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 1.2137749195098877, - "rewards/env_reward/std": 1.311942219734192, - "reward": 2.7137749195098877, - "reward_std": 1.311942219734192, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 2.980517789721489, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.159, - "step": 318 - }, - { - "loss": 0.0255, - "grad_norm": 0.06195315718650818, - "learning_rate": 2.1e-05, - "num_tokens": 388638.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.13220000267028809, - "rewards/env_reward/std": 0.0, - "reward": 1.632200002670288, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.2548181116580963, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1595, - "step": 319 - }, - { - "loss": 0.0244, - "grad_norm": 0.08441329002380371, - "learning_rate": 2.088888888888889e-05, - "num_tokens": 389858.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.11670000106096268, - "rewards/env_reward/std": 0.0, - "reward": 1.6167000532150269, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.24376285076141357, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.16, - "step": 320 - }, - { - "loss": 0.0031, - "grad_norm": 0.04484923183917999, - "learning_rate": 2.077777777777778e-05, - "num_tokens": 391102.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 2.989500045776367, - "rewards/env_reward/std": 0.0, - "reward": 4.489500045776367, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.03081400692462921, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1605, - "step": 321 - }, - { - "loss": 0.0686, - "grad_norm": 17.365371704101562, - "learning_rate": 2.0666666666666666e-05, - "num_tokens": 392321.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.11549999564886093, - "rewards/env_reward/std": 0.009599998593330383, - "reward": 1.6154999732971191, - "reward_std": 0.009600004181265831, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.6855819001793861, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.161, - "step": 322 - }, - { - "loss": 0.0571, - "grad_norm": 7.864797592163086, - "learning_rate": 2.0555555555555555e-05, - "num_tokens": 393568.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 2.226325035095215, - "rewards/env_reward/std": 1.5473500490188599, - "reward": 3.726325035095215, - "reward_std": 1.5473499298095703, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.5705475509166718, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1615, - "step": 323 - }, - { - "loss": 0.0432, - "grad_norm": 4.899349212646484, - "learning_rate": 2.0444444444444446e-05, - "num_tokens": 394787.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 3.0, - "rewards/env_reward/std": 0.0, - "reward": 4.5, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.43231137096881866, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.162, - "step": 324 - }, - { - "loss": 0.0341, - "grad_norm": 7.9208664894104, - "learning_rate": 2.0333333333333334e-05, - "num_tokens": 396006.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.4809750020503998, - "rewards/env_reward/std": 0.11704999208450317, - "reward": 1.9809750318527222, - "reward_std": 0.11704997718334198, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.3410384729504585, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1625, - "step": 325 - }, - { - "loss": 0.0173, - "grad_norm": 0.05077933520078659, - "learning_rate": 2.0222222222222222e-05, - "num_tokens": 397230.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 3.0, - "rewards/env_reward/std": 0.0, - "reward": 4.5, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.17307066917419434, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.163, - "step": 326 - }, - { - "loss": 0.107, - "grad_norm": 15.94080924987793, - "learning_rate": 2.011111111111111e-05, - "num_tokens": 398448.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.12385000288486481, - "rewards/env_reward/std": 0.14749404788017273, - "reward": 1.6238499879837036, - "reward_std": 0.14749404788017273, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 1.0698204934597015, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1635, - "step": 327 - }, - { - "loss": 0.0232, - "grad_norm": 0.06655026227235794, - "learning_rate": 2e-05, - "num_tokens": 399668.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.2168000042438507, - "rewards/env_reward/std": 0.0, - "reward": 1.7167999744415283, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.23233091831207275, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.164, - "step": 328 - }, - { - "loss": 0.0141, - "grad_norm": 0.044783495366573334, - "learning_rate": 1.988888888888889e-05, - "num_tokens": 400892.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 3.0, - "rewards/env_reward/std": 0.0, - "reward": 4.5, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.1407936066389084, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1645, - "step": 329 - }, - { - "loss": 0.0325, - "grad_norm": 0.3753240704536438, - "learning_rate": 1.9777777777777778e-05, - "num_tokens": 402104.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.4124999940395355, - "rewards/env_reward/std": 0.0, - "reward": 1.912500023841858, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 0.3249969184398651, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.165, - "step": 330 - }, - { - "loss": 0.03, - "grad_norm": 10.282318115234375, - "learning_rate": 1.9666666666666666e-05, - "num_tokens": 403323.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.6045999526977539, - "rewards/env_reward/std": 0.4702000021934509, - "reward": 2.104599952697754, - "reward_std": 0.4702000021934509, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.30002758651971817, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1655, - "step": 331 - }, - { - "loss": 0.023, - "grad_norm": 0.25242263078689575, - "learning_rate": 1.9555555555555557e-05, - "num_tokens": 404535.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.11249999701976776, - "rewards/env_reward/std": 0.0, - "reward": 1.6124999523162842, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 0.22953550517559052, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.166, - "step": 332 - }, - { - "loss": 0.0502, - "grad_norm": 14.25239086151123, - "learning_rate": 1.9444444444444445e-05, - "num_tokens": 405753.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.031175002455711365, - "rewards/env_reward/std": 0.34885352849960327, - "reward": 1.531174898147583, - "reward_std": 0.34885352849960327, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.5018511228263378, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1665, - "step": 333 - }, - { - "loss": 0.0413, - "grad_norm": 2.6266722679138184, - "learning_rate": 1.9333333333333333e-05, - "num_tokens": 406973.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.12707500159740448, - "rewards/env_reward/std": 0.004249997902661562, - "reward": 1.627074956893921, - "reward_std": 0.004250010009855032, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.41331926733255386, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.167, - "step": 334 - }, - { - "loss": 0.0229, - "grad_norm": 0.1149856448173523, - "learning_rate": 1.922222222222222e-05, - "num_tokens": 408209.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.1868000030517578, - "rewards/env_reward/std": 0.0, - "reward": 1.3131999969482422, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 0.22919198870658875, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1675, - "step": 335 - }, - { - "loss": 0.0535, - "grad_norm": 9.98381233215332, - "learning_rate": 1.9111111111111113e-05, - "num_tokens": 409432.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.37014999985694885, - "rewards/env_reward/std": 0.23389999568462372, - "reward": 1.870150089263916, - "reward_std": 0.23389999568462372, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.5347674936056137, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.168, - "step": 336 - }, - { - "loss": 0.0317, - "grad_norm": 0.06461849808692932, - "learning_rate": 1.9e-05, - "num_tokens": 410652.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.5595999956130981, - "rewards/env_reward/std": 0.0, - "reward": 2.0595998764038086, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.31705617904663086, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1685, - "step": 337 - }, - { - "loss": 0.0226, - "grad_norm": 0.05863592401146889, - "learning_rate": 1.888888888888889e-05, - "num_tokens": 411868.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.13179999589920044, - "rewards/env_reward/std": 0.0, - "reward": 1.6317999362945557, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.22567623853683472, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.169, - "step": 338 - }, - { - "loss": 0.0174, - "grad_norm": 0.0803116038441658, - "learning_rate": 1.8777777777777777e-05, - "num_tokens": 413112.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 2.745500087738037, - "rewards/env_reward/std": 0.0, - "reward": 4.245500087738037, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.1738945096731186, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1695, - "step": 339 - }, - { - "loss": 0.0219, - "grad_norm": 10.494765281677246, - "learning_rate": 1.866666666666667e-05, - "num_tokens": 414359.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.11049999296665192, - "rewards/env_reward/std": 0.06040000170469284, - "reward": 1.6104999780654907, - "reward_std": 0.06040000915527344, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.21919090673327446, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.17, - "step": 340 - }, - { - "loss": 0.0399, - "grad_norm": 10.061318397521973, - "learning_rate": 1.8555555555555557e-05, - "num_tokens": 415577.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.3518500030040741, - "rewards/env_reward/std": 0.3896750807762146, - "reward": 1.8518500328063965, - "reward_std": 0.3896750807762146, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.3991617187857628, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1705, - "step": 341 - }, - { - "loss": 0.0533, - "grad_norm": 8.98741626739502, - "learning_rate": 1.8444444444444445e-05, - "num_tokens": 416796.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.437375009059906, - "rewards/env_reward/std": 0.2680499851703644, - "reward": 1.9373749494552612, - "reward_std": 0.268049955368042, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.5329404026269913, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.171, - "step": 342 - }, - { - "loss": 0.0387, - "grad_norm": 8.799539566040039, - "learning_rate": 1.8333333333333333e-05, - "num_tokens": 418015.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.10607500374317169, - "rewards/env_reward/std": 0.030049998313188553, - "reward": 1.6060749292373657, - "reward_std": 0.030049998313188553, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.3870438188314438, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1715, - "step": 343 - }, - { - "loss": 0.0693, - "grad_norm": 7.4406280517578125, - "learning_rate": 1.8222222222222224e-05, - "num_tokens": 419229.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.2102999985218048, - "rewards/env_reward/std": 0.4063391089439392, - "reward": 1.7102999687194824, - "reward_std": 0.406339168548584, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.6930159032344818, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.172, - "step": 344 - }, - { - "loss": 0.0704, - "grad_norm": 5.985000133514404, - "learning_rate": 1.8111111111111112e-05, - "num_tokens": 420448.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.17270000278949738, - "rewards/env_reward/std": 0.0, - "reward": 1.672700047492981, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.7036964893341064, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1725, - "step": 345 - }, - { - "loss": 0.026, - "grad_norm": 0.07850353419780731, - "learning_rate": 1.8e-05, - "num_tokens": 421668.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.5697000026702881, - "rewards/env_reward/std": 0.0, - "reward": 2.069700002670288, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.25952446460723877, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.173, - "step": 346 - }, - { - "loss": 0.0499, - "grad_norm": 0.07321112602949142, - "learning_rate": 1.788888888888889e-05, - "num_tokens": 422892.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.09549999982118607, - "rewards/env_reward/std": 0.0, - "reward": 1.5954999923706055, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.4991249144077301, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1735, - "step": 347 - }, - { - "loss": 0.1159, - "grad_norm": 14.562685012817383, - "learning_rate": 1.777777777777778e-05, - "num_tokens": 424111.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 3.0, - "rewards/env_reward/std": 0.0, - "reward": 4.5, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 1.1592684239149094, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.174, - "step": 348 - }, - { - "loss": 0.0324, - "grad_norm": 0.08439615368843079, - "learning_rate": 1.7666666666666668e-05, - "num_tokens": 425327.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.0551999993622303, - "rewards/env_reward/std": 0.0, - "reward": 1.4448000192642212, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.32355356216430664, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1745, - "step": 349 - }, - { - "loss": 0.2581, - "grad_norm": 17.2807674407959, - "learning_rate": 1.7555555555555556e-05, - "num_tokens": 426546.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.07095000147819519, - "rewards/env_reward/std": 0.10130000859498978, - "reward": 1.5709500312805176, - "reward_std": 0.10130000114440918, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 2.581086255609989, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.175, - "step": 350 - }, - { - "loss": 0.0209, - "grad_norm": 0.06290078908205032, - "learning_rate": 1.7444444444444448e-05, - "num_tokens": 427762.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.48750001192092896, - "rewards/env_reward/std": 0.0, - "reward": 1.9874999523162842, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.20860141515731812, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1755, - "step": 351 - }, - { - "loss": 0.0244, - "grad_norm": 0.07568230479955673, - "learning_rate": 1.7333333333333336e-05, - "num_tokens": 428978.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.4765999913215637, - "rewards/env_reward/std": 0.0, - "reward": 1.976599931716919, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.24379196763038635, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.176, - "step": 352 - }, - { - "loss": 0.1051, - "grad_norm": 1.451683759689331, - "learning_rate": 1.7222222222222224e-05, - "num_tokens": 430201.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.14972499012947083, - "rewards/env_reward/std": 0.017549999058246613, - "reward": 1.6497249603271484, - "reward_std": 0.017549991607666016, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 1.0505332052707672, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1765, - "step": 353 - }, - { - "loss": 0.2366, - "grad_norm": 13.9566068649292, - "learning_rate": 1.7111111111111112e-05, - "num_tokens": 431416.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.07552500069141388, - "rewards/env_reward/std": 0.1043500006198883, - "reward": 1.5755250453948975, - "reward_std": 0.10434997081756592, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 2.365892544388771, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.177, - "step": 354 - }, - { - "loss": 0.1949, - "grad_norm": 32.534908294677734, - "learning_rate": 1.7000000000000003e-05, - "num_tokens": 432639.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.27617502212524414, - "rewards/env_reward/std": 0.2965500056743622, - "reward": 1.7761750221252441, - "reward_std": 0.2965499460697174, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 1.948879137635231, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1775, - "step": 355 - }, - { - "loss": 0.0072, - "grad_norm": 2.4790847301483154, - "learning_rate": 1.688888888888889e-05, - "num_tokens": 433883.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 2.28629994392395, - "rewards/env_reward/std": 1.42739999294281, - "reward": 3.78629994392395, - "reward_std": 1.42739999294281, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.0723607949912548, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.178, - "step": 356 - }, - { - "loss": 0.1356, - "grad_norm": 5.789950847625732, - "learning_rate": 1.677777777777778e-05, - "num_tokens": 435098.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 1.129325032234192, - "rewards/env_reward/std": 1.6126071214675903, - "reward": 2.6293251514434814, - "reward_std": 1.6126071214675903, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 1.3564762622117996, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1785, - "step": 357 - }, - { - "loss": 0.0798, - "grad_norm": 1.1189056634902954, - "learning_rate": 1.6666666666666667e-05, - "num_tokens": 436314.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.11904999613761902, - "rewards/env_reward/std": 0.0018999986350536346, - "reward": 1.6190500259399414, - "reward_std": 0.0019000370521098375, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.7980380952358246, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.179, - "step": 358 - }, - { - "loss": 0.067, - "grad_norm": 18.612730026245117, - "learning_rate": 1.655555555555556e-05, - "num_tokens": 437529.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 5.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 5.0, - "rewards/format_valid/mean": 0.25, - "rewards/format_valid/std": 1.5, - "rewards/action_legal/mean": 0.125, - "rewards/action_legal/std": 0.75, - "rewards/env_reward/mean": -0.595674991607666, - "rewards/env_reward/std": 1.6049399375915527, - "reward": -0.2206750512123108, - "reward_std": 3.8537392616271973, - "frac_reward_zero_std": 0.0, - "completion_length": 5.0, - "kl": 0.6701930351555347, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1795, - "step": 359 - }, - { - "loss": 0.0139, - "grad_norm": 0.04511000216007233, - "learning_rate": 1.6444444444444447e-05, - "num_tokens": 438773.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 2.9381000995635986, - "rewards/env_reward/std": 0.0, - "reward": 4.4380998611450195, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.13874168694019318, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.18, - "step": 360 - }, - { - "loss": 0.0321, - "grad_norm": 0.04865608364343643, - "learning_rate": 1.6333333333333335e-05, - "num_tokens": 439989.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.120899997651577, - "rewards/env_reward/std": 0.0, - "reward": 1.62090003490448, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.3211500346660614, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1805, - "step": 361 - }, - { - "loss": 0.0416, - "grad_norm": 13.253110885620117, - "learning_rate": 1.6222222222222223e-05, - "num_tokens": 441204.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.08607499301433563, - "rewards/env_reward/std": 0.028149999678134918, - "reward": 1.5860750675201416, - "reward_std": 0.028149962425231934, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.41631054133176804, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.181, - "step": 362 - }, - { - "loss": 0.0345, - "grad_norm": 0.05830829218029976, - "learning_rate": 1.6111111111111115e-05, - "num_tokens": 442420.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.41110000014305115, - "rewards/env_reward/std": 0.0, - "reward": 1.9111000299453735, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.3447533845901489, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1815, - "step": 363 - }, - { - "loss": 0.0929, - "grad_norm": 6.0469584465026855, - "learning_rate": 1.6000000000000003e-05, - "num_tokens": 443634.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.10462500154972076, - "rewards/env_reward/std": 0.0596962571144104, - "reward": 1.6046249866485596, - "reward_std": 0.0596962608397007, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.9288121908903122, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.182, - "step": 364 - }, - { - "loss": 0.0555, - "grad_norm": 10.187057495117188, - "learning_rate": 1.588888888888889e-05, - "num_tokens": 444873.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 2.0332250595092773, - "rewards/env_reward/std": 1.4233499765396118, - "reward": 3.5332248210906982, - "reward_std": 1.4233498573303223, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.555173322558403, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1825, - "step": 365 - }, - { - "loss": 0.1928, - "grad_norm": 19.102651596069336, - "learning_rate": 1.577777777777778e-05, - "num_tokens": 446088.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.5183249711990356, - "rewards/env_reward/std": 0.5100499987602234, - "reward": 2.018324851989746, - "reward_std": 0.5100500583648682, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 1.9275872558355331, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.183, - "step": 366 - }, - { - "loss": 0.0008, - "grad_norm": 7.459122180938721, - "learning_rate": 1.5666666666666667e-05, - "num_tokens": 447304.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.17572498321533203, - "rewards/env_reward/std": 0.24214999377727509, - "reward": 1.324275016784668, - "reward_std": 0.24214999377727509, - "frac_reward_zero_std": 0.0, - "completion_length": 3.0, - "kl": 0.007747650146484375, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1835, - "step": 367 - }, - { - "loss": 0.0527, - "grad_norm": 0.05531112104654312, - "learning_rate": 1.5555555555555555e-05, - "num_tokens": 448524.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.0478999987244606, - "rewards/env_reward/std": 0.0, - "reward": 1.5478999614715576, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.527458131313324, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.184, - "step": 368 - }, - { - "loss": 0.1785, - "grad_norm": 12.54236125946045, - "learning_rate": 1.5444444444444446e-05, - "num_tokens": 449738.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 1.155324935913086, - "rewards/env_reward/std": 1.2862874269485474, - "reward": 2.655324935913086, - "reward_std": 1.286287546157837, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 1.7845973372459412, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1845, - "step": 369 - }, - { - "loss": 0.0947, - "grad_norm": 11.19451904296875, - "learning_rate": 1.5333333333333334e-05, - "num_tokens": 450952.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.8645250201225281, - "rewards/env_reward/std": 0.9372963905334473, - "reward": 2.3645248413085938, - "reward_std": 0.9372963905334473, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.9465900436043739, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.185, - "step": 370 - }, - { - "loss": 0.0309, - "grad_norm": 0.05680959299206734, - "learning_rate": 1.5222222222222224e-05, - "num_tokens": 452168.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.22120000422000885, - "rewards/env_reward/std": 0.0, - "reward": 1.7211999893188477, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.30913686752319336, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1855, - "step": 371 - }, - { - "loss": 0.0533, - "grad_norm": 12.08806324005127, - "learning_rate": 1.5111111111111112e-05, - "num_tokens": 453383.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.36172500252723694, - "rewards/env_reward/std": 0.1968500018119812, - "reward": 1.8617249727249146, - "reward_std": 0.19685006141662598, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.5326454341411591, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.186, - "step": 372 - }, - { - "loss": 0.0237, - "grad_norm": 0.037441231310367584, - "learning_rate": 1.5e-05, - "num_tokens": 454603.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.12919999659061432, - "rewards/env_reward/std": 0.0, - "reward": 1.6291999816894531, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.23727068305015564, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1865, - "step": 373 - }, - { - "loss": 0.0995, - "grad_norm": 18.8474178314209, - "learning_rate": 1.4888888888888888e-05, - "num_tokens": 455845.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.4505000114440918, - "rewards/env_reward/std": 0.09029757976531982, - "reward": 1.9505000114440918, - "reward_std": 0.09029749035835266, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.9948283955454826, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.187, - "step": 374 - }, - { - "loss": 0.0285, - "grad_norm": 14.588151931762695, - "learning_rate": 1.477777777777778e-05, - "num_tokens": 457082.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.7173500061035156, - "rewards/env_reward/std": 0.4462999999523163, - "reward": 2.2173500061035156, - "reward_std": 0.44630002975463867, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.28469008952379227, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1875, - "step": 375 - }, - { - "loss": 0.0764, - "grad_norm": 12.01020336151123, - "learning_rate": 1.4666666666666668e-05, - "num_tokens": 458297.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.287075012922287, - "rewards/env_reward/std": 0.4023500382900238, - "reward": 1.7870750427246094, - "reward_std": 0.4023500382900238, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.7636995688080788, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.188, - "step": 376 - }, - { - "loss": 0.0493, - "grad_norm": 14.367756843566895, - "learning_rate": 1.4555555555555556e-05, - "num_tokens": 459511.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.49560001492500305, - "rewards/env_reward/std": 0.3432924747467041, - "reward": 1.9955999851226807, - "reward_std": 0.3432925045490265, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.4925762265920639, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1885, - "step": 377 - }, - { - "loss": 0.0477, - "grad_norm": 11.141998291015625, - "learning_rate": 1.4444444444444444e-05, - "num_tokens": 460754.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 2.2250499725341797, - "rewards/env_reward/std": 1.5499000549316406, - "reward": 3.7250499725341797, - "reward_std": 1.5499000549316406, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.47659212350845337, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.189, - "step": 378 - }, - { - "loss": 0.0055, - "grad_norm": 6.927285194396973, - "learning_rate": 1.4333333333333334e-05, - "num_tokens": 461966.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.1862250119447708, - "rewards/env_reward/std": 0.19915001094341278, - "reward": 1.3137750625610352, - "reward_std": 0.1991499662399292, - "frac_reward_zero_std": 0.0, - "completion_length": 3.0, - "kl": 0.05537472292780876, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1895, - "step": 379 - }, - { - "loss": 0.0674, - "grad_norm": 10.835347175598145, - "learning_rate": 1.4222222222222224e-05, - "num_tokens": 463189.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.1679999977350235, - "rewards/env_reward/std": 0.07199999690055847, - "reward": 1.6679999828338623, - "reward_std": 0.07200002670288086, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.6744091212749481, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.19, - "step": 380 - }, - { - "loss": 0.0151, - "grad_norm": 0.025390751659870148, - "learning_rate": 1.4111111111111112e-05, - "num_tokens": 464405.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 3.0, - "rewards/env_reward/std": 0.0, - "reward": 4.5, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.15128396451473236, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1905, - "step": 381 - }, - { - "loss": 0.0431, - "grad_norm": 10.737701416015625, - "learning_rate": 1.4000000000000001e-05, - "num_tokens": 465624.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.07417500019073486, - "rewards/env_reward/std": 0.1214500144124031, - "reward": 1.5741748809814453, - "reward_std": 0.1214500293135643, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.43060215562582016, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.191, - "step": 382 - }, - { - "loss": 0.0116, - "grad_norm": 0.03365212678909302, - "learning_rate": 1.388888888888889e-05, - "num_tokens": 466848.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 3.0, - "rewards/env_reward/std": 0.0, - "reward": 4.5, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.1161755621433258, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1915, - "step": 383 - }, - { - "loss": 0.0193, - "grad_norm": 12.718644142150879, - "learning_rate": 1.3777777777777778e-05, - "num_tokens": 468061.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.26397499442100525, - "rewards/env_reward/std": 0.42328521609306335, - "reward": 1.7639750242233276, - "reward_std": 0.42328527569770813, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.192903034389019, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.192, - "step": 384 - }, - { - "loss": 0.028, - "grad_norm": 10.580364227294922, - "learning_rate": 1.3666666666666666e-05, - "num_tokens": 469276.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.20807498693466187, - "rewards/env_reward/std": 0.07964999973773956, - "reward": 1.2919249534606934, - "reward_std": 0.07965004444122314, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.28047777712345123, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1925, - "step": 385 - }, - { - "loss": 0.0389, - "grad_norm": 0.11868631094694138, - "learning_rate": 1.3555555555555557e-05, - "num_tokens": 470492.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.23100000619888306, - "rewards/env_reward/std": 0.0, - "reward": 1.7309999465942383, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.38852906227111816, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.193, - "step": 386 - }, - { - "loss": 0.0184, - "grad_norm": 4.448313236236572, - "learning_rate": 1.3444444444444445e-05, - "num_tokens": 471704.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.014475002884864807, - "rewards/env_reward/std": 0.1984500139951706, - "reward": 1.5144751071929932, - "reward_std": 0.1984500139951706, - "frac_reward_zero_std": 0.0, - "completion_length": 3.0, - "kl": 0.18447205191478133, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1935, - "step": 387 - }, - { - "loss": 0.0291, - "grad_norm": 12.654869079589844, - "learning_rate": 1.3333333333333333e-05, - "num_tokens": 472918.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 1.6022250652313232, - "rewards/env_reward/std": 1.6144437789916992, - "reward": 3.1022250652313232, - "reward_std": 1.6144436597824097, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.29140961170196533, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.194, - "step": 388 - }, - { - "loss": 0.0443, - "grad_norm": 10.518233299255371, - "learning_rate": 1.3222222222222221e-05, - "num_tokens": 474133.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 1.7465500831604004, - "rewards/env_reward/std": 0.98170006275177, - "reward": 3.2465500831604004, - "reward_std": 0.9817000031471252, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.4428248852491379, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1945, - "step": 389 - }, - { - "loss": 0.0398, - "grad_norm": 0.08597031235694885, - "learning_rate": 1.3111111111111113e-05, - "num_tokens": 475349.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.31470000743865967, - "rewards/env_reward/std": 0.0, - "reward": 1.8147000074386597, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.397882878780365, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.195, - "step": 390 - }, - { - "loss": 0.0287, - "grad_norm": 0.1028384119272232, - "learning_rate": 1.3000000000000001e-05, - "num_tokens": 476565.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 2.835400104522705, - "rewards/env_reward/std": 0.0, - "reward": 4.335400104522705, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.28692421317100525, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1955, - "step": 391 - }, - { - "loss": 0.0439, - "grad_norm": 10.946956634521484, - "learning_rate": 1.2888888888888889e-05, - "num_tokens": 477780.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.1368750035762787, - "rewards/env_reward/std": 0.07964999973773956, - "reward": 1.6368749141693115, - "reward_std": 0.07965000718832016, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.43886157870292664, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.196, - "step": 392 - }, - { - "loss": 0.0498, - "grad_norm": 12.383152961730957, - "learning_rate": 1.2777777777777777e-05, - "num_tokens": 478998.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 1.5355499982833862, - "rewards/env_reward/std": 1.6910011768341064, - "reward": 3.035550117492676, - "reward_std": 1.6910011768341064, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.4976438209414482, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1965, - "step": 393 - }, - { - "loss": 0.0249, - "grad_norm": 10.40050220489502, - "learning_rate": 1.2666666666666668e-05, - "num_tokens": 480213.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 2.1511499881744385, - "rewards/env_reward/std": 1.332900047302246, - "reward": 3.6511502265930176, - "reward_std": 1.332900047302246, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.24874845519661903, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.197, - "step": 394 - }, - { - "loss": 0.0341, - "grad_norm": 10.021830558776855, - "learning_rate": 1.2555555555555557e-05, - "num_tokens": 481428.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.11985000222921371, - "rewards/env_reward/std": 0.04410000145435333, - "reward": 1.6198500394821167, - "reward_std": 0.044100046157836914, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.3408486172556877, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1975, - "step": 395 - }, - { - "loss": 0.0329, - "grad_norm": 11.029459953308105, - "learning_rate": 1.2444444444444445e-05, - "num_tokens": 482646.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.09814999997615814, - "rewards/env_reward/std": 0.27741682529449463, - "reward": 1.5981500148773193, - "reward_std": 0.27741679549217224, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.3289405405521393, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.198, - "step": 396 - }, - { - "loss": 0.0352, - "grad_norm": 0.10097940266132355, - "learning_rate": 1.2333333333333334e-05, - "num_tokens": 483866.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.5396999716758728, - "rewards/env_reward/std": 0.0, - "reward": 2.0397000312805176, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.3520781695842743, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1985, - "step": 397 - }, - { - "loss": 0.0342, - "grad_norm": 9.53077507019043, - "learning_rate": 1.2222222222222222e-05, - "num_tokens": 485081.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.18497499823570251, - "rewards/env_reward/std": 0.07945000380277634, - "reward": 1.6849749088287354, - "reward_std": 0.07945001125335693, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.3423382416367531, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.199, - "step": 398 - }, - { - "loss": 0.0543, - "grad_norm": 14.226777076721191, - "learning_rate": 1.2111111111111112e-05, - "num_tokens": 486300.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.6295000314712524, - "rewards/env_reward/std": 0.07639998197555542, - "reward": 2.129499912261963, - "reward_std": 0.0764000415802002, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.5428656786680222, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.1995, - "step": 399 - }, - { - "loss": 0.0213, - "grad_norm": 0.054999712854623795, - "learning_rate": 1.2e-05, - "num_tokens": 487520.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.12160000205039978, - "rewards/env_reward/std": 0.0, - "reward": 1.6216000318527222, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.2132653295993805, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.2, - "step": 400 - }, - { - "loss": 0.0678, - "grad_norm": 8.299275398254395, - "learning_rate": 1.188888888888889e-05, - "num_tokens": 488743.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.673675000667572, - "rewards/env_reward/std": 0.41065001487731934, - "reward": 2.173675060272217, - "reward_std": 0.41064998507499695, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.678008109331131, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.2005, - "step": 401 - }, - { - "loss": 0.0118, - "grad_norm": 0.09446023404598236, - "learning_rate": 1.1777777777777778e-05, - "num_tokens": 489991.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.5418000221252441, - "rewards/env_reward/std": 0.0, - "reward": 2.041800022125244, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.1179627776145935, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.201, - "step": 402 - }, - { - "loss": 0.1018, - "grad_norm": 3.7508544921875, - "learning_rate": 1.1666666666666668e-05, - "num_tokens": 491204.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.010974999517202377, - "rewards/env_reward/std": 0.08034929633140564, - "reward": 1.5109750032424927, - "reward_std": 0.08034923672676086, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 1.0181541293859482, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.2015, - "step": 403 - }, - { - "loss": 0.1123, - "grad_norm": 27.579870223999023, - "learning_rate": 1.1555555555555556e-05, - "num_tokens": 492437.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.03229999914765358, - "rewards/env_reward/std": 0.10499999672174454, - "reward": 1.4677000045776367, - "reward_std": 0.10500001907348633, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 1.1227066367864609, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.202, - "step": 404 - }, - { - "loss": 0.0542, - "grad_norm": 13.787353515625, - "learning_rate": 1.1444444444444446e-05, - "num_tokens": 493652.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.28747498989105225, - "rewards/env_reward/std": 0.048350006341934204, - "reward": 1.7874749898910522, - "reward_std": 0.048350054770708084, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.5424342602491379, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.2025, - "step": 405 - }, - { - "loss": 0.1413, - "grad_norm": 9.871804237365723, - "learning_rate": 1.1333333333333334e-05, - "num_tokens": 494893.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.21519999206066132, - "rewards/env_reward/std": 0.532200038433075, - "reward": 1.2848000526428223, - "reward_std": 0.532200038433075, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 1.4125559478998184, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.203, - "step": 406 - }, - { - "loss": 0.0222, - "grad_norm": 0.07422968745231628, - "learning_rate": 1.1222222222222224e-05, - "num_tokens": 496109.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 2.775099992752075, - "rewards/env_reward/std": 0.0, - "reward": 4.275099754333496, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.22246673703193665, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.2035, - "step": 407 - }, - { - "loss": 0.0368, - "grad_norm": 0.09009969979524612, - "learning_rate": 1.1111111111111112e-05, - "num_tokens": 497325.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.11050000041723251, - "rewards/env_reward/std": 0.0, - "reward": 1.6104999780654907, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.36780449748039246, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.204, - "step": 408 - }, - { - "loss": 0.0286, - "grad_norm": 1.5730061531066895, - "learning_rate": 1.1000000000000001e-05, - "num_tokens": 498545.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.11737500131130219, - "rewards/env_reward/std": 0.006950002163648605, - "reward": 1.6173748970031738, - "reward_std": 0.006950021255761385, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.28572753444314003, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.2045, - "step": 409 - }, - { - "loss": 0.0659, - "grad_norm": 2.1238832473754883, - "learning_rate": 1.088888888888889e-05, - "num_tokens": 499758.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.46617498993873596, - "rewards/env_reward/std": 1.444044589996338, - "reward": 1.9661749601364136, - "reward_std": 1.4440444707870483, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.6587927043437958, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.205, - "step": 410 - }, - { - "loss": 0.0483, - "grad_norm": 0.0901445522904396, - "learning_rate": 1.0777777777777778e-05, - "num_tokens": 500978.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 1.6664999723434448, - "rewards/env_reward/std": 0.0, - "reward": 3.1665000915527344, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.48277437686920166, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.2055, - "step": 411 - }, - { - "loss": 0.0311, - "grad_norm": 0.17845037579536438, - "learning_rate": 1.0666666666666667e-05, - "num_tokens": 502190.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.4124999940395355, - "rewards/env_reward/std": 0.0, - "reward": 1.912500023841858, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 0.3107207417488098, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.206, - "step": 412 - }, - { - "loss": 0.0175, - "grad_norm": 0.05390382558107376, - "learning_rate": 1.0555555555555555e-05, - "num_tokens": 503406.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.05739999935030937, - "rewards/env_reward/std": 0.0, - "reward": 1.5573999881744385, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.1745627522468567, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.2065, - "step": 413 - }, - { - "loss": 0.0625, - "grad_norm": 10.375347137451172, - "learning_rate": 1.0444444444444445e-05, - "num_tokens": 504623.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.03642500936985016, - "rewards/env_reward/std": 0.5068695545196533, - "reward": 1.536424994468689, - "reward_std": 0.5068694949150085, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.6247732043266296, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.207, - "step": 414 - }, - { - "loss": 0.044, - "grad_norm": 0.110545314848423, - "learning_rate": 1.0333333333333333e-05, - "num_tokens": 505839.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.0925000011920929, - "rewards/env_reward/std": 0.0, - "reward": 1.5924999713897705, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.4403933882713318, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.2075, - "step": 415 - }, - { - "loss": 0.0276, - "grad_norm": 2.9608986377716064, - "learning_rate": 1.0222222222222223e-05, - "num_tokens": 507054.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.5720750093460083, - "rewards/env_reward/std": 0.11894048005342484, - "reward": 2.0720748901367188, - "reward_std": 0.11894050985574722, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.27571701258420944, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.208, - "step": 416 - }, - { - "loss": 0.0714, - "grad_norm": 4.221258640289307, - "learning_rate": 1.0111111111111111e-05, - "num_tokens": 508268.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.11145000159740448, - "rewards/env_reward/std": 0.08967870473861694, - "reward": 1.611449956893921, - "reward_std": 0.08967869728803635, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.7143204063177109, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.2085, - "step": 417 - }, - { - "loss": 0.0089, - "grad_norm": 1.7130541801452637, - "learning_rate": 1e-05, - "num_tokens": 509484.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.5528500080108643, - "rewards/env_reward/std": 0.09589999914169312, - "reward": 2.0528500080108643, - "reward_std": 0.09590005874633789, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.0889815017580986, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.209, - "step": 418 - }, - { - "loss": 0.1025, - "grad_norm": 0.15961776673793793, - "learning_rate": 9.888888888888889e-06, - "num_tokens": 510700.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 1.4427000284194946, - "rewards/env_reward/std": 0.0, - "reward": 2.942699909210205, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 1.024822473526001, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.2095, - "step": 419 - }, - { - "loss": 0.0877, - "grad_norm": 2.687859296798706, - "learning_rate": 9.777777777777779e-06, - "num_tokens": 511918.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.2002749890089035, - "rewards/env_reward/std": 0.2704545259475708, - "reward": 1.70027494430542, - "reward_std": 0.2704544961452484, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.8766704872250557, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.21, - "step": 420 - }, - { - "loss": 0.0133, - "grad_norm": 0.07045546919107437, - "learning_rate": 9.666666666666667e-06, - "num_tokens": 513134.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.02889999933540821, - "rewards/env_reward/std": 0.0, - "reward": 1.5289000272750854, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.13296586275100708, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.2105, - "step": 421 - }, - { - "loss": 0.0111, - "grad_norm": 0.04775848612189293, - "learning_rate": 9.555555555555556e-06, - "num_tokens": 514350.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 3.0, - "rewards/env_reward/std": 0.0, - "reward": 4.5, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.11139857769012451, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.211, - "step": 422 - }, - { - "loss": 0.0531, - "grad_norm": 14.071207046508789, - "learning_rate": 9.444444444444445e-06, - "num_tokens": 515565.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.38374999165534973, - "rewards/env_reward/std": 0.058500006794929504, - "reward": 1.883750081062317, - "reward_std": 0.0585000142455101, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.5313576012849808, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.2115, - "step": 423 - }, - { - "loss": 0.0495, - "grad_norm": 6.999293804168701, - "learning_rate": 9.333333333333334e-06, - "num_tokens": 516780.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 2.2377500534057617, - "rewards/env_reward/std": 1.5245001316070557, - "reward": 3.7377500534057617, - "reward_std": 1.5245000123977661, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.494718536734581, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.212, - "step": 424 - }, - { - "loss": 0.0561, - "grad_norm": 6.606105327606201, - "learning_rate": 9.222222222222222e-06, - "num_tokens": 517994.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.06355000287294388, - "rewards/env_reward/std": 0.053404901176691055, - "reward": 1.5635499954223633, - "reward_std": 0.05340488255023956, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.5608605295419693, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.2125, - "step": 425 - }, - { - "loss": 0.0411, - "grad_norm": 6.317296504974365, - "learning_rate": 9.111111111111112e-06, - "num_tokens": 519213.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.5113000273704529, - "rewards/env_reward/std": 0.04380001127719879, - "reward": 2.0113000869750977, - "reward_std": 0.0437999963760376, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.41063985228538513, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.213, - "step": 426 - }, - { - "loss": 0.0445, - "grad_norm": 16.5034236907959, - "learning_rate": 9e-06, - "num_tokens": 520428.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.34769999980926514, - "rewards/env_reward/std": 0.3646000027656555, - "reward": 1.8476999998092651, - "reward_std": 0.36459994316101074, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.44513607025146484, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.2135, - "step": 427 - }, - { - "loss": 0.0686, - "grad_norm": 4.950837135314941, - "learning_rate": 8.88888888888889e-06, - "num_tokens": 521642.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.04944999888539314, - "rewards/env_reward/std": 0.033890459686517715, - "reward": 1.5494499206542969, - "reward_std": 0.03389044106006622, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.6864325404167175, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.214, - "step": 428 - }, - { - "loss": 0.0708, - "grad_norm": 18.683910369873047, - "learning_rate": 8.777777777777778e-06, - "num_tokens": 522861.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.26930001378059387, - "rewards/env_reward/std": 0.33180001378059387, - "reward": 1.7692999839782715, - "reward_std": 0.3317999839782715, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.7076273337006569, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.2145, - "step": 429 - }, - { - "loss": 0.0819, - "grad_norm": 3.3483595848083496, - "learning_rate": 8.666666666666668e-06, - "num_tokens": 524075.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 1.4747999906539917, - "rewards/env_reward/std": 1.761149287223816, - "reward": 2.9748001098632812, - "reward_std": 1.761149287223816, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.8185893446207047, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.215, - "step": 430 - }, - { - "loss": 0.1114, - "grad_norm": 17.29606819152832, - "learning_rate": 8.555555555555556e-06, - "num_tokens": 525288.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.4809499979019165, - "rewards/env_reward/std": 0.36256280541419983, - "reward": 1.9809499979019165, - "reward_std": 0.36256274580955505, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 1.1136217415332794, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.2155, - "step": 431 - }, - { - "loss": 0.0464, - "grad_norm": 6.7067790031433105, - "learning_rate": 8.444444444444446e-06, - "num_tokens": 526507.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.19207501411437988, - "rewards/env_reward/std": 0.037450000643730164, - "reward": 1.6920750141143799, - "reward_std": 0.037449996918439865, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.4635205864906311, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.216, - "step": 432 - }, - { - "loss": 0.0313, - "grad_norm": 0.09291418641805649, - "learning_rate": 8.333333333333334e-06, - "num_tokens": 527727.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.11069999635219574, - "rewards/env_reward/std": 0.0, - "reward": 1.610700011253357, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.3125154972076416, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.2165, - "step": 433 - }, - { - "loss": 0.0213, - "grad_norm": 0.07645757496356964, - "learning_rate": 8.222222222222223e-06, - "num_tokens": 528943.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.061500001698732376, - "rewards/env_reward/std": 0.0, - "reward": 1.5614999532699585, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.21342608332633972, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.217, - "step": 434 - }, - { - "loss": 0.0431, - "grad_norm": 6.316166400909424, - "learning_rate": 8.111111111111112e-06, - "num_tokens": 530162.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 2.105675220489502, - "rewards/env_reward/std": 1.4696500301361084, - "reward": 3.605674982070923, - "reward_std": 1.4696499109268188, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.4310971572995186, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.2175, - "step": 435 - }, - { - "loss": 0.0591, - "grad_norm": 6.796760559082031, - "learning_rate": 8.000000000000001e-06, - "num_tokens": 531377.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 1.9459749460220337, - "rewards/env_reward/std": 1.3590500354766846, - "reward": 3.445974826812744, - "reward_std": 1.3590497970581055, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.5905175358057022, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.218, - "step": 436 - }, - { - "loss": 0.1627, - "grad_norm": 34.66132736206055, - "learning_rate": 7.88888888888889e-06, - "num_tokens": 532595.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.08609999716281891, - "rewards/env_reward/std": 0.2364826649427414, - "reward": 1.5860999822616577, - "reward_std": 0.2364826798439026, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 1.6273682713508606, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.2185, - "step": 437 - }, - { - "loss": 0.0281, - "grad_norm": 1.5878809690475464, - "learning_rate": 7.777777777777777e-06, - "num_tokens": 533815.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.10175000131130219, - "rewards/env_reward/std": 0.014699999243021011, - "reward": 1.6017498970031738, - "reward_std": 0.01470001507550478, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.2811654843389988, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.219, - "step": 438 - }, - { - "loss": 0.0671, - "grad_norm": 2.8122143745422363, - "learning_rate": 7.666666666666667e-06, - "num_tokens": 535030.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 2.307950019836426, - "rewards/env_reward/std": 1.3840999603271484, - "reward": 3.807950019836426, - "reward_std": 1.3840999603271484, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.6714244410395622, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.2195, - "step": 439 - }, - { - "loss": 0.1292, - "grad_norm": 22.363697052001953, - "learning_rate": 7.555555555555556e-06, - "num_tokens": 536244.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.25197499990463257, - "rewards/env_reward/std": 0.3354390859603882, - "reward": 1.7519750595092773, - "reward_std": 0.33543911576271057, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 1.2917723655700684, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.22, - "step": 440 - }, - { - "loss": 0.0524, - "grad_norm": 6.302943706512451, - "learning_rate": 7.444444444444444e-06, - "num_tokens": 537463.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.46480000019073486, - "rewards/env_reward/std": 0.0, - "reward": 1.9648000001907349, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.5239357650279999, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.2205, - "step": 441 - }, - { - "loss": 0.0212, - "grad_norm": 0.08151021599769592, - "learning_rate": 7.333333333333334e-06, - "num_tokens": 538683.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.8804000020027161, - "rewards/env_reward/std": 0.0, - "reward": 2.3803999423980713, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.21206602454185486, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.221, - "step": 442 - }, - { - "loss": 0.0324, - "grad_norm": 0.10439272969961166, - "learning_rate": 7.222222222222222e-06, - "num_tokens": 539927.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.4124999940395355, - "rewards/env_reward/std": 0.0, - "reward": 1.912500023841858, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 0.3237926661968231, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.2215, - "step": 443 - }, - { - "loss": 0.0711, - "grad_norm": 3.5051026344299316, - "learning_rate": 7.111111111111112e-06, - "num_tokens": 541146.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.10379999876022339, - "rewards/env_reward/std": 0.028999999165534973, - "reward": 1.6037999391555786, - "reward_std": 0.02900000475347042, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.711153905838728, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.222, - "step": 444 - }, - { - "loss": 0.1239, - "grad_norm": 21.986900329589844, - "learning_rate": 7.000000000000001e-06, - "num_tokens": 542360.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.010450005531311035, - "rewards/env_reward/std": 0.47714316844940186, - "reward": 1.510450005531311, - "reward_std": 0.47714319825172424, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 1.239209771156311, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.2225, - "step": 445 - }, - { - "loss": 0.023, - "grad_norm": 0.07607124745845795, - "learning_rate": 6.888888888888889e-06, - "num_tokens": 543580.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.4551999866962433, - "rewards/env_reward/std": 0.0, - "reward": 1.955199956893921, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.22952622175216675, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.223, - "step": 446 - }, - { - "loss": 0.1395, - "grad_norm": 17.633258819580078, - "learning_rate": 6.777777777777779e-06, - "num_tokens": 544793.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.14509999752044678, - "rewards/env_reward/std": 0.186971977353096, - "reward": 1.3548998832702637, - "reward_std": 0.186971977353096, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 1.3945640623569489, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.2235, - "step": 447 - }, - { - "loss": 0.0274, - "grad_norm": 0.10130514949560165, - "learning_rate": 6.666666666666667e-06, - "num_tokens": 546013.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.2011999934911728, - "rewards/env_reward/std": 0.0, - "reward": 1.701200008392334, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.2743580937385559, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.224, - "step": 448 - }, - { - "loss": 0.0986, - "grad_norm": 6.136234283447266, - "learning_rate": 6.555555555555556e-06, - "num_tokens": 547227.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.0943249985575676, - "rewards/env_reward/std": 0.05175946652889252, - "reward": 1.594325065612793, - "reward_std": 0.051759425550699234, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.9864560812711716, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.2245, - "step": 449 - }, - { - "loss": 0.0313, - "grad_norm": 0.13909797370433807, - "learning_rate": 6.4444444444444445e-06, - "num_tokens": 548439.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.436599999666214, - "rewards/env_reward/std": 0.0, - "reward": 1.9365999698638916, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 0.31275004148483276, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.225, - "step": 450 - }, - { - "loss": 0.0496, - "grad_norm": 5.083831787109375, - "learning_rate": 6.333333333333334e-06, - "num_tokens": 549662.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 2.296875, - "rewards/env_reward/std": 1.40625, - "reward": 3.796875, - "reward_std": 1.40625, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.49592315405607224, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.2255, - "step": 451 - }, - { - "loss": 0.0498, - "grad_norm": 17.404560089111328, - "learning_rate": 6.222222222222222e-06, - "num_tokens": 550877.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.7101999521255493, - "rewards/env_reward/std": 0.46720001101493835, - "reward": 2.2101998329162598, - "reward_std": 0.46720001101493835, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.49799713492393494, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.226, - "step": 452 - }, - { - "loss": 0.0237, - "grad_norm": 0.10003330558538437, - "learning_rate": 6.111111111111111e-06, - "num_tokens": 552093.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.22120000422000885, - "rewards/env_reward/std": 0.0, - "reward": 1.7211999893188477, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.23683232069015503, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.2265, - "step": 453 - }, - { - "loss": 0.0352, - "grad_norm": 0.07923237234354019, - "learning_rate": 6e-06, - "num_tokens": 553313.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 3.0, - "rewards/env_reward/std": 0.0, - "reward": 4.5, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.35178661346435547, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.227, - "step": 454 - }, - { - "loss": 0.1115, - "grad_norm": 23.17422866821289, - "learning_rate": 5.888888888888889e-06, - "num_tokens": 554526.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.22169999778270721, - "rewards/env_reward/std": 0.4673518240451813, - "reward": 1.7217000722885132, - "reward_std": 0.4673517942428589, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 1.1154318749904633, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.2275, - "step": 455 - }, - { - "loss": 0.0082, - "grad_norm": 0.048594508320093155, - "learning_rate": 5.777777777777778e-06, - "num_tokens": 555742.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.4713999927043915, - "rewards/env_reward/std": 0.0, - "reward": 1.9714000225067139, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 0.08181055635213852, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.228, - "step": 456 - }, - { - "loss": 0.0641, - "grad_norm": 13.334370613098145, - "learning_rate": 5.666666666666667e-06, - "num_tokens": 556960.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.34242498874664307, - "rewards/env_reward/std": 0.4131866991519928, - "reward": 1.842424988746643, - "reward_std": 0.4131866991519928, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.6406428962945938, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.2285, - "step": 457 - }, - { - "loss": 0.0471, - "grad_norm": 7.503530979156494, - "learning_rate": 5.555555555555556e-06, - "num_tokens": 558179.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.43992501497268677, - "rewards/env_reward/std": 0.013150006532669067, - "reward": 1.9399250745773315, - "reward_std": 0.013150015845894814, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.47112637758255005, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.229, - "step": 458 - }, - { - "loss": 0.0201, - "grad_norm": 0.08755699545145035, - "learning_rate": 5.444444444444445e-06, - "num_tokens": 559399.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 2.2887001037597656, - "rewards/env_reward/std": 0.0, - "reward": 3.7887001037597656, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.20145288109779358, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.2295, - "step": 459 - }, - { - "loss": 0.0792, - "grad_norm": 22.58326530456543, - "learning_rate": 5.333333333333334e-06, - "num_tokens": 560617.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.43185001611709595, - "rewards/env_reward/std": 0.3804160952568054, - "reward": 1.9318499565124512, - "reward_std": 0.3804161548614502, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.7920261472463608, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.23, - "step": 460 - }, - { - "loss": 0.0454, - "grad_norm": 13.632698059082031, - "learning_rate": 5.2222222222222226e-06, - "num_tokens": 561832.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.6263749599456787, - "rewards/env_reward/std": 0.4093500077724457, - "reward": 2.1263749599456787, - "reward_std": 0.4093499183654785, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.45408768951892853, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.2305, - "step": 461 - }, - { - "loss": 0.0126, - "grad_norm": 0.061295535415410995, - "learning_rate": 5.1111111111111115e-06, - "num_tokens": 563052.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.1851000040769577, - "rewards/env_reward/std": 0.0, - "reward": 1.6850999593734741, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.12610086798667908, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.231, - "step": 462 - }, - { - "loss": 0.0453, - "grad_norm": 10.238383293151855, - "learning_rate": 5e-06, - "num_tokens": 564265.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.5235750079154968, - "rewards/env_reward/std": 0.44271576404571533, - "reward": 2.0235750675201416, - "reward_std": 0.44271576404571533, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.4533032178878784, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.2315, - "step": 463 - }, - { - "loss": 0.0593, - "grad_norm": 8.796762466430664, - "learning_rate": 4.888888888888889e-06, - "num_tokens": 565478.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.2136250138282776, - "rewards/env_reward/std": 0.33112218976020813, - "reward": 1.2863750457763672, - "reward_std": 0.33112218976020813, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.5929864794015884, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.232, - "step": 464 - }, - { - "loss": 0.0466, - "grad_norm": 7.1668620109558105, - "learning_rate": 4.777777777777778e-06, - "num_tokens": 566693.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.13312500715255737, - "rewards/env_reward/std": 0.004950001835823059, - "reward": 1.6331250667572021, - "reward_std": 0.004949966911226511, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.46607181429862976, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.2325, - "step": 465 - }, - { - "loss": 0.0366, - "grad_norm": 9.879788398742676, - "learning_rate": 4.666666666666667e-06, - "num_tokens": 567907.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.2449999898672104, - "rewards/env_reward/std": 0.8202992677688599, - "reward": 1.7450000047683716, - "reward_std": 0.8202992677688599, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.3659088909626007, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.233, - "step": 466 - }, - { - "loss": 0.0282, - "grad_norm": 13.147476196289062, - "learning_rate": 4.555555555555556e-06, - "num_tokens": 569126.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.4874749779701233, - "rewards/env_reward/std": 0.2195499986410141, - "reward": 1.9874749183654785, - "reward_std": 0.21955005824565887, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.2823686748743057, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.2335, - "step": 467 - }, - { - "loss": 0.0267, - "grad_norm": 8.822482109069824, - "learning_rate": 4.444444444444445e-06, - "num_tokens": 570345.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 2.2204749584198, - "rewards/env_reward/std": 1.4490500688552856, - "reward": 3.7204747200012207, - "reward_std": 1.4490498304367065, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.2672518938779831, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.234, - "step": 468 - }, - { - "loss": 0.1027, - "grad_norm": 14.732349395751953, - "learning_rate": 4.333333333333334e-06, - "num_tokens": 571563.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 5.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 5.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": -0.07499999552965164, - "rewards/env_reward/std": 0.375, - "reward": 1.4249999523162842, - "reward_std": 0.3749999701976776, - "frac_reward_zero_std": 0.0, - "completion_length": 5.0, - "kl": 1.0270851105451584, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.2345, - "step": 469 - }, - { - "loss": 0.0364, - "grad_norm": 8.678610801696777, - "learning_rate": 4.222222222222223e-06, - "num_tokens": 572781.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 1.2231000661849976, - "rewards/env_reward/std": 1.4263650178909302, - "reward": 2.723099946975708, - "reward_std": 1.4263651371002197, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.3637717515230179, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.235, - "step": 470 - }, - { - "loss": 0.0408, - "grad_norm": 7.956890106201172, - "learning_rate": 4.111111111111112e-06, - "num_tokens": 573999.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.2797500193119049, - "rewards/env_reward/std": 0.29265886545181274, - "reward": 1.7797499895095825, - "reward_std": 0.29265889525413513, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.40757453441619873, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.2355, - "step": 471 - }, - { - "loss": 0.0313, - "grad_norm": 4.231257438659668, - "learning_rate": 4.000000000000001e-06, - "num_tokens": 575211.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.06790000200271606, - "rewards/env_reward/std": 0.24300001561641693, - "reward": 1.5678999423980713, - "reward_std": 0.24299998581409454, - "frac_reward_zero_std": 0.0, - "completion_length": 3.0, - "kl": 0.312759205698967, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.236, - "step": 472 - }, - { - "loss": 0.027, - "grad_norm": 12.84277057647705, - "learning_rate": 3.888888888888889e-06, - "num_tokens": 576430.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.41770002245903015, - "rewards/env_reward/std": 0.40940001606941223, - "reward": 1.9177000522613525, - "reward_std": 0.40939995646476746, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.26984886825084686, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.2365, - "step": 473 - }, - { - "loss": 0.1004, - "grad_norm": 0.3565484881401062, - "learning_rate": 3.777777777777778e-06, - "num_tokens": 577650.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.060750000178813934, - "rewards/env_reward/std": 0.013299999758601189, - "reward": 1.5607500076293945, - "reward_std": 0.013299982063472271, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 1.004371426999569, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.237, - "step": 474 - }, - { - "loss": 0.0206, - "grad_norm": 0.07799467444419861, - "learning_rate": 3.666666666666667e-06, - "num_tokens": 578870.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.0617000013589859, - "rewards/env_reward/std": 0.0, - "reward": 1.5616999864578247, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.20640861988067627, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.2375, - "step": 475 - }, - { - "loss": 0.0421, - "grad_norm": 4.742053508758545, - "learning_rate": 3.555555555555556e-06, - "num_tokens": 580087.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.7774249911308289, - "rewards/env_reward/std": 1.4874924421310425, - "reward": 2.2774250507354736, - "reward_std": 1.4874924421310425, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.42091556638479233, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.238, - "step": 476 - }, - { - "loss": 0.0288, - "grad_norm": 0.08009364455938339, - "learning_rate": 3.4444444444444444e-06, - "num_tokens": 581311.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.41600000858306885, - "rewards/env_reward/std": 0.0, - "reward": 1.9160000085830688, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.28848007321357727, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.2385, - "step": 477 - }, - { - "loss": 0.028, - "grad_norm": 9.957799911499023, - "learning_rate": 3.3333333333333333e-06, - "num_tokens": 582530.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.16455000638961792, - "rewards/env_reward/std": 0.0925000011920929, - "reward": 1.6645500659942627, - "reward_std": 0.09250004589557648, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.27987441420555115, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.239, - "step": 478 - }, - { - "loss": 0.0446, - "grad_norm": 0.18692584335803986, - "learning_rate": 3.2222222222222222e-06, - "num_tokens": 583746.0, - "completions/mean_length": 3.0, - "completions/min_length": 3.0, - "completions/max_length": 3.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.0, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 3.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.18970000743865967, - "rewards/env_reward/std": 0.0, - "reward": 1.6897000074386597, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 3.0, - "kl": 0.4460113048553467, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.2395, - "step": 479 - }, - { - "loss": 0.0583, - "grad_norm": 8.525734901428223, - "learning_rate": 3.111111111111111e-06, - "num_tokens": 584960.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.3813000023365021, - "rewards/env_reward/std": 0.6251548528671265, - "reward": 1.8812999725341797, - "reward_std": 0.6251548528671265, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.5830349922180176, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.24, - "step": 480 - }, - { - "loss": 0.0252, - "grad_norm": 0.07544836401939392, - "learning_rate": 3e-06, - "num_tokens": 586180.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.3840000033378601, - "rewards/env_reward/std": 0.0, - "reward": 1.8840000629425049, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.25199753046035767, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.2405, - "step": 481 - }, - { - "loss": 0.0475, - "grad_norm": 0.11984675377607346, - "learning_rate": 2.888888888888889e-06, - "num_tokens": 587396.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.19089999794960022, - "rewards/env_reward/std": 0.0, - "reward": 1.6908999681472778, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.4745121896266937, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.241, - "step": 482 - }, - { - "loss": 0.0356, - "grad_norm": 10.510693550109863, - "learning_rate": 2.777777777777778e-06, - "num_tokens": 588614.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.7358250021934509, - "rewards/env_reward/std": 0.5424174070358276, - "reward": 2.2358250617980957, - "reward_std": 0.5424175262451172, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.35567909479141235, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.2415, - "step": 483 - }, - { - "loss": 0.0343, - "grad_norm": 3.6369059085845947, - "learning_rate": 2.666666666666667e-06, - "num_tokens": 589827.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.07247500121593475, - "rewards/env_reward/std": 0.06592877209186554, - "reward": 1.5724749565124512, - "reward_std": 0.06592877209186554, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.3429151102900505, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.242, - "step": 484 - }, - { - "loss": 0.0236, - "grad_norm": 8.855422019958496, - "learning_rate": 2.5555555555555557e-06, - "num_tokens": 591046.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 2.26639986038208, - "rewards/env_reward/std": 1.4672000408172607, - "reward": 3.76639986038208, - "reward_std": 1.4672000408172607, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.23648104071617126, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.2425, - "step": 485 - }, - { - "loss": 0.0557, - "grad_norm": 6.969616413116455, - "learning_rate": 2.4444444444444447e-06, - "num_tokens": 592259.0, - "completions/mean_length": 3.25, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.25, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.3041999936103821, - "rewards/env_reward/std": 0.09227003902196884, - "reward": 1.8042000532150269, - "reward_std": 0.09227006137371063, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.5572411194443703, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.243, - "step": 486 - }, - { - "loss": 0.0434, - "grad_norm": 12.514766693115234, - "learning_rate": 2.3333333333333336e-06, - "num_tokens": 593474.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.2367749959230423, - "rewards/env_reward/std": 0.6161500215530396, - "reward": 1.7367749214172363, - "reward_std": 0.6161500215530396, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.4340191185474396, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.2435, - "step": 487 - }, - { - "loss": 0.0492, - "grad_norm": 13.703819274902344, - "learning_rate": 2.2222222222222225e-06, - "num_tokens": 594697.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.29727500677108765, - "rewards/env_reward/std": 0.3591500222682953, - "reward": 1.7972750663757324, - "reward_std": 0.3591500222682953, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.4917333871126175, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.244, - "step": 488 - }, - { - "loss": 0.064, - "grad_norm": 5.9716949462890625, - "learning_rate": 2.1111111111111114e-06, - "num_tokens": 595911.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 1.4799000024795532, - "rewards/env_reward/std": 1.7552603483200073, - "reward": 2.9798998832702637, - "reward_std": 1.7552602291107178, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.6398516371846199, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.2445, - "step": 489 - }, - { - "loss": 0.0234, - "grad_norm": 1.6018086671829224, - "learning_rate": 2.0000000000000003e-06, - "num_tokens": 597130.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.5368000268936157, - "rewards/env_reward/std": 0.0, - "reward": 2.036799907684326, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.2344426065683365, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.245, - "step": 490 - }, - { - "loss": 0.0547, - "grad_norm": 4.675817489624023, - "learning_rate": 1.888888888888889e-06, - "num_tokens": 598344.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.03647499904036522, - "rewards/env_reward/std": 0.09896047413349152, - "reward": 1.5364750623703003, - "reward_std": 0.0989605039358139, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.5469909161329269, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.2455, - "step": 491 - }, - { - "loss": 0.0462, - "grad_norm": 13.125691413879395, - "learning_rate": 1.777777777777778e-06, - "num_tokens": 599563.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.5171999931335449, - "rewards/env_reward/std": 0.026600003242492676, - "reward": 2.017199993133545, - "reward_std": 0.026599964126944542, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.46199870109558105, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.246, - "step": 492 - }, - { - "loss": 0.0404, - "grad_norm": 8.710203170776367, - "learning_rate": 1.6666666666666667e-06, - "num_tokens": 600782.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 1.916100025177002, - "rewards/env_reward/std": 1.25, - "reward": 3.416100025177002, - "reward_std": 1.25, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.40352775156497955, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.2465, - "step": 493 - }, - { - "loss": 0.0381, - "grad_norm": 3.4673705101013184, - "learning_rate": 1.5555555555555556e-06, - "num_tokens": 601998.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.3388749957084656, - "rewards/env_reward/std": 0.2771500051021576, - "reward": 1.8388750553131104, - "reward_std": 0.27715003490448, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.3812360465526581, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.247, - "step": 494 - }, - { - "loss": 0.0296, - "grad_norm": 9.451112747192383, - "learning_rate": 1.4444444444444445e-06, - "num_tokens": 603241.0, - "completions/mean_length": 3.75, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.75, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 2.2273499965667725, - "rewards/env_reward/std": 1.5453001260757446, - "reward": 3.7273499965667725, - "reward_std": 1.545300006866455, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.2962055504322052, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.2475, - "step": 495 - }, - { - "loss": 0.0297, - "grad_norm": 0.08744853734970093, - "learning_rate": 1.3333333333333334e-06, - "num_tokens": 604461.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.5414000153541565, - "rewards/env_reward/std": 0.0, - "reward": 2.0413999557495117, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.2973760664463043, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.248, - "step": 496 - }, - { - "loss": 0.0669, - "grad_norm": 7.617353916168213, - "learning_rate": 1.2222222222222223e-06, - "num_tokens": 605679.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.6072749495506287, - "rewards/env_reward/std": 0.7213825583457947, - "reward": 2.1072750091552734, - "reward_std": 0.7213825583457947, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.6694605350494385, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.2485, - "step": 497 - }, - { - "loss": 0.0419, - "grad_norm": 0.11127752810716629, - "learning_rate": 1.1111111111111112e-06, - "num_tokens": 606899.0, - "completions/mean_length": 4.0, - "completions/min_length": 4.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 4.0, - "completions/min_terminated_length": 4.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.21160000562667847, - "rewards/env_reward/std": 0.0, - "reward": 1.7116000652313232, - "reward_std": 0.0, - "frac_reward_zero_std": 1.0, - "completion_length": 4.0, - "kl": 0.41930001974105835, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.249, - "step": 498 - }, - { - "loss": 0.0472, - "grad_norm": 12.42243766784668, - "learning_rate": 1.0000000000000002e-06, - "num_tokens": 608117.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.645550012588501, - "rewards/env_reward/std": 0.4464397430419922, - "reward": 2.145550012588501, - "reward_std": 0.4464397430419922, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.47190549969673157, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.2495, - "step": 499 - }, - { - "loss": 0.0374, - "grad_norm": 9.841089248657227, - "learning_rate": 8.88888888888889e-07, - "num_tokens": 609331.0, - "completions/mean_length": 3.5, - "completions/min_length": 3.0, - "completions/max_length": 4.0, - "completions/clipped_ratio": 0.0, - "completions/mean_terminated_length": 3.5, - "completions/min_terminated_length": 3.0, - "completions/max_terminated_length": 4.0, - "rewards/format_valid/mean": 1.0, - "rewards/format_valid/std": 0.0, - "rewards/action_legal/mean": 0.5, - "rewards/action_legal/std": 0.0, - "rewards/env_reward/mean": 0.9353499412536621, - "rewards/env_reward/std": 1.2736923694610596, - "reward": 2.435349941253662, - "reward_std": 1.2736923694610596, - "frac_reward_zero_std": 0.0, - "completion_length": 4.0, - "kl": 0.3742150366306305, - "clip_ratio/low_mean": 0.0, - "clip_ratio/low_min": 0.0, - "clip_ratio/high_mean": 0.0, - "clip_ratio/high_max": 0.0, - "clip_ratio/region_mean": 0.0, - "epoch": 0.25, - "step": 500 - }, - { - "train_runtime": 1991.1306, - "train_samples_per_second": 1.004, - "train_steps_per_second": 0.251, - "total_flos": 0.0, - "train_loss": 0.05931463603555517, - "epoch": 0.25, - "step": 500 - } -] \ No newline at end of file