diff --git "a/training_log.json" "b/training_log.json" new file mode 100644--- /dev/null +++ "b/training_log.json" @@ -0,0 +1,15511 @@ +[ + { + "loss": 0.0, + "grad_norm": 0.0015333890914916992, + "learning_rate": 0.0, + "num_tokens": 1216.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 1.0943000316619873, + "rewards/env_reward/std": 0.0, + "reward": 2.5943000316619873, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 5.4836273193359375e-06, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0005, + "step": 1 + }, + { + "loss": 0.0, + "grad_norm": 0.001904299482703209, + "learning_rate": 1.0000000000000002e-06, + "num_tokens": 2452.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.024800000712275505, + "rewards/env_reward/std": 0.0, + "reward": 1.4751999378204346, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 5.801518909720471e-06, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.001, + "step": 2 + }, + { + "loss": 0.0, + "grad_norm": 5.467747211456299, + "learning_rate": 2.0000000000000003e-06, + "num_tokens": 3668.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.04742499813437462, + "rewards/env_reward/std": 0.0726500004529953, + "reward": 1.5474250316619873, + "reward_std": 0.07264995574951172, + "frac_reward_zero_std": 0.0, + "completion_length": 3.0, + "kl": 2.7855238235119373e-05, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0015, + "step": 3 + }, + { + "loss": 0.0001, + "grad_norm": NaN, + "learning_rate": 3e-06, + "num_tokens": 4918.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 5.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.27184998989105225, + "rewards/env_reward/std": 0.30637651681900024, + "reward": 1.2281500101089478, + "reward_std": 0.30637648701667786, + "frac_reward_zero_std": 0.0, + "completion_length": 5.0, + "kl": 0.0009263694344099349, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.002, + "step": 4 + }, + { + "loss": 0.0, + "grad_norm": NaN, + "learning_rate": 3e-06, + "num_tokens": 6134.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.0914749950170517, + "rewards/env_reward/std": 0.2446500062942505, + "reward": 1.591475009918213, + "reward_std": 0.2446500062942505, + "frac_reward_zero_std": 0.0, + "completion_length": 3.0, + "kl": 8.985400313576974e-06, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0025, + "step": 5 + }, + { + "loss": 0.0, + "grad_norm": 5.823563575744629, + "learning_rate": 3e-06, + "num_tokens": 7346.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.055275000631809235, + "rewards/env_reward/std": 0.05144999921321869, + "reward": 1.4447250366210938, + "reward_std": 0.05145001411437988, + "frac_reward_zero_std": 0.0, + "completion_length": 3.0, + "kl": 2.5783977740445607e-05, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.003, + "step": 6 + }, + { + "loss": 0.0, + "grad_norm": 3.515963554382324, + "learning_rate": 4.000000000000001e-06, + "num_tokens": 8558.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.02980000153183937, + "rewards/env_reward/std": 0.025125950574874878, + "reward": 1.5297999382019043, + "reward_std": 0.025125989690423012, + "frac_reward_zero_std": 0.0, + "completion_length": 3.0, + "kl": 2.0618240043290825e-05, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0035, + "step": 7 + }, + { + "loss": 0.0, + "grad_norm": 0.0012212995206937194, + "learning_rate": 5e-06, + "num_tokens": 9770.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.017500000074505806, + "rewards/env_reward/std": 0.0, + "reward": 1.5175000429153442, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 2.9007594548602356e-06, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.004, + "step": 8 + }, + { + "loss": 0.0, + "grad_norm": 4.4884257316589355, + "learning_rate": 6e-06, + "num_tokens": 10982.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.3139750063419342, + "rewards/env_reward/std": 0.15238404273986816, + "reward": 1.1860249042510986, + "reward_std": 0.15238407254219055, + "frac_reward_zero_std": 0.0, + "completion_length": 3.0, + "kl": 3.519654396200167e-05, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0045, + "step": 9 + }, + { + "loss": 0.0, + "grad_norm": NaN, + "learning_rate": 7.000000000000001e-06, + "num_tokens": 12199.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.40117499232292175, + "rewards/env_reward/std": 0.4204060137271881, + "reward": 1.9011750221252441, + "reward_std": 0.4204059839248657, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.00046923011541366577, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.005, + "step": 10 + }, + { + "loss": 0.0, + "grad_norm": 5.395345687866211, + "learning_rate": 7.000000000000001e-06, + "num_tokens": 13411.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.26269999146461487, + "rewards/env_reward/std": 0.17459072172641754, + "reward": 1.762700080871582, + "reward_std": 0.17459072172641754, + "frac_reward_zero_std": 0.0, + "completion_length": 3.0, + "kl": 4.217028799757827e-05, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0055, + "step": 11 + }, + { + "loss": 0.0002, + "grad_norm": 24.1549015045166, + "learning_rate": 8.000000000000001e-06, + "num_tokens": 14624.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.05982499569654465, + "rewards/env_reward/std": 0.05187172815203667, + "reward": 1.5598249435424805, + "reward_std": 0.05187166854739189, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.00168648362159729, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.006, + "step": 12 + }, + { + "loss": 0.0, + "grad_norm": 3.7908565998077393, + "learning_rate": 9e-06, + "num_tokens": 15840.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.20547500252723694, + "rewards/env_reward/std": 0.05324999988079071, + "reward": 1.7054749727249146, + "reward_std": 0.0532500334084034, + "frac_reward_zero_std": 0.0, + "completion_length": 3.0, + "kl": 7.70886759937639e-06, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0065, + "step": 13 + }, + { + "loss": 0.0, + "grad_norm": 0.00359143503010273, + "learning_rate": 1e-05, + "num_tokens": 17052.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.792900025844574, + "rewards/env_reward/std": 0.0, + "reward": 2.2929000854492188, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 1.2079874977644067e-05, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.007, + "step": 14 + }, + { + "loss": 0.0, + "grad_norm": 3.261981248855591, + "learning_rate": 1.1000000000000001e-05, + "num_tokens": 18264.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.016324998810887337, + "rewards/env_reward/std": 0.06768739223480225, + "reward": 1.5163249969482422, + "reward_std": 0.06768736243247986, + "frac_reward_zero_std": 0.0, + "completion_length": 3.0, + "kl": 7.071098025335232e-05, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0075, + "step": 15 + }, + { + "loss": 0.0002, + "grad_norm": 22.216018676757812, + "learning_rate": 1.2e-05, + "num_tokens": 19477.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.06070000305771828, + "rewards/env_reward/std": 0.10127566754817963, + "reward": 1.4393000602722168, + "reward_std": 0.10127566009759903, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.0017704889178276062, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.008, + "step": 16 + }, + { + "loss": 0.0017, + "grad_norm": 22.440473556518555, + "learning_rate": 1.3000000000000001e-05, + "num_tokens": 20690.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.702750027179718, + "rewards/env_reward/std": 1.534355640411377, + "reward": 2.2027499675750732, + "reward_std": 1.534355640411377, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.01707853004336357, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0085, + "step": 17 + }, + { + "loss": 0.0001, + "grad_norm": 3.5670650005340576, + "learning_rate": 1.4000000000000001e-05, + "num_tokens": 21902.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.3511750102043152, + "rewards/env_reward/std": 0.2421124279499054, + "reward": 1.85117506980896, + "reward_std": 0.24211247265338898, + "frac_reward_zero_std": 0.0, + "completion_length": 3.0, + "kl": 0.0013786455432036604, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.009, + "step": 18 + }, + { + "loss": 0.0, + "grad_norm": 0.023953210562467575, + "learning_rate": 1.5e-05, + "num_tokens": 23114.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.021400000900030136, + "rewards/env_reward/std": 0.0, + "reward": 1.521399974822998, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 0.0004576047358568758, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0095, + "step": 19 + }, + { + "loss": 0.0516, + "grad_norm": 13.440083503723145, + "learning_rate": 1.6000000000000003e-05, + "num_tokens": 24331.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.5774250030517578, + "rewards/env_reward/std": 0.35871684551239014, + "reward": 2.077425003051758, + "reward_std": 0.3587168753147125, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.5155713111162186, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.01, + "step": 20 + }, + { + "loss": 0.0846, + "grad_norm": 8.484152793884277, + "learning_rate": 1.7000000000000003e-05, + "num_tokens": 25548.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.0176750048995018, + "rewards/env_reward/std": 0.13360247015953064, + "reward": 1.4823249578475952, + "reward_std": 0.13360245525836945, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.8458694666624069, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0105, + "step": 21 + }, + { + "loss": 0.0675, + "grad_norm": 9.707590103149414, + "learning_rate": 1.8e-05, + "num_tokens": 26765.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.017500000074505806, + "rewards/env_reward/std": 0.060531098395586014, + "reward": 1.5174999237060547, + "reward_std": 0.06053108721971512, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.6749855130910873, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.011, + "step": 22 + }, + { + "loss": 0.0617, + "grad_norm": 3.4466989040374756, + "learning_rate": 1.9e-05, + "num_tokens": 27978.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.29785001277923584, + "rewards/env_reward/std": 0.25378555059432983, + "reward": 1.2021499872207642, + "reward_std": 0.25378552079200745, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.6174105107784271, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0115, + "step": 23 + }, + { + "loss": 0.0338, + "grad_norm": 39.08412170410156, + "learning_rate": 2e-05, + "num_tokens": 29196.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 5.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.1463249921798706, + "rewards/env_reward/std": 0.3274500072002411, + "reward": 1.3536750078201294, + "reward_std": 0.3274500072002411, + "frac_reward_zero_std": 0.0, + "completion_length": 5.0, + "kl": 0.3376213669835124, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.012, + "step": 24 + }, + { + "loss": 0.0151, + "grad_norm": 20.261240005493164, + "learning_rate": 2.1e-05, + "num_tokens": 30410.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.9611750245094299, + "rewards/env_reward/std": 0.5365340113639832, + "reward": 2.461174964904785, + "reward_std": 0.5365338921546936, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.1506122574210167, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0125, + "step": 25 + }, + { + "loss": 0.0145, + "grad_norm": 11.482789993286133, + "learning_rate": 2.2000000000000003e-05, + "num_tokens": 31624.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.14649999141693115, + "rewards/env_reward/std": 0.06558185815811157, + "reward": 1.6464999914169312, + "reward_std": 0.06558185070753098, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.14546086639165878, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.013, + "step": 26 + }, + { + "loss": 0.0124, + "grad_norm": 14.635064125061035, + "learning_rate": 2.3000000000000003e-05, + "num_tokens": 32842.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.5030750036239624, + "rewards/env_reward/std": 0.7913716435432434, + "reward": 2.003074884414673, + "reward_std": 0.7913715243339539, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.12426742166280746, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0135, + "step": 27 + }, + { + "loss": 0.0039, + "grad_norm": 16.629444122314453, + "learning_rate": 2.4e-05, + "num_tokens": 34055.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 1.0267250537872314, + "rewards/env_reward/std": 0.4732673764228821, + "reward": 2.5267250537872314, + "reward_std": 0.47326740622520447, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.039139650762081146, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.014, + "step": 28 + }, + { + "loss": 0.0045, + "grad_norm": 26.910884857177734, + "learning_rate": 2.5e-05, + "num_tokens": 35273.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 1.5891250371932983, + "rewards/env_reward/std": 1.5938708782196045, + "reward": 3.089125156402588, + "reward_std": 1.5938708782196045, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.04533430188894272, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0145, + "step": 29 + }, + { + "loss": 0.0012, + "grad_norm": 22.792469024658203, + "learning_rate": 2.6000000000000002e-05, + "num_tokens": 36490.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.793999969959259, + "rewards/env_reward/std": 1.4766161441802979, + "reward": 2.2939999103546143, + "reward_std": 1.4766160249710083, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.0121101513504982, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.015, + "step": 30 + }, + { + "loss": 0.0044, + "grad_norm": 19.31297492980957, + "learning_rate": 2.7000000000000002e-05, + "num_tokens": 37707.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.05525000020861626, + "rewards/env_reward/std": 0.05696369707584381, + "reward": 1.5552499294281006, + "reward_std": 0.056963708251714706, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.04407123476266861, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0155, + "step": 31 + }, + { + "loss": 0.0139, + "grad_norm": 9.066012382507324, + "learning_rate": 2.8000000000000003e-05, + "num_tokens": 38920.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.021675001829862595, + "rewards/env_reward/std": 0.15254653990268707, + "reward": 1.5216751098632812, + "reward_std": 0.15254652500152588, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.1389305256307125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.016, + "step": 32 + }, + { + "loss": 0.0196, + "grad_norm": 21.37767791748047, + "learning_rate": 2.9e-05, + "num_tokens": 40134.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.6998249888420105, + "rewards/env_reward/std": 0.11420779675245285, + "reward": 2.1998250484466553, + "reward_std": 0.11420782655477524, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.1958363577723503, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0165, + "step": 33 + }, + { + "loss": 0.0086, + "grad_norm": 6.699766635894775, + "learning_rate": 3e-05, + "num_tokens": 41350.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.06562499701976776, + "rewards/env_reward/std": 0.1474488228559494, + "reward": 1.4343750476837158, + "reward_std": 0.1474488228559494, + "frac_reward_zero_std": 0.0, + "completion_length": 3.0, + "kl": 0.08599535003304482, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.017, + "step": 34 + }, + { + "loss": 0.0004, + "grad_norm": 4.889353275299072, + "learning_rate": 3.1e-05, + "num_tokens": 42562.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.03232499957084656, + "rewards/env_reward/std": 0.020850002765655518, + "reward": 1.532325029373169, + "reward_std": 0.02085002325475216, + "frac_reward_zero_std": 0.0, + "completion_length": 3.0, + "kl": 0.0037796597498527262, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0175, + "step": 35 + }, + { + "loss": 0.1285, + "grad_norm": 15.511324882507324, + "learning_rate": 3.2000000000000005e-05, + "num_tokens": 43775.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.3314250111579895, + "rewards/env_reward/std": 0.17125000059604645, + "reward": 1.8314249515533447, + "reward_std": 0.17124998569488525, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 1.2848919034004211, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.018, + "step": 36 + }, + { + "loss": 0.0175, + "grad_norm": 5.048687934875488, + "learning_rate": 3.3e-05, + "num_tokens": 44991.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.0533749982714653, + "rewards/env_reward/std": 0.06385000050067902, + "reward": 1.446624994277954, + "reward_std": 0.06384996324777603, + "frac_reward_zero_std": 0.0, + "completion_length": 3.0, + "kl": 0.1752433218061924, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0185, + "step": 37 + }, + { + "loss": 0.0024, + "grad_norm": 4.65332555770874, + "learning_rate": 3.4000000000000007e-05, + "num_tokens": 46203.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.03635000064969063, + "rewards/env_reward/std": 0.02292080596089363, + "reward": 1.5363500118255615, + "reward_std": 0.022920822724699974, + "frac_reward_zero_std": 0.0, + "completion_length": 3.0, + "kl": 0.023557812673971057, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.019, + "step": 38 + }, + { + "loss": 0.0679, + "grad_norm": 48.29575729370117, + "learning_rate": 3.5e-05, + "num_tokens": 47420.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.6989249587059021, + "rewards/env_reward/std": 0.649150013923645, + "reward": 2.198925018310547, + "reward_std": 0.649150013923645, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.6788432970643044, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0195, + "step": 39 + }, + { + "loss": 0.0063, + "grad_norm": 0.36195874214172363, + "learning_rate": 3.6e-05, + "num_tokens": 48640.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.41749998927116394, + "rewards/env_reward/std": 0.0, + "reward": 1.9175000190734863, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.06268331408500671, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.02, + "step": 40 + }, + { + "loss": 0.0269, + "grad_norm": 13.39460563659668, + "learning_rate": 3.7e-05, + "num_tokens": 49857.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.8564249873161316, + "rewards/env_reward/std": 1.4304662942886353, + "reward": 2.3564250469207764, + "reward_std": 1.4304662942886353, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.26936937868595123, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0205, + "step": 41 + }, + { + "loss": 0.0187, + "grad_norm": 30.891935348510742, + "learning_rate": 3.8e-05, + "num_tokens": 51071.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 5.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.38007497787475586, + "rewards/env_reward/std": 0.18915000557899475, + "reward": 1.1199250221252441, + "reward_std": 0.18914997577667236, + "frac_reward_zero_std": 0.0, + "completion_length": 5.0, + "kl": 0.1872375439852476, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.021, + "step": 42 + }, + { + "loss": 0.0032, + "grad_norm": 3.8051974773406982, + "learning_rate": 3.9000000000000006e-05, + "num_tokens": 52291.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 1.5476000308990479, + "rewards/env_reward/std": 1.6770870685577393, + "reward": 3.047600030899048, + "reward_std": 1.6770870685577393, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.031939536333084106, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0215, + "step": 43 + }, + { + "loss": 0.0046, + "grad_norm": 0.1436671018600464, + "learning_rate": 4e-05, + "num_tokens": 53503.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.09260000288486481, + "rewards/env_reward/std": 0.0, + "reward": 1.4074000120162964, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 0.04641978070139885, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.022, + "step": 44 + }, + { + "loss": 0.0088, + "grad_norm": 3.5930678844451904, + "learning_rate": 4.1e-05, + "num_tokens": 54723.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.11879999935626984, + "rewards/env_reward/std": 0.19900000095367432, + "reward": 1.6187999248504639, + "reward_std": 0.19900000095367432, + "frac_reward_zero_std": 0.0, + "completion_length": 3.0, + "kl": 0.08768663927912712, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0225, + "step": 45 + }, + { + "loss": 0.0091, + "grad_norm": 21.19375228881836, + "learning_rate": 4.2e-05, + "num_tokens": 55940.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.9104750156402588, + "rewards/env_reward/std": 1.393201231956482, + "reward": 2.410475015640259, + "reward_std": 1.393201231956482, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.09141915291547775, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.023, + "step": 46 + }, + { + "loss": 0.0113, + "grad_norm": 7.426270008087158, + "learning_rate": 4.3e-05, + "num_tokens": 57157.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.19827499985694885, + "rewards/env_reward/std": 0.44344282150268555, + "reward": 1.6982749700546265, + "reward_std": 0.44344279170036316, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.11329534649848938, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0235, + "step": 47 + }, + { + "loss": 0.0096, + "grad_norm": 23.390466690063477, + "learning_rate": 4.4000000000000006e-05, + "num_tokens": 58372.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.4764999747276306, + "rewards/env_reward/std": 0.07039998471736908, + "reward": 1.9765000343322754, + "reward_std": 0.07039991766214371, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.09569882601499557, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.024, + "step": 48 + }, + { + "loss": 0.0228, + "grad_norm": 13.507621765136719, + "learning_rate": 4.5e-05, + "num_tokens": 59585.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.19177499413490295, + "rewards/env_reward/std": 0.6475414037704468, + "reward": 1.6917749643325806, + "reward_std": 0.647541344165802, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.22822999954223633, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0245, + "step": 49 + }, + { + "loss": 0.0147, + "grad_norm": 16.81793212890625, + "learning_rate": 4.600000000000001e-05, + "num_tokens": 60803.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.08905000984668732, + "rewards/env_reward/std": 0.642199695110321, + "reward": 1.589050054550171, + "reward_std": 0.6421996355056763, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.1465737447142601, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.025, + "step": 50 + }, + { + "loss": 0.011, + "grad_norm": 13.575445175170898, + "learning_rate": 4.7e-05, + "num_tokens": 62051.0, + "completions/mean_length": 4.0, + "completions/min_length": 3.0, + "completions/max_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 5.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 1.0112500190734863, + "rewards/env_reward/std": 1.2719687223434448, + "reward": 2.5112500190734863, + "reward_std": 1.2719687223434448, + "frac_reward_zero_std": 0.0, + "completion_length": 5.0, + "kl": 0.11031558783724904, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0255, + "step": 51 + }, + { + "loss": 0.0137, + "grad_norm": 23.81420135498047, + "learning_rate": 4.8e-05, + "num_tokens": 63269.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.11865000426769257, + "rewards/env_reward/std": 0.04601481929421425, + "reward": 1.618649959564209, + "reward_std": 0.04601481184363365, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.13692589104175568, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.026, + "step": 52 + }, + { + "loss": 0.0209, + "grad_norm": 23.225088119506836, + "learning_rate": 4.9e-05, + "num_tokens": 64483.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.3612000048160553, + "rewards/env_reward/std": 0.09145228564739227, + "reward": 1.8612000942230225, + "reward_std": 0.09145224839448929, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.20900648832321167, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0265, + "step": 53 + }, + { + "loss": 0.0341, + "grad_norm": 28.92061996459961, + "learning_rate": 5e-05, + "num_tokens": 65701.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.5038999915122986, + "rewards/env_reward/std": 0.05588749051094055, + "reward": 2.0039000511169434, + "reward_std": 0.05588748678565025, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.34077559411525726, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.027, + "step": 54 + }, + { + "loss": 0.0357, + "grad_norm": 22.28899383544922, + "learning_rate": 4.9888888888888894e-05, + "num_tokens": 66916.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.45639997720718384, + "rewards/env_reward/std": 0.09699998795986176, + "reward": 1.956399917602539, + "reward_std": 0.09700000286102295, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.3573853522539139, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0275, + "step": 55 + }, + { + "loss": 0.0179, + "grad_norm": 0.27123039960861206, + "learning_rate": 4.977777777777778e-05, + "num_tokens": 68132.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.05339999869465828, + "rewards/env_reward/std": 0.0, + "reward": 1.5534000396728516, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 0.17926177382469177, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.028, + "step": 56 + }, + { + "loss": 0.0523, + "grad_norm": 17.443599700927734, + "learning_rate": 4.966666666666667e-05, + "num_tokens": 69350.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.10307499766349792, + "rewards/env_reward/std": 0.03798661008477211, + "reward": 1.6030750274658203, + "reward_std": 0.03798658773303032, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.5231126323342323, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0285, + "step": 57 + }, + { + "loss": 0.049, + "grad_norm": 18.03072738647461, + "learning_rate": 4.955555555555556e-05, + "num_tokens": 70573.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.35887500643730164, + "rewards/env_reward/std": 0.41454997658729553, + "reward": 1.858875036239624, + "reward_std": 0.41454997658729553, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.4901750087738037, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.029, + "step": 58 + }, + { + "loss": 0.1065, + "grad_norm": 27.605003356933594, + "learning_rate": 4.9444444444444446e-05, + "num_tokens": 71790.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.08367499709129333, + "rewards/env_reward/std": 0.04114999994635582, + "reward": 1.5836749076843262, + "reward_std": 0.041150014847517014, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 1.0653182864189148, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0295, + "step": 59 + }, + { + "loss": 0.109, + "grad_norm": 25.35848045349121, + "learning_rate": 4.933333333333334e-05, + "num_tokens": 73007.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.19287501275539398, + "rewards/env_reward/std": 0.05770479142665863, + "reward": 1.6928750276565552, + "reward_std": 0.05770481005311012, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 1.0902290344238281, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.03, + "step": 60 + }, + { + "loss": 0.102, + "grad_norm": 18.48727798461914, + "learning_rate": 4.922222222222222e-05, + "num_tokens": 74225.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 5.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.23907500505447388, + "rewards/env_reward/std": 0.8462180495262146, + "reward": 1.2609249353408813, + "reward_std": 0.8462179899215698, + "frac_reward_zero_std": 0.0, + "completion_length": 5.0, + "kl": 1.0198184018954635, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0305, + "step": 61 + }, + { + "loss": 0.0864, + "grad_norm": 19.874589920043945, + "learning_rate": 4.9111111111111114e-05, + "num_tokens": 75443.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.2953000068664551, + "rewards/env_reward/std": 0.2296384572982788, + "reward": 1.795300006866455, + "reward_std": 0.2296384572982788, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.8640272691845894, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.031, + "step": 62 + }, + { + "loss": 0.0702, + "grad_norm": 19.802614212036133, + "learning_rate": 4.9e-05, + "num_tokens": 76681.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 1.56659996509552, + "rewards/env_reward/std": 1.6551477909088135, + "reward": 3.0666000843048096, + "reward_std": 1.655147671699524, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.7016429305076599, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0315, + "step": 63 + }, + { + "loss": 0.0658, + "grad_norm": 29.09418296813965, + "learning_rate": 4.888888888888889e-05, + "num_tokens": 77895.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.09319999814033508, + "rewards/env_reward/std": 0.07863511145114899, + "reward": 1.4068000316619873, + "reward_std": 0.078635074198246, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.6582788825035095, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.032, + "step": 64 + }, + { + "loss": 0.0473, + "grad_norm": 19.2071475982666, + "learning_rate": 4.8777777777777775e-05, + "num_tokens": 79118.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.16110000014305115, + "rewards/env_reward/std": 0.16779999434947968, + "reward": 1.661099910736084, + "reward_std": 0.16780002415180206, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.4729412868618965, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0325, + "step": 65 + }, + { + "loss": 0.0064, + "grad_norm": 2.748170852661133, + "learning_rate": 4.866666666666667e-05, + "num_tokens": 80342.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 2.250649929046631, + "rewards/env_reward/std": 1.4378999471664429, + "reward": 3.750649929046631, + "reward_std": 1.4378999471664429, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.06360267847776413, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.033, + "step": 66 + }, + { + "loss": 0.045, + "grad_norm": 3.3013949394226074, + "learning_rate": 4.855555555555556e-05, + "num_tokens": 81562.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.36434999108314514, + "rewards/env_reward/std": 0.12464992702007294, + "reward": 1.8643500804901123, + "reward_std": 0.12464995682239532, + "frac_reward_zero_std": 0.0, + "completion_length": 3.0, + "kl": 0.45019371435046196, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0335, + "step": 67 + }, + { + "loss": 0.0816, + "grad_norm": 5.779470920562744, + "learning_rate": 4.844444444444445e-05, + "num_tokens": 82779.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.2705250084400177, + "rewards/env_reward/std": 0.2717834413051605, + "reward": 1.7705249786376953, + "reward_std": 0.2717834413051605, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.815668910741806, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.034, + "step": 68 + }, + { + "loss": 0.0289, + "grad_norm": 26.5799617767334, + "learning_rate": 4.8333333333333334e-05, + "num_tokens": 83993.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.4679499864578247, + "rewards/env_reward/std": 0.042666174471378326, + "reward": 1.9679499864578247, + "reward_std": 0.042666174471378326, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.2885260581970215, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0345, + "step": 69 + }, + { + "loss": 0.014, + "grad_norm": 21.3876895904541, + "learning_rate": 4.8222222222222225e-05, + "num_tokens": 85208.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 2.292875051498413, + "rewards/env_reward/std": 1.4142500162124634, + "reward": 3.792875051498413, + "reward_std": 1.4142500162124634, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.13965822756290436, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.035, + "step": 70 + }, + { + "loss": 0.0306, + "grad_norm": 24.042043685913086, + "learning_rate": 4.811111111111111e-05, + "num_tokens": 86422.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 1.2664499282836914, + "rewards/env_reward/std": 1.5692957639694214, + "reward": 2.7664499282836914, + "reward_std": 1.5692955255508423, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.3056928962469101, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0355, + "step": 71 + }, + { + "loss": 0.0218, + "grad_norm": 3.963625907897949, + "learning_rate": 4.8e-05, + "num_tokens": 87638.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.47062501311302185, + "rewards/env_reward/std": 0.12355001270771027, + "reward": 1.9706251621246338, + "reward_std": 0.12355005741119385, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.21846546977758408, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.036, + "step": 72 + }, + { + "loss": 0.0292, + "grad_norm": 20.61098861694336, + "learning_rate": 4.7888888888888886e-05, + "num_tokens": 88853.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.12492500245571136, + "rewards/env_reward/std": 0.15604999661445618, + "reward": 1.6249250173568726, + "reward_std": 0.1560499668121338, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.29186780005693436, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0365, + "step": 73 + }, + { + "loss": 0.0662, + "grad_norm": 14.41640567779541, + "learning_rate": 4.7777777777777784e-05, + "num_tokens": 90072.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.4067000150680542, + "rewards/env_reward/std": 0.14239999651908875, + "reward": 1.9067000150680542, + "reward_std": 0.14240002632141113, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.6624188795685768, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.037, + "step": 74 + }, + { + "loss": 0.0778, + "grad_norm": 6.708349227905273, + "learning_rate": 4.766666666666667e-05, + "num_tokens": 91286.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 0.25, + "rewards/format_valid/std": 1.5, + "rewards/action_legal/mean": 0.125, + "rewards/action_legal/std": 0.75, + "rewards/env_reward/mean": -0.6858749985694885, + "rewards/env_reward/std": 1.5449819564819336, + "reward": -0.3108749985694885, + "reward_std": 3.793658494949341, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.7776949852705002, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0375, + "step": 75 + }, + { + "loss": 0.1064, + "grad_norm": 2.824847936630249, + "learning_rate": 4.755555555555556e-05, + "num_tokens": 92501.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.17409999668598175, + "rewards/env_reward/std": 0.07380000501871109, + "reward": 1.6740999221801758, + "reward_std": 0.0737999677658081, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 1.0637851729989052, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.038, + "step": 76 + }, + { + "loss": 0.1088, + "grad_norm": NaN, + "learning_rate": 4.7444444444444445e-05, + "num_tokens": 93720.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.524150013923645, + "rewards/env_reward/std": 0.03130000829696655, + "reward": 2.0241501331329346, + "reward_std": 0.03129998967051506, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 1.08802729845047, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0385, + "step": 77 + }, + { + "loss": 0.2122, + "grad_norm": 56.26820373535156, + "learning_rate": 4.7444444444444445e-05, + "num_tokens": 94938.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.1987999975681305, + "rewards/env_reward/std": 0.04688084498047829, + "reward": 1.6988000869750977, + "reward_std": 0.04688084498047829, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 2.122272402048111, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.039, + "step": 78 + }, + { + "loss": 0.0738, + "grad_norm": 4.279953479766846, + "learning_rate": 4.7333333333333336e-05, + "num_tokens": 96157.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.10649999976158142, + "rewards/env_reward/std": 0.03020000271499157, + "reward": 1.6065000295639038, + "reward_std": 0.03020000457763672, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.7380976751446724, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0395, + "step": 79 + }, + { + "loss": 0.0759, + "grad_norm": 0.21008452773094177, + "learning_rate": 4.722222222222222e-05, + "num_tokens": 97373.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.2621999979019165, + "rewards/env_reward/std": 0.0, + "reward": 1.2378000020980835, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.7594332098960876, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.04, + "step": 80 + }, + { + "loss": 0.0536, + "grad_norm": 0.1332825869321823, + "learning_rate": 4.711111111111111e-05, + "num_tokens": 98589.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.07129999995231628, + "rewards/env_reward/std": 0.0, + "reward": 1.4286999702453613, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.5356340408325195, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0405, + "step": 81 + }, + { + "loss": 0.0905, + "grad_norm": 32.49578857421875, + "learning_rate": 4.7e-05, + "num_tokens": 99808.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.08722500503063202, + "rewards/env_reward/std": 0.03164999932050705, + "reward": 1.5872249603271484, + "reward_std": 0.031649984419345856, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.9049255102872849, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.041, + "step": 82 + }, + { + "loss": 0.0264, + "grad_norm": 0.06416141986846924, + "learning_rate": 4.6888888888888895e-05, + "num_tokens": 101024.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 2.6410000324249268, + "rewards/env_reward/std": 0.0, + "reward": 4.140999794006348, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.2637370824813843, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0415, + "step": 83 + }, + { + "loss": 0.1094, + "grad_norm": 36.0538444519043, + "learning_rate": 4.677777777777778e-05, + "num_tokens": 102242.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.010550007224082947, + "rewards/env_reward/std": 0.5143613815307617, + "reward": 1.5105500221252441, + "reward_std": 0.5143613815307617, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 1.0936070084571838, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.042, + "step": 84 + }, + { + "loss": 0.0578, + "grad_norm": 12.624850273132324, + "learning_rate": 4.666666666666667e-05, + "num_tokens": 103457.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.09380000084638596, + "rewards/env_reward/std": 0.04500000178813934, + "reward": 1.4061999320983887, + "reward_std": 0.04499995708465576, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.5784699767827988, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0425, + "step": 85 + }, + { + "loss": 0.0414, + "grad_norm": 0.054093338549137115, + "learning_rate": 4.6555555555555556e-05, + "num_tokens": 104681.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.12860000133514404, + "rewards/env_reward/std": 0.0, + "reward": 1.628600001335144, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.41362982988357544, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.043, + "step": 86 + }, + { + "loss": 0.0287, + "grad_norm": 0.052269965410232544, + "learning_rate": 4.644444444444445e-05, + "num_tokens": 105901.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.21299999952316284, + "rewards/env_reward/std": 0.0, + "reward": 1.7130000591278076, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.28686797618865967, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0435, + "step": 87 + }, + { + "loss": 0.0305, + "grad_norm": 0.059413399547338486, + "learning_rate": 4.633333333333333e-05, + "num_tokens": 107121.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.11779999732971191, + "rewards/env_reward/std": 0.0, + "reward": 1.617799997329712, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.3047557473182678, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.044, + "step": 88 + }, + { + "loss": 0.0364, + "grad_norm": 13.943113327026367, + "learning_rate": 4.6222222222222224e-05, + "num_tokens": 108340.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.06332500278949738, + "rewards/env_reward/std": 0.16214999556541443, + "reward": 1.5633249282836914, + "reward_std": 0.16214998066425323, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.3640783429145813, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0445, + "step": 89 + }, + { + "loss": 0.0191, + "grad_norm": 0.0353960245847702, + "learning_rate": 4.6111111111111115e-05, + "num_tokens": 109556.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.28200000524520874, + "rewards/env_reward/std": 0.0, + "reward": 1.7820000648498535, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.19095450639724731, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.045, + "step": 90 + }, + { + "loss": 0.0612, + "grad_norm": 25.147192001342773, + "learning_rate": 4.600000000000001e-05, + "num_tokens": 110802.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 5.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.296999990940094, + "rewards/env_reward/std": 0.3095349669456482, + "reward": 1.2030000686645508, + "reward_std": 0.3095349371433258, + "frac_reward_zero_std": 0.0, + "completion_length": 5.0, + "kl": 0.6122921258211136, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0455, + "step": 91 + }, + { + "loss": 0.0685, + "grad_norm": 0.17055261135101318, + "learning_rate": 4.588888888888889e-05, + "num_tokens": 112022.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.1607999950647354, + "rewards/env_reward/std": 0.0, + "reward": 1.3392000198364258, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.6851677894592285, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.046, + "step": 92 + }, + { + "loss": 0.0863, + "grad_norm": 13.699186325073242, + "learning_rate": 4.577777777777778e-05, + "num_tokens": 113237.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.2336750030517578, + "rewards/env_reward/std": 0.3572499752044678, + "reward": 1.7336750030517578, + "reward_std": 0.3572499752044678, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.8626954779028893, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0465, + "step": 93 + }, + { + "loss": 0.0274, + "grad_norm": 0.07039373368024826, + "learning_rate": 4.566666666666667e-05, + "num_tokens": 114457.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 2.6287999153137207, + "rewards/env_reward/std": 0.0, + "reward": 4.128799915313721, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.2735567092895508, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.047, + "step": 94 + }, + { + "loss": 0.061, + "grad_norm": 4.729802131652832, + "learning_rate": 4.555555555555556e-05, + "num_tokens": 115673.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.0939750000834465, + "rewards/env_reward/std": 0.024350004270672798, + "reward": 1.5939749479293823, + "reward_std": 0.024349967017769814, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.6097765266895294, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0475, + "step": 95 + }, + { + "loss": 0.0207, + "grad_norm": 0.062468525022268295, + "learning_rate": 4.5444444444444444e-05, + "num_tokens": 116917.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.0908999964594841, + "rewards/env_reward/std": 0.0, + "reward": 1.59089994430542, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.20688144862651825, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.048, + "step": 96 + }, + { + "loss": 0.0827, + "grad_norm": 6.658811569213867, + "learning_rate": 4.5333333333333335e-05, + "num_tokens": 118131.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.163100004196167, + "rewards/env_reward/std": 0.018706146627664566, + "reward": 1.336899995803833, + "reward_std": 0.018706224858760834, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.8271568417549133, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0485, + "step": 97 + }, + { + "loss": 0.0433, + "grad_norm": 0.15103508532047272, + "learning_rate": 4.522222222222223e-05, + "num_tokens": 119351.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.12250000238418579, + "rewards/env_reward/std": 0.0, + "reward": 1.622499942779541, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.4332352876663208, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.049, + "step": 98 + }, + { + "loss": 0.0089, + "grad_norm": 0.036531127989292145, + "learning_rate": 4.511111111111112e-05, + "num_tokens": 120571.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 3.0, + "rewards/env_reward/std": 0.0, + "reward": 4.5, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.08936262130737305, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0495, + "step": 99 + }, + { + "loss": 0.0212, + "grad_norm": 0.08901583403348923, + "learning_rate": 4.5e-05, + "num_tokens": 121791.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.5421000123023987, + "rewards/env_reward/std": 0.0, + "reward": 2.042099952697754, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.2119981050491333, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.05, + "step": 100 + }, + { + "loss": 0.2432, + "grad_norm": 66.02021026611328, + "learning_rate": 4.4888888888888894e-05, + "num_tokens": 123008.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.5019000172615051, + "rewards/env_reward/std": 0.054399993270635605, + "reward": 2.0018999576568604, + "reward_std": 0.054399967193603516, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 2.4316359385848045, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0505, + "step": 101 + }, + { + "loss": 0.0245, + "grad_norm": 2.3296234607696533, + "learning_rate": 4.477777777777778e-05, + "num_tokens": 124224.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 2.2127251625061035, + "rewards/env_reward/std": 1.4261500835418701, + "reward": 3.7127251625061035, + "reward_std": 1.4261500835418701, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.24477409571409225, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.051, + "step": 102 + }, + { + "loss": 0.0983, + "grad_norm": 9.382343292236328, + "learning_rate": 4.466666666666667e-05, + "num_tokens": 125443.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.573775053024292, + "rewards/env_reward/std": 0.14553172886371613, + "reward": 2.073775053024292, + "reward_std": 0.1455318182706833, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.9832261502742767, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0515, + "step": 103 + }, + { + "loss": 0.114, + "grad_norm": 2.782944679260254, + "learning_rate": 4.4555555555555555e-05, + "num_tokens": 126662.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.5668249726295471, + "rewards/env_reward/std": 0.27695000171661377, + "reward": 2.0668249130249023, + "reward_std": 0.2769499123096466, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 1.1402944773435593, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.052, + "step": 104 + }, + { + "loss": 0.0581, + "grad_norm": 7.829272270202637, + "learning_rate": 4.4444444444444447e-05, + "num_tokens": 127877.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 1.1490999460220337, + "rewards/env_reward/std": 0.5705999732017517, + "reward": 2.6491000652313232, + "reward_std": 0.5706000328063965, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.580941379070282, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0525, + "step": 105 + }, + { + "loss": 0.0701, + "grad_norm": 27.5728759765625, + "learning_rate": 4.433333333333334e-05, + "num_tokens": 129096.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.2233249992132187, + "rewards/env_reward/std": 0.05404999852180481, + "reward": 1.7233250141143799, + "reward_std": 0.05404997244477272, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.7010266333818436, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.053, + "step": 106 + }, + { + "loss": 0.0109, + "grad_norm": 2.1162190437316895, + "learning_rate": 4.422222222222222e-05, + "num_tokens": 130316.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.06030000001192093, + "rewards/env_reward/std": 0.026399999856948853, + "reward": 1.5602998733520508, + "reward_std": 0.026400011032819748, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.10948272794485092, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0535, + "step": 107 + }, + { + "loss": 0.0407, + "grad_norm": 11.704363822937012, + "learning_rate": 4.4111111111111114e-05, + "num_tokens": 131535.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.12099999934434891, + "rewards/env_reward/std": 0.03739999607205391, + "reward": 1.6209999322891235, + "reward_std": 0.03739996626973152, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.4071061462163925, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.054, + "step": 108 + }, + { + "loss": 0.0236, + "grad_norm": 0.09328442811965942, + "learning_rate": 4.4000000000000006e-05, + "num_tokens": 132751.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 2.6695001125335693, + "rewards/env_reward/std": 0.0, + "reward": 4.169500350952148, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.23597905039787292, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0545, + "step": 109 + }, + { + "loss": 0.0301, + "grad_norm": 0.0933489054441452, + "learning_rate": 4.388888888888889e-05, + "num_tokens": 133975.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.12200000137090683, + "rewards/env_reward/std": 0.0, + "reward": 1.621999979019165, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.30128175020217896, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.055, + "step": 110 + }, + { + "loss": 0.0356, + "grad_norm": 7.820048809051514, + "learning_rate": 4.377777777777778e-05, + "num_tokens": 135194.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.10917499661445618, + "rewards/env_reward/std": 0.04259517788887024, + "reward": 1.6091749668121338, + "reward_std": 0.04259520396590233, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.3561866581439972, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0555, + "step": 111 + }, + { + "loss": 0.0152, + "grad_norm": 0.07931513339281082, + "learning_rate": 4.3666666666666666e-05, + "num_tokens": 136442.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.5921000242233276, + "rewards/env_reward/std": 0.0, + "reward": 2.092100143432617, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.15175144374370575, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.056, + "step": 112 + }, + { + "loss": 0.0653, + "grad_norm": 9.840944290161133, + "learning_rate": 4.355555555555556e-05, + "num_tokens": 137657.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 2.1259751319885254, + "rewards/env_reward/std": 1.2256500720977783, + "reward": 3.6259751319885254, + "reward_std": 1.2256500720977783, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.6528940796852112, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0565, + "step": 113 + }, + { + "loss": 0.0145, + "grad_norm": 0.17490524053573608, + "learning_rate": 4.344444444444445e-05, + "num_tokens": 138869.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.25099998712539673, + "rewards/env_reward/std": 0.0, + "reward": 1.750999927520752, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 0.14506137371063232, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.057, + "step": 114 + }, + { + "loss": 0.0443, + "grad_norm": 2.580396890640259, + "learning_rate": 4.3333333333333334e-05, + "num_tokens": 140089.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.09184999763965607, + "rewards/env_reward/std": 0.029099998995661736, + "reward": 1.5918500423431396, + "reward_std": 0.029099982231855392, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.44321155548095703, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0575, + "step": 115 + }, + { + "loss": 0.0254, + "grad_norm": 0.03984629362821579, + "learning_rate": 4.3222222222222226e-05, + "num_tokens": 141309.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.5141000151634216, + "rewards/env_reward/std": 0.0, + "reward": 2.0141000747680664, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.25392112135887146, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.058, + "step": 116 + }, + { + "loss": 0.0429, + "grad_norm": 0.04275592789053917, + "learning_rate": 4.311111111111111e-05, + "num_tokens": 142533.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.1080000028014183, + "rewards/env_reward/std": 0.0, + "reward": 1.6080000400543213, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.4292468726634979, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0585, + "step": 117 + }, + { + "loss": 0.0406, + "grad_norm": 0.03664204105734825, + "learning_rate": 4.3e-05, + "num_tokens": 143753.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.09380000084638596, + "rewards/env_reward/std": 0.0, + "reward": 1.5937999486923218, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.4057614803314209, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.059, + "step": 118 + }, + { + "loss": 0.0157, + "grad_norm": 0.020087143406271935, + "learning_rate": 4.2888888888888886e-05, + "num_tokens": 144993.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 2.705399990081787, + "rewards/env_reward/std": 0.0, + "reward": 4.205399990081787, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.15697520971298218, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0595, + "step": 119 + }, + { + "loss": 0.0447, + "grad_norm": 0.04034416750073433, + "learning_rate": 4.277777777777778e-05, + "num_tokens": 146209.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.4903999865055084, + "rewards/env_reward/std": 0.0, + "reward": 1.990399956703186, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.4468948245048523, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.06, + "step": 120 + }, + { + "loss": 0.0335, + "grad_norm": 0.026327671483159065, + "learning_rate": 4.266666666666667e-05, + "num_tokens": 147429.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.4788999855518341, + "rewards/env_reward/std": 0.0, + "reward": 1.9788999557495117, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.334785133600235, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0605, + "step": 121 + }, + { + "loss": 0.0422, + "grad_norm": 0.02646907977759838, + "learning_rate": 4.255555555555556e-05, + "num_tokens": 148649.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.08839999884366989, + "rewards/env_reward/std": 0.0, + "reward": 1.5884000062942505, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.422196626663208, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.061, + "step": 122 + }, + { + "loss": 0.5017, + "grad_norm": NaN, + "learning_rate": 4.2444444444444445e-05, + "num_tokens": 149864.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.0749250054359436, + "rewards/env_reward/std": 0.0864500030875206, + "reward": 1.5749249458312988, + "reward_std": 0.08644998073577881, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 5.016612961888313, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0615, + "step": 123 + }, + { + "loss": 0.0299, + "grad_norm": 0.03146585077047348, + "learning_rate": 4.2444444444444445e-05, + "num_tokens": 151080.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 2.3415000438690186, + "rewards/env_reward/std": 0.0, + "reward": 3.8415000438690186, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.2987973093986511, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.062, + "step": 124 + }, + { + "loss": 0.0243, + "grad_norm": 0.020708182826638222, + "learning_rate": 4.233333333333334e-05, + "num_tokens": 152300.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.16120000183582306, + "rewards/env_reward/std": 0.0, + "reward": 1.6612000465393066, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.24254640936851501, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0625, + "step": 125 + }, + { + "loss": 0.0815, + "grad_norm": 0.03530203923583031, + "learning_rate": 4.222222222222222e-05, + "num_tokens": 153524.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.22169999778270721, + "rewards/env_reward/std": 0.0, + "reward": 1.2783000469207764, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.8152614235877991, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.063, + "step": 126 + }, + { + "loss": 0.5948, + "grad_norm": NaN, + "learning_rate": 4.211111111111111e-05, + "num_tokens": 154743.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.15600000321865082, + "rewards/env_reward/std": 0.0, + "reward": 1.656000018119812, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 5.947892501950264, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0635, + "step": 127 + }, + { + "loss": 0.0188, + "grad_norm": 0.011080100201070309, + "learning_rate": 4.211111111111111e-05, + "num_tokens": 155959.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 3.0, + "rewards/env_reward/std": 0.0, + "reward": 4.5, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.18799710273742676, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.064, + "step": 128 + }, + { + "loss": 0.031, + "grad_norm": 0.02280164510011673, + "learning_rate": 4.2e-05, + "num_tokens": 157179.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 3.0, + "rewards/env_reward/std": 0.0, + "reward": 4.5, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.31020915508270264, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0645, + "step": 129 + }, + { + "loss": 0.0566, + "grad_norm": 0.06029680743813515, + "learning_rate": 4.188888888888889e-05, + "num_tokens": 158395.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.07129999995231628, + "rewards/env_reward/std": 0.0, + "reward": 1.4286999702453613, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.5657824277877808, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.065, + "step": 130 + }, + { + "loss": 1.6834, + "grad_norm": 317.1106262207031, + "learning_rate": 4.177777777777778e-05, + "num_tokens": 159641.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.45752501487731934, + "rewards/env_reward/std": 0.028950003907084465, + "reward": 1.9575250148773193, + "reward_std": 0.028950056061148643, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 16.833803206682205, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0655, + "step": 131 + }, + { + "loss": 1.0988, + "grad_norm": NaN, + "learning_rate": 4.166666666666667e-05, + "num_tokens": 160860.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.12665000557899475, + "rewards/env_reward/std": 0.35850000381469727, + "reward": 1.3733500242233276, + "reward_std": 0.35850000381469727, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 10.988276034593582, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.066, + "step": 132 + }, + { + "loss": 1.4756, + "grad_norm": 285.3789978027344, + "learning_rate": 4.166666666666667e-05, + "num_tokens": 162077.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.3772500157356262, + "rewards/env_reward/std": 0.2277943193912506, + "reward": 1.8772499561309814, + "reward_std": 0.22779428958892822, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 14.75564831495285, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0665, + "step": 133 + }, + { + "loss": 0.012, + "grad_norm": 0.01531064510345459, + "learning_rate": 4.155555555555556e-05, + "num_tokens": 163297.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 3.0, + "rewards/env_reward/std": 0.0, + "reward": 4.5, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.119660884141922, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.067, + "step": 134 + }, + { + "loss": 0.0384, + "grad_norm": 0.042683765292167664, + "learning_rate": 4.144444444444445e-05, + "num_tokens": 164517.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.38960000872612, + "rewards/env_reward/std": 0.0, + "reward": 1.8896000385284424, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.3839707374572754, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0675, + "step": 135 + }, + { + "loss": 0.0353, + "grad_norm": 0.027836784720420837, + "learning_rate": 4.133333333333333e-05, + "num_tokens": 165737.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.593999981880188, + "rewards/env_reward/std": 0.0, + "reward": 2.0939998626708984, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.3534739017486572, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.068, + "step": 136 + }, + { + "loss": 0.3368, + "grad_norm": 24.851154327392578, + "learning_rate": 4.1222222222222224e-05, + "num_tokens": 166960.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 2.200200080871582, + "rewards/env_reward/std": 1.5382000207901, + "reward": 3.700200080871582, + "reward_std": 1.5382001399993896, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 3.368258073925972, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0685, + "step": 137 + }, + { + "loss": 0.046, + "grad_norm": 0.05512086674571037, + "learning_rate": 4.111111111111111e-05, + "num_tokens": 168176.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 2.74180006980896, + "rewards/env_reward/std": 0.0, + "reward": 4.241800308227539, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.45961225032806396, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.069, + "step": 138 + }, + { + "loss": 0.0351, + "grad_norm": 0.0317252017557621, + "learning_rate": 4.1e-05, + "num_tokens": 169396.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 2.27810001373291, + "rewards/env_reward/std": 0.0, + "reward": 3.77810001373291, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.3511863350868225, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0695, + "step": 139 + }, + { + "loss": 0.1634, + "grad_norm": 9.790648460388184, + "learning_rate": 4.088888888888889e-05, + "num_tokens": 170639.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.13592499494552612, + "rewards/env_reward/std": 0.07474999874830246, + "reward": 1.635925054550171, + "reward_std": 0.07474998384714127, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 1.633728265762329, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.07, + "step": 140 + }, + { + "loss": 0.0719, + "grad_norm": 10.20471477508545, + "learning_rate": 4.0777777777777783e-05, + "num_tokens": 171858.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.11779999732971191, + "rewards/env_reward/std": 0.0, + "reward": 1.617799997329712, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.7185821086168289, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0705, + "step": 141 + }, + { + "loss": 0.0609, + "grad_norm": 0.04811082035303116, + "learning_rate": 4.066666666666667e-05, + "num_tokens": 173074.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.01860000006854534, + "rewards/env_reward/std": 0.0, + "reward": 1.5185999870300293, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.6092149615287781, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.071, + "step": 142 + }, + { + "loss": 0.0715, + "grad_norm": 10.69751262664795, + "learning_rate": 4.055555555555556e-05, + "num_tokens": 174289.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 1.9459749460220337, + "rewards/env_reward/std": 1.3590500354766846, + "reward": 3.445974826812744, + "reward_std": 1.3590497970581055, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.7153288573026657, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0715, + "step": 143 + }, + { + "loss": 0.0376, + "grad_norm": 0.038950882852077484, + "learning_rate": 4.0444444444444444e-05, + "num_tokens": 175505.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.43720000982284546, + "rewards/env_reward/std": 0.0, + "reward": 1.9372000694274902, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.37605082988739014, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.072, + "step": 144 + }, + { + "loss": 0.0852, + "grad_norm": 21.775800704956055, + "learning_rate": 4.0333333333333336e-05, + "num_tokens": 176724.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.13734997808933258, + "rewards/env_reward/std": 0.5797000527381897, + "reward": 1.6373498439788818, + "reward_std": 0.5796998739242554, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.8518322631716728, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0725, + "step": 145 + }, + { + "loss": 0.0286, + "grad_norm": 0.04734448343515396, + "learning_rate": 4.022222222222222e-05, + "num_tokens": 177944.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.21299999952316284, + "rewards/env_reward/std": 0.0, + "reward": 1.7130000591278076, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.2863689064979553, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.073, + "step": 146 + }, + { + "loss": 0.0756, + "grad_norm": 18.95853042602539, + "learning_rate": 4.011111111111111e-05, + "num_tokens": 179159.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.12667499482631683, + "rewards/env_reward/std": 0.16345000267028809, + "reward": 1.373324990272522, + "reward_std": 0.16345000267028809, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.7564148157835007, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0735, + "step": 147 + }, + { + "loss": 0.0258, + "grad_norm": 0.031110836192965508, + "learning_rate": 4e-05, + "num_tokens": 180375.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 2.9486000537872314, + "rewards/env_reward/std": 0.0, + "reward": 4.448599815368652, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.2578928470611572, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.074, + "step": 148 + }, + { + "loss": 0.0471, + "grad_norm": 16.788101196289062, + "learning_rate": 3.9888888888888895e-05, + "num_tokens": 181594.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.4031500220298767, + "rewards/env_reward/std": 0.4043000340461731, + "reward": 1.903149962425232, + "reward_std": 0.4043000638484955, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.4707563817501068, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0745, + "step": 149 + }, + { + "loss": 0.0301, + "grad_norm": 0.05600623041391373, + "learning_rate": 3.977777777777778e-05, + "num_tokens": 182814.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.11779999732971191, + "rewards/env_reward/std": 0.0, + "reward": 1.617799997329712, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.30064845085144043, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.075, + "step": 150 + }, + { + "loss": 0.0347, + "grad_norm": 15.009771347045898, + "learning_rate": 3.966666666666667e-05, + "num_tokens": 184033.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.44749999046325684, + "rewards/env_reward/std": 0.054399993270635605, + "reward": 1.9474999904632568, + "reward_std": 0.054399967193603516, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.34688687324523926, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0755, + "step": 151 + }, + { + "loss": 0.026, + "grad_norm": 14.952474594116211, + "learning_rate": 3.9555555555555556e-05, + "num_tokens": 185252.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.06487499922513962, + "rewards/env_reward/std": 0.027149999514222145, + "reward": 1.5648750066757202, + "reward_std": 0.027149956673383713, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.25970758497714996, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.076, + "step": 152 + }, + { + "loss": 0.1091, + "grad_norm": 15.009089469909668, + "learning_rate": 3.944444444444445e-05, + "num_tokens": 186471.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.1564750075340271, + "rewards/env_reward/std": 0.2308499962091446, + "reward": 1.3435250520706177, + "reward_std": 0.2308499962091446, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 1.0911710932850838, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0765, + "step": 153 + }, + { + "loss": 0.0086, + "grad_norm": 0.06202450767159462, + "learning_rate": 3.933333333333333e-05, + "num_tokens": 187719.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.16519999504089355, + "rewards/env_reward/std": 0.0, + "reward": 1.6651999950408936, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.08641213178634644, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.077, + "step": 154 + }, + { + "loss": 0.0239, + "grad_norm": 0.06951211392879486, + "learning_rate": 3.922222222222223e-05, + "num_tokens": 188939.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.08739999681711197, + "rewards/env_reward/std": 0.0, + "reward": 1.587399959564209, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.23888960480690002, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0775, + "step": 155 + }, + { + "loss": 0.0192, + "grad_norm": 0.0887007862329483, + "learning_rate": 3.9111111111111115e-05, + "num_tokens": 190155.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 3.0, + "rewards/env_reward/std": 0.0, + "reward": 4.5, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.19150155782699585, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.078, + "step": 156 + }, + { + "loss": 0.0499, + "grad_norm": 17.98279571533203, + "learning_rate": 3.9000000000000006e-05, + "num_tokens": 191374.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.11112499237060547, + "rewards/env_reward/std": 0.03234999626874924, + "reward": 1.6111249923706055, + "reward_std": 0.032350022345781326, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.4987874999642372, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0785, + "step": 157 + }, + { + "loss": 0.0095, + "grad_norm": 0.2604275941848755, + "learning_rate": 3.888888888888889e-05, + "num_tokens": 192586.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.11249999701976776, + "rewards/env_reward/std": 0.0, + "reward": 1.6124999523162842, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 0.09546057879924774, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.079, + "step": 158 + }, + { + "loss": 0.0635, + "grad_norm": 7.868729114532471, + "learning_rate": 3.877777777777778e-05, + "num_tokens": 193805.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.1739250123500824, + "rewards/env_reward/std": 0.12384999543428421, + "reward": 1.6739249229431152, + "reward_std": 0.12384998798370361, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.635490357875824, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0795, + "step": 159 + }, + { + "loss": 0.0751, + "grad_norm": 8.487398147583008, + "learning_rate": 3.866666666666667e-05, + "num_tokens": 195019.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.6370000243186951, + "rewards/env_reward/std": 0.4801245331764221, + "reward": 2.13700008392334, + "reward_std": 0.4801245927810669, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.751496285200119, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.08, + "step": 160 + }, + { + "loss": 0.0911, + "grad_norm": 5.337852954864502, + "learning_rate": 3.855555555555556e-05, + "num_tokens": 196237.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.36442500352859497, + "rewards/env_reward/std": 0.24691858887672424, + "reward": 1.8644250631332397, + "reward_std": 0.24691854417324066, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.9107427895069122, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0805, + "step": 161 + }, + { + "loss": 0.0027, + "grad_norm": 3.4733355045318604, + "learning_rate": 3.844444444444444e-05, + "num_tokens": 197449.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.49985000491142273, + "rewards/env_reward/std": 0.7634302973747253, + "reward": 1.9998500347137451, + "reward_std": 0.7634302377700806, + "frac_reward_zero_std": 0.0, + "completion_length": 3.0, + "kl": 0.027130742906592786, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.081, + "step": 162 + }, + { + "loss": 0.1171, + "grad_norm": 11.301681518554688, + "learning_rate": 3.8333333333333334e-05, + "num_tokens": 198671.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.3044999837875366, + "rewards/env_reward/std": 0.2737325131893158, + "reward": 1.8045001029968262, + "reward_std": 0.27373257279396057, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 1.1712135076522827, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0815, + "step": 163 + }, + { + "loss": 0.0945, + "grad_norm": 4.037586212158203, + "learning_rate": 3.8222222222222226e-05, + "num_tokens": 199885.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.31325000524520874, + "rewards/env_reward/std": 0.40956011414527893, + "reward": 1.813249945640564, + "reward_std": 0.40956008434295654, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.9451059624552727, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.082, + "step": 164 + }, + { + "loss": 0.1085, + "grad_norm": 6.316772937774658, + "learning_rate": 3.811111111111112e-05, + "num_tokens": 201098.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.448199987411499, + "rewards/env_reward/std": 0.2181999832391739, + "reward": 1.9482001066207886, + "reward_std": 0.21819999814033508, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 1.0854874588549137, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0825, + "step": 165 + }, + { + "loss": 0.1035, + "grad_norm": 5.041471004486084, + "learning_rate": 3.8e-05, + "num_tokens": 202340.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.14730000495910645, + "rewards/env_reward/std": 0.5735984444618225, + "reward": 1.6473000049591064, + "reward_std": 0.5735983848571777, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 1.0352106094360352, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.083, + "step": 166 + }, + { + "loss": 0.1103, + "grad_norm": 9.183646202087402, + "learning_rate": 3.7888888888888894e-05, + "num_tokens": 203553.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.6046750545501709, + "rewards/env_reward/std": 1.2963500022888184, + "reward": 2.104675054550171, + "reward_std": 1.2963500022888184, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 1.103184774518013, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0835, + "step": 167 + }, + { + "loss": 0.0571, + "grad_norm": 7.774601459503174, + "learning_rate": 3.777777777777778e-05, + "num_tokens": 204772.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.536175012588501, + "rewards/env_reward/std": 0.05065000429749489, + "reward": 2.036175012588501, + "reward_std": 0.050650037825107574, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.5715078748762608, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.084, + "step": 168 + }, + { + "loss": 0.0006, + "grad_norm": 0.020840495824813843, + "learning_rate": 3.766666666666667e-05, + "num_tokens": 205984.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.38589999079704285, + "rewards/env_reward/std": 0.0, + "reward": 1.8859000205993652, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 0.0058740577660501, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0845, + "step": 169 + }, + { + "loss": 0.1056, + "grad_norm": 9.08940601348877, + "learning_rate": 3.7555555555555554e-05, + "num_tokens": 207201.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.13997501134872437, + "rewards/env_reward/std": 0.24175000190734863, + "reward": 1.6399749517440796, + "reward_std": 0.24174998700618744, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 1.0561846308410168, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.085, + "step": 170 + }, + { + "loss": 0.1105, + "grad_norm": 9.16537857055664, + "learning_rate": 3.7444444444444446e-05, + "num_tokens": 208418.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.16380000114440918, + "rewards/env_reward/std": 0.25220000743865967, + "reward": 1.6638000011444092, + "reward_std": 0.2521999776363373, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 1.1045464426279068, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0855, + "step": 171 + }, + { + "loss": 0.0097, + "grad_norm": 2.578529119491577, + "learning_rate": 3.733333333333334e-05, + "num_tokens": 209630.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.30320000648498535, + "rewards/env_reward/std": 0.09060001373291016, + "reward": 1.8032000064849854, + "reward_std": 0.09060001373291016, + "frac_reward_zero_std": 0.0, + "completion_length": 3.0, + "kl": 0.09665492875501513, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.086, + "step": 172 + }, + { + "loss": 0.0034, + "grad_norm": 3.320836305618286, + "learning_rate": 3.722222222222222e-05, + "num_tokens": 210846.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.19130000472068787, + "rewards/env_reward/std": 0.04619999974966049, + "reward": 1.6913000345230103, + "reward_std": 0.046199996024370193, + "frac_reward_zero_std": 0.0, + "completion_length": 3.0, + "kl": 0.03439147397875786, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0865, + "step": 173 + }, + { + "loss": 0.0757, + "grad_norm": 9.950850486755371, + "learning_rate": 3.7111111111111113e-05, + "num_tokens": 212060.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 1.4722000360488892, + "rewards/env_reward/std": 1.7641515731811523, + "reward": 2.9721999168395996, + "reward_std": 1.7641514539718628, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.7566990703344345, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.087, + "step": 174 + }, + { + "loss": 0.0018, + "grad_norm": 3.615767478942871, + "learning_rate": 3.7e-05, + "num_tokens": 213272.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.18514999747276306, + "rewards/env_reward/std": 0.20029999315738678, + "reward": 1.3148499727249146, + "reward_std": 0.20029997825622559, + "frac_reward_zero_std": 0.0, + "completion_length": 3.0, + "kl": 0.018229197012260556, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0875, + "step": 175 + }, + { + "loss": 0.0004, + "grad_norm": 0.01711435616016388, + "learning_rate": 3.688888888888889e-05, + "num_tokens": 214484.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.04910000041127205, + "rewards/env_reward/std": 0.0, + "reward": 1.4509000778198242, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 0.003533522365614772, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.088, + "step": 176 + }, + { + "loss": 0.1084, + "grad_norm": 3.395838737487793, + "learning_rate": 3.677777777777778e-05, + "num_tokens": 215701.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.49125000834465027, + "rewards/env_reward/std": 0.42663922905921936, + "reward": 1.9912500381469727, + "reward_std": 0.42663925886154175, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 1.0841109305620193, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0885, + "step": 177 + }, + { + "loss": 0.1782, + "grad_norm": 3.345154047012329, + "learning_rate": 3.6666666666666666e-05, + "num_tokens": 216917.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.07502499967813492, + "rewards/env_reward/std": 0.2228500097990036, + "reward": 1.4249749183654785, + "reward_std": 0.22284995019435883, + "frac_reward_zero_std": 0.0, + "completion_length": 3.0, + "kl": 1.782272845506668, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.089, + "step": 178 + }, + { + "loss": 0.148, + "grad_norm": 10.321990013122559, + "learning_rate": 3.655555555555556e-05, + "num_tokens": 218130.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.8711000084877014, + "rewards/env_reward/std": 1.4336090087890625, + "reward": 2.3710999488830566, + "reward_std": 1.4336090087890625, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 1.4800196141004562, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0895, + "step": 179 + }, + { + "loss": 0.0863, + "grad_norm": 8.806055068969727, + "learning_rate": 3.644444444444445e-05, + "num_tokens": 219343.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.7229999899864197, + "rewards/env_reward/std": 1.5180000066757202, + "reward": 2.2230000495910645, + "reward_std": 1.5180000066757202, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.8633862249553204, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.09, + "step": 180 + }, + { + "loss": 0.1071, + "grad_norm": 8.860750198364258, + "learning_rate": 3.633333333333333e-05, + "num_tokens": 220556.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.7496999502182007, + "rewards/env_reward/std": 1.5017735958099365, + "reward": 2.2497000694274902, + "reward_std": 1.501773476600647, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 1.0707316398620605, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0905, + "step": 181 + }, + { + "loss": 0.0593, + "grad_norm": 9.149721145629883, + "learning_rate": 3.6222222222222225e-05, + "num_tokens": 221774.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 1.2278499603271484, + "rewards/env_reward/std": 1.6160610914230347, + "reward": 2.7278499603271484, + "reward_std": 1.6160610914230347, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.5934920236468315, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.091, + "step": 182 + }, + { + "loss": 0.0992, + "grad_norm": 5.919835090637207, + "learning_rate": 3.611111111111111e-05, + "num_tokens": 222995.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.25929999351501465, + "rewards/env_reward/std": 0.2668980360031128, + "reward": 1.7592999935150146, + "reward_std": 0.2668980360031128, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.9921397641301155, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0915, + "step": 183 + }, + { + "loss": 0.0752, + "grad_norm": 8.201157569885254, + "learning_rate": 3.6e-05, + "num_tokens": 224212.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.6631500124931335, + "rewards/env_reward/std": 1.2741000652313232, + "reward": 2.1631500720977783, + "reward_std": 1.2740998268127441, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.7523262202739716, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.092, + "step": 184 + }, + { + "loss": 0.0011, + "grad_norm": 2.42414927482605, + "learning_rate": 3.5888888888888886e-05, + "num_tokens": 225424.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.016649994999170303, + "rewards/env_reward/std": 0.1965000033378601, + "reward": 1.5166499614715576, + "reward_std": 0.1964999884366989, + "frac_reward_zero_std": 0.0, + "completion_length": 3.0, + "kl": 0.0112856529885903, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0925, + "step": 185 + }, + { + "loss": 0.0532, + "grad_norm": 9.443381309509277, + "learning_rate": 3.577777777777778e-05, + "num_tokens": 226643.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.11549999564886093, + "rewards/env_reward/std": 0.009599998593330383, + "reward": 1.6154999732971191, + "reward_std": 0.009600004181265831, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.5324121937155724, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.093, + "step": 186 + }, + { + "loss": 0.0581, + "grad_norm": 4.851520538330078, + "learning_rate": 3.566666666666667e-05, + "num_tokens": 227857.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.2538999915122986, + "rewards/env_reward/std": 0.20722638070583344, + "reward": 1.7539000511169434, + "reward_std": 0.2072264403104782, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.5809661895036697, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0935, + "step": 187 + }, + { + "loss": 0.0899, + "grad_norm": 8.911284446716309, + "learning_rate": 3.555555555555556e-05, + "num_tokens": 229075.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.09845000505447388, + "rewards/env_reward/std": 0.061433784663677216, + "reward": 1.598449945449829, + "reward_std": 0.061433833092451096, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.8985702842473984, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.094, + "step": 188 + }, + { + "loss": 0.0026, + "grad_norm": 7.717309951782227, + "learning_rate": 3.5444444444444445e-05, + "num_tokens": 230317.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.6454499959945679, + "rewards/env_reward/std": 0.48630210757255554, + "reward": 2.1454498767852783, + "reward_std": 0.4863020181655884, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.02618086338043213, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0945, + "step": 189 + }, + { + "loss": 0.0833, + "grad_norm": 3.0679268836975098, + "learning_rate": 3.5333333333333336e-05, + "num_tokens": 231534.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.28885000944137573, + "rewards/env_reward/std": 0.4101759195327759, + "reward": 1.7888500690460205, + "reward_std": 0.4101759195327759, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.8325570225715637, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.095, + "step": 190 + }, + { + "loss": 0.1207, + "grad_norm": 6.385967254638672, + "learning_rate": 3.522222222222222e-05, + "num_tokens": 232747.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.6820499897003174, + "rewards/env_reward/std": 0.5525884628295898, + "reward": 2.1820499897003174, + "reward_std": 0.5525885224342346, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 1.2066741809248924, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0955, + "step": 191 + }, + { + "loss": 0.0654, + "grad_norm": 7.2965192794799805, + "learning_rate": 3.511111111111111e-05, + "num_tokens": 233961.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.03577499836683273, + "rewards/env_reward/std": 0.09620396047830582, + "reward": 1.5357749462127686, + "reward_std": 0.09620393812656403, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.6539988666772842, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.096, + "step": 192 + }, + { + "loss": 0.035, + "grad_norm": 9.981371879577637, + "learning_rate": 3.5e-05, + "num_tokens": 235184.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 2.225299835205078, + "rewards/env_reward/std": 1.3604000806808472, + "reward": 3.725299835205078, + "reward_std": 1.3603999614715576, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.3499831482768059, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0965, + "step": 193 + }, + { + "loss": 0.0672, + "grad_norm": 8.588502883911133, + "learning_rate": 3.4888888888888895e-05, + "num_tokens": 236401.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.6177250146865845, + "rewards/env_reward/std": 1.22762131690979, + "reward": 2.117724895477295, + "reward_std": 1.22762131690979, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.6720898002386093, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.097, + "step": 194 + }, + { + "loss": 0.0034, + "grad_norm": 3.138629674911499, + "learning_rate": 3.477777777777778e-05, + "num_tokens": 237650.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 0.25, + "rewards/format_valid/std": 1.5, + "rewards/action_legal/mean": 0.125, + "rewards/action_legal/std": 0.75, + "rewards/env_reward/mean": -0.2300250232219696, + "rewards/env_reward/std": 1.8820770978927612, + "reward": 0.14497500658035278, + "reward_std": 4.112740993499756, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.03350692242383957, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0975, + "step": 195 + }, + { + "loss": 0.0167, + "grad_norm": 6.246030330657959, + "learning_rate": 3.466666666666667e-05, + "num_tokens": 238865.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.3423500061035156, + "rewards/env_reward/std": 0.4099000096321106, + "reward": 1.8423500061035156, + "reward_std": 0.4099000096321106, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.16748806089162827, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.098, + "step": 196 + }, + { + "loss": 0.0006, + "grad_norm": 0.06681685894727707, + "learning_rate": 3.4555555555555556e-05, + "num_tokens": 240081.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.2660999894142151, + "rewards/env_reward/std": 0.0, + "reward": 1.7660999298095703, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 0.005917628761380911, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0985, + "step": 197 + }, + { + "loss": 0.009, + "grad_norm": 2.279668092727661, + "learning_rate": 3.444444444444445e-05, + "num_tokens": 241293.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.8304250240325928, + "rewards/env_reward/std": 0.6611500382423401, + "reward": 2.3304250240325928, + "reward_std": 0.6611500382423401, + "frac_reward_zero_std": 0.0, + "completion_length": 3.0, + "kl": 0.09011396765708923, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.099, + "step": 198 + }, + { + "loss": 0.0256, + "grad_norm": 13.054587364196777, + "learning_rate": 3.433333333333333e-05, + "num_tokens": 242512.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.12422500550746918, + "rewards/env_reward/std": 0.0044499970972537994, + "reward": 1.6242250204086304, + "reward_std": 0.004450043197721243, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.2554884999990463, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.0995, + "step": 199 + }, + { + "loss": 0.0225, + "grad_norm": 8.720185279846191, + "learning_rate": 3.4222222222222224e-05, + "num_tokens": 243734.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 1.3869000673294067, + "rewards/env_reward/std": 1.5606932640075684, + "reward": 2.886899948120117, + "reward_std": 1.5606932640075684, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.2250380516052246, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1, + "step": 200 + }, + { + "loss": 0.0472, + "grad_norm": 10.663931846618652, + "learning_rate": 3.411111111111111e-05, + "num_tokens": 244948.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 5.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.1264750063419342, + "rewards/env_reward/std": 0.15003366768360138, + "reward": 1.3735250234603882, + "reward_std": 0.15003369748592377, + "frac_reward_zero_std": 0.0, + "completion_length": 5.0, + "kl": 0.4722195148933679, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1005, + "step": 201 + }, + { + "loss": 0.0505, + "grad_norm": 8.990918159484863, + "learning_rate": 3.4000000000000007e-05, + "num_tokens": 246165.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 1.2725749015808105, + "rewards/env_reward/std": 1.4268337488174438, + "reward": 2.7725749015808105, + "reward_std": 1.4268337488174438, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.5048771500587463, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.101, + "step": 202 + }, + { + "loss": 0.0222, + "grad_norm": 7.526683807373047, + "learning_rate": 3.388888888888889e-05, + "num_tokens": 247383.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.2131499946117401, + "rewards/env_reward/std": 0.15301643311977386, + "reward": 1.7131500244140625, + "reward_std": 0.15301649272441864, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.22175872698426247, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1015, + "step": 203 + }, + { + "loss": 0.0001, + "grad_norm": 0.012850847095251083, + "learning_rate": 3.377777777777778e-05, + "num_tokens": 248595.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.7128000259399414, + "rewards/env_reward/std": 0.0, + "reward": 2.2128000259399414, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 0.0014626781921833754, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.102, + "step": 204 + }, + { + "loss": 0.0268, + "grad_norm": 6.102322578430176, + "learning_rate": 3.366666666666667e-05, + "num_tokens": 249808.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.6442500352859497, + "rewards/env_reward/std": 0.2973000109195709, + "reward": 2.14424991607666, + "reward_std": 0.2973000407218933, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.2678939402103424, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1025, + "step": 205 + }, + { + "loss": 0.0511, + "grad_norm": 6.538288116455078, + "learning_rate": 3.355555555555556e-05, + "num_tokens": 251026.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.2204499989748001, + "rewards/env_reward/std": 0.024459702894091606, + "reward": 1.2795499563217163, + "reward_std": 0.02445964328944683, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.5111789032816887, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.103, + "step": 206 + }, + { + "loss": 0.0408, + "grad_norm": 9.782583236694336, + "learning_rate": 3.3444444444444443e-05, + "num_tokens": 252240.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.1491749882698059, + "rewards/env_reward/std": 0.03605110943317413, + "reward": 1.3508250713348389, + "reward_std": 0.036051150411367416, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.4080735631287098, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1035, + "step": 207 + }, + { + "loss": 0.0322, + "grad_norm": 5.914154052734375, + "learning_rate": 3.3333333333333335e-05, + "num_tokens": 253457.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.8079249858856201, + "rewards/env_reward/std": 0.40994998812675476, + "reward": 2.307924747467041, + "reward_std": 0.40994998812675476, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.3219486065208912, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.104, + "step": 208 + }, + { + "loss": 0.0284, + "grad_norm": 2.2945775985717773, + "learning_rate": 3.322222222222222e-05, + "num_tokens": 254673.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.14419999718666077, + "rewards/env_reward/std": 0.30845096707344055, + "reward": 1.6441999673843384, + "reward_std": 0.30845096707344055, + "frac_reward_zero_std": 0.0, + "completion_length": 3.0, + "kl": 0.2838685214519501, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1045, + "step": 209 + }, + { + "loss": 0.0027, + "grad_norm": 0.059150148183107376, + "learning_rate": 3.311111111111112e-05, + "num_tokens": 255885.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.992900013923645, + "rewards/env_reward/std": 0.0, + "reward": 2.4928998947143555, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 0.02724589966237545, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.105, + "step": 210 + }, + { + "loss": 0.0065, + "grad_norm": 2.316431760787964, + "learning_rate": 3.3e-05, + "num_tokens": 257105.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.6849249601364136, + "rewards/env_reward/std": 0.37295001745224, + "reward": 2.184924840927124, + "reward_std": 0.3729499578475952, + "frac_reward_zero_std": 0.0, + "completion_length": 3.0, + "kl": 0.06542554311454296, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1055, + "step": 211 + }, + { + "loss": 0.001, + "grad_norm": 0.02504754438996315, + "learning_rate": 3.2888888888888894e-05, + "num_tokens": 258321.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.04019999876618385, + "rewards/env_reward/std": 0.0, + "reward": 1.4598000049591064, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 0.009607851505279541, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.106, + "step": 212 + }, + { + "loss": 0.0834, + "grad_norm": 5.390646457672119, + "learning_rate": 3.277777777777778e-05, + "num_tokens": 259533.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.22612501680850983, + "rewards/env_reward/std": 0.2722500264644623, + "reward": 1.7261250019073486, + "reward_std": 0.2722500264644623, + "frac_reward_zero_std": 0.0, + "completion_length": 3.0, + "kl": 0.8344221711158752, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1065, + "step": 213 + }, + { + "loss": 0.073, + "grad_norm": 8.010059356689453, + "learning_rate": 3.266666666666667e-05, + "num_tokens": 260750.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.18207500874996185, + "rewards/env_reward/std": 0.17445001006126404, + "reward": 1.317924976348877, + "reward_std": 0.17444996535778046, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.7295764461159706, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.107, + "step": 214 + }, + { + "loss": 0.0429, + "grad_norm": 9.196842193603516, + "learning_rate": 3.2555555555555555e-05, + "num_tokens": 261964.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.060099996626377106, + "rewards/env_reward/std": 0.08995117247104645, + "reward": 1.4398999214172363, + "reward_std": 0.08995116502046585, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.42901405692100525, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1075, + "step": 215 + }, + { + "loss": 0.049, + "grad_norm": 6.548834323883057, + "learning_rate": 3.2444444444444446e-05, + "num_tokens": 263177.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.6177250146865845, + "rewards/env_reward/std": 0.3055500090122223, + "reward": 2.117724895477295, + "reward_std": 0.3055499792098999, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.4904120974242687, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.108, + "step": 216 + }, + { + "loss": 0.0098, + "grad_norm": 0.0852428674697876, + "learning_rate": 3.233333333333333e-05, + "num_tokens": 264397.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.4487000107765198, + "rewards/env_reward/std": 0.0, + "reward": 1.948699951171875, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 0.09779202938079834, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1085, + "step": 217 + }, + { + "loss": 0.0026, + "grad_norm": 0.020340105518698692, + "learning_rate": 3.222222222222223e-05, + "num_tokens": 265613.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 1.013700008392334, + "rewards/env_reward/std": 0.0, + "reward": 2.513700008392334, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 0.025670906528830528, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.109, + "step": 218 + }, + { + "loss": 0.0207, + "grad_norm": 5.19188928604126, + "learning_rate": 3.2111111111111114e-05, + "num_tokens": 266855.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.6587499976158142, + "rewards/env_reward/std": 0.49077048897743225, + "reward": 2.158750057220459, + "reward_std": 0.49077045917510986, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.206882506608963, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1095, + "step": 219 + }, + { + "loss": 0.0066, + "grad_norm": 0.027376951649785042, + "learning_rate": 3.2000000000000005e-05, + "num_tokens": 268067.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 1.1727999448776245, + "rewards/env_reward/std": 0.0, + "reward": 2.672800064086914, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 0.06600229442119598, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.11, + "step": 220 + }, + { + "loss": 0.0653, + "grad_norm": 9.92841911315918, + "learning_rate": 3.188888888888889e-05, + "num_tokens": 269285.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 5.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.5943999886512756, + "rewards/env_reward/std": 0.7274900674819946, + "reward": 0.9056000709533691, + "reward_std": 0.7274901270866394, + "frac_reward_zero_std": 0.0, + "completion_length": 5.0, + "kl": 0.6533837057650089, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1105, + "step": 221 + }, + { + "loss": 0.0187, + "grad_norm": 0.040416963398456573, + "learning_rate": 3.177777777777778e-05, + "num_tokens": 270497.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.33410000801086426, + "rewards/env_reward/std": 0.0, + "reward": 1.1658999919891357, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 0.18729273974895477, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.111, + "step": 222 + }, + { + "loss": 0.0588, + "grad_norm": 5.750544548034668, + "learning_rate": 3.1666666666666666e-05, + "num_tokens": 271710.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.5752999782562256, + "rewards/env_reward/std": 0.26019999384880066, + "reward": 2.0752999782562256, + "reward_std": 0.26020002365112305, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.5881490483880043, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1115, + "step": 223 + }, + { + "loss": 0.113, + "grad_norm": 8.916067123413086, + "learning_rate": 3.155555555555556e-05, + "num_tokens": 272927.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.020275000482797623, + "rewards/env_reward/std": 0.038450002670288086, + "reward": 1.4797250032424927, + "reward_std": 0.038450002670288086, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 1.1298761367797852, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.112, + "step": 224 + }, + { + "loss": 0.0048, + "grad_norm": 0.035957083106040955, + "learning_rate": 3.144444444444445e-05, + "num_tokens": 274143.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 1.035099983215332, + "rewards/env_reward/std": 0.0, + "reward": 2.535099983215332, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 0.04778510332107544, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1125, + "step": 225 + }, + { + "loss": 0.0022, + "grad_norm": 0.01710173487663269, + "learning_rate": 3.1333333333333334e-05, + "num_tokens": 275363.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.9266999959945679, + "rewards/env_reward/std": 0.0, + "reward": 2.4267001152038574, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 0.022123416885733604, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.113, + "step": 226 + }, + { + "loss": 0.0062, + "grad_norm": 0.031508758664131165, + "learning_rate": 3.1222222222222225e-05, + "num_tokens": 276575.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.6335999965667725, + "rewards/env_reward/std": 0.0, + "reward": 2.1335999965667725, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 0.06243076175451279, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1135, + "step": 227 + }, + { + "loss": 0.0029, + "grad_norm": 4.613099575042725, + "learning_rate": 3.111111111111111e-05, + "num_tokens": 277787.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.0832500085234642, + "rewards/env_reward/std": 0.22603262960910797, + "reward": 1.4167499542236328, + "reward_std": 0.22603262960910797, + "frac_reward_zero_std": 0.0, + "completion_length": 3.0, + "kl": 0.029001038521528244, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.114, + "step": 228 + }, + { + "loss": 0.0037, + "grad_norm": 0.026186607778072357, + "learning_rate": 3.1e-05, + "num_tokens": 279003.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 1.0399999618530273, + "rewards/env_reward/std": 0.0, + "reward": 2.5399999618530273, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 0.03700479120016098, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1145, + "step": 229 + }, + { + "loss": 0.0038, + "grad_norm": 0.018804756924510002, + "learning_rate": 3.088888888888889e-05, + "num_tokens": 280219.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.5902000069618225, + "rewards/env_reward/std": 0.0, + "reward": 2.0901999473571777, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 0.03775995969772339, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.115, + "step": 230 + }, + { + "loss": 0.0608, + "grad_norm": 7.023393630981445, + "learning_rate": 3.077777777777778e-05, + "num_tokens": 281436.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.11394999921321869, + "rewards/env_reward/std": 0.22169999778270721, + "reward": 1.6139500141143799, + "reward_std": 0.2217000275850296, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.6076318472623825, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1155, + "step": 231 + }, + { + "loss": 0.0438, + "grad_norm": 2.462205410003662, + "learning_rate": 3.066666666666667e-05, + "num_tokens": 282682.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.28862500190734863, + "rewards/env_reward/std": 0.37281543016433716, + "reward": 1.7886250019073486, + "reward_std": 0.37281543016433716, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.4382362440228462, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.116, + "step": 232 + }, + { + "loss": 0.0013, + "grad_norm": 0.008755974471569061, + "learning_rate": 3.055555555555556e-05, + "num_tokens": 283898.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.036400001496076584, + "rewards/env_reward/std": 0.0, + "reward": 1.5363999605178833, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 0.012826403602957726, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1165, + "step": 233 + }, + { + "loss": 0.0897, + "grad_norm": 3.114485740661621, + "learning_rate": 3.044444444444445e-05, + "num_tokens": 285115.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.33834999799728394, + "rewards/env_reward/std": 0.29102057218551636, + "reward": 1.8383500576019287, + "reward_std": 0.29102060198783875, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.8970653489232063, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.117, + "step": 234 + }, + { + "loss": 0.001, + "grad_norm": 0.006958460435271263, + "learning_rate": 3.0333333333333337e-05, + "num_tokens": 286331.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.31310001015663147, + "rewards/env_reward/std": 0.0, + "reward": 1.813099980354309, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 0.010390917770564556, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1175, + "step": 235 + }, + { + "loss": 0.0761, + "grad_norm": 5.068345546722412, + "learning_rate": 3.0222222222222225e-05, + "num_tokens": 287544.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.6761999726295471, + "rewards/env_reward/std": 0.37300002574920654, + "reward": 2.1761999130249023, + "reward_std": 0.3729999363422394, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.7609379142522812, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.118, + "step": 236 + }, + { + "loss": 0.0307, + "grad_norm": 2.9388842582702637, + "learning_rate": 3.0111111111111113e-05, + "num_tokens": 288788.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.03700000047683716, + "rewards/env_reward/std": 0.15161217749118805, + "reward": 1.5369999408721924, + "reward_std": 0.15161222219467163, + "frac_reward_zero_std": 0.0, + "completion_length": 3.0, + "kl": 0.3069487512111664, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1185, + "step": 237 + }, + { + "loss": 0.0455, + "grad_norm": 3.424729585647583, + "learning_rate": 3e-05, + "num_tokens": 290004.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.203000009059906, + "rewards/env_reward/std": 0.4246000349521637, + "reward": 1.2969999313354492, + "reward_std": 0.4246000051498413, + "frac_reward_zero_std": 0.0, + "completion_length": 3.0, + "kl": 0.45477381348609924, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.119, + "step": 238 + }, + { + "loss": 0.0037, + "grad_norm": 0.014718780294060707, + "learning_rate": 2.988888888888889e-05, + "num_tokens": 291220.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 1.0318000316619873, + "rewards/env_reward/std": 0.0, + "reward": 2.5318000316619873, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 0.03718819469213486, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1195, + "step": 239 + }, + { + "loss": 0.0032, + "grad_norm": 0.023474005982279778, + "learning_rate": 2.9777777777777777e-05, + "num_tokens": 292432.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.1080000028014183, + "rewards/env_reward/std": 0.0, + "reward": 1.3919999599456787, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 0.03223689645528793, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.12, + "step": 240 + }, + { + "loss": 0.0038, + "grad_norm": 0.01683114469051361, + "learning_rate": 2.9666666666666672e-05, + "num_tokens": 293648.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.031300000846385956, + "rewards/env_reward/std": 0.0, + "reward": 1.5312999486923218, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 0.038463495671749115, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1205, + "step": 241 + }, + { + "loss": 0.0649, + "grad_norm": 7.739398002624512, + "learning_rate": 2.955555555555556e-05, + "num_tokens": 294865.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.43119996786117554, + "rewards/env_reward/std": 0.19619999825954437, + "reward": 1.9312000274658203, + "reward_std": 0.19620005786418915, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.6485915556550026, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.121, + "step": 242 + }, + { + "loss": 0.0063, + "grad_norm": 0.0356922373175621, + "learning_rate": 2.9444444444444448e-05, + "num_tokens": 296085.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.021800000220537186, + "rewards/env_reward/std": 0.0, + "reward": 1.5218000411987305, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 0.06295116990804672, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1215, + "step": 243 + }, + { + "loss": 0.0596, + "grad_norm": 5.311740875244141, + "learning_rate": 2.9333333333333336e-05, + "num_tokens": 297302.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.2790749967098236, + "rewards/env_reward/std": 0.10045000165700912, + "reward": 1.7790749073028564, + "reward_std": 0.10045000165700912, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.5959512740373611, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.122, + "step": 244 + }, + { + "loss": 0.0042, + "grad_norm": 0.01989293284714222, + "learning_rate": 2.9222222222222224e-05, + "num_tokens": 298514.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.792900025844574, + "rewards/env_reward/std": 0.0, + "reward": 2.2929000854492188, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 0.04173398017883301, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1225, + "step": 245 + }, + { + "loss": 0.0031, + "grad_norm": 0.007336912676692009, + "learning_rate": 2.9111111111111112e-05, + "num_tokens": 299726.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.7305999994277954, + "rewards/env_reward/std": 0.0, + "reward": 2.230599880218506, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 0.031048377975821495, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.123, + "step": 246 + }, + { + "loss": 0.1736, + "grad_norm": 9.066658020019531, + "learning_rate": 2.9e-05, + "num_tokens": 300938.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.2737250030040741, + "rewards/env_reward/std": 0.12075000256299973, + "reward": 1.2262749671936035, + "reward_std": 0.12074998766183853, + "frac_reward_zero_std": 0.0, + "completion_length": 3.0, + "kl": 1.7364354468882084, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1235, + "step": 247 + }, + { + "loss": 0.0322, + "grad_norm": 3.391655683517456, + "learning_rate": 2.8888888888888888e-05, + "num_tokens": 302150.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.17912501096725464, + "rewards/env_reward/std": 0.02304999530315399, + "reward": 1.6791250705718994, + "reward_std": 0.023049989715218544, + "frac_reward_zero_std": 0.0, + "completion_length": 3.0, + "kl": 0.32196859270334244, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.124, + "step": 248 + }, + { + "loss": 0.0347, + "grad_norm": 2.445664882659912, + "learning_rate": 2.877777777777778e-05, + "num_tokens": 303362.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.3643999993801117, + "rewards/env_reward/std": 0.16140000522136688, + "reward": 1.864400029182434, + "reward_std": 0.16140003502368927, + "frac_reward_zero_std": 0.0, + "completion_length": 3.0, + "kl": 0.34707972407341003, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1245, + "step": 249 + }, + { + "loss": 0.0907, + "grad_norm": 8.821626663208008, + "learning_rate": 2.8666666666666668e-05, + "num_tokens": 304575.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.6588499546051025, + "rewards/env_reward/std": 1.5048999786376953, + "reward": 2.1588497161865234, + "reward_std": 1.5048998594284058, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.9067427404224873, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.125, + "step": 250 + }, + { + "loss": 0.0037, + "grad_norm": 0.010974357835948467, + "learning_rate": 2.855555555555556e-05, + "num_tokens": 305787.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.3894999921321869, + "rewards/env_reward/std": 0.0, + "reward": 1.8895000219345093, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 0.03672550246119499, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1255, + "step": 251 + }, + { + "loss": 0.0292, + "grad_norm": 3.933217763900757, + "learning_rate": 2.8444444444444447e-05, + "num_tokens": 306999.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.1211249977350235, + "rewards/env_reward/std": 0.10684999823570251, + "reward": 1.6211249828338623, + "reward_std": 0.10684998333454132, + "frac_reward_zero_std": 0.0, + "completion_length": 3.0, + "kl": 0.2923688758164644, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.126, + "step": 252 + }, + { + "loss": 0.0503, + "grad_norm": 5.908778190612793, + "learning_rate": 2.8333333333333335e-05, + "num_tokens": 308241.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.13635000586509705, + "rewards/env_reward/std": 0.009872685186564922, + "reward": 1.6363499164581299, + "reward_std": 0.00987267680466175, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.5032139346003532, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1265, + "step": 253 + }, + { + "loss": 0.0028, + "grad_norm": 0.014478636905550957, + "learning_rate": 2.8222222222222223e-05, + "num_tokens": 309457.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.01720000058412552, + "rewards/env_reward/std": 0.0, + "reward": 1.517199993133545, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 0.0278167724609375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.127, + "step": 254 + }, + { + "loss": 0.0018, + "grad_norm": 0.010732459835708141, + "learning_rate": 2.811111111111111e-05, + "num_tokens": 310669.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.5572999715805054, + "rewards/env_reward/std": 0.0, + "reward": 2.057300090789795, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 0.018153250217437744, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1275, + "step": 255 + }, + { + "loss": 0.0475, + "grad_norm": 6.396478176116943, + "learning_rate": 2.8000000000000003e-05, + "num_tokens": 311891.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.4102250039577484, + "rewards/env_reward/std": 0.24782662093639374, + "reward": 1.9102249145507812, + "reward_std": 0.24782662093639374, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.47455649450421333, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.128, + "step": 256 + }, + { + "loss": 0.0102, + "grad_norm": 0.08098237961530685, + "learning_rate": 2.788888888888889e-05, + "num_tokens": 313103.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.664900004863739, + "rewards/env_reward/std": 0.0, + "reward": 2.164900064468384, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 0.10185647010803223, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1285, + "step": 257 + }, + { + "loss": 0.0108, + "grad_norm": 2.5040080547332764, + "learning_rate": 2.777777777777778e-05, + "num_tokens": 314319.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.7252249717712402, + "rewards/env_reward/std": 0.4409500062465668, + "reward": 2.2252249717712402, + "reward_std": 0.440949946641922, + "frac_reward_zero_std": 0.0, + "completion_length": 3.0, + "kl": 0.10804228484630585, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.129, + "step": 258 + }, + { + "loss": 0.01, + "grad_norm": 2.200788736343384, + "learning_rate": 2.7666666666666667e-05, + "num_tokens": 315531.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.7825750112533569, + "rewards/env_reward/std": 0.48044997453689575, + "reward": 2.2825751304626465, + "reward_std": 0.48044994473457336, + "frac_reward_zero_std": 0.0, + "completion_length": 3.0, + "kl": 0.09993956610560417, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1295, + "step": 259 + }, + { + "loss": 0.0588, + "grad_norm": 7.3923139572143555, + "learning_rate": 2.7555555555555555e-05, + "num_tokens": 316748.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.4410000145435333, + "rewards/env_reward/std": 0.43780001997947693, + "reward": 1.940999984741211, + "reward_std": 0.43779999017715454, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.588453009724617, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.13, + "step": 260 + }, + { + "loss": 0.0036, + "grad_norm": 0.051993146538734436, + "learning_rate": 2.7444444444444443e-05, + "num_tokens": 317960.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.03790000081062317, + "rewards/env_reward/std": 0.0, + "reward": 1.4621000289916992, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 0.03645044565200806, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1305, + "step": 261 + }, + { + "loss": 0.0301, + "grad_norm": 3.2377870082855225, + "learning_rate": 2.733333333333333e-05, + "num_tokens": 319176.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.1784999966621399, + "rewards/env_reward/std": 0.21696822345256805, + "reward": 1.3215000629425049, + "reward_std": 0.21696823835372925, + "frac_reward_zero_std": 0.0, + "completion_length": 3.0, + "kl": 0.3008611798286438, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.131, + "step": 262 + }, + { + "loss": 0.0148, + "grad_norm": 0.13123686611652374, + "learning_rate": 2.7222222222222223e-05, + "num_tokens": 320392.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.14329999685287476, + "rewards/env_reward/std": 0.0, + "reward": 1.6433000564575195, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 0.14758411049842834, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1315, + "step": 263 + }, + { + "loss": 0.0522, + "grad_norm": 3.171525001525879, + "learning_rate": 2.7111111111111114e-05, + "num_tokens": 321628.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.20374999940395355, + "rewards/env_reward/std": 0.10271061211824417, + "reward": 1.7037500143051147, + "reward_std": 0.10271065682172775, + "frac_reward_zero_std": 0.0, + "completion_length": 3.0, + "kl": 0.5219772905111313, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.132, + "step": 264 + }, + { + "loss": 0.0283, + "grad_norm": 9.769635200500488, + "learning_rate": 2.7000000000000002e-05, + "num_tokens": 322847.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.06989999860525131, + "rewards/env_reward/std": 0.11860000342130661, + "reward": 1.5699000358581543, + "reward_std": 0.11860001087188721, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.2829606235027313, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1325, + "step": 265 + }, + { + "loss": 0.0067, + "grad_norm": 0.035419248044490814, + "learning_rate": 2.688888888888889e-05, + "num_tokens": 324063.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.1979999989271164, + "rewards/env_reward/std": 0.0, + "reward": 1.6979999542236328, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 0.06715995073318481, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.133, + "step": 266 + }, + { + "loss": 0.0036, + "grad_norm": 0.03618660196661949, + "learning_rate": 2.677777777777778e-05, + "num_tokens": 325275.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.17569999396800995, + "rewards/env_reward/std": 0.0, + "reward": 1.6756999492645264, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 0.03648458048701286, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1335, + "step": 267 + }, + { + "loss": 0.0012, + "grad_norm": 0.00686953729018569, + "learning_rate": 2.6666666666666667e-05, + "num_tokens": 326487.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.03020000085234642, + "rewards/env_reward/std": 0.0, + "reward": 1.4697999954223633, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 0.0123521089553833, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.134, + "step": 268 + }, + { + "loss": 0.0084, + "grad_norm": 0.03367152810096741, + "learning_rate": 2.6555555555555555e-05, + "num_tokens": 327727.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.03959999978542328, + "rewards/env_reward/std": 0.0, + "reward": 1.4603999853134155, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 0.08355914056301117, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1345, + "step": 269 + }, + { + "loss": 0.0686, + "grad_norm": 13.220355987548828, + "learning_rate": 2.6444444444444443e-05, + "num_tokens": 328940.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.12467499822378159, + "rewards/env_reward/std": 0.23714999854564667, + "reward": 1.6246750354766846, + "reward_std": 0.23714995384216309, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.6856659427285194, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.135, + "step": 270 + }, + { + "loss": 0.0568, + "grad_norm": 13.717921257019043, + "learning_rate": 2.633333333333333e-05, + "num_tokens": 330157.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.0213250033557415, + "rewards/env_reward/std": 0.10735000669956207, + "reward": 1.4786748886108398, + "reward_std": 0.10734999179840088, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.5684559792280197, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1355, + "step": 271 + }, + { + "loss": 0.0026, + "grad_norm": 0.015910228714346886, + "learning_rate": 2.6222222222222226e-05, + "num_tokens": 331373.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.04749999940395355, + "rewards/env_reward/std": 0.0, + "reward": 1.4524999856948853, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 0.025526802986860275, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.136, + "step": 272 + }, + { + "loss": 0.026, + "grad_norm": 14.18847942352295, + "learning_rate": 2.6111111111111114e-05, + "num_tokens": 332591.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.24905000627040863, + "rewards/env_reward/std": 0.2707195580005646, + "reward": 1.7490500211715698, + "reward_std": 0.2707195281982422, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.25982359051704407, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1365, + "step": 273 + }, + { + "loss": 0.0008, + "grad_norm": 0.011627077125012875, + "learning_rate": 2.6000000000000002e-05, + "num_tokens": 333807.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 1.0943000316619873, + "rewards/env_reward/std": 0.0, + "reward": 2.5943000316619873, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 0.008376996032893658, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.137, + "step": 274 + }, + { + "loss": 0.0015, + "grad_norm": 0.025715822353959084, + "learning_rate": 2.588888888888889e-05, + "num_tokens": 335019.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.09470000118017197, + "rewards/env_reward/std": 0.0, + "reward": 1.4053000211715698, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 0.015157541260123253, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1375, + "step": 275 + }, + { + "loss": 0.0037, + "grad_norm": 0.042894382029771805, + "learning_rate": 2.5777777777777778e-05, + "num_tokens": 336235.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.019700000062584877, + "rewards/env_reward/std": 0.0, + "reward": 1.519700050354004, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 0.03723999112844467, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.138, + "step": 276 + }, + { + "loss": 0.0266, + "grad_norm": 0.18419010937213898, + "learning_rate": 2.5666666666666666e-05, + "num_tokens": 337483.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.05939999967813492, + "rewards/env_reward/std": 0.0, + "reward": 1.559399962425232, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 0.26566994190216064, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1385, + "step": 277 + }, + { + "loss": 0.009, + "grad_norm": 14.892497062683105, + "learning_rate": 2.5555555555555554e-05, + "num_tokens": 338701.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 1.4440499544143677, + "rewards/env_reward/std": 1.7966563701629639, + "reward": 2.9440500736236572, + "reward_std": 1.7966562509536743, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.08950217813253403, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.139, + "step": 278 + }, + { + "loss": 0.0093, + "grad_norm": 7.610400199890137, + "learning_rate": 2.5444444444444442e-05, + "num_tokens": 339917.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.014999985694885254, + "rewards/env_reward/std": 0.5485435128211975, + "reward": 1.5150001049041748, + "reward_std": 0.5485435128211975, + "frac_reward_zero_std": 0.0, + "completion_length": 3.0, + "kl": 0.09344403445720673, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1395, + "step": 279 + }, + { + "loss": 0.0179, + "grad_norm": 2.456191062927246, + "learning_rate": 2.5333333333333337e-05, + "num_tokens": 341129.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.5827499628067017, + "rewards/env_reward/std": 0.148499995470047, + "reward": 2.082750082015991, + "reward_std": 0.1485000103712082, + "frac_reward_zero_std": 0.0, + "completion_length": 3.0, + "kl": 0.17908194661140442, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.14, + "step": 280 + }, + { + "loss": 0.15, + "grad_norm": 11.619885444641113, + "learning_rate": 2.5222222222222225e-05, + "num_tokens": 342370.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.45319998264312744, + "rewards/env_reward/std": 1.5305131673812866, + "reward": 1.953200101852417, + "reward_std": 1.5305134057998657, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 1.4995090551674366, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1405, + "step": 281 + }, + { + "loss": 0.0366, + "grad_norm": 0.21036075055599213, + "learning_rate": 2.5111111111111113e-05, + "num_tokens": 343614.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 1.0576000213623047, + "rewards/env_reward/std": 0.0, + "reward": 2.5576000213623047, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 0.3662685751914978, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.141, + "step": 282 + }, + { + "loss": 0.0113, + "grad_norm": 8.74146842956543, + "learning_rate": 2.5e-05, + "num_tokens": 344828.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.4428500235080719, + "rewards/env_reward/std": 0.15349557995796204, + "reward": 1.9428499937057495, + "reward_std": 0.1534956395626068, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.11289814859628677, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1415, + "step": 283 + }, + { + "loss": 0.0467, + "grad_norm": 4.497031211853027, + "learning_rate": 2.488888888888889e-05, + "num_tokens": 346041.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.4057999849319458, + "rewards/env_reward/std": 1.4797999858856201, + "reward": 1.9057999849319458, + "reward_std": 1.4797998666763306, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.4667878672480583, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.142, + "step": 284 + }, + { + "loss": 0.1547, + "grad_norm": 10.665767669677734, + "learning_rate": 2.477777777777778e-05, + "num_tokens": 347254.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.2880000174045563, + "rewards/env_reward/std": 1.1435999870300293, + "reward": 1.7880001068115234, + "reward_std": 1.1435999870300293, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 1.5469659715890884, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1425, + "step": 285 + }, + { + "loss": 0.1671, + "grad_norm": 40.458736419677734, + "learning_rate": 2.466666666666667e-05, + "num_tokens": 348472.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.3573499917984009, + "rewards/env_reward/std": 0.0411650724709034, + "reward": 1.8573499917984009, + "reward_std": 0.04116509109735489, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 1.6707031056284904, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.143, + "step": 286 + }, + { + "loss": 0.0732, + "grad_norm": 6.045069694519043, + "learning_rate": 2.4555555555555557e-05, + "num_tokens": 349684.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.439674973487854, + "rewards/env_reward/std": 0.4867240786552429, + "reward": 1.9396750926971436, + "reward_std": 0.4867240786552429, + "frac_reward_zero_std": 0.0, + "completion_length": 3.0, + "kl": 0.7324017907958478, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1435, + "step": 287 + }, + { + "loss": 0.303, + "grad_norm": 30.9366512298584, + "learning_rate": 2.4444444444444445e-05, + "num_tokens": 350902.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.23154999315738678, + "rewards/env_reward/std": 0.25824877619743347, + "reward": 1.7315499782562256, + "reward_std": 0.2582487463951111, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 3.029875487089157, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.144, + "step": 288 + }, + { + "loss": 0.0467, + "grad_norm": 3.2864396572113037, + "learning_rate": 2.4333333333333336e-05, + "num_tokens": 352114.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.299049973487854, + "rewards/env_reward/std": 0.22349999845027924, + "reward": 1.200950026512146, + "reward_std": 0.22350001335144043, + "frac_reward_zero_std": 0.0, + "completion_length": 3.0, + "kl": 0.467236191034317, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1445, + "step": 289 + }, + { + "loss": 0.0158, + "grad_norm": 0.14807219803333282, + "learning_rate": 2.4222222222222224e-05, + "num_tokens": 353354.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.17890000343322754, + "rewards/env_reward/std": 0.0, + "reward": 1.6789000034332275, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.15768074989318848, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.145, + "step": 290 + }, + { + "loss": 0.2747, + "grad_norm": 36.407737731933594, + "learning_rate": 2.4111111111111113e-05, + "num_tokens": 354567.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.335999995470047, + "rewards/env_reward/std": 0.2505159378051758, + "reward": 1.1640000343322754, + "reward_std": 0.2505159080028534, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 2.7468889504671097, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1455, + "step": 291 + }, + { + "loss": 0.1787, + "grad_norm": 13.456348419189453, + "learning_rate": 2.4e-05, + "num_tokens": 355781.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.40004998445510864, + "rewards/env_reward/std": 0.11044710874557495, + "reward": 1.9000499248504639, + "reward_std": 0.11044712364673615, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 1.787174940109253, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.146, + "step": 292 + }, + { + "loss": 0.0, + "grad_norm": 0.012113215401768684, + "learning_rate": 2.3888888888888892e-05, + "num_tokens": 357001.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.024900000542402267, + "rewards/env_reward/std": 0.0, + "reward": 1.524899959564209, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 0.000368654727935791, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1465, + "step": 293 + }, + { + "loss": 0.0683, + "grad_norm": 3.7350590229034424, + "learning_rate": 2.377777777777778e-05, + "num_tokens": 358215.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.042649999260902405, + "rewards/env_reward/std": 0.11010069400072098, + "reward": 1.4573500156402588, + "reward_std": 0.11010072380304337, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.6829928830265999, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.147, + "step": 294 + }, + { + "loss": 0.0682, + "grad_norm": 8.174044609069824, + "learning_rate": 2.3666666666666668e-05, + "num_tokens": 359456.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.030500005930662155, + "rewards/env_reward/std": 0.2613101601600647, + "reward": 1.530500054359436, + "reward_std": 0.2613101899623871, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.6816707998514175, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1475, + "step": 295 + }, + { + "loss": 0.0335, + "grad_norm": 13.81272029876709, + "learning_rate": 2.3555555555555556e-05, + "num_tokens": 360672.0, + "completions/mean_length": 4.0, + "completions/min_length": 3.0, + "completions/max_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 5.0, + "rewards/format_valid/mean": 0.25, + "rewards/format_valid/std": 1.5, + "rewards/action_legal/mean": 0.125, + "rewards/action_legal/std": 0.75, + "rewards/env_reward/mean": -0.808650016784668, + "rewards/env_reward/std": 1.4733041524887085, + "reward": -0.43365007638931274, + "reward_std": 3.7158007621765137, + "frac_reward_zero_std": 0.0, + "completion_length": 5.0, + "kl": 0.33455391973257065, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.148, + "step": 296 + }, + { + "loss": 0.0277, + "grad_norm": 1.6418352127075195, + "learning_rate": 2.3444444444444448e-05, + "num_tokens": 361888.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.09337499737739563, + "rewards/env_reward/std": 0.007350001484155655, + "reward": 1.5933749675750732, + "reward_std": 0.00735000753775239, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.2769545763731003, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1485, + "step": 297 + }, + { + "loss": 0.0079, + "grad_norm": 12.594855308532715, + "learning_rate": 2.3333333333333336e-05, + "num_tokens": 363102.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 1.166100025177002, + "rewards/env_reward/std": 1.2303334474563599, + "reward": 2.666100025177002, + "reward_std": 1.2303334474563599, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.07933932542800903, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.149, + "step": 298 + }, + { + "loss": 0.0161, + "grad_norm": 3.4360291957855225, + "learning_rate": 2.3222222222222224e-05, + "num_tokens": 364314.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.3107999861240387, + "rewards/env_reward/std": 0.5266494154930115, + "reward": 1.8107999563217163, + "reward_std": 0.5266494154930115, + "frac_reward_zero_std": 0.0, + "completion_length": 3.0, + "kl": 0.16086430102586746, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1495, + "step": 299 + }, + { + "loss": 0.0199, + "grad_norm": 12.927040100097656, + "learning_rate": 2.3111111111111112e-05, + "num_tokens": 365532.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.22199998795986176, + "rewards/env_reward/std": 0.3220459520816803, + "reward": 1.722000002861023, + "reward_std": 0.3220460116863251, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.19921859353780746, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.15, + "step": 300 + }, + { + "loss": 0.0097, + "grad_norm": 11.500689506530762, + "learning_rate": 2.3000000000000003e-05, + "num_tokens": 366755.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 2.149049997329712, + "rewards/env_reward/std": 1.404900074005127, + "reward": 3.649049997329712, + "reward_std": 1.404900074005127, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.09673605859279633, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1505, + "step": 301 + }, + { + "loss": 0.0063, + "grad_norm": 3.8352203369140625, + "learning_rate": 2.288888888888889e-05, + "num_tokens": 367971.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.2719250023365021, + "rewards/env_reward/std": 0.4681147336959839, + "reward": 1.7719249725341797, + "reward_std": 0.46811479330062866, + "frac_reward_zero_std": 0.0, + "completion_length": 3.0, + "kl": 0.0630109328776598, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.151, + "step": 302 + }, + { + "loss": 0.0224, + "grad_norm": 10.689901351928711, + "learning_rate": 2.277777777777778e-05, + "num_tokens": 369186.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.17434999346733093, + "rewards/env_reward/std": 0.13670000433921814, + "reward": 1.6743500232696533, + "reward_std": 0.13669998943805695, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.22385764122009277, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1515, + "step": 303 + }, + { + "loss": 0.0436, + "grad_norm": 16.9063663482666, + "learning_rate": 2.2666666666666668e-05, + "num_tokens": 370404.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.17260000109672546, + "rewards/env_reward/std": 0.1812880039215088, + "reward": 1.6726000308990479, + "reward_std": 0.1812879890203476, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.4355325996875763, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.152, + "step": 304 + }, + { + "loss": 0.019, + "grad_norm": 10.453081130981445, + "learning_rate": 2.255555555555556e-05, + "num_tokens": 371622.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 1.5529749393463135, + "rewards/env_reward/std": 1.6722427606582642, + "reward": 3.0529751777648926, + "reward_std": 1.6722426414489746, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.19028642773628235, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1525, + "step": 305 + }, + { + "loss": 0.0007, + "grad_norm": 2.4359123706817627, + "learning_rate": 2.2444444444444447e-05, + "num_tokens": 372834.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.21952499449253082, + "rewards/env_reward/std": 0.2609499990940094, + "reward": 1.2804749011993408, + "reward_std": 0.2609499990940094, + "frac_reward_zero_std": 0.0, + "completion_length": 3.0, + "kl": 0.0068905456573702395, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.153, + "step": 306 + }, + { + "loss": 0.0293, + "grad_norm": 0.18433877825737, + "learning_rate": 2.2333333333333335e-05, + "num_tokens": 374050.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.23100000619888306, + "rewards/env_reward/std": 0.0, + "reward": 1.7309999465942383, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.2930239737033844, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1535, + "step": 307 + }, + { + "loss": 0.0439, + "grad_norm": 7.052338600158691, + "learning_rate": 2.2222222222222223e-05, + "num_tokens": 375269.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 2.196974992752075, + "rewards/env_reward/std": 1.4914499521255493, + "reward": 3.696974992752075, + "reward_std": 1.4914498329162598, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.43932508677244186, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.154, + "step": 308 + }, + { + "loss": 0.0413, + "grad_norm": 7.099660873413086, + "learning_rate": 2.211111111111111e-05, + "num_tokens": 376484.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 2.0833749771118164, + "rewards/env_reward/std": 1.1990500688552856, + "reward": 3.5833749771118164, + "reward_std": 1.199049949645996, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.41285115480422974, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1545, + "step": 309 + }, + { + "loss": 0.2951, + "grad_norm": 31.55593490600586, + "learning_rate": 2.2000000000000003e-05, + "num_tokens": 377697.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.7319250106811523, + "rewards/env_reward/std": 1.3156499862670898, + "reward": 2.2319250106811523, + "reward_std": 1.3156499862670898, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 2.950943909585476, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.155, + "step": 310 + }, + { + "loss": 0.0574, + "grad_norm": 4.8634724617004395, + "learning_rate": 2.188888888888889e-05, + "num_tokens": 378912.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 2.2956249713897705, + "rewards/env_reward/std": 1.3313499689102173, + "reward": 3.7956249713897705, + "reward_std": 1.3313499689102173, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.5737996399402618, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1555, + "step": 311 + }, + { + "loss": 0.2786, + "grad_norm": 24.31743049621582, + "learning_rate": 2.177777777777778e-05, + "num_tokens": 380130.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.4107999801635742, + "rewards/env_reward/std": 0.11281422525644302, + "reward": 1.9107999801635742, + "reward_std": 0.11281431466341019, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 2.786042869091034, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.156, + "step": 312 + }, + { + "loss": 0.3142, + "grad_norm": 28.74258804321289, + "learning_rate": 2.1666666666666667e-05, + "num_tokens": 381344.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 1.4592249393463135, + "rewards/env_reward/std": 1.7793108224868774, + "reward": 2.9592249393463135, + "reward_std": 1.7793108224868774, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 3.1422307789325714, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1565, + "step": 313 + }, + { + "loss": 0.0197, + "grad_norm": 1.6756749153137207, + "learning_rate": 2.1555555555555555e-05, + "num_tokens": 382560.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 2.0082499980926514, + "rewards/env_reward/std": 1.2208999395370483, + "reward": 3.5082499980926514, + "reward_std": 1.2209001779556274, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.19709030538797379, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.157, + "step": 314 + }, + { + "loss": 0.0153, + "grad_norm": 0.08564022183418274, + "learning_rate": 2.1444444444444443e-05, + "num_tokens": 383776.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 2.7123000621795654, + "rewards/env_reward/std": 0.0, + "reward": 4.2123003005981445, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.1531214416027069, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1575, + "step": 315 + }, + { + "loss": 0.0935, + "grad_norm": 2.8297767639160156, + "learning_rate": 2.1333333333333335e-05, + "num_tokens": 384991.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.03192500025033951, + "rewards/env_reward/std": 0.009549999609589577, + "reward": 1.5319249629974365, + "reward_std": 0.009549975395202637, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.9346872717142105, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.158, + "step": 316 + }, + { + "loss": 0.232, + "grad_norm": 44.11138153076172, + "learning_rate": 2.1222222222222223e-05, + "num_tokens": 386208.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.6417250037193298, + "rewards/env_reward/std": 0.5402948260307312, + "reward": 2.1417250633239746, + "reward_std": 0.5402949452400208, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 2.319624111056328, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1585, + "step": 317 + }, + { + "loss": 0.2981, + "grad_norm": 25.64458656311035, + "learning_rate": 2.111111111111111e-05, + "num_tokens": 387422.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 1.2137749195098877, + "rewards/env_reward/std": 1.311942219734192, + "reward": 2.7137749195098877, + "reward_std": 1.311942219734192, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 2.980517789721489, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.159, + "step": 318 + }, + { + "loss": 0.0255, + "grad_norm": 0.06195315718650818, + "learning_rate": 2.1e-05, + "num_tokens": 388638.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.13220000267028809, + "rewards/env_reward/std": 0.0, + "reward": 1.632200002670288, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.2548181116580963, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1595, + "step": 319 + }, + { + "loss": 0.0244, + "grad_norm": 0.08441329002380371, + "learning_rate": 2.088888888888889e-05, + "num_tokens": 389858.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.11670000106096268, + "rewards/env_reward/std": 0.0, + "reward": 1.6167000532150269, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.24376285076141357, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.16, + "step": 320 + }, + { + "loss": 0.0031, + "grad_norm": 0.04484923183917999, + "learning_rate": 2.077777777777778e-05, + "num_tokens": 391102.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 2.989500045776367, + "rewards/env_reward/std": 0.0, + "reward": 4.489500045776367, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.03081400692462921, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1605, + "step": 321 + }, + { + "loss": 0.0686, + "grad_norm": 17.365371704101562, + "learning_rate": 2.0666666666666666e-05, + "num_tokens": 392321.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.11549999564886093, + "rewards/env_reward/std": 0.009599998593330383, + "reward": 1.6154999732971191, + "reward_std": 0.009600004181265831, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.6855819001793861, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.161, + "step": 322 + }, + { + "loss": 0.0571, + "grad_norm": 7.864797592163086, + "learning_rate": 2.0555555555555555e-05, + "num_tokens": 393568.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 2.226325035095215, + "rewards/env_reward/std": 1.5473500490188599, + "reward": 3.726325035095215, + "reward_std": 1.5473499298095703, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.5705475509166718, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1615, + "step": 323 + }, + { + "loss": 0.0432, + "grad_norm": 4.899349212646484, + "learning_rate": 2.0444444444444446e-05, + "num_tokens": 394787.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 3.0, + "rewards/env_reward/std": 0.0, + "reward": 4.5, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.43231137096881866, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.162, + "step": 324 + }, + { + "loss": 0.0341, + "grad_norm": 7.9208664894104, + "learning_rate": 2.0333333333333334e-05, + "num_tokens": 396006.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.4809750020503998, + "rewards/env_reward/std": 0.11704999208450317, + "reward": 1.9809750318527222, + "reward_std": 0.11704997718334198, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.3410384729504585, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1625, + "step": 325 + }, + { + "loss": 0.0173, + "grad_norm": 0.05077933520078659, + "learning_rate": 2.0222222222222222e-05, + "num_tokens": 397230.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 3.0, + "rewards/env_reward/std": 0.0, + "reward": 4.5, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.17307066917419434, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.163, + "step": 326 + }, + { + "loss": 0.107, + "grad_norm": 15.94080924987793, + "learning_rate": 2.011111111111111e-05, + "num_tokens": 398448.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.12385000288486481, + "rewards/env_reward/std": 0.14749404788017273, + "reward": 1.6238499879837036, + "reward_std": 0.14749404788017273, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 1.0698204934597015, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1635, + "step": 327 + }, + { + "loss": 0.0232, + "grad_norm": 0.06655026227235794, + "learning_rate": 2e-05, + "num_tokens": 399668.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.2168000042438507, + "rewards/env_reward/std": 0.0, + "reward": 1.7167999744415283, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.23233091831207275, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.164, + "step": 328 + }, + { + "loss": 0.0141, + "grad_norm": 0.044783495366573334, + "learning_rate": 1.988888888888889e-05, + "num_tokens": 400892.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 3.0, + "rewards/env_reward/std": 0.0, + "reward": 4.5, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.1407936066389084, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1645, + "step": 329 + }, + { + "loss": 0.0325, + "grad_norm": 0.3753240704536438, + "learning_rate": 1.9777777777777778e-05, + "num_tokens": 402104.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.4124999940395355, + "rewards/env_reward/std": 0.0, + "reward": 1.912500023841858, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 0.3249969184398651, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.165, + "step": 330 + }, + { + "loss": 0.03, + "grad_norm": 10.282318115234375, + "learning_rate": 1.9666666666666666e-05, + "num_tokens": 403323.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.6045999526977539, + "rewards/env_reward/std": 0.4702000021934509, + "reward": 2.104599952697754, + "reward_std": 0.4702000021934509, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.30002758651971817, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1655, + "step": 331 + }, + { + "loss": 0.023, + "grad_norm": 0.25242263078689575, + "learning_rate": 1.9555555555555557e-05, + "num_tokens": 404535.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.11249999701976776, + "rewards/env_reward/std": 0.0, + "reward": 1.6124999523162842, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 0.22953550517559052, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.166, + "step": 332 + }, + { + "loss": 0.0502, + "grad_norm": 14.25239086151123, + "learning_rate": 1.9444444444444445e-05, + "num_tokens": 405753.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.031175002455711365, + "rewards/env_reward/std": 0.34885352849960327, + "reward": 1.531174898147583, + "reward_std": 0.34885352849960327, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.5018511228263378, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1665, + "step": 333 + }, + { + "loss": 0.0413, + "grad_norm": 2.6266722679138184, + "learning_rate": 1.9333333333333333e-05, + "num_tokens": 406973.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.12707500159740448, + "rewards/env_reward/std": 0.004249997902661562, + "reward": 1.627074956893921, + "reward_std": 0.004250010009855032, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.41331926733255386, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.167, + "step": 334 + }, + { + "loss": 0.0229, + "grad_norm": 0.1149856448173523, + "learning_rate": 1.922222222222222e-05, + "num_tokens": 408209.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.1868000030517578, + "rewards/env_reward/std": 0.0, + "reward": 1.3131999969482422, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 0.22919198870658875, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1675, + "step": 335 + }, + { + "loss": 0.0535, + "grad_norm": 9.98381233215332, + "learning_rate": 1.9111111111111113e-05, + "num_tokens": 409432.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.37014999985694885, + "rewards/env_reward/std": 0.23389999568462372, + "reward": 1.870150089263916, + "reward_std": 0.23389999568462372, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.5347674936056137, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.168, + "step": 336 + }, + { + "loss": 0.0317, + "grad_norm": 0.06461849808692932, + "learning_rate": 1.9e-05, + "num_tokens": 410652.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.5595999956130981, + "rewards/env_reward/std": 0.0, + "reward": 2.0595998764038086, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.31705617904663086, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1685, + "step": 337 + }, + { + "loss": 0.0226, + "grad_norm": 0.05863592401146889, + "learning_rate": 1.888888888888889e-05, + "num_tokens": 411868.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.13179999589920044, + "rewards/env_reward/std": 0.0, + "reward": 1.6317999362945557, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.22567623853683472, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.169, + "step": 338 + }, + { + "loss": 0.0174, + "grad_norm": 0.0803116038441658, + "learning_rate": 1.8777777777777777e-05, + "num_tokens": 413112.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 2.745500087738037, + "rewards/env_reward/std": 0.0, + "reward": 4.245500087738037, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.1738945096731186, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1695, + "step": 339 + }, + { + "loss": 0.0219, + "grad_norm": 10.494765281677246, + "learning_rate": 1.866666666666667e-05, + "num_tokens": 414359.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.11049999296665192, + "rewards/env_reward/std": 0.06040000170469284, + "reward": 1.6104999780654907, + "reward_std": 0.06040000915527344, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.21919090673327446, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.17, + "step": 340 + }, + { + "loss": 0.0399, + "grad_norm": 10.061318397521973, + "learning_rate": 1.8555555555555557e-05, + "num_tokens": 415577.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.3518500030040741, + "rewards/env_reward/std": 0.3896750807762146, + "reward": 1.8518500328063965, + "reward_std": 0.3896750807762146, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.3991617187857628, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1705, + "step": 341 + }, + { + "loss": 0.0533, + "grad_norm": 8.98741626739502, + "learning_rate": 1.8444444444444445e-05, + "num_tokens": 416796.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.437375009059906, + "rewards/env_reward/std": 0.2680499851703644, + "reward": 1.9373749494552612, + "reward_std": 0.268049955368042, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.5329404026269913, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.171, + "step": 342 + }, + { + "loss": 0.0387, + "grad_norm": 8.799539566040039, + "learning_rate": 1.8333333333333333e-05, + "num_tokens": 418015.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.10607500374317169, + "rewards/env_reward/std": 0.030049998313188553, + "reward": 1.6060749292373657, + "reward_std": 0.030049998313188553, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.3870438188314438, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1715, + "step": 343 + }, + { + "loss": 0.0693, + "grad_norm": 7.4406280517578125, + "learning_rate": 1.8222222222222224e-05, + "num_tokens": 419229.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.2102999985218048, + "rewards/env_reward/std": 0.4063391089439392, + "reward": 1.7102999687194824, + "reward_std": 0.406339168548584, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.6930159032344818, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.172, + "step": 344 + }, + { + "loss": 0.0704, + "grad_norm": 5.985000133514404, + "learning_rate": 1.8111111111111112e-05, + "num_tokens": 420448.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.17270000278949738, + "rewards/env_reward/std": 0.0, + "reward": 1.672700047492981, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.7036964893341064, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1725, + "step": 345 + }, + { + "loss": 0.026, + "grad_norm": 0.07850353419780731, + "learning_rate": 1.8e-05, + "num_tokens": 421668.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.5697000026702881, + "rewards/env_reward/std": 0.0, + "reward": 2.069700002670288, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.25952446460723877, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.173, + "step": 346 + }, + { + "loss": 0.0499, + "grad_norm": 0.07321112602949142, + "learning_rate": 1.788888888888889e-05, + "num_tokens": 422892.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.09549999982118607, + "rewards/env_reward/std": 0.0, + "reward": 1.5954999923706055, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.4991249144077301, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1735, + "step": 347 + }, + { + "loss": 0.1159, + "grad_norm": 14.562685012817383, + "learning_rate": 1.777777777777778e-05, + "num_tokens": 424111.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 3.0, + "rewards/env_reward/std": 0.0, + "reward": 4.5, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 1.1592684239149094, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.174, + "step": 348 + }, + { + "loss": 0.0324, + "grad_norm": 0.08439615368843079, + "learning_rate": 1.7666666666666668e-05, + "num_tokens": 425327.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.0551999993622303, + "rewards/env_reward/std": 0.0, + "reward": 1.4448000192642212, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.32355356216430664, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1745, + "step": 349 + }, + { + "loss": 0.2581, + "grad_norm": 17.2807674407959, + "learning_rate": 1.7555555555555556e-05, + "num_tokens": 426546.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.07095000147819519, + "rewards/env_reward/std": 0.10130000859498978, + "reward": 1.5709500312805176, + "reward_std": 0.10130000114440918, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 2.581086255609989, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.175, + "step": 350 + }, + { + "loss": 0.0209, + "grad_norm": 0.06290078908205032, + "learning_rate": 1.7444444444444448e-05, + "num_tokens": 427762.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.48750001192092896, + "rewards/env_reward/std": 0.0, + "reward": 1.9874999523162842, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.20860141515731812, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1755, + "step": 351 + }, + { + "loss": 0.0244, + "grad_norm": 0.07568230479955673, + "learning_rate": 1.7333333333333336e-05, + "num_tokens": 428978.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.4765999913215637, + "rewards/env_reward/std": 0.0, + "reward": 1.976599931716919, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.24379196763038635, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.176, + "step": 352 + }, + { + "loss": 0.1051, + "grad_norm": 1.451683759689331, + "learning_rate": 1.7222222222222224e-05, + "num_tokens": 430201.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.14972499012947083, + "rewards/env_reward/std": 0.017549999058246613, + "reward": 1.6497249603271484, + "reward_std": 0.017549991607666016, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 1.0505332052707672, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1765, + "step": 353 + }, + { + "loss": 0.2366, + "grad_norm": 13.9566068649292, + "learning_rate": 1.7111111111111112e-05, + "num_tokens": 431416.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.07552500069141388, + "rewards/env_reward/std": 0.1043500006198883, + "reward": 1.5755250453948975, + "reward_std": 0.10434997081756592, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 2.365892544388771, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.177, + "step": 354 + }, + { + "loss": 0.1949, + "grad_norm": 32.534908294677734, + "learning_rate": 1.7000000000000003e-05, + "num_tokens": 432639.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.27617502212524414, + "rewards/env_reward/std": 0.2965500056743622, + "reward": 1.7761750221252441, + "reward_std": 0.2965499460697174, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 1.948879137635231, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1775, + "step": 355 + }, + { + "loss": 0.0072, + "grad_norm": 2.4790847301483154, + "learning_rate": 1.688888888888889e-05, + "num_tokens": 433883.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 2.28629994392395, + "rewards/env_reward/std": 1.42739999294281, + "reward": 3.78629994392395, + "reward_std": 1.42739999294281, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.0723607949912548, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.178, + "step": 356 + }, + { + "loss": 0.1356, + "grad_norm": 5.789950847625732, + "learning_rate": 1.677777777777778e-05, + "num_tokens": 435098.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 1.129325032234192, + "rewards/env_reward/std": 1.6126071214675903, + "reward": 2.6293251514434814, + "reward_std": 1.6126071214675903, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 1.3564762622117996, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1785, + "step": 357 + }, + { + "loss": 0.0798, + "grad_norm": 1.1189056634902954, + "learning_rate": 1.6666666666666667e-05, + "num_tokens": 436314.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.11904999613761902, + "rewards/env_reward/std": 0.0018999986350536346, + "reward": 1.6190500259399414, + "reward_std": 0.0019000370521098375, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.7980380952358246, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.179, + "step": 358 + }, + { + "loss": 0.067, + "grad_norm": 18.612730026245117, + "learning_rate": 1.655555555555556e-05, + "num_tokens": 437529.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 5.0, + "rewards/format_valid/mean": 0.25, + "rewards/format_valid/std": 1.5, + "rewards/action_legal/mean": 0.125, + "rewards/action_legal/std": 0.75, + "rewards/env_reward/mean": -0.595674991607666, + "rewards/env_reward/std": 1.6049399375915527, + "reward": -0.2206750512123108, + "reward_std": 3.8537392616271973, + "frac_reward_zero_std": 0.0, + "completion_length": 5.0, + "kl": 0.6701930351555347, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1795, + "step": 359 + }, + { + "loss": 0.0139, + "grad_norm": 0.04511000216007233, + "learning_rate": 1.6444444444444447e-05, + "num_tokens": 438773.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 2.9381000995635986, + "rewards/env_reward/std": 0.0, + "reward": 4.4380998611450195, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.13874168694019318, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.18, + "step": 360 + }, + { + "loss": 0.0321, + "grad_norm": 0.04865608364343643, + "learning_rate": 1.6333333333333335e-05, + "num_tokens": 439989.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.120899997651577, + "rewards/env_reward/std": 0.0, + "reward": 1.62090003490448, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.3211500346660614, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1805, + "step": 361 + }, + { + "loss": 0.0416, + "grad_norm": 13.253110885620117, + "learning_rate": 1.6222222222222223e-05, + "num_tokens": 441204.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.08607499301433563, + "rewards/env_reward/std": 0.028149999678134918, + "reward": 1.5860750675201416, + "reward_std": 0.028149962425231934, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.41631054133176804, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.181, + "step": 362 + }, + { + "loss": 0.0345, + "grad_norm": 0.05830829218029976, + "learning_rate": 1.6111111111111115e-05, + "num_tokens": 442420.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.41110000014305115, + "rewards/env_reward/std": 0.0, + "reward": 1.9111000299453735, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.3447533845901489, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1815, + "step": 363 + }, + { + "loss": 0.0929, + "grad_norm": 6.0469584465026855, + "learning_rate": 1.6000000000000003e-05, + "num_tokens": 443634.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.10462500154972076, + "rewards/env_reward/std": 0.0596962571144104, + "reward": 1.6046249866485596, + "reward_std": 0.0596962608397007, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.9288121908903122, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.182, + "step": 364 + }, + { + "loss": 0.0555, + "grad_norm": 10.187057495117188, + "learning_rate": 1.588888888888889e-05, + "num_tokens": 444873.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 2.0332250595092773, + "rewards/env_reward/std": 1.4233499765396118, + "reward": 3.5332248210906982, + "reward_std": 1.4233498573303223, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.555173322558403, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1825, + "step": 365 + }, + { + "loss": 0.1928, + "grad_norm": 19.102651596069336, + "learning_rate": 1.577777777777778e-05, + "num_tokens": 446088.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.5183249711990356, + "rewards/env_reward/std": 0.5100499987602234, + "reward": 2.018324851989746, + "reward_std": 0.5100500583648682, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 1.9275872558355331, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.183, + "step": 366 + }, + { + "loss": 0.0008, + "grad_norm": 7.459122180938721, + "learning_rate": 1.5666666666666667e-05, + "num_tokens": 447304.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.17572498321533203, + "rewards/env_reward/std": 0.24214999377727509, + "reward": 1.324275016784668, + "reward_std": 0.24214999377727509, + "frac_reward_zero_std": 0.0, + "completion_length": 3.0, + "kl": 0.007747650146484375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1835, + "step": 367 + }, + { + "loss": 0.0527, + "grad_norm": 0.05531112104654312, + "learning_rate": 1.5555555555555555e-05, + "num_tokens": 448524.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.0478999987244606, + "rewards/env_reward/std": 0.0, + "reward": 1.5478999614715576, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.527458131313324, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.184, + "step": 368 + }, + { + "loss": 0.1785, + "grad_norm": 12.54236125946045, + "learning_rate": 1.5444444444444446e-05, + "num_tokens": 449738.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 1.155324935913086, + "rewards/env_reward/std": 1.2862874269485474, + "reward": 2.655324935913086, + "reward_std": 1.286287546157837, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 1.7845973372459412, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1845, + "step": 369 + }, + { + "loss": 0.0947, + "grad_norm": 11.19451904296875, + "learning_rate": 1.5333333333333334e-05, + "num_tokens": 450952.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.8645250201225281, + "rewards/env_reward/std": 0.9372963905334473, + "reward": 2.3645248413085938, + "reward_std": 0.9372963905334473, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.9465900436043739, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.185, + "step": 370 + }, + { + "loss": 0.0309, + "grad_norm": 0.05680959299206734, + "learning_rate": 1.5222222222222224e-05, + "num_tokens": 452168.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.22120000422000885, + "rewards/env_reward/std": 0.0, + "reward": 1.7211999893188477, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.30913686752319336, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1855, + "step": 371 + }, + { + "loss": 0.0533, + "grad_norm": 12.08806324005127, + "learning_rate": 1.5111111111111112e-05, + "num_tokens": 453383.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.36172500252723694, + "rewards/env_reward/std": 0.1968500018119812, + "reward": 1.8617249727249146, + "reward_std": 0.19685006141662598, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.5326454341411591, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.186, + "step": 372 + }, + { + "loss": 0.0237, + "grad_norm": 0.037441231310367584, + "learning_rate": 1.5e-05, + "num_tokens": 454603.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.12919999659061432, + "rewards/env_reward/std": 0.0, + "reward": 1.6291999816894531, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.23727068305015564, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1865, + "step": 373 + }, + { + "loss": 0.0995, + "grad_norm": 18.8474178314209, + "learning_rate": 1.4888888888888888e-05, + "num_tokens": 455845.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.4505000114440918, + "rewards/env_reward/std": 0.09029757976531982, + "reward": 1.9505000114440918, + "reward_std": 0.09029749035835266, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.9948283955454826, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.187, + "step": 374 + }, + { + "loss": 0.0285, + "grad_norm": 14.588151931762695, + "learning_rate": 1.477777777777778e-05, + "num_tokens": 457082.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.7173500061035156, + "rewards/env_reward/std": 0.4462999999523163, + "reward": 2.2173500061035156, + "reward_std": 0.44630002975463867, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.28469008952379227, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1875, + "step": 375 + }, + { + "loss": 0.0764, + "grad_norm": 12.01020336151123, + "learning_rate": 1.4666666666666668e-05, + "num_tokens": 458297.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.287075012922287, + "rewards/env_reward/std": 0.4023500382900238, + "reward": 1.7870750427246094, + "reward_std": 0.4023500382900238, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.7636995688080788, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.188, + "step": 376 + }, + { + "loss": 0.0493, + "grad_norm": 14.367756843566895, + "learning_rate": 1.4555555555555556e-05, + "num_tokens": 459511.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.49560001492500305, + "rewards/env_reward/std": 0.3432924747467041, + "reward": 1.9955999851226807, + "reward_std": 0.3432925045490265, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.4925762265920639, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1885, + "step": 377 + }, + { + "loss": 0.0477, + "grad_norm": 11.141998291015625, + "learning_rate": 1.4444444444444444e-05, + "num_tokens": 460754.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 2.2250499725341797, + "rewards/env_reward/std": 1.5499000549316406, + "reward": 3.7250499725341797, + "reward_std": 1.5499000549316406, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.47659212350845337, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.189, + "step": 378 + }, + { + "loss": 0.0055, + "grad_norm": 6.927285194396973, + "learning_rate": 1.4333333333333334e-05, + "num_tokens": 461966.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.1862250119447708, + "rewards/env_reward/std": 0.19915001094341278, + "reward": 1.3137750625610352, + "reward_std": 0.1991499662399292, + "frac_reward_zero_std": 0.0, + "completion_length": 3.0, + "kl": 0.05537472292780876, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1895, + "step": 379 + }, + { + "loss": 0.0674, + "grad_norm": 10.835347175598145, + "learning_rate": 1.4222222222222224e-05, + "num_tokens": 463189.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.1679999977350235, + "rewards/env_reward/std": 0.07199999690055847, + "reward": 1.6679999828338623, + "reward_std": 0.07200002670288086, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.6744091212749481, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.19, + "step": 380 + }, + { + "loss": 0.0151, + "grad_norm": 0.025390751659870148, + "learning_rate": 1.4111111111111112e-05, + "num_tokens": 464405.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 3.0, + "rewards/env_reward/std": 0.0, + "reward": 4.5, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.15128396451473236, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1905, + "step": 381 + }, + { + "loss": 0.0431, + "grad_norm": 10.737701416015625, + "learning_rate": 1.4000000000000001e-05, + "num_tokens": 465624.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.07417500019073486, + "rewards/env_reward/std": 0.1214500144124031, + "reward": 1.5741748809814453, + "reward_std": 0.1214500293135643, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.43060215562582016, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.191, + "step": 382 + }, + { + "loss": 0.0116, + "grad_norm": 0.03365212678909302, + "learning_rate": 1.388888888888889e-05, + "num_tokens": 466848.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 3.0, + "rewards/env_reward/std": 0.0, + "reward": 4.5, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.1161755621433258, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1915, + "step": 383 + }, + { + "loss": 0.0193, + "grad_norm": 12.718644142150879, + "learning_rate": 1.3777777777777778e-05, + "num_tokens": 468061.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.26397499442100525, + "rewards/env_reward/std": 0.42328521609306335, + "reward": 1.7639750242233276, + "reward_std": 0.42328527569770813, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.192903034389019, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.192, + "step": 384 + }, + { + "loss": 0.028, + "grad_norm": 10.580364227294922, + "learning_rate": 1.3666666666666666e-05, + "num_tokens": 469276.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.20807498693466187, + "rewards/env_reward/std": 0.07964999973773956, + "reward": 1.2919249534606934, + "reward_std": 0.07965004444122314, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.28047777712345123, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1925, + "step": 385 + }, + { + "loss": 0.0389, + "grad_norm": 0.11868631094694138, + "learning_rate": 1.3555555555555557e-05, + "num_tokens": 470492.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.23100000619888306, + "rewards/env_reward/std": 0.0, + "reward": 1.7309999465942383, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.38852906227111816, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.193, + "step": 386 + }, + { + "loss": 0.0184, + "grad_norm": 4.448313236236572, + "learning_rate": 1.3444444444444445e-05, + "num_tokens": 471704.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.014475002884864807, + "rewards/env_reward/std": 0.1984500139951706, + "reward": 1.5144751071929932, + "reward_std": 0.1984500139951706, + "frac_reward_zero_std": 0.0, + "completion_length": 3.0, + "kl": 0.18447205191478133, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1935, + "step": 387 + }, + { + "loss": 0.0291, + "grad_norm": 12.654869079589844, + "learning_rate": 1.3333333333333333e-05, + "num_tokens": 472918.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 1.6022250652313232, + "rewards/env_reward/std": 1.6144437789916992, + "reward": 3.1022250652313232, + "reward_std": 1.6144436597824097, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.29140961170196533, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.194, + "step": 388 + }, + { + "loss": 0.0443, + "grad_norm": 10.518233299255371, + "learning_rate": 1.3222222222222221e-05, + "num_tokens": 474133.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 1.7465500831604004, + "rewards/env_reward/std": 0.98170006275177, + "reward": 3.2465500831604004, + "reward_std": 0.9817000031471252, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.4428248852491379, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1945, + "step": 389 + }, + { + "loss": 0.0398, + "grad_norm": 0.08597031235694885, + "learning_rate": 1.3111111111111113e-05, + "num_tokens": 475349.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.31470000743865967, + "rewards/env_reward/std": 0.0, + "reward": 1.8147000074386597, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.397882878780365, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.195, + "step": 390 + }, + { + "loss": 0.0287, + "grad_norm": 0.1028384119272232, + "learning_rate": 1.3000000000000001e-05, + "num_tokens": 476565.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 2.835400104522705, + "rewards/env_reward/std": 0.0, + "reward": 4.335400104522705, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.28692421317100525, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1955, + "step": 391 + }, + { + "loss": 0.0439, + "grad_norm": 10.946956634521484, + "learning_rate": 1.2888888888888889e-05, + "num_tokens": 477780.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.1368750035762787, + "rewards/env_reward/std": 0.07964999973773956, + "reward": 1.6368749141693115, + "reward_std": 0.07965000718832016, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.43886157870292664, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.196, + "step": 392 + }, + { + "loss": 0.0498, + "grad_norm": 12.383152961730957, + "learning_rate": 1.2777777777777777e-05, + "num_tokens": 478998.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 1.5355499982833862, + "rewards/env_reward/std": 1.6910011768341064, + "reward": 3.035550117492676, + "reward_std": 1.6910011768341064, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.4976438209414482, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1965, + "step": 393 + }, + { + "loss": 0.0249, + "grad_norm": 10.40050220489502, + "learning_rate": 1.2666666666666668e-05, + "num_tokens": 480213.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 2.1511499881744385, + "rewards/env_reward/std": 1.332900047302246, + "reward": 3.6511502265930176, + "reward_std": 1.332900047302246, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.24874845519661903, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.197, + "step": 394 + }, + { + "loss": 0.0341, + "grad_norm": 10.021830558776855, + "learning_rate": 1.2555555555555557e-05, + "num_tokens": 481428.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.11985000222921371, + "rewards/env_reward/std": 0.04410000145435333, + "reward": 1.6198500394821167, + "reward_std": 0.044100046157836914, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.3408486172556877, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1975, + "step": 395 + }, + { + "loss": 0.0329, + "grad_norm": 11.029459953308105, + "learning_rate": 1.2444444444444445e-05, + "num_tokens": 482646.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.09814999997615814, + "rewards/env_reward/std": 0.27741682529449463, + "reward": 1.5981500148773193, + "reward_std": 0.27741679549217224, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.3289405405521393, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.198, + "step": 396 + }, + { + "loss": 0.0352, + "grad_norm": 0.10097940266132355, + "learning_rate": 1.2333333333333334e-05, + "num_tokens": 483866.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.5396999716758728, + "rewards/env_reward/std": 0.0, + "reward": 2.0397000312805176, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.3520781695842743, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1985, + "step": 397 + }, + { + "loss": 0.0342, + "grad_norm": 9.53077507019043, + "learning_rate": 1.2222222222222222e-05, + "num_tokens": 485081.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.18497499823570251, + "rewards/env_reward/std": 0.07945000380277634, + "reward": 1.6849749088287354, + "reward_std": 0.07945001125335693, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.3423382416367531, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.199, + "step": 398 + }, + { + "loss": 0.0543, + "grad_norm": 14.226777076721191, + "learning_rate": 1.2111111111111112e-05, + "num_tokens": 486300.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.6295000314712524, + "rewards/env_reward/std": 0.07639998197555542, + "reward": 2.129499912261963, + "reward_std": 0.0764000415802002, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.5428656786680222, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.1995, + "step": 399 + }, + { + "loss": 0.0213, + "grad_norm": 0.054999712854623795, + "learning_rate": 1.2e-05, + "num_tokens": 487520.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.12160000205039978, + "rewards/env_reward/std": 0.0, + "reward": 1.6216000318527222, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.2132653295993805, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.2, + "step": 400 + }, + { + "loss": 0.0678, + "grad_norm": 8.299275398254395, + "learning_rate": 1.188888888888889e-05, + "num_tokens": 488743.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.673675000667572, + "rewards/env_reward/std": 0.41065001487731934, + "reward": 2.173675060272217, + "reward_std": 0.41064998507499695, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.678008109331131, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.2005, + "step": 401 + }, + { + "loss": 0.0118, + "grad_norm": 0.09446023404598236, + "learning_rate": 1.1777777777777778e-05, + "num_tokens": 489991.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.5418000221252441, + "rewards/env_reward/std": 0.0, + "reward": 2.041800022125244, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.1179627776145935, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.201, + "step": 402 + }, + { + "loss": 0.1018, + "grad_norm": 3.7508544921875, + "learning_rate": 1.1666666666666668e-05, + "num_tokens": 491204.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.010974999517202377, + "rewards/env_reward/std": 0.08034929633140564, + "reward": 1.5109750032424927, + "reward_std": 0.08034923672676086, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 1.0181541293859482, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.2015, + "step": 403 + }, + { + "loss": 0.1123, + "grad_norm": 27.579870223999023, + "learning_rate": 1.1555555555555556e-05, + "num_tokens": 492437.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.03229999914765358, + "rewards/env_reward/std": 0.10499999672174454, + "reward": 1.4677000045776367, + "reward_std": 0.10500001907348633, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 1.1227066367864609, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.202, + "step": 404 + }, + { + "loss": 0.0542, + "grad_norm": 13.787353515625, + "learning_rate": 1.1444444444444446e-05, + "num_tokens": 493652.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.28747498989105225, + "rewards/env_reward/std": 0.048350006341934204, + "reward": 1.7874749898910522, + "reward_std": 0.048350054770708084, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.5424342602491379, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.2025, + "step": 405 + }, + { + "loss": 0.1413, + "grad_norm": 9.871804237365723, + "learning_rate": 1.1333333333333334e-05, + "num_tokens": 494893.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.21519999206066132, + "rewards/env_reward/std": 0.532200038433075, + "reward": 1.2848000526428223, + "reward_std": 0.532200038433075, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 1.4125559478998184, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.203, + "step": 406 + }, + { + "loss": 0.0222, + "grad_norm": 0.07422968745231628, + "learning_rate": 1.1222222222222224e-05, + "num_tokens": 496109.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 2.775099992752075, + "rewards/env_reward/std": 0.0, + "reward": 4.275099754333496, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.22246673703193665, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.2035, + "step": 407 + }, + { + "loss": 0.0368, + "grad_norm": 0.09009969979524612, + "learning_rate": 1.1111111111111112e-05, + "num_tokens": 497325.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.11050000041723251, + "rewards/env_reward/std": 0.0, + "reward": 1.6104999780654907, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.36780449748039246, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.204, + "step": 408 + }, + { + "loss": 0.0286, + "grad_norm": 1.5730061531066895, + "learning_rate": 1.1000000000000001e-05, + "num_tokens": 498545.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.11737500131130219, + "rewards/env_reward/std": 0.006950002163648605, + "reward": 1.6173748970031738, + "reward_std": 0.006950021255761385, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.28572753444314003, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.2045, + "step": 409 + }, + { + "loss": 0.0659, + "grad_norm": 2.1238832473754883, + "learning_rate": 1.088888888888889e-05, + "num_tokens": 499758.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.46617498993873596, + "rewards/env_reward/std": 1.444044589996338, + "reward": 1.9661749601364136, + "reward_std": 1.4440444707870483, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.6587927043437958, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.205, + "step": 410 + }, + { + "loss": 0.0483, + "grad_norm": 0.0901445522904396, + "learning_rate": 1.0777777777777778e-05, + "num_tokens": 500978.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 1.6664999723434448, + "rewards/env_reward/std": 0.0, + "reward": 3.1665000915527344, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.48277437686920166, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.2055, + "step": 411 + }, + { + "loss": 0.0311, + "grad_norm": 0.17845037579536438, + "learning_rate": 1.0666666666666667e-05, + "num_tokens": 502190.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.4124999940395355, + "rewards/env_reward/std": 0.0, + "reward": 1.912500023841858, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 0.3107207417488098, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.206, + "step": 412 + }, + { + "loss": 0.0175, + "grad_norm": 0.05390382558107376, + "learning_rate": 1.0555555555555555e-05, + "num_tokens": 503406.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.05739999935030937, + "rewards/env_reward/std": 0.0, + "reward": 1.5573999881744385, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.1745627522468567, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.2065, + "step": 413 + }, + { + "loss": 0.0625, + "grad_norm": 10.375347137451172, + "learning_rate": 1.0444444444444445e-05, + "num_tokens": 504623.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.03642500936985016, + "rewards/env_reward/std": 0.5068695545196533, + "reward": 1.536424994468689, + "reward_std": 0.5068694949150085, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.6247732043266296, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.207, + "step": 414 + }, + { + "loss": 0.044, + "grad_norm": 0.110545314848423, + "learning_rate": 1.0333333333333333e-05, + "num_tokens": 505839.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.0925000011920929, + "rewards/env_reward/std": 0.0, + "reward": 1.5924999713897705, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.4403933882713318, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.2075, + "step": 415 + }, + { + "loss": 0.0276, + "grad_norm": 2.9608986377716064, + "learning_rate": 1.0222222222222223e-05, + "num_tokens": 507054.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.5720750093460083, + "rewards/env_reward/std": 0.11894048005342484, + "reward": 2.0720748901367188, + "reward_std": 0.11894050985574722, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.27571701258420944, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.208, + "step": 416 + }, + { + "loss": 0.0714, + "grad_norm": 4.221258640289307, + "learning_rate": 1.0111111111111111e-05, + "num_tokens": 508268.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.11145000159740448, + "rewards/env_reward/std": 0.08967870473861694, + "reward": 1.611449956893921, + "reward_std": 0.08967869728803635, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.7143204063177109, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.2085, + "step": 417 + }, + { + "loss": 0.0089, + "grad_norm": 1.7130541801452637, + "learning_rate": 1e-05, + "num_tokens": 509484.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.5528500080108643, + "rewards/env_reward/std": 0.09589999914169312, + "reward": 2.0528500080108643, + "reward_std": 0.09590005874633789, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.0889815017580986, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.209, + "step": 418 + }, + { + "loss": 0.1025, + "grad_norm": 0.15961776673793793, + "learning_rate": 9.888888888888889e-06, + "num_tokens": 510700.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 1.4427000284194946, + "rewards/env_reward/std": 0.0, + "reward": 2.942699909210205, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 1.024822473526001, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.2095, + "step": 419 + }, + { + "loss": 0.0877, + "grad_norm": 2.687859296798706, + "learning_rate": 9.777777777777779e-06, + "num_tokens": 511918.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.2002749890089035, + "rewards/env_reward/std": 0.2704545259475708, + "reward": 1.70027494430542, + "reward_std": 0.2704544961452484, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.8766704872250557, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.21, + "step": 420 + }, + { + "loss": 0.0133, + "grad_norm": 0.07045546919107437, + "learning_rate": 9.666666666666667e-06, + "num_tokens": 513134.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.02889999933540821, + "rewards/env_reward/std": 0.0, + "reward": 1.5289000272750854, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.13296586275100708, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.2105, + "step": 421 + }, + { + "loss": 0.0111, + "grad_norm": 0.04775848612189293, + "learning_rate": 9.555555555555556e-06, + "num_tokens": 514350.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 3.0, + "rewards/env_reward/std": 0.0, + "reward": 4.5, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.11139857769012451, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.211, + "step": 422 + }, + { + "loss": 0.0531, + "grad_norm": 14.071207046508789, + "learning_rate": 9.444444444444445e-06, + "num_tokens": 515565.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.38374999165534973, + "rewards/env_reward/std": 0.058500006794929504, + "reward": 1.883750081062317, + "reward_std": 0.0585000142455101, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.5313576012849808, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.2115, + "step": 423 + }, + { + "loss": 0.0495, + "grad_norm": 6.999293804168701, + "learning_rate": 9.333333333333334e-06, + "num_tokens": 516780.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 2.2377500534057617, + "rewards/env_reward/std": 1.5245001316070557, + "reward": 3.7377500534057617, + "reward_std": 1.5245000123977661, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.494718536734581, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.212, + "step": 424 + }, + { + "loss": 0.0561, + "grad_norm": 6.606105327606201, + "learning_rate": 9.222222222222222e-06, + "num_tokens": 517994.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.06355000287294388, + "rewards/env_reward/std": 0.053404901176691055, + "reward": 1.5635499954223633, + "reward_std": 0.05340488255023956, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.5608605295419693, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.2125, + "step": 425 + }, + { + "loss": 0.0411, + "grad_norm": 6.317296504974365, + "learning_rate": 9.111111111111112e-06, + "num_tokens": 519213.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.5113000273704529, + "rewards/env_reward/std": 0.04380001127719879, + "reward": 2.0113000869750977, + "reward_std": 0.0437999963760376, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.41063985228538513, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.213, + "step": 426 + }, + { + "loss": 0.0445, + "grad_norm": 16.5034236907959, + "learning_rate": 9e-06, + "num_tokens": 520428.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.34769999980926514, + "rewards/env_reward/std": 0.3646000027656555, + "reward": 1.8476999998092651, + "reward_std": 0.36459994316101074, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.44513607025146484, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.2135, + "step": 427 + }, + { + "loss": 0.0686, + "grad_norm": 4.950837135314941, + "learning_rate": 8.88888888888889e-06, + "num_tokens": 521642.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.04944999888539314, + "rewards/env_reward/std": 0.033890459686517715, + "reward": 1.5494499206542969, + "reward_std": 0.03389044106006622, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.6864325404167175, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.214, + "step": 428 + }, + { + "loss": 0.0708, + "grad_norm": 18.683910369873047, + "learning_rate": 8.777777777777778e-06, + "num_tokens": 522861.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.26930001378059387, + "rewards/env_reward/std": 0.33180001378059387, + "reward": 1.7692999839782715, + "reward_std": 0.3317999839782715, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.7076273337006569, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.2145, + "step": 429 + }, + { + "loss": 0.0819, + "grad_norm": 3.3483595848083496, + "learning_rate": 8.666666666666668e-06, + "num_tokens": 524075.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 1.4747999906539917, + "rewards/env_reward/std": 1.761149287223816, + "reward": 2.9748001098632812, + "reward_std": 1.761149287223816, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.8185893446207047, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.215, + "step": 430 + }, + { + "loss": 0.1114, + "grad_norm": 17.29606819152832, + "learning_rate": 8.555555555555556e-06, + "num_tokens": 525288.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.4809499979019165, + "rewards/env_reward/std": 0.36256280541419983, + "reward": 1.9809499979019165, + "reward_std": 0.36256274580955505, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 1.1136217415332794, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.2155, + "step": 431 + }, + { + "loss": 0.0464, + "grad_norm": 6.7067790031433105, + "learning_rate": 8.444444444444446e-06, + "num_tokens": 526507.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.19207501411437988, + "rewards/env_reward/std": 0.037450000643730164, + "reward": 1.6920750141143799, + "reward_std": 0.037449996918439865, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.4635205864906311, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.216, + "step": 432 + }, + { + "loss": 0.0313, + "grad_norm": 0.09291418641805649, + "learning_rate": 8.333333333333334e-06, + "num_tokens": 527727.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.11069999635219574, + "rewards/env_reward/std": 0.0, + "reward": 1.610700011253357, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.3125154972076416, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.2165, + "step": 433 + }, + { + "loss": 0.0213, + "grad_norm": 0.07645757496356964, + "learning_rate": 8.222222222222223e-06, + "num_tokens": 528943.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.061500001698732376, + "rewards/env_reward/std": 0.0, + "reward": 1.5614999532699585, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.21342608332633972, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.217, + "step": 434 + }, + { + "loss": 0.0431, + "grad_norm": 6.316166400909424, + "learning_rate": 8.111111111111112e-06, + "num_tokens": 530162.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 2.105675220489502, + "rewards/env_reward/std": 1.4696500301361084, + "reward": 3.605674982070923, + "reward_std": 1.4696499109268188, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.4310971572995186, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.2175, + "step": 435 + }, + { + "loss": 0.0591, + "grad_norm": 6.796760559082031, + "learning_rate": 8.000000000000001e-06, + "num_tokens": 531377.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 1.9459749460220337, + "rewards/env_reward/std": 1.3590500354766846, + "reward": 3.445974826812744, + "reward_std": 1.3590497970581055, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.5905175358057022, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.218, + "step": 436 + }, + { + "loss": 0.1627, + "grad_norm": 34.66132736206055, + "learning_rate": 7.88888888888889e-06, + "num_tokens": 532595.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.08609999716281891, + "rewards/env_reward/std": 0.2364826649427414, + "reward": 1.5860999822616577, + "reward_std": 0.2364826798439026, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 1.6273682713508606, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.2185, + "step": 437 + }, + { + "loss": 0.0281, + "grad_norm": 1.5878809690475464, + "learning_rate": 7.777777777777777e-06, + "num_tokens": 533815.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.10175000131130219, + "rewards/env_reward/std": 0.014699999243021011, + "reward": 1.6017498970031738, + "reward_std": 0.01470001507550478, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.2811654843389988, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.219, + "step": 438 + }, + { + "loss": 0.0671, + "grad_norm": 2.8122143745422363, + "learning_rate": 7.666666666666667e-06, + "num_tokens": 535030.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 2.307950019836426, + "rewards/env_reward/std": 1.3840999603271484, + "reward": 3.807950019836426, + "reward_std": 1.3840999603271484, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.6714244410395622, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.2195, + "step": 439 + }, + { + "loss": 0.1292, + "grad_norm": 22.363697052001953, + "learning_rate": 7.555555555555556e-06, + "num_tokens": 536244.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.25197499990463257, + "rewards/env_reward/std": 0.3354390859603882, + "reward": 1.7519750595092773, + "reward_std": 0.33543911576271057, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 1.2917723655700684, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.22, + "step": 440 + }, + { + "loss": 0.0524, + "grad_norm": 6.302943706512451, + "learning_rate": 7.444444444444444e-06, + "num_tokens": 537463.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.46480000019073486, + "rewards/env_reward/std": 0.0, + "reward": 1.9648000001907349, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.5239357650279999, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.2205, + "step": 441 + }, + { + "loss": 0.0212, + "grad_norm": 0.08151021599769592, + "learning_rate": 7.333333333333334e-06, + "num_tokens": 538683.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.8804000020027161, + "rewards/env_reward/std": 0.0, + "reward": 2.3803999423980713, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.21206602454185486, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.221, + "step": 442 + }, + { + "loss": 0.0324, + "grad_norm": 0.10439272969961166, + "learning_rate": 7.222222222222222e-06, + "num_tokens": 539927.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.4124999940395355, + "rewards/env_reward/std": 0.0, + "reward": 1.912500023841858, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 0.3237926661968231, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.2215, + "step": 443 + }, + { + "loss": 0.0711, + "grad_norm": 3.5051026344299316, + "learning_rate": 7.111111111111112e-06, + "num_tokens": 541146.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.10379999876022339, + "rewards/env_reward/std": 0.028999999165534973, + "reward": 1.6037999391555786, + "reward_std": 0.02900000475347042, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.711153905838728, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.222, + "step": 444 + }, + { + "loss": 0.1239, + "grad_norm": 21.986900329589844, + "learning_rate": 7.000000000000001e-06, + "num_tokens": 542360.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.010450005531311035, + "rewards/env_reward/std": 0.47714316844940186, + "reward": 1.510450005531311, + "reward_std": 0.47714319825172424, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 1.239209771156311, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.2225, + "step": 445 + }, + { + "loss": 0.023, + "grad_norm": 0.07607124745845795, + "learning_rate": 6.888888888888889e-06, + "num_tokens": 543580.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.4551999866962433, + "rewards/env_reward/std": 0.0, + "reward": 1.955199956893921, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.22952622175216675, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.223, + "step": 446 + }, + { + "loss": 0.1395, + "grad_norm": 17.633258819580078, + "learning_rate": 6.777777777777779e-06, + "num_tokens": 544793.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.14509999752044678, + "rewards/env_reward/std": 0.186971977353096, + "reward": 1.3548998832702637, + "reward_std": 0.186971977353096, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 1.3945640623569489, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.2235, + "step": 447 + }, + { + "loss": 0.0274, + "grad_norm": 0.10130514949560165, + "learning_rate": 6.666666666666667e-06, + "num_tokens": 546013.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.2011999934911728, + "rewards/env_reward/std": 0.0, + "reward": 1.701200008392334, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.2743580937385559, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.224, + "step": 448 + }, + { + "loss": 0.0986, + "grad_norm": 6.136234283447266, + "learning_rate": 6.555555555555556e-06, + "num_tokens": 547227.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.0943249985575676, + "rewards/env_reward/std": 0.05175946652889252, + "reward": 1.594325065612793, + "reward_std": 0.051759425550699234, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.9864560812711716, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.2245, + "step": 449 + }, + { + "loss": 0.0313, + "grad_norm": 0.13909797370433807, + "learning_rate": 6.4444444444444445e-06, + "num_tokens": 548439.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.436599999666214, + "rewards/env_reward/std": 0.0, + "reward": 1.9365999698638916, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 0.31275004148483276, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.225, + "step": 450 + }, + { + "loss": 0.0496, + "grad_norm": 5.083831787109375, + "learning_rate": 6.333333333333334e-06, + "num_tokens": 549662.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 2.296875, + "rewards/env_reward/std": 1.40625, + "reward": 3.796875, + "reward_std": 1.40625, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.49592315405607224, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.2255, + "step": 451 + }, + { + "loss": 0.0498, + "grad_norm": 17.404560089111328, + "learning_rate": 6.222222222222222e-06, + "num_tokens": 550877.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.7101999521255493, + "rewards/env_reward/std": 0.46720001101493835, + "reward": 2.2101998329162598, + "reward_std": 0.46720001101493835, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.49799713492393494, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.226, + "step": 452 + }, + { + "loss": 0.0237, + "grad_norm": 0.10003330558538437, + "learning_rate": 6.111111111111111e-06, + "num_tokens": 552093.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.22120000422000885, + "rewards/env_reward/std": 0.0, + "reward": 1.7211999893188477, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.23683232069015503, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.2265, + "step": 453 + }, + { + "loss": 0.0352, + "grad_norm": 0.07923237234354019, + "learning_rate": 6e-06, + "num_tokens": 553313.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 3.0, + "rewards/env_reward/std": 0.0, + "reward": 4.5, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.35178661346435547, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.227, + "step": 454 + }, + { + "loss": 0.1115, + "grad_norm": 23.17422866821289, + "learning_rate": 5.888888888888889e-06, + "num_tokens": 554526.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.22169999778270721, + "rewards/env_reward/std": 0.4673518240451813, + "reward": 1.7217000722885132, + "reward_std": 0.4673517942428589, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 1.1154318749904633, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.2275, + "step": 455 + }, + { + "loss": 0.0082, + "grad_norm": 0.048594508320093155, + "learning_rate": 5.777777777777778e-06, + "num_tokens": 555742.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.4713999927043915, + "rewards/env_reward/std": 0.0, + "reward": 1.9714000225067139, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 0.08181055635213852, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.228, + "step": 456 + }, + { + "loss": 0.0641, + "grad_norm": 13.334370613098145, + "learning_rate": 5.666666666666667e-06, + "num_tokens": 556960.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.34242498874664307, + "rewards/env_reward/std": 0.4131866991519928, + "reward": 1.842424988746643, + "reward_std": 0.4131866991519928, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.6406428962945938, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.2285, + "step": 457 + }, + { + "loss": 0.0471, + "grad_norm": 7.503530979156494, + "learning_rate": 5.555555555555556e-06, + "num_tokens": 558179.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.43992501497268677, + "rewards/env_reward/std": 0.013150006532669067, + "reward": 1.9399250745773315, + "reward_std": 0.013150015845894814, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.47112637758255005, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.229, + "step": 458 + }, + { + "loss": 0.0201, + "grad_norm": 0.08755699545145035, + "learning_rate": 5.444444444444445e-06, + "num_tokens": 559399.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 2.2887001037597656, + "rewards/env_reward/std": 0.0, + "reward": 3.7887001037597656, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.20145288109779358, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.2295, + "step": 459 + }, + { + "loss": 0.0792, + "grad_norm": 22.58326530456543, + "learning_rate": 5.333333333333334e-06, + "num_tokens": 560617.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.43185001611709595, + "rewards/env_reward/std": 0.3804160952568054, + "reward": 1.9318499565124512, + "reward_std": 0.3804161548614502, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.7920261472463608, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.23, + "step": 460 + }, + { + "loss": 0.0454, + "grad_norm": 13.632698059082031, + "learning_rate": 5.2222222222222226e-06, + "num_tokens": 561832.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.6263749599456787, + "rewards/env_reward/std": 0.4093500077724457, + "reward": 2.1263749599456787, + "reward_std": 0.4093499183654785, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.45408768951892853, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.2305, + "step": 461 + }, + { + "loss": 0.0126, + "grad_norm": 0.061295535415410995, + "learning_rate": 5.1111111111111115e-06, + "num_tokens": 563052.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.1851000040769577, + "rewards/env_reward/std": 0.0, + "reward": 1.6850999593734741, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.12610086798667908, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.231, + "step": 462 + }, + { + "loss": 0.0453, + "grad_norm": 10.238383293151855, + "learning_rate": 5e-06, + "num_tokens": 564265.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.5235750079154968, + "rewards/env_reward/std": 0.44271576404571533, + "reward": 2.0235750675201416, + "reward_std": 0.44271576404571533, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.4533032178878784, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.2315, + "step": 463 + }, + { + "loss": 0.0593, + "grad_norm": 8.796762466430664, + "learning_rate": 4.888888888888889e-06, + "num_tokens": 565478.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.2136250138282776, + "rewards/env_reward/std": 0.33112218976020813, + "reward": 1.2863750457763672, + "reward_std": 0.33112218976020813, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.5929864794015884, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.232, + "step": 464 + }, + { + "loss": 0.0466, + "grad_norm": 7.1668620109558105, + "learning_rate": 4.777777777777778e-06, + "num_tokens": 566693.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.13312500715255737, + "rewards/env_reward/std": 0.004950001835823059, + "reward": 1.6331250667572021, + "reward_std": 0.004949966911226511, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.46607181429862976, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.2325, + "step": 465 + }, + { + "loss": 0.0366, + "grad_norm": 9.879788398742676, + "learning_rate": 4.666666666666667e-06, + "num_tokens": 567907.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.2449999898672104, + "rewards/env_reward/std": 0.8202992677688599, + "reward": 1.7450000047683716, + "reward_std": 0.8202992677688599, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.3659088909626007, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.233, + "step": 466 + }, + { + "loss": 0.0282, + "grad_norm": 13.147476196289062, + "learning_rate": 4.555555555555556e-06, + "num_tokens": 569126.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.4874749779701233, + "rewards/env_reward/std": 0.2195499986410141, + "reward": 1.9874749183654785, + "reward_std": 0.21955005824565887, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.2823686748743057, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.2335, + "step": 467 + }, + { + "loss": 0.0267, + "grad_norm": 8.822482109069824, + "learning_rate": 4.444444444444445e-06, + "num_tokens": 570345.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 2.2204749584198, + "rewards/env_reward/std": 1.4490500688552856, + "reward": 3.7204747200012207, + "reward_std": 1.4490498304367065, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.2672518938779831, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.234, + "step": 468 + }, + { + "loss": 0.1027, + "grad_norm": 14.732349395751953, + "learning_rate": 4.333333333333334e-06, + "num_tokens": 571563.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 5.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 5.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": -0.07499999552965164, + "rewards/env_reward/std": 0.375, + "reward": 1.4249999523162842, + "reward_std": 0.3749999701976776, + "frac_reward_zero_std": 0.0, + "completion_length": 5.0, + "kl": 1.0270851105451584, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.2345, + "step": 469 + }, + { + "loss": 0.0364, + "grad_norm": 8.678610801696777, + "learning_rate": 4.222222222222223e-06, + "num_tokens": 572781.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 1.2231000661849976, + "rewards/env_reward/std": 1.4263650178909302, + "reward": 2.723099946975708, + "reward_std": 1.4263651371002197, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.3637717515230179, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.235, + "step": 470 + }, + { + "loss": 0.0408, + "grad_norm": 7.956890106201172, + "learning_rate": 4.111111111111112e-06, + "num_tokens": 573999.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.2797500193119049, + "rewards/env_reward/std": 0.29265886545181274, + "reward": 1.7797499895095825, + "reward_std": 0.29265889525413513, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.40757453441619873, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.2355, + "step": 471 + }, + { + "loss": 0.0313, + "grad_norm": 4.231257438659668, + "learning_rate": 4.000000000000001e-06, + "num_tokens": 575211.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.06790000200271606, + "rewards/env_reward/std": 0.24300001561641693, + "reward": 1.5678999423980713, + "reward_std": 0.24299998581409454, + "frac_reward_zero_std": 0.0, + "completion_length": 3.0, + "kl": 0.312759205698967, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.236, + "step": 472 + }, + { + "loss": 0.027, + "grad_norm": 12.84277057647705, + "learning_rate": 3.888888888888889e-06, + "num_tokens": 576430.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.41770002245903015, + "rewards/env_reward/std": 0.40940001606941223, + "reward": 1.9177000522613525, + "reward_std": 0.40939995646476746, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.26984886825084686, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.2365, + "step": 473 + }, + { + "loss": 0.1004, + "grad_norm": 0.3565484881401062, + "learning_rate": 3.777777777777778e-06, + "num_tokens": 577650.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.060750000178813934, + "rewards/env_reward/std": 0.013299999758601189, + "reward": 1.5607500076293945, + "reward_std": 0.013299982063472271, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 1.004371426999569, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.237, + "step": 474 + }, + { + "loss": 0.0206, + "grad_norm": 0.07799467444419861, + "learning_rate": 3.666666666666667e-06, + "num_tokens": 578870.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.0617000013589859, + "rewards/env_reward/std": 0.0, + "reward": 1.5616999864578247, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.20640861988067627, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.2375, + "step": 475 + }, + { + "loss": 0.0421, + "grad_norm": 4.742053508758545, + "learning_rate": 3.555555555555556e-06, + "num_tokens": 580087.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.7774249911308289, + "rewards/env_reward/std": 1.4874924421310425, + "reward": 2.2774250507354736, + "reward_std": 1.4874924421310425, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.42091556638479233, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.238, + "step": 476 + }, + { + "loss": 0.0288, + "grad_norm": 0.08009364455938339, + "learning_rate": 3.4444444444444444e-06, + "num_tokens": 581311.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.41600000858306885, + "rewards/env_reward/std": 0.0, + "reward": 1.9160000085830688, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.28848007321357727, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.2385, + "step": 477 + }, + { + "loss": 0.028, + "grad_norm": 9.957799911499023, + "learning_rate": 3.3333333333333333e-06, + "num_tokens": 582530.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.16455000638961792, + "rewards/env_reward/std": 0.0925000011920929, + "reward": 1.6645500659942627, + "reward_std": 0.09250004589557648, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.27987441420555115, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.239, + "step": 478 + }, + { + "loss": 0.0446, + "grad_norm": 0.18692584335803986, + "learning_rate": 3.2222222222222222e-06, + "num_tokens": 583746.0, + "completions/mean_length": 3.0, + "completions/min_length": 3.0, + "completions/max_length": 3.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.0, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 3.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.18970000743865967, + "rewards/env_reward/std": 0.0, + "reward": 1.6897000074386597, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 3.0, + "kl": 0.4460113048553467, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.2395, + "step": 479 + }, + { + "loss": 0.0583, + "grad_norm": 8.525734901428223, + "learning_rate": 3.111111111111111e-06, + "num_tokens": 584960.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.3813000023365021, + "rewards/env_reward/std": 0.6251548528671265, + "reward": 1.8812999725341797, + "reward_std": 0.6251548528671265, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.5830349922180176, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.24, + "step": 480 + }, + { + "loss": 0.0252, + "grad_norm": 0.07544836401939392, + "learning_rate": 3e-06, + "num_tokens": 586180.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.3840000033378601, + "rewards/env_reward/std": 0.0, + "reward": 1.8840000629425049, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.25199753046035767, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.2405, + "step": 481 + }, + { + "loss": 0.0475, + "grad_norm": 0.11984675377607346, + "learning_rate": 2.888888888888889e-06, + "num_tokens": 587396.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.19089999794960022, + "rewards/env_reward/std": 0.0, + "reward": 1.6908999681472778, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.4745121896266937, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.241, + "step": 482 + }, + { + "loss": 0.0356, + "grad_norm": 10.510693550109863, + "learning_rate": 2.777777777777778e-06, + "num_tokens": 588614.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.7358250021934509, + "rewards/env_reward/std": 0.5424174070358276, + "reward": 2.2358250617980957, + "reward_std": 0.5424175262451172, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.35567909479141235, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.2415, + "step": 483 + }, + { + "loss": 0.0343, + "grad_norm": 3.6369059085845947, + "learning_rate": 2.666666666666667e-06, + "num_tokens": 589827.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.07247500121593475, + "rewards/env_reward/std": 0.06592877209186554, + "reward": 1.5724749565124512, + "reward_std": 0.06592877209186554, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.3429151102900505, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.242, + "step": 484 + }, + { + "loss": 0.0236, + "grad_norm": 8.855422019958496, + "learning_rate": 2.5555555555555557e-06, + "num_tokens": 591046.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 2.26639986038208, + "rewards/env_reward/std": 1.4672000408172607, + "reward": 3.76639986038208, + "reward_std": 1.4672000408172607, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.23648104071617126, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.2425, + "step": 485 + }, + { + "loss": 0.0557, + "grad_norm": 6.969616413116455, + "learning_rate": 2.4444444444444447e-06, + "num_tokens": 592259.0, + "completions/mean_length": 3.25, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.25, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.3041999936103821, + "rewards/env_reward/std": 0.09227003902196884, + "reward": 1.8042000532150269, + "reward_std": 0.09227006137371063, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.5572411194443703, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.243, + "step": 486 + }, + { + "loss": 0.0434, + "grad_norm": 12.514766693115234, + "learning_rate": 2.3333333333333336e-06, + "num_tokens": 593474.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.2367749959230423, + "rewards/env_reward/std": 0.6161500215530396, + "reward": 1.7367749214172363, + "reward_std": 0.6161500215530396, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.4340191185474396, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.2435, + "step": 487 + }, + { + "loss": 0.0492, + "grad_norm": 13.703819274902344, + "learning_rate": 2.2222222222222225e-06, + "num_tokens": 594697.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.29727500677108765, + "rewards/env_reward/std": 0.3591500222682953, + "reward": 1.7972750663757324, + "reward_std": 0.3591500222682953, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.4917333871126175, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.244, + "step": 488 + }, + { + "loss": 0.064, + "grad_norm": 5.9716949462890625, + "learning_rate": 2.1111111111111114e-06, + "num_tokens": 595911.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 1.4799000024795532, + "rewards/env_reward/std": 1.7552603483200073, + "reward": 2.9798998832702637, + "reward_std": 1.7552602291107178, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.6398516371846199, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.2445, + "step": 489 + }, + { + "loss": 0.0234, + "grad_norm": 1.6018086671829224, + "learning_rate": 2.0000000000000003e-06, + "num_tokens": 597130.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.5368000268936157, + "rewards/env_reward/std": 0.0, + "reward": 2.036799907684326, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.2344426065683365, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.245, + "step": 490 + }, + { + "loss": 0.0547, + "grad_norm": 4.675817489624023, + "learning_rate": 1.888888888888889e-06, + "num_tokens": 598344.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.03647499904036522, + "rewards/env_reward/std": 0.09896047413349152, + "reward": 1.5364750623703003, + "reward_std": 0.0989605039358139, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.5469909161329269, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.2455, + "step": 491 + }, + { + "loss": 0.0462, + "grad_norm": 13.125691413879395, + "learning_rate": 1.777777777777778e-06, + "num_tokens": 599563.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.5171999931335449, + "rewards/env_reward/std": 0.026600003242492676, + "reward": 2.017199993133545, + "reward_std": 0.026599964126944542, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.46199870109558105, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.246, + "step": 492 + }, + { + "loss": 0.0404, + "grad_norm": 8.710203170776367, + "learning_rate": 1.6666666666666667e-06, + "num_tokens": 600782.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 1.916100025177002, + "rewards/env_reward/std": 1.25, + "reward": 3.416100025177002, + "reward_std": 1.25, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.40352775156497955, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.2465, + "step": 493 + }, + { + "loss": 0.0381, + "grad_norm": 3.4673705101013184, + "learning_rate": 1.5555555555555556e-06, + "num_tokens": 601998.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.3388749957084656, + "rewards/env_reward/std": 0.2771500051021576, + "reward": 1.8388750553131104, + "reward_std": 0.27715003490448, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.3812360465526581, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.247, + "step": 494 + }, + { + "loss": 0.0296, + "grad_norm": 9.451112747192383, + "learning_rate": 1.4444444444444445e-06, + "num_tokens": 603241.0, + "completions/mean_length": 3.75, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.75, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 2.2273499965667725, + "rewards/env_reward/std": 1.5453001260757446, + "reward": 3.7273499965667725, + "reward_std": 1.545300006866455, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.2962055504322052, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.2475, + "step": 495 + }, + { + "loss": 0.0297, + "grad_norm": 0.08744853734970093, + "learning_rate": 1.3333333333333334e-06, + "num_tokens": 604461.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.5414000153541565, + "rewards/env_reward/std": 0.0, + "reward": 2.0413999557495117, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.2973760664463043, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.248, + "step": 496 + }, + { + "loss": 0.0669, + "grad_norm": 7.617353916168213, + "learning_rate": 1.2222222222222223e-06, + "num_tokens": 605679.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.6072749495506287, + "rewards/env_reward/std": 0.7213825583457947, + "reward": 2.1072750091552734, + "reward_std": 0.7213825583457947, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.6694605350494385, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.2485, + "step": 497 + }, + { + "loss": 0.0419, + "grad_norm": 0.11127752810716629, + "learning_rate": 1.1111111111111112e-06, + "num_tokens": 606899.0, + "completions/mean_length": 4.0, + "completions/min_length": 4.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 4.0, + "completions/min_terminated_length": 4.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.21160000562667847, + "rewards/env_reward/std": 0.0, + "reward": 1.7116000652313232, + "reward_std": 0.0, + "frac_reward_zero_std": 1.0, + "completion_length": 4.0, + "kl": 0.41930001974105835, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.249, + "step": 498 + }, + { + "loss": 0.0472, + "grad_norm": 12.42243766784668, + "learning_rate": 1.0000000000000002e-06, + "num_tokens": 608117.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.645550012588501, + "rewards/env_reward/std": 0.4464397430419922, + "reward": 2.145550012588501, + "reward_std": 0.4464397430419922, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.47190549969673157, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.2495, + "step": 499 + }, + { + "loss": 0.0374, + "grad_norm": 9.841089248657227, + "learning_rate": 8.88888888888889e-07, + "num_tokens": 609331.0, + "completions/mean_length": 3.5, + "completions/min_length": 3.0, + "completions/max_length": 4.0, + "completions/clipped_ratio": 0.0, + "completions/mean_terminated_length": 3.5, + "completions/min_terminated_length": 3.0, + "completions/max_terminated_length": 4.0, + "rewards/format_valid/mean": 1.0, + "rewards/format_valid/std": 0.0, + "rewards/action_legal/mean": 0.5, + "rewards/action_legal/std": 0.0, + "rewards/env_reward/mean": 0.9353499412536621, + "rewards/env_reward/std": 1.2736923694610596, + "reward": 2.435349941253662, + "reward_std": 1.2736923694610596, + "frac_reward_zero_std": 0.0, + "completion_length": 4.0, + "kl": 0.3742150366306305, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/high_max": 0.0, + "clip_ratio/region_mean": 0.0, + "epoch": 0.25, + "step": 500 + }, + { + "train_runtime": 1991.1306, + "train_samples_per_second": 1.004, + "train_steps_per_second": 0.251, + "total_flos": 0.0, + "train_loss": 0.05931463603555517, + "epoch": 0.25, + "step": 500 + } +] \ No newline at end of file