{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.2195121951219512, "eval_steps": 500, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 60.10416793823242, "completions/mean_terminated_length": 60.10416793823242, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "entropy": 1.2584454119205475, "epoch": 0.024390243902439025, "frac_reward_zero_std": 0.1666666716337204, "grad_norm": 0.09302648901939392, "kl": 1.2248776783962967e-05, "learning_rate": 0.0, "loss": -0.0423424206674099, "num_tokens": 23029.0, "reward": 0.5082165002822876, "reward_std": 0.27811428904533386, "rewards/true_env_reward_fn/mean": 0.5082164406776428, "rewards/true_env_reward_fn/std": 0.27811428904533386, "step": 1, "step_time": 11.815711200999885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 55.875, "completions/mean_terminated_length": 55.875, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 1.3789870142936707, "epoch": 0.04878048780487805, "frac_reward_zero_std": 0.0, "grad_norm": 0.11938872188329697, "kl": 1.2672078355535632e-05, "learning_rate": 2.4390243902439023e-08, "loss": -0.11833255738019943, "num_tokens": 57015.0, "reward": 0.1327376663684845, "reward_std": 0.241567462682724, "rewards/true_env_reward_fn/mean": 0.1327376663684845, "rewards/true_env_reward_fn/std": 0.241567462682724, "step": 2, "step_time": 13.493524850000085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 124.0, "completions/max_terminated_length": 124.0, "completions/mean_length": 63.79166793823242, "completions/mean_terminated_length": 63.79166793823242, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 1.315225213766098, "epoch": 0.07317073170731707, "frac_reward_zero_std": 0.1666666716337204, "grad_norm": 0.08519645780324936, "kl": 1.2407871281538974e-05, "learning_rate": 4.878048780487805e-08, "loss": -0.03654177859425545, "num_tokens": 86989.0, "reward": 0.3152047097682953, "reward_std": 0.3069385886192322, "rewards/true_env_reward_fn/mean": 0.3152047097682953, "rewards/true_env_reward_fn/std": 0.30693864822387695, "step": 3, "step_time": 11.449303891999875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 77.20833587646484, "completions/mean_terminated_length": 77.20833587646484, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 1.338063895702362, "epoch": 0.0975609756097561, "frac_reward_zero_std": 0.1666666716337204, "grad_norm": 0.08080132305622101, "kl": 1.239982589140709e-05, "learning_rate": 7.317073170731706e-08, "loss": 0.053779490292072296, "num_tokens": 112007.0, "reward": 0.4893929362297058, "reward_std": 0.28476035594940186, "rewards/true_env_reward_fn/mean": 0.4893929064273834, "rewards/true_env_reward_fn/std": 0.28476035594940186, "step": 4, "step_time": 18.835909622000145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 67.41667175292969, "completions/mean_terminated_length": 67.41667175292969, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 1.3855182826519012, "epoch": 0.12195121951219512, "frac_reward_zero_std": 0.1666666716337204, "grad_norm": 0.08783729374408722, "kl": 1.1660237760224845e-05, "learning_rate": 9.75609756097561e-08, "loss": -0.026884621009230614, "num_tokens": 135883.0, "reward": 0.48575252294540405, "reward_std": 0.335994690656662, "rewards/true_env_reward_fn/mean": 0.48575249314308167, "rewards/true_env_reward_fn/std": 0.335994690656662, "step": 5, "step_time": 14.435845696000001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 71.29167175292969, "completions/mean_terminated_length": 71.29167175292969, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "entropy": 1.2962585091590881, "epoch": 0.14634146341463414, "frac_reward_zero_std": 0.0, "grad_norm": 0.08510823547840118, "kl": 1.241418908648484e-05, "learning_rate": 1.219512195121951e-07, "loss": -0.05353507027029991, "num_tokens": 157537.0, "reward": 0.47622889280319214, "reward_std": 0.3605790138244629, "rewards/true_env_reward_fn/mean": 0.47622886300086975, "rewards/true_env_reward_fn/std": 0.3605790138244629, "step": 6, "step_time": 13.232063896999989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 114.0, "completions/max_terminated_length": 114.0, "completions/mean_length": 69.45833587646484, "completions/mean_terminated_length": 69.45833587646484, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 1.273663192987442, "epoch": 0.17073170731707318, "frac_reward_zero_std": 0.1666666716337204, "grad_norm": 0.0775279700756073, "kl": 1.2900356978207128e-05, "learning_rate": 1.4634146341463413e-07, "loss": -0.010494321584701538, "num_tokens": 179167.0, "reward": 0.5062826871871948, "reward_std": 0.18032674491405487, "rewards/true_env_reward_fn/mean": 0.5062826871871948, "rewards/true_env_reward_fn/std": 0.18032673001289368, "step": 7, "step_time": 9.810652986000036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 65.54167175292969, "completions/mean_terminated_length": 65.54167175292969, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 1.255563884973526, "epoch": 0.1951219512195122, "frac_reward_zero_std": 0.1666666716337204, "grad_norm": 0.07464194297790527, "kl": 1.1561841347429436e-05, "learning_rate": 1.7073170731707317e-07, "loss": 0.0830899029970169, "num_tokens": 201865.0, "reward": 0.38212963938713074, "reward_std": 0.29894331097602844, "rewards/true_env_reward_fn/mean": 0.38212963938713074, "rewards/true_env_reward_fn/std": 0.29894331097602844, "step": 8, "step_time": 19.874756868999953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 122.0, "completions/max_terminated_length": 122.0, "completions/mean_length": 68.33333587646484, "completions/mean_terminated_length": 68.33333587646484, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 1.2706169188022614, "epoch": 0.21951219512195122, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.049192048609256744, "kl": 1.157601468548819e-05, "learning_rate": 1.951219512195122e-07, "loss": 0.010864660143852234, "num_tokens": 219953.0, "reward": 0.6740004420280457, "reward_std": 0.18809831142425537, "rewards/true_env_reward_fn/mean": 0.6740004420280457, "rewards/true_env_reward_fn/std": 0.18809829652309418, "step": 9, "step_time": 9.458149736999985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 59.833335876464844, "completions/mean_terminated_length": 59.833335876464844, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "entropy": 1.1927059888839722, "epoch": 0.24390243902439024, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.0561092346906662, "kl": 1.0622535000948119e-05, "learning_rate": 2.195121951219512e-07, "loss": -0.02407176047563553, "num_tokens": 244913.0, "reward": 0.5113257169723511, "reward_std": 0.32156965136528015, "rewards/true_env_reward_fn/mean": 0.5113256573677063, "rewards/true_env_reward_fn/std": 0.32156962156295776, "step": 10, "step_time": 14.219840567000006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 115.0, "completions/max_terminated_length": 115.0, "completions/mean_length": 65.47917175292969, "completions/mean_terminated_length": 65.47917175292969, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 1.2782267928123474, "epoch": 0.2682926829268293, "frac_reward_zero_std": 0.5, "grad_norm": 0.05816411226987839, "kl": 1.2071807759639341e-05, "learning_rate": 2.439024390243902e-07, "loss": 0.007693461142480373, "num_tokens": 269080.0, "reward": 0.37106746435165405, "reward_std": 0.26608046889305115, "rewards/true_env_reward_fn/mean": 0.37106743454933167, "rewards/true_env_reward_fn/std": 0.26608046889305115, "step": 11, "step_time": 9.271131832999913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 67.9375, "completions/mean_terminated_length": 67.9375, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 1.3190773129463196, "epoch": 0.2926829268292683, "frac_reward_zero_std": 0.1666666716337204, "grad_norm": 0.09140665084123611, "kl": 1.2069708191120299e-05, "learning_rate": 2.682926829268293e-07, "loss": 0.07185906916856766, "num_tokens": 291317.0, "reward": 0.4376159906387329, "reward_std": 0.27247554063796997, "rewards/true_env_reward_fn/mean": 0.4376159906387329, "rewards/true_env_reward_fn/std": 0.27247554063796997, "step": 12, "step_time": 12.184364300000084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 122.0, "completions/max_terminated_length": 122.0, "completions/mean_length": 66.54167175292969, "completions/mean_terminated_length": 66.54167175292969, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "entropy": 1.3555113077163696, "epoch": 0.3170731707317073, "frac_reward_zero_std": 0.1666666716337204, "grad_norm": 0.08824986964464188, "kl": 1.2127976788178785e-05, "learning_rate": 2.9268292682926825e-07, "loss": -0.0217185840010643, "num_tokens": 313623.0, "reward": 0.5092746615409851, "reward_std": 0.3137436807155609, "rewards/true_env_reward_fn/mean": 0.5092746615409851, "rewards/true_env_reward_fn/std": 0.3137436509132385, "step": 13, "step_time": 10.720424850000086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 69.3125, "completions/mean_terminated_length": 69.3125, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 1.3283279240131378, "epoch": 0.34146341463414637, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.05055573210120201, "kl": 1.3128182672517141e-05, "learning_rate": 3.170731707317073e-07, "loss": -0.024722743779420853, "num_tokens": 339118.0, "reward": 0.45545920729637146, "reward_std": 0.18457132577896118, "rewards/true_env_reward_fn/mean": 0.45545920729637146, "rewards/true_env_reward_fn/std": 0.18457134068012238, "step": 14, "step_time": 14.965493325000011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 66.45833587646484, "completions/mean_terminated_length": 66.45833587646484, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "entropy": 1.2629931271076202, "epoch": 0.36585365853658536, "frac_reward_zero_std": 0.5, "grad_norm": 0.06967486441135406, "kl": 1.1465989928183262e-05, "learning_rate": 3.4146341463414634e-07, "loss": 0.046319857239723206, "num_tokens": 366364.0, "reward": 0.4448578357696533, "reward_std": 0.24966756999492645, "rewards/true_env_reward_fn/mean": 0.4448578357696533, "rewards/true_env_reward_fn/std": 0.24966755509376526, "step": 15, "step_time": 13.628413805999912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 69.04167175292969, "completions/mean_terminated_length": 69.04167175292969, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 1.2793545722961426, "epoch": 0.3902439024390244, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.04725664108991623, "kl": 1.1130929124192335e-05, "learning_rate": 3.6585365853658536e-07, "loss": 0.006799306720495224, "num_tokens": 392926.0, "reward": 0.414639949798584, "reward_std": 0.2748004198074341, "rewards/true_env_reward_fn/mean": 0.414639949798584, "rewards/true_env_reward_fn/std": 0.2748004198074341, "step": 16, "step_time": 14.229579036999894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 76.4375, "completions/mean_terminated_length": 76.4375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 1.3106227219104767, "epoch": 0.4146341463414634, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.06872504949569702, "kl": 1.2065312830600305e-05, "learning_rate": 3.902439024390244e-07, "loss": 0.036527130752801895, "num_tokens": 419219.0, "reward": 0.49165210127830505, "reward_std": 0.267509400844574, "rewards/true_env_reward_fn/mean": 0.49165210127830505, "rewards/true_env_reward_fn/std": 0.267509400844574, "step": 17, "step_time": 17.023353198999985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 71.72917175292969, "completions/mean_terminated_length": 71.72917175292969, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 1.3780030608177185, "epoch": 0.43902439024390244, "frac_reward_zero_std": 0.5, "grad_norm": 0.05453665927052498, "kl": 1.2325858278927626e-05, "learning_rate": 4.146341463414634e-07, "loss": 0.01989848166704178, "num_tokens": 442822.0, "reward": 0.5288735032081604, "reward_std": 0.2950553297996521, "rewards/true_env_reward_fn/mean": 0.5288735032081604, "rewards/true_env_reward_fn/std": 0.2950552701950073, "step": 18, "step_time": 11.965533113999868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 106.0, "completions/max_terminated_length": 106.0, "completions/mean_length": 65.4375, "completions/mean_terminated_length": 65.4375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 1.3424750864505768, "epoch": 0.4634146341463415, "frac_reward_zero_std": 0.1666666716337204, "grad_norm": 0.09814280271530151, "kl": 1.2686515219684225e-05, "learning_rate": 4.390243902439024e-07, "loss": 0.06940581649541855, "num_tokens": 467275.0, "reward": 0.5175753831863403, "reward_std": 0.2811976969242096, "rewards/true_env_reward_fn/mean": 0.5175753235816956, "rewards/true_env_reward_fn/std": 0.2811976969242096, "step": 19, "step_time": 10.33812468799988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 65.10417175292969, "completions/mean_terminated_length": 65.10417175292969, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 1.1681120097637177, "epoch": 0.4878048780487805, "frac_reward_zero_std": 0.0, "grad_norm": 0.09474422037601471, "kl": 1.2183225862827385e-05, "learning_rate": 4.634146341463415e-07, "loss": 0.05423373728990555, "num_tokens": 494320.0, "reward": 0.48628994822502136, "reward_std": 0.25381213426589966, "rewards/true_env_reward_fn/mean": 0.48628994822502136, "rewards/true_env_reward_fn/std": 0.25381216406822205, "step": 20, "step_time": 17.317542748000164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 62.395835876464844, "completions/mean_terminated_length": 62.395835876464844, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "entropy": 1.2504475116729736, "epoch": 0.5121951219512195, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.0819205492734909, "kl": 1.0698822279664455e-05, "learning_rate": 4.878048780487804e-07, "loss": 0.05607657879590988, "num_tokens": 518323.0, "reward": 0.4693639278411865, "reward_std": 0.32881346344947815, "rewards/true_env_reward_fn/mean": 0.4693639278411865, "rewards/true_env_reward_fn/std": 0.32881346344947815, "step": 21, "step_time": 12.20283881399996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 68.91667175292969, "completions/mean_terminated_length": 68.91667175292969, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 1.2199381291866302, "epoch": 0.5365853658536586, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.06889473646879196, "kl": 1.1745656820494332e-05, "learning_rate": 5.121951219512195e-07, "loss": -0.017973195761442184, "num_tokens": 543591.0, "reward": 0.49388420581817627, "reward_std": 0.2952423393726349, "rewards/true_env_reward_fn/mean": 0.49388420581817627, "rewards/true_env_reward_fn/std": 0.2952423095703125, "step": 22, "step_time": 11.211206898000114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 107.0, "completions/max_terminated_length": 107.0, "completions/mean_length": 65.625, "completions/mean_terminated_length": 65.625, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 1.2588726878166199, "epoch": 0.5609756097560976, "frac_reward_zero_std": 0.1666666716337204, "grad_norm": 0.08144447952508926, "kl": 1.2306870758038713e-05, "learning_rate": 5.365853658536586e-07, "loss": 0.02826106920838356, "num_tokens": 567973.0, "reward": 0.48142755031585693, "reward_std": 0.26756224036216736, "rewards/true_env_reward_fn/mean": 0.48142755031585693, "rewards/true_env_reward_fn/std": 0.26756221055984497, "step": 23, "step_time": 10.428452587999914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 122.0, "completions/max_terminated_length": 122.0, "completions/mean_length": 59.5625, "completions/mean_terminated_length": 59.5625, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "entropy": 1.384379804134369, "epoch": 0.5853658536585366, "frac_reward_zero_std": 0.0, "grad_norm": 0.11170398443937302, "kl": 1.2296073691686615e-05, "learning_rate": 5.609756097560975e-07, "loss": 0.07271970808506012, "num_tokens": 590248.0, "reward": 0.38166365027427673, "reward_std": 0.34809473156929016, "rewards/true_env_reward_fn/mean": 0.38166365027427673, "rewards/true_env_reward_fn/std": 0.3480947017669678, "step": 24, "step_time": 11.223491792000118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 123.0, "completions/max_terminated_length": 123.0, "completions/mean_length": 63.35416793823242, "completions/mean_terminated_length": 63.35416793823242, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 1.3013385236263275, "epoch": 0.6097560975609756, "frac_reward_zero_std": 0.0, "grad_norm": 0.10069931298494339, "kl": 1.2947949016961502e-05, "learning_rate": 5.853658536585365e-07, "loss": 0.033605337142944336, "num_tokens": 615345.0, "reward": 0.5046355724334717, "reward_std": 0.2754679322242737, "rewards/true_env_reward_fn/mean": 0.5046355128288269, "rewards/true_env_reward_fn/std": 0.2754679322242737, "step": 25, "step_time": 10.92509102200006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 101.0, "completions/max_terminated_length": 101.0, "completions/mean_length": 61.41666793823242, "completions/mean_terminated_length": 61.41666793823242, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 1.2652399837970734, "epoch": 0.6341463414634146, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.07595694065093994, "kl": 1.151612354988174e-05, "learning_rate": 6.097560975609756e-07, "loss": 0.04607678949832916, "num_tokens": 644749.0, "reward": 0.3311978578567505, "reward_std": 0.21527718007564545, "rewards/true_env_reward_fn/mean": 0.3311978578567505, "rewards/true_env_reward_fn/std": 0.21527719497680664, "step": 26, "step_time": 10.458724108999945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 71.25, "completions/mean_terminated_length": 71.25, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "entropy": 1.193794459104538, "epoch": 0.6585365853658537, "frac_reward_zero_std": 0.1666666716337204, "grad_norm": 0.07690244168043137, "kl": 1.2164698546257569e-05, "learning_rate": 6.341463414634146e-07, "loss": 0.00818883627653122, "num_tokens": 671153.0, "reward": 0.3635203242301941, "reward_std": 0.23849114775657654, "rewards/true_env_reward_fn/mean": 0.3635202944278717, "rewards/true_env_reward_fn/std": 0.23849113285541534, "step": 27, "step_time": 14.364785926000081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 111.0, "completions/max_terminated_length": 111.0, "completions/mean_length": 63.4375, "completions/mean_terminated_length": 63.4375, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 1.2883787751197815, "epoch": 0.6829268292682927, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.0902288407087326, "kl": 1.1798915693361778e-05, "learning_rate": 6.585365853658536e-07, "loss": 0.038317371159791946, "num_tokens": 697614.0, "reward": 0.44166144728660583, "reward_std": 0.25748196244239807, "rewards/true_env_reward_fn/mean": 0.44166144728660583, "rewards/true_env_reward_fn/std": 0.25748199224472046, "step": 28, "step_time": 10.888908384999922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 69.60417175292969, "completions/mean_terminated_length": 69.60417175292969, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 1.3002805709838867, "epoch": 0.7073170731707317, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.07522639632225037, "kl": 1.2230455695316778e-05, "learning_rate": 6.829268292682927e-07, "loss": 0.031045034527778625, "num_tokens": 719187.0, "reward": 0.5349087119102478, "reward_std": 0.29909756779670715, "rewards/true_env_reward_fn/mean": 0.5349087119102478, "rewards/true_env_reward_fn/std": 0.29909753799438477, "step": 29, "step_time": 15.510035302999995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 70.91667175292969, "completions/mean_terminated_length": 70.91667175292969, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 1.2718828916549683, "epoch": 0.7317073170731707, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.06776711344718933, "kl": 1.2617916354429326e-05, "learning_rate": 7.073170731707316e-07, "loss": 0.09301326423883438, "num_tokens": 744095.0, "reward": 0.43472790718078613, "reward_std": 0.3138841986656189, "rewards/true_env_reward_fn/mean": 0.43472790718078613, "rewards/true_env_reward_fn/std": 0.3138841688632965, "step": 30, "step_time": 14.50245602599989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 69.77083587646484, "completions/mean_terminated_length": 69.77083587646484, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 1.2918945252895355, "epoch": 0.7560975609756098, "frac_reward_zero_std": 0.1666666716337204, "grad_norm": 0.08040682971477509, "kl": 1.2672349157583085e-05, "learning_rate": 7.317073170731707e-07, "loss": 0.0367550291121006, "num_tokens": 764612.0, "reward": 0.5134401321411133, "reward_std": 0.19073942303657532, "rewards/true_env_reward_fn/mean": 0.5134401321411133, "rewards/true_env_reward_fn/std": 0.19073940813541412, "step": 31, "step_time": 11.06186091799998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 71.79167175292969, "completions/mean_terminated_length": 71.79167175292969, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "entropy": 1.1679067015647888, "epoch": 0.7804878048780488, "frac_reward_zero_std": 0.1666666716337204, "grad_norm": 0.0744430273771286, "kl": 1.2661263326663175e-05, "learning_rate": 7.560975609756097e-07, "loss": 0.05885648727416992, "num_tokens": 782058.0, "reward": 0.5372593402862549, "reward_std": 0.18350909650325775, "rewards/true_env_reward_fn/mean": 0.5372593402862549, "rewards/true_env_reward_fn/std": 0.18350908160209656, "step": 32, "step_time": 15.808748693000211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 76.79167175292969, "completions/mean_terminated_length": 76.79167175292969, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 1.1829756796360016, "epoch": 0.8048780487804879, "frac_reward_zero_std": 0.5, "grad_norm": 0.051698025315999985, "kl": 1.0996191576850833e-05, "learning_rate": 7.804878048780488e-07, "loss": 0.010143717750906944, "num_tokens": 810472.0, "reward": 0.4369215667247772, "reward_std": 0.30869919061660767, "rewards/true_env_reward_fn/mean": 0.4369215667247772, "rewards/true_env_reward_fn/std": 0.30869919061660767, "step": 33, "step_time": 24.20358999299981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 61.85416793823242, "completions/mean_terminated_length": 61.85416793823242, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 1.2468958497047424, "epoch": 0.8292682926829268, "frac_reward_zero_std": 0.0, "grad_norm": 0.09706687182188034, "kl": 1.2097383432774222e-05, "learning_rate": 8.048780487804878e-07, "loss": 0.026558157056570053, "num_tokens": 836713.0, "reward": 0.3587157428264618, "reward_std": 0.2754887044429779, "rewards/true_env_reward_fn/mean": 0.3587157428264618, "rewards/true_env_reward_fn/std": 0.2754887044429779, "step": 34, "step_time": 12.218407348999904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 115.0, "completions/max_terminated_length": 115.0, "completions/mean_length": 59.5625, "completions/mean_terminated_length": 59.5625, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 1.2368170320987701, "epoch": 0.8536585365853658, "frac_reward_zero_std": 0.1666666716337204, "grad_norm": 0.08964981138706207, "kl": 1.3131634887031396e-05, "learning_rate": 8.292682926829268e-07, "loss": -0.01139204390347004, "num_tokens": 860028.0, "reward": 0.49109315872192383, "reward_std": 0.20359393954277039, "rewards/true_env_reward_fn/mean": 0.49109315872192383, "rewards/true_env_reward_fn/std": 0.20359393954277039, "step": 35, "step_time": 9.66908789599995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 102.0, "completions/max_terminated_length": 102.0, "completions/mean_length": 66.02083587646484, "completions/mean_terminated_length": 66.02083587646484, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 1.1611860394477844, "epoch": 0.8780487804878049, "frac_reward_zero_std": 0.1666666716337204, "grad_norm": 0.08851195871829987, "kl": 1.2570341596074286e-05, "learning_rate": 8.536585365853657e-07, "loss": 0.021737128496170044, "num_tokens": 883189.0, "reward": 0.46058258414268494, "reward_std": 0.2632383108139038, "rewards/true_env_reward_fn/mean": 0.46058258414268494, "rewards/true_env_reward_fn/std": 0.2632383108139038, "step": 36, "step_time": 8.370980583999994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 75.58333587646484, "completions/mean_terminated_length": 75.58333587646484, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 1.37085822224617, "epoch": 0.9024390243902439, "frac_reward_zero_std": 0.5, "grad_norm": 0.05852028727531433, "kl": 1.2957561011717189e-05, "learning_rate": 8.780487804878048e-07, "loss": -0.024281952530145645, "num_tokens": 906801.0, "reward": 0.5022324323654175, "reward_std": 0.11637427657842636, "rewards/true_env_reward_fn/mean": 0.5022324323654175, "rewards/true_env_reward_fn/std": 0.11637428402900696, "step": 37, "step_time": 10.285125336999727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 65.14583587646484, "completions/mean_terminated_length": 65.14583587646484, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 1.2760809361934662, "epoch": 0.926829268292683, "frac_reward_zero_std": 0.1666666716337204, "grad_norm": 0.09105321019887924, "kl": 1.3129126955391257e-05, "learning_rate": 9.024390243902439e-07, "loss": -0.011838603764772415, "num_tokens": 929536.0, "reward": 0.49639374017715454, "reward_std": 0.32166802883148193, "rewards/true_env_reward_fn/mean": 0.49639371037483215, "rewards/true_env_reward_fn/std": 0.32166802883148193, "step": 38, "step_time": 12.449738128000035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 124.0, "completions/max_terminated_length": 124.0, "completions/mean_length": 72.08333587646484, "completions/mean_terminated_length": 72.08333587646484, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 1.2545586228370667, "epoch": 0.9512195121951219, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.06919296830892563, "kl": 1.459557256566768e-05, "learning_rate": 9.26829268292683e-07, "loss": 0.021831180900335312, "num_tokens": 950388.0, "reward": 0.4879913330078125, "reward_std": 0.24854585528373718, "rewards/true_env_reward_fn/mean": 0.4879913330078125, "rewards/true_env_reward_fn/std": 0.24854585528373718, "step": 39, "step_time": 10.279209028999958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 74.20833587646484, "completions/mean_terminated_length": 74.20833587646484, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 1.2255937159061432, "epoch": 0.975609756097561, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.06352153420448303, "kl": 1.2041192348988261e-05, "learning_rate": 9.512195121951218e-07, "loss": -0.013997981324791908, "num_tokens": 981254.0, "reward": 0.39802420139312744, "reward_std": 0.20212584733963013, "rewards/true_env_reward_fn/mean": 0.39802420139312744, "rewards/true_env_reward_fn/std": 0.20212584733963013, "step": 40, "step_time": 13.58010066599968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 75.04167175292969, "completions/mean_terminated_length": 75.04167175292969, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 1.2703719735145569, "epoch": 1.0, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.045169439166784286, "kl": 1.1270850109212915e-05, "learning_rate": 9.756097560975609e-07, "loss": -0.010194316506385803, "num_tokens": 1009968.0, "reward": 0.4517599940299988, "reward_std": 0.11791092902421951, "rewards/true_env_reward_fn/mean": 0.4517599642276764, "rewards/true_env_reward_fn/std": 0.11791091412305832, "step": 41, "step_time": 10.35077203700007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 109.0, "completions/max_terminated_length": 109.0, "completions/mean_length": 64.33333587646484, "completions/mean_terminated_length": 64.33333587646484, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 1.329576164484024, "epoch": 1.024390243902439, "frac_reward_zero_std": 0.1666666716337204, "grad_norm": 0.08522730320692062, "kl": 1.4469044799625408e-05, "learning_rate": 1e-06, "loss": -0.00014946190640330315, "num_tokens": 1039032.0, "reward": 0.33548423647880554, "reward_std": 0.22271563112735748, "rewards/true_env_reward_fn/mean": 0.33548423647880554, "rewards/true_env_reward_fn/std": 0.22271563112735748, "step": 42, "step_time": 10.548370664999993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 70.02083587646484, "completions/mean_terminated_length": 70.02083587646484, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "entropy": 1.2357364892959595, "epoch": 1.048780487804878, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.07030358910560608, "kl": 1.3562755839302554e-05, "learning_rate": 9.999818789066163e-07, "loss": -0.02616041898727417, "num_tokens": 1060833.0, "reward": 0.5167371034622192, "reward_std": 0.24280032515525818, "rewards/true_env_reward_fn/mean": 0.5167370438575745, "rewards/true_env_reward_fn/std": 0.24280032515525818, "step": 43, "step_time": 24.089396637999698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 77.47917175292969, "completions/mean_terminated_length": 77.47917175292969, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "entropy": 1.1693778038024902, "epoch": 1.0731707317073171, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.07017157226800919, "kl": 1.332453393843025e-05, "learning_rate": 9.999275169399612e-07, "loss": -0.006466507911682129, "num_tokens": 1088648.0, "reward": 0.4498252272605896, "reward_std": 0.21398545801639557, "rewards/true_env_reward_fn/mean": 0.4498251974582672, "rewards/true_env_reward_fn/std": 0.21398545801639557, "step": 44, "step_time": 19.39071501599983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 72.16667175292969, "completions/mean_terminated_length": 72.16667175292969, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 1.3268415927886963, "epoch": 1.0975609756097562, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.06632921099662781, "kl": 1.4458733630817733e-05, "learning_rate": 9.99836918040428e-07, "loss": -0.03534461930394173, "num_tokens": 1117096.0, "reward": 0.4053138196468353, "reward_std": 0.21476909518241882, "rewards/true_env_reward_fn/mean": 0.4053138196468353, "rewards/true_env_reward_fn/std": 0.21476909518241882, "step": 45, "step_time": 13.893569495999827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 70.16667175292969, "completions/mean_terminated_length": 70.16667175292969, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 1.2670875787734985, "epoch": 1.1219512195121952, "frac_reward_zero_std": 0.1666666716337204, "grad_norm": 0.08321154117584229, "kl": 1.4837954950053245e-05, "learning_rate": 9.997100887750215e-07, "loss": -0.039235007017850876, "num_tokens": 1136480.0, "reward": 0.48141974210739136, "reward_std": 0.2837103307247162, "rewards/true_env_reward_fn/mean": 0.48141971230506897, "rewards/true_env_reward_fn/std": 0.2837103009223938, "step": 46, "step_time": 10.50698806499986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 76.1875, "completions/mean_terminated_length": 76.1875, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 1.3037313222885132, "epoch": 1.146341463414634, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.061912886798381805, "kl": 1.283655774386716e-05, "learning_rate": 9.995470383368808e-07, "loss": -0.01992109790444374, "num_tokens": 1162249.0, "reward": 0.49922606348991394, "reward_std": 0.2621309757232666, "rewards/true_env_reward_fn/mean": 0.49922606348991394, "rewards/true_env_reward_fn/std": 0.2621309757232666, "step": 47, "step_time": 12.964419044000124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 71.375, "completions/mean_terminated_length": 71.375, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 1.2007178366184235, "epoch": 1.170731707317073, "frac_reward_zero_std": 0.1666666716337204, "grad_norm": 0.0889662653207779, "kl": 1.6228528693318367e-05, "learning_rate": 9.993477785446149e-07, "loss": 0.045945264399051666, "num_tokens": 1184555.0, "reward": 0.42501482367515564, "reward_std": 0.27350595593452454, "rewards/true_env_reward_fn/mean": 0.42501482367515564, "rewards/true_env_reward_fn/std": 0.27350592613220215, "step": 48, "step_time": 17.23041258299986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 86.0, "completions/max_terminated_length": 86.0, "completions/mean_length": 55.9375, "completions/mean_terminated_length": 55.9375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 1.182040810585022, "epoch": 1.1951219512195121, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.08547856658697128, "kl": 1.571832831359643e-05, "learning_rate": 9.991123238414453e-07, "loss": 0.02548346482217312, "num_tokens": 1208384.0, "reward": 0.3845663070678711, "reward_std": 0.315467894077301, "rewards/true_env_reward_fn/mean": 0.3845663070678711, "rewards/true_env_reward_fn/std": 0.31546786427497864, "step": 49, "step_time": 8.691208415999881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 109.0, "completions/max_terminated_length": 109.0, "completions/mean_length": 64.75, "completions/mean_terminated_length": 64.75, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 1.2306177020072937, "epoch": 1.2195121951219512, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.07395736873149872, "kl": 1.2643881973417592e-05, "learning_rate": 9.988406912941589e-07, "loss": -0.04186868295073509, "num_tokens": 1227700.0, "reward": 0.5068289637565613, "reward_std": 0.31324177980422974, "rewards/true_env_reward_fn/mean": 0.5068289637565613, "rewards/true_env_reward_fn/std": 0.31324175000190735, "step": 50, "step_time": 10.162109979000206 } ], "logging_steps": 1, "max_steps": 410, "num_input_tokens_seen": 1227700, "num_train_epochs": 10, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 12, "trial_name": null, "trial_params": null }