salespath-grpo / checkpoint-50 /trainer_state.json
Lomesh7777's picture
Upload folder using huggingface_hub
88d2321 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.2195121951219512,
"eval_steps": 500,
"global_step": 50,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 131.0,
"completions/max_terminated_length": 131.0,
"completions/mean_length": 60.10416793823242,
"completions/mean_terminated_length": 60.10416793823242,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"entropy": 1.2584454119205475,
"epoch": 0.024390243902439025,
"frac_reward_zero_std": 0.1666666716337204,
"grad_norm": 0.09302648901939392,
"kl": 1.2248776783962967e-05,
"learning_rate": 0.0,
"loss": -0.0423424206674099,
"num_tokens": 23029.0,
"reward": 0.5082165002822876,
"reward_std": 0.27811428904533386,
"rewards/true_env_reward_fn/mean": 0.5082164406776428,
"rewards/true_env_reward_fn/std": 0.27811428904533386,
"step": 1,
"step_time": 11.815711200999885
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 161.0,
"completions/max_terminated_length": 161.0,
"completions/mean_length": 55.875,
"completions/mean_terminated_length": 55.875,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"entropy": 1.3789870142936707,
"epoch": 0.04878048780487805,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.11938872188329697,
"kl": 1.2672078355535632e-05,
"learning_rate": 2.4390243902439023e-08,
"loss": -0.11833255738019943,
"num_tokens": 57015.0,
"reward": 0.1327376663684845,
"reward_std": 0.241567462682724,
"rewards/true_env_reward_fn/mean": 0.1327376663684845,
"rewards/true_env_reward_fn/std": 0.241567462682724,
"step": 2,
"step_time": 13.493524850000085
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 124.0,
"completions/max_terminated_length": 124.0,
"completions/mean_length": 63.79166793823242,
"completions/mean_terminated_length": 63.79166793823242,
"completions/min_length": 7.0,
"completions/min_terminated_length": 7.0,
"entropy": 1.315225213766098,
"epoch": 0.07317073170731707,
"frac_reward_zero_std": 0.1666666716337204,
"grad_norm": 0.08519645780324936,
"kl": 1.2407871281538974e-05,
"learning_rate": 4.878048780487805e-08,
"loss": -0.03654177859425545,
"num_tokens": 86989.0,
"reward": 0.3152047097682953,
"reward_std": 0.3069385886192322,
"rewards/true_env_reward_fn/mean": 0.3152047097682953,
"rewards/true_env_reward_fn/std": 0.30693864822387695,
"step": 3,
"step_time": 11.449303891999875
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 226.0,
"completions/max_terminated_length": 226.0,
"completions/mean_length": 77.20833587646484,
"completions/mean_terminated_length": 77.20833587646484,
"completions/min_length": 29.0,
"completions/min_terminated_length": 29.0,
"entropy": 1.338063895702362,
"epoch": 0.0975609756097561,
"frac_reward_zero_std": 0.1666666716337204,
"grad_norm": 0.08080132305622101,
"kl": 1.239982589140709e-05,
"learning_rate": 7.317073170731706e-08,
"loss": 0.053779490292072296,
"num_tokens": 112007.0,
"reward": 0.4893929362297058,
"reward_std": 0.28476035594940186,
"rewards/true_env_reward_fn/mean": 0.4893929064273834,
"rewards/true_env_reward_fn/std": 0.28476035594940186,
"step": 4,
"step_time": 18.835909622000145
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 212.0,
"completions/max_terminated_length": 212.0,
"completions/mean_length": 67.41667175292969,
"completions/mean_terminated_length": 67.41667175292969,
"completions/min_length": 37.0,
"completions/min_terminated_length": 37.0,
"entropy": 1.3855182826519012,
"epoch": 0.12195121951219512,
"frac_reward_zero_std": 0.1666666716337204,
"grad_norm": 0.08783729374408722,
"kl": 1.1660237760224845e-05,
"learning_rate": 9.75609756097561e-08,
"loss": -0.026884621009230614,
"num_tokens": 135883.0,
"reward": 0.48575252294540405,
"reward_std": 0.335994690656662,
"rewards/true_env_reward_fn/mean": 0.48575249314308167,
"rewards/true_env_reward_fn/std": 0.335994690656662,
"step": 5,
"step_time": 14.435845696000001
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 164.0,
"completions/max_terminated_length": 164.0,
"completions/mean_length": 71.29167175292969,
"completions/mean_terminated_length": 71.29167175292969,
"completions/min_length": 26.0,
"completions/min_terminated_length": 26.0,
"entropy": 1.2962585091590881,
"epoch": 0.14634146341463414,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.08510823547840118,
"kl": 1.241418908648484e-05,
"learning_rate": 1.219512195121951e-07,
"loss": -0.05353507027029991,
"num_tokens": 157537.0,
"reward": 0.47622889280319214,
"reward_std": 0.3605790138244629,
"rewards/true_env_reward_fn/mean": 0.47622886300086975,
"rewards/true_env_reward_fn/std": 0.3605790138244629,
"step": 6,
"step_time": 13.232063896999989
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 114.0,
"completions/max_terminated_length": 114.0,
"completions/mean_length": 69.45833587646484,
"completions/mean_terminated_length": 69.45833587646484,
"completions/min_length": 44.0,
"completions/min_terminated_length": 44.0,
"entropy": 1.273663192987442,
"epoch": 0.17073170731707318,
"frac_reward_zero_std": 0.1666666716337204,
"grad_norm": 0.0775279700756073,
"kl": 1.2900356978207128e-05,
"learning_rate": 1.4634146341463413e-07,
"loss": -0.010494321584701538,
"num_tokens": 179167.0,
"reward": 0.5062826871871948,
"reward_std": 0.18032674491405487,
"rewards/true_env_reward_fn/mean": 0.5062826871871948,
"rewards/true_env_reward_fn/std": 0.18032673001289368,
"step": 7,
"step_time": 9.810652986000036
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 287.0,
"completions/max_terminated_length": 287.0,
"completions/mean_length": 65.54167175292969,
"completions/mean_terminated_length": 65.54167175292969,
"completions/min_length": 35.0,
"completions/min_terminated_length": 35.0,
"entropy": 1.255563884973526,
"epoch": 0.1951219512195122,
"frac_reward_zero_std": 0.1666666716337204,
"grad_norm": 0.07464194297790527,
"kl": 1.1561841347429436e-05,
"learning_rate": 1.7073170731707317e-07,
"loss": 0.0830899029970169,
"num_tokens": 201865.0,
"reward": 0.38212963938713074,
"reward_std": 0.29894331097602844,
"rewards/true_env_reward_fn/mean": 0.38212963938713074,
"rewards/true_env_reward_fn/std": 0.29894331097602844,
"step": 8,
"step_time": 19.874756868999953
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 122.0,
"completions/max_terminated_length": 122.0,
"completions/mean_length": 68.33333587646484,
"completions/mean_terminated_length": 68.33333587646484,
"completions/min_length": 38.0,
"completions/min_terminated_length": 38.0,
"entropy": 1.2706169188022614,
"epoch": 0.21951219512195122,
"frac_reward_zero_std": 0.6666666865348816,
"grad_norm": 0.049192048609256744,
"kl": 1.157601468548819e-05,
"learning_rate": 1.951219512195122e-07,
"loss": 0.010864660143852234,
"num_tokens": 219953.0,
"reward": 0.6740004420280457,
"reward_std": 0.18809831142425537,
"rewards/true_env_reward_fn/mean": 0.6740004420280457,
"rewards/true_env_reward_fn/std": 0.18809829652309418,
"step": 9,
"step_time": 9.458149736999985
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 148.0,
"completions/max_terminated_length": 148.0,
"completions/mean_length": 59.833335876464844,
"completions/mean_terminated_length": 59.833335876464844,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"entropy": 1.1927059888839722,
"epoch": 0.24390243902439024,
"frac_reward_zero_std": 0.6666666865348816,
"grad_norm": 0.0561092346906662,
"kl": 1.0622535000948119e-05,
"learning_rate": 2.195121951219512e-07,
"loss": -0.02407176047563553,
"num_tokens": 244913.0,
"reward": 0.5113257169723511,
"reward_std": 0.32156965136528015,
"rewards/true_env_reward_fn/mean": 0.5113256573677063,
"rewards/true_env_reward_fn/std": 0.32156962156295776,
"step": 10,
"step_time": 14.219840567000006
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 115.0,
"completions/max_terminated_length": 115.0,
"completions/mean_length": 65.47917175292969,
"completions/mean_terminated_length": 65.47917175292969,
"completions/min_length": 36.0,
"completions/min_terminated_length": 36.0,
"entropy": 1.2782267928123474,
"epoch": 0.2682926829268293,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.05816411226987839,
"kl": 1.2071807759639341e-05,
"learning_rate": 2.439024390243902e-07,
"loss": 0.007693461142480373,
"num_tokens": 269080.0,
"reward": 0.37106746435165405,
"reward_std": 0.26608046889305115,
"rewards/true_env_reward_fn/mean": 0.37106743454933167,
"rewards/true_env_reward_fn/std": 0.26608046889305115,
"step": 11,
"step_time": 9.271131832999913
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 143.0,
"completions/max_terminated_length": 143.0,
"completions/mean_length": 67.9375,
"completions/mean_terminated_length": 67.9375,
"completions/min_length": 41.0,
"completions/min_terminated_length": 41.0,
"entropy": 1.3190773129463196,
"epoch": 0.2926829268292683,
"frac_reward_zero_std": 0.1666666716337204,
"grad_norm": 0.09140665084123611,
"kl": 1.2069708191120299e-05,
"learning_rate": 2.682926829268293e-07,
"loss": 0.07185906916856766,
"num_tokens": 291317.0,
"reward": 0.4376159906387329,
"reward_std": 0.27247554063796997,
"rewards/true_env_reward_fn/mean": 0.4376159906387329,
"rewards/true_env_reward_fn/std": 0.27247554063796997,
"step": 12,
"step_time": 12.184364300000084
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 122.0,
"completions/max_terminated_length": 122.0,
"completions/mean_length": 66.54167175292969,
"completions/mean_terminated_length": 66.54167175292969,
"completions/min_length": 25.0,
"completions/min_terminated_length": 25.0,
"entropy": 1.3555113077163696,
"epoch": 0.3170731707317073,
"frac_reward_zero_std": 0.1666666716337204,
"grad_norm": 0.08824986964464188,
"kl": 1.2127976788178785e-05,
"learning_rate": 2.9268292682926825e-07,
"loss": -0.0217185840010643,
"num_tokens": 313623.0,
"reward": 0.5092746615409851,
"reward_std": 0.3137436807155609,
"rewards/true_env_reward_fn/mean": 0.5092746615409851,
"rewards/true_env_reward_fn/std": 0.3137436509132385,
"step": 13,
"step_time": 10.720424850000086
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 188.0,
"completions/max_terminated_length": 188.0,
"completions/mean_length": 69.3125,
"completions/mean_terminated_length": 69.3125,
"completions/min_length": 47.0,
"completions/min_terminated_length": 47.0,
"entropy": 1.3283279240131378,
"epoch": 0.34146341463414637,
"frac_reward_zero_std": 0.6666666865348816,
"grad_norm": 0.05055573210120201,
"kl": 1.3128182672517141e-05,
"learning_rate": 3.170731707317073e-07,
"loss": -0.024722743779420853,
"num_tokens": 339118.0,
"reward": 0.45545920729637146,
"reward_std": 0.18457132577896118,
"rewards/true_env_reward_fn/mean": 0.45545920729637146,
"rewards/true_env_reward_fn/std": 0.18457134068012238,
"step": 14,
"step_time": 14.965493325000011
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 181.0,
"completions/max_terminated_length": 181.0,
"completions/mean_length": 66.45833587646484,
"completions/mean_terminated_length": 66.45833587646484,
"completions/min_length": 24.0,
"completions/min_terminated_length": 24.0,
"entropy": 1.2629931271076202,
"epoch": 0.36585365853658536,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.06967486441135406,
"kl": 1.1465989928183262e-05,
"learning_rate": 3.4146341463414634e-07,
"loss": 0.046319857239723206,
"num_tokens": 366364.0,
"reward": 0.4448578357696533,
"reward_std": 0.24966756999492645,
"rewards/true_env_reward_fn/mean": 0.4448578357696533,
"rewards/true_env_reward_fn/std": 0.24966755509376526,
"step": 15,
"step_time": 13.628413805999912
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 194.0,
"completions/max_terminated_length": 194.0,
"completions/mean_length": 69.04167175292969,
"completions/mean_terminated_length": 69.04167175292969,
"completions/min_length": 38.0,
"completions/min_terminated_length": 38.0,
"entropy": 1.2793545722961426,
"epoch": 0.3902439024390244,
"frac_reward_zero_std": 0.6666666865348816,
"grad_norm": 0.04725664108991623,
"kl": 1.1130929124192335e-05,
"learning_rate": 3.6585365853658536e-07,
"loss": 0.006799306720495224,
"num_tokens": 392926.0,
"reward": 0.414639949798584,
"reward_std": 0.2748004198074341,
"rewards/true_env_reward_fn/mean": 0.414639949798584,
"rewards/true_env_reward_fn/std": 0.2748004198074341,
"step": 16,
"step_time": 14.229579036999894
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 195.0,
"completions/max_terminated_length": 195.0,
"completions/mean_length": 76.4375,
"completions/mean_terminated_length": 76.4375,
"completions/min_length": 31.0,
"completions/min_terminated_length": 31.0,
"entropy": 1.3106227219104767,
"epoch": 0.4146341463414634,
"frac_reward_zero_std": 0.3333333432674408,
"grad_norm": 0.06872504949569702,
"kl": 1.2065312830600305e-05,
"learning_rate": 3.902439024390244e-07,
"loss": 0.036527130752801895,
"num_tokens": 419219.0,
"reward": 0.49165210127830505,
"reward_std": 0.267509400844574,
"rewards/true_env_reward_fn/mean": 0.49165210127830505,
"rewards/true_env_reward_fn/std": 0.267509400844574,
"step": 17,
"step_time": 17.023353198999985
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 159.0,
"completions/max_terminated_length": 159.0,
"completions/mean_length": 71.72917175292969,
"completions/mean_terminated_length": 71.72917175292969,
"completions/min_length": 39.0,
"completions/min_terminated_length": 39.0,
"entropy": 1.3780030608177185,
"epoch": 0.43902439024390244,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.05453665927052498,
"kl": 1.2325858278927626e-05,
"learning_rate": 4.146341463414634e-07,
"loss": 0.01989848166704178,
"num_tokens": 442822.0,
"reward": 0.5288735032081604,
"reward_std": 0.2950553297996521,
"rewards/true_env_reward_fn/mean": 0.5288735032081604,
"rewards/true_env_reward_fn/std": 0.2950552701950073,
"step": 18,
"step_time": 11.965533113999868
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 106.0,
"completions/max_terminated_length": 106.0,
"completions/mean_length": 65.4375,
"completions/mean_terminated_length": 65.4375,
"completions/min_length": 42.0,
"completions/min_terminated_length": 42.0,
"entropy": 1.3424750864505768,
"epoch": 0.4634146341463415,
"frac_reward_zero_std": 0.1666666716337204,
"grad_norm": 0.09814280271530151,
"kl": 1.2686515219684225e-05,
"learning_rate": 4.390243902439024e-07,
"loss": 0.06940581649541855,
"num_tokens": 467275.0,
"reward": 0.5175753831863403,
"reward_std": 0.2811976969242096,
"rewards/true_env_reward_fn/mean": 0.5175753235816956,
"rewards/true_env_reward_fn/std": 0.2811976969242096,
"step": 19,
"step_time": 10.33812468799988
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 244.0,
"completions/max_terminated_length": 244.0,
"completions/mean_length": 65.10417175292969,
"completions/mean_terminated_length": 65.10417175292969,
"completions/min_length": 34.0,
"completions/min_terminated_length": 34.0,
"entropy": 1.1681120097637177,
"epoch": 0.4878048780487805,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.09474422037601471,
"kl": 1.2183225862827385e-05,
"learning_rate": 4.634146341463415e-07,
"loss": 0.05423373728990555,
"num_tokens": 494320.0,
"reward": 0.48628994822502136,
"reward_std": 0.25381213426589966,
"rewards/true_env_reward_fn/mean": 0.48628994822502136,
"rewards/true_env_reward_fn/std": 0.25381216406822205,
"step": 20,
"step_time": 17.317542748000164
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 157.0,
"completions/max_terminated_length": 157.0,
"completions/mean_length": 62.395835876464844,
"completions/mean_terminated_length": 62.395835876464844,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"entropy": 1.2504475116729736,
"epoch": 0.5121951219512195,
"frac_reward_zero_std": 0.3333333432674408,
"grad_norm": 0.0819205492734909,
"kl": 1.0698822279664455e-05,
"learning_rate": 4.878048780487804e-07,
"loss": 0.05607657879590988,
"num_tokens": 518323.0,
"reward": 0.4693639278411865,
"reward_std": 0.32881346344947815,
"rewards/true_env_reward_fn/mean": 0.4693639278411865,
"rewards/true_env_reward_fn/std": 0.32881346344947815,
"step": 21,
"step_time": 12.20283881399996
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 144.0,
"completions/max_terminated_length": 144.0,
"completions/mean_length": 68.91667175292969,
"completions/mean_terminated_length": 68.91667175292969,
"completions/min_length": 31.0,
"completions/min_terminated_length": 31.0,
"entropy": 1.2199381291866302,
"epoch": 0.5365853658536586,
"frac_reward_zero_std": 0.3333333432674408,
"grad_norm": 0.06889473646879196,
"kl": 1.1745656820494332e-05,
"learning_rate": 5.121951219512195e-07,
"loss": -0.017973195761442184,
"num_tokens": 543591.0,
"reward": 0.49388420581817627,
"reward_std": 0.2952423393726349,
"rewards/true_env_reward_fn/mean": 0.49388420581817627,
"rewards/true_env_reward_fn/std": 0.2952423095703125,
"step": 22,
"step_time": 11.211206898000114
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 107.0,
"completions/max_terminated_length": 107.0,
"completions/mean_length": 65.625,
"completions/mean_terminated_length": 65.625,
"completions/min_length": 41.0,
"completions/min_terminated_length": 41.0,
"entropy": 1.2588726878166199,
"epoch": 0.5609756097560976,
"frac_reward_zero_std": 0.1666666716337204,
"grad_norm": 0.08144447952508926,
"kl": 1.2306870758038713e-05,
"learning_rate": 5.365853658536586e-07,
"loss": 0.02826106920838356,
"num_tokens": 567973.0,
"reward": 0.48142755031585693,
"reward_std": 0.26756224036216736,
"rewards/true_env_reward_fn/mean": 0.48142755031585693,
"rewards/true_env_reward_fn/std": 0.26756221055984497,
"step": 23,
"step_time": 10.428452587999914
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 122.0,
"completions/max_terminated_length": 122.0,
"completions/mean_length": 59.5625,
"completions/mean_terminated_length": 59.5625,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"entropy": 1.384379804134369,
"epoch": 0.5853658536585366,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.11170398443937302,
"kl": 1.2296073691686615e-05,
"learning_rate": 5.609756097560975e-07,
"loss": 0.07271970808506012,
"num_tokens": 590248.0,
"reward": 0.38166365027427673,
"reward_std": 0.34809473156929016,
"rewards/true_env_reward_fn/mean": 0.38166365027427673,
"rewards/true_env_reward_fn/std": 0.3480947017669678,
"step": 24,
"step_time": 11.223491792000118
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 123.0,
"completions/max_terminated_length": 123.0,
"completions/mean_length": 63.35416793823242,
"completions/mean_terminated_length": 63.35416793823242,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"entropy": 1.3013385236263275,
"epoch": 0.6097560975609756,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.10069931298494339,
"kl": 1.2947949016961502e-05,
"learning_rate": 5.853658536585365e-07,
"loss": 0.033605337142944336,
"num_tokens": 615345.0,
"reward": 0.5046355724334717,
"reward_std": 0.2754679322242737,
"rewards/true_env_reward_fn/mean": 0.5046355128288269,
"rewards/true_env_reward_fn/std": 0.2754679322242737,
"step": 25,
"step_time": 10.92509102200006
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 101.0,
"completions/max_terminated_length": 101.0,
"completions/mean_length": 61.41666793823242,
"completions/mean_terminated_length": 61.41666793823242,
"completions/min_length": 37.0,
"completions/min_terminated_length": 37.0,
"entropy": 1.2652399837970734,
"epoch": 0.6341463414634146,
"frac_reward_zero_std": 0.3333333432674408,
"grad_norm": 0.07595694065093994,
"kl": 1.151612354988174e-05,
"learning_rate": 6.097560975609756e-07,
"loss": 0.04607678949832916,
"num_tokens": 644749.0,
"reward": 0.3311978578567505,
"reward_std": 0.21527718007564545,
"rewards/true_env_reward_fn/mean": 0.3311978578567505,
"rewards/true_env_reward_fn/std": 0.21527719497680664,
"step": 26,
"step_time": 10.458724108999945
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 140.0,
"completions/max_terminated_length": 140.0,
"completions/mean_length": 71.25,
"completions/mean_terminated_length": 71.25,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"entropy": 1.193794459104538,
"epoch": 0.6585365853658537,
"frac_reward_zero_std": 0.1666666716337204,
"grad_norm": 0.07690244168043137,
"kl": 1.2164698546257569e-05,
"learning_rate": 6.341463414634146e-07,
"loss": 0.00818883627653122,
"num_tokens": 671153.0,
"reward": 0.3635203242301941,
"reward_std": 0.23849114775657654,
"rewards/true_env_reward_fn/mean": 0.3635202944278717,
"rewards/true_env_reward_fn/std": 0.23849113285541534,
"step": 27,
"step_time": 14.364785926000081
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 111.0,
"completions/max_terminated_length": 111.0,
"completions/mean_length": 63.4375,
"completions/mean_terminated_length": 63.4375,
"completions/min_length": 32.0,
"completions/min_terminated_length": 32.0,
"entropy": 1.2883787751197815,
"epoch": 0.6829268292682927,
"frac_reward_zero_std": 0.3333333432674408,
"grad_norm": 0.0902288407087326,
"kl": 1.1798915693361778e-05,
"learning_rate": 6.585365853658536e-07,
"loss": 0.038317371159791946,
"num_tokens": 697614.0,
"reward": 0.44166144728660583,
"reward_std": 0.25748196244239807,
"rewards/true_env_reward_fn/mean": 0.44166144728660583,
"rewards/true_env_reward_fn/std": 0.25748199224472046,
"step": 28,
"step_time": 10.888908384999922
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 238.0,
"completions/max_terminated_length": 238.0,
"completions/mean_length": 69.60417175292969,
"completions/mean_terminated_length": 69.60417175292969,
"completions/min_length": 39.0,
"completions/min_terminated_length": 39.0,
"entropy": 1.3002805709838867,
"epoch": 0.7073170731707317,
"frac_reward_zero_std": 0.3333333432674408,
"grad_norm": 0.07522639632225037,
"kl": 1.2230455695316778e-05,
"learning_rate": 6.829268292682927e-07,
"loss": 0.031045034527778625,
"num_tokens": 719187.0,
"reward": 0.5349087119102478,
"reward_std": 0.29909756779670715,
"rewards/true_env_reward_fn/mean": 0.5349087119102478,
"rewards/true_env_reward_fn/std": 0.29909753799438477,
"step": 29,
"step_time": 15.510035302999995
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 189.0,
"completions/max_terminated_length": 189.0,
"completions/mean_length": 70.91667175292969,
"completions/mean_terminated_length": 70.91667175292969,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"entropy": 1.2718828916549683,
"epoch": 0.7317073170731707,
"frac_reward_zero_std": 0.3333333432674408,
"grad_norm": 0.06776711344718933,
"kl": 1.2617916354429326e-05,
"learning_rate": 7.073170731707316e-07,
"loss": 0.09301326423883438,
"num_tokens": 744095.0,
"reward": 0.43472790718078613,
"reward_std": 0.3138841986656189,
"rewards/true_env_reward_fn/mean": 0.43472790718078613,
"rewards/true_env_reward_fn/std": 0.3138841688632965,
"step": 30,
"step_time": 14.50245602599989
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 153.0,
"completions/max_terminated_length": 153.0,
"completions/mean_length": 69.77083587646484,
"completions/mean_terminated_length": 69.77083587646484,
"completions/min_length": 40.0,
"completions/min_terminated_length": 40.0,
"entropy": 1.2918945252895355,
"epoch": 0.7560975609756098,
"frac_reward_zero_std": 0.1666666716337204,
"grad_norm": 0.08040682971477509,
"kl": 1.2672349157583085e-05,
"learning_rate": 7.317073170731707e-07,
"loss": 0.0367550291121006,
"num_tokens": 764612.0,
"reward": 0.5134401321411133,
"reward_std": 0.19073942303657532,
"rewards/true_env_reward_fn/mean": 0.5134401321411133,
"rewards/true_env_reward_fn/std": 0.19073940813541412,
"step": 31,
"step_time": 11.06186091799998
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 269.0,
"completions/max_terminated_length": 269.0,
"completions/mean_length": 71.79167175292969,
"completions/mean_terminated_length": 71.79167175292969,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"entropy": 1.1679067015647888,
"epoch": 0.7804878048780488,
"frac_reward_zero_std": 0.1666666716337204,
"grad_norm": 0.0744430273771286,
"kl": 1.2661263326663175e-05,
"learning_rate": 7.560975609756097e-07,
"loss": 0.05885648727416992,
"num_tokens": 782058.0,
"reward": 0.5372593402862549,
"reward_std": 0.18350909650325775,
"rewards/true_env_reward_fn/mean": 0.5372593402862549,
"rewards/true_env_reward_fn/std": 0.18350908160209656,
"step": 32,
"step_time": 15.808748693000211
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 265.0,
"completions/max_terminated_length": 265.0,
"completions/mean_length": 76.79167175292969,
"completions/mean_terminated_length": 76.79167175292969,
"completions/min_length": 37.0,
"completions/min_terminated_length": 37.0,
"entropy": 1.1829756796360016,
"epoch": 0.8048780487804879,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.051698025315999985,
"kl": 1.0996191576850833e-05,
"learning_rate": 7.804878048780488e-07,
"loss": 0.010143717750906944,
"num_tokens": 810472.0,
"reward": 0.4369215667247772,
"reward_std": 0.30869919061660767,
"rewards/true_env_reward_fn/mean": 0.4369215667247772,
"rewards/true_env_reward_fn/std": 0.30869919061660767,
"step": 33,
"step_time": 24.20358999299981
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 137.0,
"completions/max_terminated_length": 137.0,
"completions/mean_length": 61.85416793823242,
"completions/mean_terminated_length": 61.85416793823242,
"completions/min_length": 31.0,
"completions/min_terminated_length": 31.0,
"entropy": 1.2468958497047424,
"epoch": 0.8292682926829268,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.09706687182188034,
"kl": 1.2097383432774222e-05,
"learning_rate": 8.048780487804878e-07,
"loss": 0.026558157056570053,
"num_tokens": 836713.0,
"reward": 0.3587157428264618,
"reward_std": 0.2754887044429779,
"rewards/true_env_reward_fn/mean": 0.3587157428264618,
"rewards/true_env_reward_fn/std": 0.2754887044429779,
"step": 34,
"step_time": 12.218407348999904
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 115.0,
"completions/max_terminated_length": 115.0,
"completions/mean_length": 59.5625,
"completions/mean_terminated_length": 59.5625,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"entropy": 1.2368170320987701,
"epoch": 0.8536585365853658,
"frac_reward_zero_std": 0.1666666716337204,
"grad_norm": 0.08964981138706207,
"kl": 1.3131634887031396e-05,
"learning_rate": 8.292682926829268e-07,
"loss": -0.01139204390347004,
"num_tokens": 860028.0,
"reward": 0.49109315872192383,
"reward_std": 0.20359393954277039,
"rewards/true_env_reward_fn/mean": 0.49109315872192383,
"rewards/true_env_reward_fn/std": 0.20359393954277039,
"step": 35,
"step_time": 9.66908789599995
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 102.0,
"completions/max_terminated_length": 102.0,
"completions/mean_length": 66.02083587646484,
"completions/mean_terminated_length": 66.02083587646484,
"completions/min_length": 41.0,
"completions/min_terminated_length": 41.0,
"entropy": 1.1611860394477844,
"epoch": 0.8780487804878049,
"frac_reward_zero_std": 0.1666666716337204,
"grad_norm": 0.08851195871829987,
"kl": 1.2570341596074286e-05,
"learning_rate": 8.536585365853657e-07,
"loss": 0.021737128496170044,
"num_tokens": 883189.0,
"reward": 0.46058258414268494,
"reward_std": 0.2632383108139038,
"rewards/true_env_reward_fn/mean": 0.46058258414268494,
"rewards/true_env_reward_fn/std": 0.2632383108139038,
"step": 36,
"step_time": 8.370980583999994
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 135.0,
"completions/max_terminated_length": 135.0,
"completions/mean_length": 75.58333587646484,
"completions/mean_terminated_length": 75.58333587646484,
"completions/min_length": 47.0,
"completions/min_terminated_length": 47.0,
"entropy": 1.37085822224617,
"epoch": 0.9024390243902439,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.05852028727531433,
"kl": 1.2957561011717189e-05,
"learning_rate": 8.780487804878048e-07,
"loss": -0.024281952530145645,
"num_tokens": 906801.0,
"reward": 0.5022324323654175,
"reward_std": 0.11637427657842636,
"rewards/true_env_reward_fn/mean": 0.5022324323654175,
"rewards/true_env_reward_fn/std": 0.11637428402900696,
"step": 37,
"step_time": 10.285125336999727
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 141.0,
"completions/max_terminated_length": 141.0,
"completions/mean_length": 65.14583587646484,
"completions/mean_terminated_length": 65.14583587646484,
"completions/min_length": 30.0,
"completions/min_terminated_length": 30.0,
"entropy": 1.2760809361934662,
"epoch": 0.926829268292683,
"frac_reward_zero_std": 0.1666666716337204,
"grad_norm": 0.09105321019887924,
"kl": 1.3129126955391257e-05,
"learning_rate": 9.024390243902439e-07,
"loss": -0.011838603764772415,
"num_tokens": 929536.0,
"reward": 0.49639374017715454,
"reward_std": 0.32166802883148193,
"rewards/true_env_reward_fn/mean": 0.49639371037483215,
"rewards/true_env_reward_fn/std": 0.32166802883148193,
"step": 38,
"step_time": 12.449738128000035
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 124.0,
"completions/max_terminated_length": 124.0,
"completions/mean_length": 72.08333587646484,
"completions/mean_terminated_length": 72.08333587646484,
"completions/min_length": 43.0,
"completions/min_terminated_length": 43.0,
"entropy": 1.2545586228370667,
"epoch": 0.9512195121951219,
"frac_reward_zero_std": 0.3333333432674408,
"grad_norm": 0.06919296830892563,
"kl": 1.459557256566768e-05,
"learning_rate": 9.26829268292683e-07,
"loss": 0.021831180900335312,
"num_tokens": 950388.0,
"reward": 0.4879913330078125,
"reward_std": 0.24854585528373718,
"rewards/true_env_reward_fn/mean": 0.4879913330078125,
"rewards/true_env_reward_fn/std": 0.24854585528373718,
"step": 39,
"step_time": 10.279209028999958
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 179.0,
"completions/max_terminated_length": 179.0,
"completions/mean_length": 74.20833587646484,
"completions/mean_terminated_length": 74.20833587646484,
"completions/min_length": 38.0,
"completions/min_terminated_length": 38.0,
"entropy": 1.2255937159061432,
"epoch": 0.975609756097561,
"frac_reward_zero_std": 0.3333333432674408,
"grad_norm": 0.06352153420448303,
"kl": 1.2041192348988261e-05,
"learning_rate": 9.512195121951218e-07,
"loss": -0.013997981324791908,
"num_tokens": 981254.0,
"reward": 0.39802420139312744,
"reward_std": 0.20212584733963013,
"rewards/true_env_reward_fn/mean": 0.39802420139312744,
"rewards/true_env_reward_fn/std": 0.20212584733963013,
"step": 40,
"step_time": 13.58010066599968
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 139.0,
"completions/max_terminated_length": 139.0,
"completions/mean_length": 75.04167175292969,
"completions/mean_terminated_length": 75.04167175292969,
"completions/min_length": 38.0,
"completions/min_terminated_length": 38.0,
"entropy": 1.2703719735145569,
"epoch": 1.0,
"frac_reward_zero_std": 0.6666666865348816,
"grad_norm": 0.045169439166784286,
"kl": 1.1270850109212915e-05,
"learning_rate": 9.756097560975609e-07,
"loss": -0.010194316506385803,
"num_tokens": 1009968.0,
"reward": 0.4517599940299988,
"reward_std": 0.11791092902421951,
"rewards/true_env_reward_fn/mean": 0.4517599642276764,
"rewards/true_env_reward_fn/std": 0.11791091412305832,
"step": 41,
"step_time": 10.35077203700007
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 109.0,
"completions/max_terminated_length": 109.0,
"completions/mean_length": 64.33333587646484,
"completions/mean_terminated_length": 64.33333587646484,
"completions/min_length": 37.0,
"completions/min_terminated_length": 37.0,
"entropy": 1.329576164484024,
"epoch": 1.024390243902439,
"frac_reward_zero_std": 0.1666666716337204,
"grad_norm": 0.08522730320692062,
"kl": 1.4469044799625408e-05,
"learning_rate": 1e-06,
"loss": -0.00014946190640330315,
"num_tokens": 1039032.0,
"reward": 0.33548423647880554,
"reward_std": 0.22271563112735748,
"rewards/true_env_reward_fn/mean": 0.33548423647880554,
"rewards/true_env_reward_fn/std": 0.22271563112735748,
"step": 42,
"step_time": 10.548370664999993
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 372.0,
"completions/max_terminated_length": 372.0,
"completions/mean_length": 70.02083587646484,
"completions/mean_terminated_length": 70.02083587646484,
"completions/min_length": 24.0,
"completions/min_terminated_length": 24.0,
"entropy": 1.2357364892959595,
"epoch": 1.048780487804878,
"frac_reward_zero_std": 0.3333333432674408,
"grad_norm": 0.07030358910560608,
"kl": 1.3562755839302554e-05,
"learning_rate": 9.999818789066163e-07,
"loss": -0.02616041898727417,
"num_tokens": 1060833.0,
"reward": 0.5167371034622192,
"reward_std": 0.24280032515525818,
"rewards/true_env_reward_fn/mean": 0.5167370438575745,
"rewards/true_env_reward_fn/std": 0.24280032515525818,
"step": 43,
"step_time": 24.089396637999698
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 234.0,
"completions/max_terminated_length": 234.0,
"completions/mean_length": 77.47917175292969,
"completions/mean_terminated_length": 77.47917175292969,
"completions/min_length": 14.0,
"completions/min_terminated_length": 14.0,
"entropy": 1.1693778038024902,
"epoch": 1.0731707317073171,
"frac_reward_zero_std": 0.3333333432674408,
"grad_norm": 0.07017157226800919,
"kl": 1.332453393843025e-05,
"learning_rate": 9.999275169399612e-07,
"loss": -0.006466507911682129,
"num_tokens": 1088648.0,
"reward": 0.4498252272605896,
"reward_std": 0.21398545801639557,
"rewards/true_env_reward_fn/mean": 0.4498251974582672,
"rewards/true_env_reward_fn/std": 0.21398545801639557,
"step": 44,
"step_time": 19.39071501599983
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 186.0,
"completions/max_terminated_length": 186.0,
"completions/mean_length": 72.16667175292969,
"completions/mean_terminated_length": 72.16667175292969,
"completions/min_length": 41.0,
"completions/min_terminated_length": 41.0,
"entropy": 1.3268415927886963,
"epoch": 1.0975609756097562,
"frac_reward_zero_std": 0.3333333432674408,
"grad_norm": 0.06632921099662781,
"kl": 1.4458733630817733e-05,
"learning_rate": 9.99836918040428e-07,
"loss": -0.03534461930394173,
"num_tokens": 1117096.0,
"reward": 0.4053138196468353,
"reward_std": 0.21476909518241882,
"rewards/true_env_reward_fn/mean": 0.4053138196468353,
"rewards/true_env_reward_fn/std": 0.21476909518241882,
"step": 45,
"step_time": 13.893569495999827
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 160.0,
"completions/max_terminated_length": 160.0,
"completions/mean_length": 70.16667175292969,
"completions/mean_terminated_length": 70.16667175292969,
"completions/min_length": 36.0,
"completions/min_terminated_length": 36.0,
"entropy": 1.2670875787734985,
"epoch": 1.1219512195121952,
"frac_reward_zero_std": 0.1666666716337204,
"grad_norm": 0.08321154117584229,
"kl": 1.4837954950053245e-05,
"learning_rate": 9.997100887750215e-07,
"loss": -0.039235007017850876,
"num_tokens": 1136480.0,
"reward": 0.48141974210739136,
"reward_std": 0.2837103307247162,
"rewards/true_env_reward_fn/mean": 0.48141971230506897,
"rewards/true_env_reward_fn/std": 0.2837103009223938,
"step": 46,
"step_time": 10.50698806499986
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 159.0,
"completions/max_terminated_length": 159.0,
"completions/mean_length": 76.1875,
"completions/mean_terminated_length": 76.1875,
"completions/min_length": 32.0,
"completions/min_terminated_length": 32.0,
"entropy": 1.3037313222885132,
"epoch": 1.146341463414634,
"frac_reward_zero_std": 0.3333333432674408,
"grad_norm": 0.061912886798381805,
"kl": 1.283655774386716e-05,
"learning_rate": 9.995470383368808e-07,
"loss": -0.01992109790444374,
"num_tokens": 1162249.0,
"reward": 0.49922606348991394,
"reward_std": 0.2621309757232666,
"rewards/true_env_reward_fn/mean": 0.49922606348991394,
"rewards/true_env_reward_fn/std": 0.2621309757232666,
"step": 47,
"step_time": 12.964419044000124
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 231.0,
"completions/max_terminated_length": 231.0,
"completions/mean_length": 71.375,
"completions/mean_terminated_length": 71.375,
"completions/min_length": 37.0,
"completions/min_terminated_length": 37.0,
"entropy": 1.2007178366184235,
"epoch": 1.170731707317073,
"frac_reward_zero_std": 0.1666666716337204,
"grad_norm": 0.0889662653207779,
"kl": 1.6228528693318367e-05,
"learning_rate": 9.993477785446149e-07,
"loss": 0.045945264399051666,
"num_tokens": 1184555.0,
"reward": 0.42501482367515564,
"reward_std": 0.27350595593452454,
"rewards/true_env_reward_fn/mean": 0.42501482367515564,
"rewards/true_env_reward_fn/std": 0.27350592613220215,
"step": 48,
"step_time": 17.23041258299986
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 86.0,
"completions/max_terminated_length": 86.0,
"completions/mean_length": 55.9375,
"completions/mean_terminated_length": 55.9375,
"completions/min_length": 31.0,
"completions/min_terminated_length": 31.0,
"entropy": 1.182040810585022,
"epoch": 1.1951219512195121,
"frac_reward_zero_std": 0.3333333432674408,
"grad_norm": 0.08547856658697128,
"kl": 1.571832831359643e-05,
"learning_rate": 9.991123238414453e-07,
"loss": 0.02548346482217312,
"num_tokens": 1208384.0,
"reward": 0.3845663070678711,
"reward_std": 0.315467894077301,
"rewards/true_env_reward_fn/mean": 0.3845663070678711,
"rewards/true_env_reward_fn/std": 0.31546786427497864,
"step": 49,
"step_time": 8.691208415999881
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 109.0,
"completions/max_terminated_length": 109.0,
"completions/mean_length": 64.75,
"completions/mean_terminated_length": 64.75,
"completions/min_length": 34.0,
"completions/min_terminated_length": 34.0,
"entropy": 1.2306177020072937,
"epoch": 1.2195121951219512,
"frac_reward_zero_std": 0.3333333432674408,
"grad_norm": 0.07395736873149872,
"kl": 1.2643881973417592e-05,
"learning_rate": 9.988406912941589e-07,
"loss": -0.04186868295073509,
"num_tokens": 1227700.0,
"reward": 0.5068289637565613,
"reward_std": 0.31324177980422974,
"rewards/true_env_reward_fn/mean": 0.5068289637565613,
"rewards/true_env_reward_fn/std": 0.31324175000190735,
"step": 50,
"step_time": 10.162109979000206
}
],
"logging_steps": 1,
"max_steps": 410,
"num_input_tokens_seen": 1227700,
"num_train_epochs": 10,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 12,
"trial_name": null,
"trial_params": null
}