CserDu123's picture
Upload 001_test-rl-checkpoint-213/trainer_state.json with huggingface_hub
7cbb2db verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.4011299435028249,
"eval_steps": 500,
"global_step": 213,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.078125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 495.0,
"completions/mean_length": 290.59375,
"completions/mean_terminated_length": 271.83050537109375,
"completions/min_length": 99.0,
"completions/min_terminated_length": 99.0,
"entropy": 2.203125,
"epoch": 0.0018832391713747645,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.8694089651107788,
"learning_rate": 1e-06,
"loss": -0.0086,
"num_tokens": 140262.0,
"reward": 0.206417053937912,
"reward_std": 0.12193800508975983,
"rewards/acc_reward/mean": 0.19810229539871216,
"rewards/acc_reward/std": 0.28156620264053345,
"rewards/format_reward/mean": 0.28125,
"rewards/format_reward/std": 0.4531635046005249,
"step": 1
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 504.0,
"completions/mean_length": 247.734375,
"completions/mean_terminated_length": 243.53970336914062,
"completions/min_length": 125.0,
"completions/min_terminated_length": 125.0,
"entropy": 1.6796875,
"epoch": 0.003766478342749529,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.8876559734344482,
"learning_rate": 9.981167608286253e-07,
"loss": 0.0222,
"num_tokens": 273701.0,
"reward": 0.41670364141464233,
"reward_std": 0.150786355137825,
"rewards/acc_reward/mean": 0.42307350039482117,
"rewards/acc_reward/std": 0.29976290464401245,
"rewards/format_reward/mean": 0.359375,
"rewards/format_reward/std": 0.4836103618144989,
"step": 2
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 482.0,
"completions/max_terminated_length": 482.0,
"completions/mean_length": 236.046875,
"completions/mean_terminated_length": 236.046875,
"completions/min_length": 86.0,
"completions/min_terminated_length": 86.0,
"entropy": 1.6640625,
"epoch": 0.005649717514124294,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.8232742547988892,
"learning_rate": 9.962335216572504e-07,
"loss": 0.0698,
"num_tokens": 396392.0,
"reward": 0.4056413769721985,
"reward_std": 0.17224639654159546,
"rewards/acc_reward/mean": 0.3864765465259552,
"rewards/acc_reward/std": 0.3097887337207794,
"rewards/format_reward/mean": 0.578125,
"rewards/format_reward/std": 0.49776285886764526,
"step": 3
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 389.0,
"completions/max_terminated_length": 389.0,
"completions/mean_length": 227.8125,
"completions/mean_terminated_length": 227.8125,
"completions/min_length": 110.0,
"completions/min_terminated_length": 110.0,
"entropy": 1.6484375,
"epoch": 0.007532956685499058,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.0226917266845703,
"learning_rate": 9.943502824858757e-07,
"loss": -0.0298,
"num_tokens": 523036.0,
"reward": 0.3570261001586914,
"reward_std": 0.12964516878128052,
"rewards/acc_reward/mean": 0.33766794204711914,
"rewards/acc_reward/std": 0.35702335834503174,
"rewards/format_reward/mean": 0.53125,
"rewards/format_reward/std": 0.5029674172401428,
"step": 4
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 414.0,
"completions/max_terminated_length": 414.0,
"completions/mean_length": 214.515625,
"completions/mean_terminated_length": 214.515625,
"completions/min_length": 69.0,
"completions/min_terminated_length": 69.0,
"entropy": 1.8515625,
"epoch": 0.009416195856873822,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.8740723133087158,
"learning_rate": 9.92467043314501e-07,
"loss": -0.0099,
"num_tokens": 649309.0,
"reward": 0.4160749912261963,
"reward_std": 0.13374584913253784,
"rewards/acc_reward/mean": 0.37723612785339355,
"rewards/acc_reward/std": 0.31442132592201233,
"rewards/format_reward/mean": 0.765625,
"rewards/format_reward/std": 0.42695629596710205,
"step": 5
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 470.0,
"completions/mean_length": 266.046875,
"completions/mean_terminated_length": 258.1129150390625,
"completions/min_length": 65.0,
"completions/min_terminated_length": 65.0,
"entropy": 1.796875,
"epoch": 0.011299435028248588,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.7314085960388184,
"learning_rate": 9.905838041431261e-07,
"loss": 0.1098,
"num_tokens": 788480.0,
"reward": 0.2147485464811325,
"reward_std": 0.1807367354631424,
"rewards/acc_reward/mean": 0.15006783604621887,
"rewards/acc_reward/std": 0.3153360188007355,
"rewards/format_reward/mean": 0.796875,
"rewards/format_reward/std": 0.40550529956817627,
"step": 6
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 506.0,
"completions/max_terminated_length": 506.0,
"completions/mean_length": 232.5625,
"completions/mean_terminated_length": 232.5625,
"completions/min_length": 134.0,
"completions/min_terminated_length": 134.0,
"entropy": 1.84375,
"epoch": 0.013182674199623353,
"frac_reward_zero_std": 0.5,
"grad_norm": 1.3759692907333374,
"learning_rate": 9.887005649717514e-07,
"loss": 0.0142,
"num_tokens": 923788.0,
"reward": 0.21130093932151794,
"reward_std": 0.14664816856384277,
"rewards/acc_reward/mean": 0.12713992595672607,
"rewards/acc_reward/std": 0.2867279052734375,
"rewards/format_reward/mean": 0.96875,
"rewards/format_reward/std": 0.17536810040473938,
"step": 7
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 461.0,
"completions/max_terminated_length": 461.0,
"completions/mean_length": 256.796875,
"completions/mean_terminated_length": 256.796875,
"completions/min_length": 136.0,
"completions/min_terminated_length": 136.0,
"entropy": 1.6484375,
"epoch": 0.015065913370998116,
"frac_reward_zero_std": 0.25,
"grad_norm": 1.6152563095092773,
"learning_rate": 9.868173258003765e-07,
"loss": 0.0757,
"num_tokens": 1062207.0,
"reward": 0.3737403452396393,
"reward_std": 0.14178410172462463,
"rewards/acc_reward/mean": 0.30589205026626587,
"rewards/acc_reward/std": 0.3208266496658325,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 8
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 436.0,
"completions/mean_length": 255.265625,
"completions/mean_terminated_length": 238.15000915527344,
"completions/min_length": 108.0,
"completions/min_terminated_length": 108.0,
"entropy": 1.765625,
"epoch": 0.01694915254237288,
"frac_reward_zero_std": 0.25,
"grad_norm": 1.4234596490859985,
"learning_rate": 9.849340866290019e-07,
"loss": 0.0147,
"num_tokens": 1205848.0,
"reward": 0.3045212924480438,
"reward_std": 0.1969890296459198,
"rewards/acc_reward/mean": 0.22898197174072266,
"rewards/acc_reward/std": 0.3434719443321228,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 9
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 487.0,
"completions/max_terminated_length": 487.0,
"completions/mean_length": 224.8125,
"completions/mean_terminated_length": 224.8125,
"completions/min_length": 83.0,
"completions/min_terminated_length": 83.0,
"entropy": 1.8203125,
"epoch": 0.018832391713747645,
"frac_reward_zero_std": 0.5,
"grad_norm": 1.4884109497070312,
"learning_rate": 9.830508474576272e-07,
"loss": -0.0137,
"num_tokens": 1330620.0,
"reward": 0.26978251338005066,
"reward_std": 0.11548593640327454,
"rewards/acc_reward/mean": 0.19038332998752594,
"rewards/acc_reward/std": 0.31174740195274353,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 10
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 484.0,
"completions/mean_length": 256.890625,
"completions/mean_terminated_length": 252.84127807617188,
"completions/min_length": 92.0,
"completions/min_terminated_length": 92.0,
"entropy": 1.8046875,
"epoch": 0.02071563088512241,
"frac_reward_zero_std": 0.625,
"grad_norm": 1.1678308248519897,
"learning_rate": 9.811676082862523e-07,
"loss": 0.0051,
"num_tokens": 1474925.0,
"reward": 0.21130861341953278,
"reward_std": 0.11159157752990723,
"rewards/acc_reward/mean": 0.12367624044418335,
"rewards/acc_reward/std": 0.28885576128959656,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 11
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 438.0,
"completions/max_terminated_length": 438.0,
"completions/mean_length": 224.640625,
"completions/mean_terminated_length": 224.640625,
"completions/min_length": 69.0,
"completions/min_terminated_length": 69.0,
"entropy": 1.59375,
"epoch": 0.022598870056497175,
"frac_reward_zero_std": 0.25,
"grad_norm": 1.6417983770370483,
"learning_rate": 9.792843691148776e-07,
"loss": -0.0083,
"num_tokens": 1610966.0,
"reward": 0.3159927725791931,
"reward_std": 0.1229424774646759,
"rewards/acc_reward/mean": 0.24172811210155487,
"rewards/acc_reward/std": 0.31462714076042175,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 12
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 483.0,
"completions/max_terminated_length": 483.0,
"completions/mean_length": 219.6875,
"completions/mean_terminated_length": 219.6875,
"completions/min_length": 80.0,
"completions/min_terminated_length": 80.0,
"entropy": 1.4921875,
"epoch": 0.02448210922787194,
"frac_reward_zero_std": 0.375,
"grad_norm": 1.5047811269760132,
"learning_rate": 9.774011299435027e-07,
"loss": 0.0277,
"num_tokens": 1755810.0,
"reward": 0.2687425911426544,
"reward_std": 0.14603173732757568,
"rewards/acc_reward/mean": 0.1874917596578598,
"rewards/acc_reward/std": 0.3036465644836426,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 13
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.078125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 459.0,
"completions/mean_length": 277.0625,
"completions/mean_terminated_length": 257.15252685546875,
"completions/min_length": 142.0,
"completions/min_terminated_length": 142.0,
"entropy": 1.890625,
"epoch": 0.026365348399246705,
"frac_reward_zero_std": 0.375,
"grad_norm": 1.3763067722320557,
"learning_rate": 9.75517890772128e-07,
"loss": 0.0409,
"num_tokens": 1901926.0,
"reward": 0.3287838101387024,
"reward_std": 0.17755521833896637,
"rewards/acc_reward/mean": 0.2542042136192322,
"rewards/acc_reward/std": 0.383696049451828,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 14
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 483.0,
"completions/mean_length": 232.046875,
"completions/mean_terminated_length": 227.60317993164062,
"completions/min_length": 104.0,
"completions/min_terminated_length": 104.0,
"entropy": 1.59375,
"epoch": 0.02824858757062147,
"frac_reward_zero_std": 0.5,
"grad_norm": 1.1948471069335938,
"learning_rate": 9.736346516007531e-07,
"loss": -0.0154,
"num_tokens": 2028921.0,
"reward": 0.31290093064308167,
"reward_std": 0.08047251403331757,
"rewards/acc_reward/mean": 0.23655660450458527,
"rewards/acc_reward/std": 0.3165181875228882,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 15
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 486.0,
"completions/mean_length": 212.40625,
"completions/mean_terminated_length": 207.6508026123047,
"completions/min_length": 83.0,
"completions/min_terminated_length": 83.0,
"entropy": 1.734375,
"epoch": 0.030131826741996232,
"frac_reward_zero_std": 0.5,
"grad_norm": 1.5261269807815552,
"learning_rate": 9.717514124293785e-07,
"loss": 0.09,
"num_tokens": 2165179.0,
"reward": 0.30339735746383667,
"reward_std": 0.133845254778862,
"rewards/acc_reward/mean": 0.22599703073501587,
"rewards/acc_reward/std": 0.3543343245983124,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 16
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 482.0,
"completions/max_terminated_length": 482.0,
"completions/mean_length": 201.40625,
"completions/mean_terminated_length": 201.40625,
"completions/min_length": 89.0,
"completions/min_terminated_length": 89.0,
"entropy": 1.4375,
"epoch": 0.032015065913371,
"frac_reward_zero_std": 0.375,
"grad_norm": 1.5391112565994263,
"learning_rate": 9.698681732580038e-07,
"loss": 0.009,
"num_tokens": 2296853.0,
"reward": 0.1953125,
"reward_std": 0.2162797451019287,
"rewards/acc_reward/mean": 0.109375,
"rewards/acc_reward/std": 0.3145764470100403,
"rewards/format_reward/mean": 0.96875,
"rewards/format_reward/std": 0.17536810040473938,
"step": 17
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 482.0,
"completions/mean_length": 220.84375,
"completions/mean_terminated_length": 216.2222442626953,
"completions/min_length": 110.0,
"completions/min_terminated_length": 110.0,
"entropy": 1.7421875,
"epoch": 0.03389830508474576,
"frac_reward_zero_std": 0.625,
"grad_norm": 1.1895616054534912,
"learning_rate": 9.679849340866289e-07,
"loss": 0.0424,
"num_tokens": 2431011.0,
"reward": 0.30048421025276184,
"reward_std": 0.06477973610162735,
"rewards/acc_reward/mean": 0.22276021540164948,
"rewards/acc_reward/std": 0.31460005044937134,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 18
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 428.0,
"completions/max_terminated_length": 428.0,
"completions/mean_length": 223.953125,
"completions/mean_terminated_length": 223.953125,
"completions/min_length": 86.0,
"completions/min_terminated_length": 86.0,
"entropy": 1.9453125,
"epoch": 0.035781544256120526,
"frac_reward_zero_std": 0.25,
"grad_norm": 1.671203374862671,
"learning_rate": 9.661016949152542e-07,
"loss": 0.0416,
"num_tokens": 2571808.0,
"reward": 0.42682912945747375,
"reward_std": 0.13467340171337128,
"rewards/acc_reward/mean": 0.3631434738636017,
"rewards/acc_reward/std": 0.37232744693756104,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 19
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 482.0,
"completions/mean_length": 224.140625,
"completions/mean_terminated_length": 214.85482788085938,
"completions/min_length": 83.0,
"completions/min_terminated_length": 83.0,
"entropy": 1.625,
"epoch": 0.03766478342749529,
"frac_reward_zero_std": 0.25,
"grad_norm": 1.5360738039016724,
"learning_rate": 9.642184557438793e-07,
"loss": 0.0717,
"num_tokens": 2711697.0,
"reward": 0.3133315443992615,
"reward_std": 0.22658446431159973,
"rewards/acc_reward/mean": 0.23703500628471375,
"rewards/acc_reward/std": 0.3616204857826233,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 20
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 398.0,
"completions/mean_length": 222.515625,
"completions/mean_terminated_length": 213.1774139404297,
"completions/min_length": 115.0,
"completions/min_terminated_length": 115.0,
"entropy": 1.7109375,
"epoch": 0.03954802259887006,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.8884894847869873,
"learning_rate": 9.623352165725046e-07,
"loss": 0.0024,
"num_tokens": 2841242.0,
"reward": 0.5347622036933899,
"reward_std": 0.15656878054141998,
"rewards/acc_reward/mean": 0.4830690622329712,
"rewards/acc_reward/std": 0.31471821665763855,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 21
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 485.0,
"completions/mean_length": 226.703125,
"completions/mean_terminated_length": 222.1746063232422,
"completions/min_length": 90.0,
"completions/min_terminated_length": 90.0,
"entropy": 1.84375,
"epoch": 0.04143126177024482,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.8899277448654175,
"learning_rate": 9.6045197740113e-07,
"loss": 0.0705,
"num_tokens": 2963015.0,
"reward": 0.43775349855422974,
"reward_std": 0.2172819972038269,
"rewards/acc_reward/mean": 0.3752816617488861,
"rewards/acc_reward/std": 0.3713799715042114,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 22
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 482.0,
"completions/mean_length": 229.5,
"completions/mean_terminated_length": 220.3870849609375,
"completions/min_length": 77.0,
"completions/min_terminated_length": 77.0,
"entropy": 1.7421875,
"epoch": 0.04331450094161959,
"frac_reward_zero_std": 0.25,
"grad_norm": 1.6153416633605957,
"learning_rate": 9.58568738229755e-07,
"loss": -0.0054,
"num_tokens": 3109447.0,
"reward": 0.32855772972106934,
"reward_std": 0.27425360679626465,
"rewards/acc_reward/mean": 0.2556891441345215,
"rewards/acc_reward/std": 0.40467146039009094,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 23
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 510.0,
"completions/max_terminated_length": 510.0,
"completions/mean_length": 212.984375,
"completions/mean_terminated_length": 212.984375,
"completions/min_length": 94.0,
"completions/min_terminated_length": 94.0,
"entropy": 1.484375,
"epoch": 0.04519774011299435,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.8740508556365967,
"learning_rate": 9.566854990583804e-07,
"loss": 0.0035,
"num_tokens": 3255782.0,
"reward": 0.4697909355163574,
"reward_std": 0.18794436752796173,
"rewards/acc_reward/mean": 0.41261494159698486,
"rewards/acc_reward/std": 0.39874467253685,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 24
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 483.0,
"completions/max_terminated_length": 483.0,
"completions/mean_length": 237.375,
"completions/mean_terminated_length": 237.375,
"completions/min_length": 42.0,
"completions/min_terminated_length": 42.0,
"entropy": 1.9375,
"epoch": 0.047080979284369114,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.9496995210647583,
"learning_rate": 9.548022598870055e-07,
"loss": 0.1125,
"num_tokens": 3398398.0,
"reward": 0.43061739206314087,
"reward_std": 0.25355520844459534,
"rewards/acc_reward/mean": 0.36735260486602783,
"rewards/acc_reward/std": 0.3482816219329834,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 25
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 483.0,
"completions/max_terminated_length": 483.0,
"completions/mean_length": 236.890625,
"completions/mean_terminated_length": 236.890625,
"completions/min_length": 86.0,
"completions/min_terminated_length": 86.0,
"entropy": 1.765625,
"epoch": 0.04896421845574388,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.7656739950180054,
"learning_rate": 9.529190207156308e-07,
"loss": 0.0431,
"num_tokens": 3534743.0,
"reward": 0.49807009100914,
"reward_std": 0.23242174088954926,
"rewards/acc_reward/mean": 0.44230008125305176,
"rewards/acc_reward/std": 0.38975948095321655,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 26
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 398.0,
"completions/max_terminated_length": 398.0,
"completions/mean_length": 187.546875,
"completions/mean_terminated_length": 187.546875,
"completions/min_length": 74.0,
"completions/min_terminated_length": 74.0,
"entropy": 1.78125,
"epoch": 0.05084745762711865,
"frac_reward_zero_std": 0.125,
"grad_norm": 2.076228618621826,
"learning_rate": 9.510357815442561e-07,
"loss": 0.0356,
"num_tokens": 3659290.0,
"reward": 0.44223499298095703,
"reward_std": 0.21930678188800812,
"rewards/acc_reward/mean": 0.383733332157135,
"rewards/acc_reward/std": 0.3709230422973633,
"rewards/format_reward/mean": 0.96875,
"rewards/format_reward/std": 0.17536810040473938,
"step": 27
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 484.0,
"completions/max_terminated_length": 484.0,
"completions/mean_length": 209.65625,
"completions/mean_terminated_length": 209.65625,
"completions/min_length": 37.0,
"completions/min_terminated_length": 37.0,
"entropy": 1.875,
"epoch": 0.05273069679849341,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.8770859241485596,
"learning_rate": 9.491525423728813e-07,
"loss": 0.0769,
"num_tokens": 3796772.0,
"reward": 0.41265854239463806,
"reward_std": 0.3168802857398987,
"rewards/acc_reward/mean": 0.34913450479507446,
"rewards/acc_reward/std": 0.42575517296791077,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 28
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 385.0,
"completions/max_terminated_length": 385.0,
"completions/mean_length": 192.75,
"completions/mean_terminated_length": 192.75,
"completions/min_length": 66.0,
"completions/min_terminated_length": 66.0,
"entropy": 2.140625,
"epoch": 0.054613935969868174,
"frac_reward_zero_std": 0.125,
"grad_norm": 2.1020395755767822,
"learning_rate": 9.472693032015065e-07,
"loss": 0.0712,
"num_tokens": 3925692.0,
"reward": 0.41072791814804077,
"reward_std": 0.3003063201904297,
"rewards/acc_reward/mean": 0.34698933362960815,
"rewards/acc_reward/std": 0.4590102732181549,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 29
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 479.0,
"completions/max_terminated_length": 479.0,
"completions/mean_length": 189.5625,
"completions/mean_terminated_length": 189.5625,
"completions/min_length": 81.0,
"completions/min_terminated_length": 81.0,
"entropy": 1.9609375,
"epoch": 0.05649717514124294,
"frac_reward_zero_std": 0.125,
"grad_norm": 2.122809410095215,
"learning_rate": 9.453860640301318e-07,
"loss": -0.0219,
"num_tokens": 4063328.0,
"reward": 0.4698576331138611,
"reward_std": 0.30180490016937256,
"rewards/acc_reward/mean": 0.41095292568206787,
"rewards/acc_reward/std": 0.4547015130519867,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 30
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 492.0,
"completions/max_terminated_length": 492.0,
"completions/mean_length": 185.6875,
"completions/mean_terminated_length": 185.6875,
"completions/min_length": 88.0,
"completions/min_terminated_length": 88.0,
"entropy": 1.8359375,
"epoch": 0.0583804143126177,
"frac_reward_zero_std": 0.125,
"grad_norm": 2.101128578186035,
"learning_rate": 9.43502824858757e-07,
"loss": 0.0325,
"num_tokens": 4191876.0,
"reward": 0.4844944179058075,
"reward_std": 0.30700868368148804,
"rewards/acc_reward/mean": 0.42895209789276123,
"rewards/acc_reward/std": 0.4517141282558441,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 31
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 435.0,
"completions/mean_length": 179.5,
"completions/mean_terminated_length": 174.22222900390625,
"completions/min_length": 35.0,
"completions/min_terminated_length": 35.0,
"entropy": 1.859375,
"epoch": 0.060263653483992465,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.2371370792388916,
"learning_rate": 9.416195856873822e-07,
"loss": -0.0023,
"num_tokens": 4322148.0,
"reward": 0.5825260281562805,
"reward_std": 0.3706633746623993,
"rewards/acc_reward/mean": 0.536139965057373,
"rewards/acc_reward/std": 0.4555891752243042,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 32
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 482.0,
"completions/max_terminated_length": 482.0,
"completions/mean_length": 188.6875,
"completions/mean_terminated_length": 188.6875,
"completions/min_length": 41.0,
"completions/min_terminated_length": 41.0,
"entropy": 1.625,
"epoch": 0.062146892655367235,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.9997694492340088,
"learning_rate": 9.397363465160075e-07,
"loss": 0.0677,
"num_tokens": 4460208.0,
"reward": 0.6379782557487488,
"reward_std": 0.284807950258255,
"rewards/acc_reward/mean": 0.599489688873291,
"rewards/acc_reward/std": 0.42208555340766907,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 33
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 479.0,
"completions/max_terminated_length": 479.0,
"completions/mean_length": 206.375,
"completions/mean_terminated_length": 206.375,
"completions/min_length": 76.0,
"completions/min_terminated_length": 76.0,
"entropy": 2.28125,
"epoch": 0.064030131826742,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.249376058578491,
"learning_rate": 9.378531073446327e-07,
"loss": 0.0497,
"num_tokens": 4598760.0,
"reward": 0.6912428140640259,
"reward_std": 0.350800096988678,
"rewards/acc_reward/mean": 0.658672571182251,
"rewards/acc_reward/std": 0.45664042234420776,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 34
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 482.0,
"completions/max_terminated_length": 482.0,
"completions/mean_length": 171.703125,
"completions/mean_terminated_length": 171.703125,
"completions/min_length": 62.0,
"completions/min_terminated_length": 62.0,
"entropy": 1.9609375,
"epoch": 0.06591337099811675,
"frac_reward_zero_std": 0.125,
"grad_norm": 1.9761137962341309,
"learning_rate": 9.359698681732579e-07,
"loss": 0.0538,
"num_tokens": 4729973.0,
"reward": 0.7882718443870544,
"reward_std": 0.24334368109703064,
"rewards/acc_reward/mean": 0.7664825916290283,
"rewards/acc_reward/std": 0.34395766258239746,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 35
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 319.0,
"completions/max_terminated_length": 319.0,
"completions/mean_length": 165.21875,
"completions/mean_terminated_length": 165.21875,
"completions/min_length": 84.0,
"completions/min_terminated_length": 84.0,
"entropy": 1.9375,
"epoch": 0.06779661016949153,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.3349790573120117,
"learning_rate": 9.340866290018831e-07,
"loss": 0.0651,
"num_tokens": 4870371.0,
"reward": 0.7641240358352661,
"reward_std": 0.3322174549102783,
"rewards/acc_reward/mean": 0.7396517395973206,
"rewards/acc_reward/std": 0.409751832485199,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 36
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 398.0,
"completions/mean_length": 164.234375,
"completions/mean_terminated_length": 158.71429443359375,
"completions/min_length": 42.0,
"completions/min_terminated_length": 42.0,
"entropy": 1.9765625,
"epoch": 0.0696798493408663,
"frac_reward_zero_std": 0.125,
"grad_norm": 2.1828882694244385,
"learning_rate": 9.322033898305083e-07,
"loss": 0.0297,
"num_tokens": 5002066.0,
"reward": 0.7222064733505249,
"reward_std": 0.2505956292152405,
"rewards/acc_reward/mean": 0.6913405656814575,
"rewards/acc_reward/std": 0.3850165903568268,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 37
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 483.0,
"completions/mean_length": 189.25,
"completions/mean_terminated_length": 184.1269989013672,
"completions/min_length": 67.0,
"completions/min_terminated_length": 67.0,
"entropy": 2.203125,
"epoch": 0.07156308851224105,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.241468906402588,
"learning_rate": 9.303201506591337e-07,
"loss": 0.0657,
"num_tokens": 5135842.0,
"reward": 0.767461359500885,
"reward_std": 0.23975765705108643,
"rewards/acc_reward/mean": 0.74509596824646,
"rewards/acc_reward/std": 0.3569471538066864,
"rewards/format_reward/mean": 0.96875,
"rewards/format_reward/std": 0.17536810040473938,
"step": 38
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 334.0,
"completions/mean_length": 167.296875,
"completions/mean_terminated_length": 161.82540893554688,
"completions/min_length": 64.0,
"completions/min_terminated_length": 64.0,
"entropy": 2.21875,
"epoch": 0.07344632768361582,
"frac_reward_zero_std": 0.125,
"grad_norm": 2.625629186630249,
"learning_rate": 9.28436911487759e-07,
"loss": 0.0755,
"num_tokens": 5268213.0,
"reward": 0.7769017219543457,
"reward_std": 0.301779568195343,
"rewards/acc_reward/mean": 0.7555853128433228,
"rewards/acc_reward/std": 0.3923026919364929,
"rewards/format_reward/mean": 0.96875,
"rewards/format_reward/std": 0.17536810040473938,
"step": 39
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 512.0,
"completions/max_terminated_length": 512.0,
"completions/mean_length": 182.0,
"completions/mean_terminated_length": 165.77047729492188,
"completions/min_length": 73.0,
"completions/min_terminated_length": 73.0,
"entropy": 2.34375,
"epoch": 0.07532956685499058,
"frac_reward_zero_std": 0.25,
"grad_norm": 1.8064289093017578,
"learning_rate": 9.265536723163842e-07,
"loss": -0.0294,
"num_tokens": 5400085.0,
"reward": 0.8888776302337646,
"reward_std": 0.2301492542028427,
"rewards/acc_reward/mean": 0.8782668113708496,
"rewards/acc_reward/std": 0.31358516216278076,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 40
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 377.0,
"completions/mean_length": 170.578125,
"completions/mean_terminated_length": 165.15872192382812,
"completions/min_length": 46.0,
"completions/min_terminated_length": 46.0,
"entropy": 2.375,
"epoch": 0.07721280602636535,
"frac_reward_zero_std": 0.375,
"grad_norm": 1.6493501663208008,
"learning_rate": 9.246704331450094e-07,
"loss": -0.034,
"num_tokens": 5529746.0,
"reward": 0.8990048170089722,
"reward_std": 0.1494266837835312,
"rewards/acc_reward/mean": 0.889519214630127,
"rewards/acc_reward/std": 0.26512882113456726,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 41
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 393.0,
"completions/max_terminated_length": 393.0,
"completions/mean_length": 158.859375,
"completions/mean_terminated_length": 158.859375,
"completions/min_length": 71.0,
"completions/min_terminated_length": 71.0,
"entropy": 2.4375,
"epoch": 0.07909604519774012,
"frac_reward_zero_std": 0.5,
"grad_norm": 1.683705449104309,
"learning_rate": 9.227871939736346e-07,
"loss": 0.0056,
"num_tokens": 5668777.0,
"reward": 0.9233179092407227,
"reward_std": 0.14834892749786377,
"rewards/acc_reward/mean": 0.9165338277816772,
"rewards/acc_reward/std": 0.2692948877811432,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 42
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 460.0,
"completions/max_terminated_length": 460.0,
"completions/mean_length": 187.859375,
"completions/mean_terminated_length": 187.859375,
"completions/min_length": 44.0,
"completions/min_terminated_length": 44.0,
"entropy": 2.46875,
"epoch": 0.08097928436911488,
"frac_reward_zero_std": 0.625,
"grad_norm": 1.3769292831420898,
"learning_rate": 9.209039548022598e-07,
"loss": 0.0487,
"num_tokens": 5800544.0,
"reward": 0.9674270153045654,
"reward_std": 0.048613592982292175,
"rewards/acc_reward/mean": 0.9672800302505493,
"rewards/acc_reward/std": 0.12451620399951935,
"rewards/format_reward/mean": 0.96875,
"rewards/format_reward/std": 0.17536810040473938,
"step": 43
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 453.0,
"completions/mean_length": 164.546875,
"completions/mean_terminated_length": 153.33871459960938,
"completions/min_length": 31.0,
"completions/min_terminated_length": 31.0,
"entropy": 2.671875,
"epoch": 0.08286252354048965,
"frac_reward_zero_std": 0.75,
"grad_norm": 1.0136916637420654,
"learning_rate": 9.190207156308852e-07,
"loss": -0.0441,
"num_tokens": 5929459.0,
"reward": 0.9583332538604736,
"reward_std": 0.016743799671530724,
"rewards/acc_reward/mean": 0.9571758508682251,
"rewards/acc_reward/std": 0.0556831993162632,
"rewards/format_reward/mean": 0.96875,
"rewards/format_reward/std": 0.17536810040473938,
"step": 44
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 294.0,
"completions/max_terminated_length": 294.0,
"completions/mean_length": 166.890625,
"completions/mean_terminated_length": 166.890625,
"completions/min_length": 51.0,
"completions/min_terminated_length": 51.0,
"entropy": 2.25,
"epoch": 0.0847457627118644,
"frac_reward_zero_std": 0.125,
"grad_norm": 2.0560693740844727,
"learning_rate": 9.171374764595104e-07,
"loss": -0.041,
"num_tokens": 6061324.0,
"reward": 0.9419167041778564,
"reward_std": 0.08913865685462952,
"rewards/acc_reward/mean": 0.9458796381950378,
"rewards/acc_reward/std": 0.15155728161334991,
"rewards/format_reward/mean": 0.90625,
"rewards/format_reward/std": 0.29378482699394226,
"step": 45
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 462.0,
"completions/max_terminated_length": 462.0,
"completions/mean_length": 184.484375,
"completions/mean_terminated_length": 184.484375,
"completions/min_length": 74.0,
"completions/min_terminated_length": 74.0,
"entropy": 2.453125,
"epoch": 0.08662900188323917,
"frac_reward_zero_std": 0.5,
"grad_norm": 1.4456909894943237,
"learning_rate": 9.152542372881356e-07,
"loss": 0.0865,
"num_tokens": 6199275.0,
"reward": 0.956250011920929,
"reward_std": 0.1237436980009079,
"rewards/acc_reward/mean": 0.953125,
"rewards/acc_reward/std": 0.21304203569889069,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 46
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 395.0,
"completions/mean_length": 182.65625,
"completions/mean_terminated_length": 172.03225708007812,
"completions/min_length": 47.0,
"completions/min_terminated_length": 47.0,
"entropy": 2.734375,
"epoch": 0.08851224105461393,
"frac_reward_zero_std": 0.875,
"grad_norm": 0.9000487923622131,
"learning_rate": 9.133709981167608e-07,
"loss": 0.0061,
"num_tokens": 6339829.0,
"reward": 0.9662767648696899,
"reward_std": 0.039774756878614426,
"rewards/acc_reward/mean": 0.9625297784805298,
"rewards/acc_reward/std": 0.12811994552612305,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 47
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 360.0,
"completions/mean_length": 174.3125,
"completions/mean_terminated_length": 168.952392578125,
"completions/min_length": 31.0,
"completions/min_terminated_length": 31.0,
"entropy": 2.546875,
"epoch": 0.0903954802259887,
"frac_reward_zero_std": 0.75,
"grad_norm": 1.0134769678115845,
"learning_rate": 9.11487758945386e-07,
"loss": -0.0181,
"num_tokens": 6473609.0,
"reward": 0.9744918346405029,
"reward_std": 0.008838837035000324,
"rewards/acc_reward/mean": 0.9751298427581787,
"rewards/acc_reward/std": 0.04131903871893883,
"rewards/format_reward/mean": 0.96875,
"rewards/format_reward/std": 0.17536810040473938,
"step": 48
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 454.0,
"completions/mean_length": 188.421875,
"completions/mean_terminated_length": 177.98385620117188,
"completions/min_length": 65.0,
"completions/min_terminated_length": 65.0,
"entropy": 2.625,
"epoch": 0.09227871939736347,
"frac_reward_zero_std": 0.625,
"grad_norm": 1.7283865213394165,
"learning_rate": 9.096045197740112e-07,
"loss": 0.0749,
"num_tokens": 6607932.0,
"reward": 0.9752188920974731,
"reward_std": 0.015992172062397003,
"rewards/acc_reward/mean": 0.981145977973938,
"rewards/acc_reward/std": 0.025175929069519043,
"rewards/format_reward/mean": 0.921875,
"rewards/format_reward/std": 0.27048972249031067,
"step": 49
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 509.0,
"completions/max_terminated_length": 509.0,
"completions/mean_length": 194.421875,
"completions/mean_terminated_length": 194.421875,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"entropy": 2.53125,
"epoch": 0.09416195856873823,
"frac_reward_zero_std": 0.75,
"grad_norm": 1.1576530933380127,
"learning_rate": 9.077212806026365e-07,
"loss": 0.0122,
"num_tokens": 6752119.0,
"reward": 0.9867604374885559,
"reward_std": 0.007432654500007629,
"rewards/acc_reward/mean": 0.9870254993438721,
"rewards/acc_reward/std": 0.017179692164063454,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 50
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 509.0,
"completions/max_terminated_length": 509.0,
"completions/mean_length": 186.71875,
"completions/mean_terminated_length": 186.71875,
"completions/min_length": 67.0,
"completions/min_terminated_length": 67.0,
"entropy": 2.78125,
"epoch": 0.096045197740113,
"frac_reward_zero_std": 0.125,
"grad_norm": 2.141554355621338,
"learning_rate": 9.058380414312617e-07,
"loss": -0.0311,
"num_tokens": 6890533.0,
"reward": 0.9580291509628296,
"reward_std": 0.0707060694694519,
"rewards/acc_reward/mean": 0.9637823700904846,
"rewards/acc_reward/std": 0.13111592829227448,
"rewards/format_reward/mean": 0.90625,
"rewards/format_reward/std": 0.29378482699394226,
"step": 51
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 307.0,
"completions/mean_length": 169.390625,
"completions/mean_terminated_length": 163.952392578125,
"completions/min_length": 32.0,
"completions/min_terminated_length": 32.0,
"entropy": 2.421875,
"epoch": 0.09792843691148775,
"frac_reward_zero_std": 0.875,
"grad_norm": 0.8654880523681641,
"learning_rate": 9.03954802259887e-07,
"loss": 0.0053,
"num_tokens": 7027358.0,
"reward": 0.9961555004119873,
"reward_std": 0.004419418517500162,
"rewards/acc_reward/mean": 0.9974644780158997,
"rewards/acc_reward/std": 0.006761432159692049,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 52
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 464.0,
"completions/mean_length": 176.078125,
"completions/mean_terminated_length": 170.74603271484375,
"completions/min_length": 51.0,
"completions/min_terminated_length": 51.0,
"entropy": 2.609375,
"epoch": 0.09981167608286252,
"frac_reward_zero_std": 0.625,
"grad_norm": 1.4219504594802856,
"learning_rate": 9.020715630885122e-07,
"loss": -0.0008,
"num_tokens": 7162123.0,
"reward": 0.9900810718536377,
"reward_std": 0.021838055923581123,
"rewards/acc_reward/mean": 0.9924511909484863,
"rewards/acc_reward/std": 0.04345937818288803,
"rewards/format_reward/mean": 0.96875,
"rewards/format_reward/std": 0.17536810040473938,
"step": 53
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 428.0,
"completions/mean_length": 171.125,
"completions/mean_terminated_length": 165.71429443359375,
"completions/min_length": 43.0,
"completions/min_terminated_length": 43.0,
"entropy": 2.453125,
"epoch": 0.1016949152542373,
"frac_reward_zero_std": 0.875,
"grad_norm": 0.9306376576423645,
"learning_rate": 9.001883239171374e-07,
"loss": 0.0039,
"num_tokens": 7294099.0,
"reward": 0.9847477674484253,
"reward_std": 0.005786377005279064,
"rewards/acc_reward/mean": 0.986525297164917,
"rewards/acc_reward/std": 0.02815171889960766,
"rewards/format_reward/mean": 0.96875,
"rewards/format_reward/std": 0.17536810040473938,
"step": 54
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 317.0,
"completions/max_terminated_length": 317.0,
"completions/mean_length": 185.28125,
"completions/mean_terminated_length": 185.28125,
"completions/min_length": 70.0,
"completions/min_terminated_length": 70.0,
"entropy": 2.59375,
"epoch": 0.10357815442561205,
"frac_reward_zero_std": 0.75,
"grad_norm": 1.0906463861465454,
"learning_rate": 8.983050847457627e-07,
"loss": -0.0101,
"num_tokens": 7421381.0,
"reward": 0.9820950031280518,
"reward_std": 0.013950306922197342,
"rewards/acc_reward/mean": 0.9818416833877563,
"rewards/acc_reward/std": 0.03777196630835533,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 55
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 512.0,
"completions/mean_length": 189.296875,
"completions/mean_terminated_length": 189.296875,
"completions/min_length": 47.0,
"completions/min_terminated_length": 47.0,
"entropy": 2.71875,
"epoch": 0.10546139359698682,
"frac_reward_zero_std": 0.875,
"grad_norm": 1.0221842527389526,
"learning_rate": 8.964218455743879e-07,
"loss": -0.0019,
"num_tokens": 7550840.0,
"reward": 0.9848359823226929,
"reward_std": 0.004419418517500162,
"rewards/acc_reward/mean": 0.9848872423171997,
"rewards/acc_reward/std": 0.026450620964169502,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 56
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 412.0,
"completions/max_terminated_length": 412.0,
"completions/mean_length": 204.6875,
"completions/mean_terminated_length": 204.6875,
"completions/min_length": 76.0,
"completions/min_terminated_length": 76.0,
"entropy": 2.609375,
"epoch": 0.10734463276836158,
"frac_reward_zero_std": 0.75,
"grad_norm": 1.1911293268203735,
"learning_rate": 8.945386064030131e-07,
"loss": 0.0145,
"num_tokens": 7693764.0,
"reward": 0.9755189418792725,
"reward_std": 0.04419417679309845,
"rewards/acc_reward/mean": 0.9745348691940308,
"rewards/acc_reward/std": 0.1250728815793991,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 57
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 452.0,
"completions/max_terminated_length": 452.0,
"completions/mean_length": 210.546875,
"completions/mean_terminated_length": 210.546875,
"completions/min_length": 87.0,
"completions/min_terminated_length": 87.0,
"entropy": 2.65625,
"epoch": 0.10922787193973635,
"frac_reward_zero_std": 0.875,
"grad_norm": 0.7357279062271118,
"learning_rate": 8.926553672316383e-07,
"loss": 0.0164,
"num_tokens": 7828263.0,
"reward": 0.9803258180618286,
"reward_std": 0.007434290833771229,
"rewards/acc_reward/mean": 0.9781398177146912,
"rewards/acc_reward/std": 0.03639683872461319,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 58
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 289.0,
"completions/max_terminated_length": 289.0,
"completions/mean_length": 154.046875,
"completions/mean_terminated_length": 154.046875,
"completions/min_length": 48.0,
"completions/min_terminated_length": 48.0,
"entropy": 2.5625,
"epoch": 0.1111111111111111,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 8.907721280602636e-07,
"loss": 0.0,
"num_tokens": 7968106.0,
"reward": 0.9852668046951294,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9836297631263733,
"rewards/acc_reward/std": 0.024946285411715508,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 59
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 449.0,
"completions/mean_length": 177.296875,
"completions/mean_terminated_length": 166.5,
"completions/min_length": 51.0,
"completions/min_terminated_length": 51.0,
"entropy": 2.546875,
"epoch": 0.11299435028248588,
"frac_reward_zero_std": 0.875,
"grad_norm": 1.5839695930480957,
"learning_rate": 8.888888888888888e-07,
"loss": 0.0779,
"num_tokens": 8098157.0,
"reward": 0.9858125448226929,
"reward_std": 0.004419418517500162,
"rewards/acc_reward/mean": 0.9859722852706909,
"rewards/acc_reward/std": 0.019654173403978348,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 60
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 385.0,
"completions/mean_length": 167.1875,
"completions/mean_terminated_length": 161.71429443359375,
"completions/min_length": 61.0,
"completions/min_terminated_length": 61.0,
"entropy": 2.765625,
"epoch": 0.11487758945386065,
"frac_reward_zero_std": 0.75,
"grad_norm": 1.4451484680175781,
"learning_rate": 8.870056497175141e-07,
"loss": -0.0,
"num_tokens": 8227641.0,
"reward": 0.9645360708236694,
"reward_std": 0.008838837035000324,
"rewards/acc_reward/mean": 0.964067816734314,
"rewards/acc_reward/std": 0.040923893451690674,
"rewards/format_reward/mean": 0.96875,
"rewards/format_reward/std": 0.17536810040473938,
"step": 61
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 459.0,
"completions/max_terminated_length": 459.0,
"completions/mean_length": 181.734375,
"completions/mean_terminated_length": 181.734375,
"completions/min_length": 16.0,
"completions/min_terminated_length": 16.0,
"entropy": 2.625,
"epoch": 0.1167608286252354,
"frac_reward_zero_std": 0.875,
"grad_norm": 0.3860294222831726,
"learning_rate": 8.851224105461393e-07,
"loss": -0.0431,
"num_tokens": 8359496.0,
"reward": 0.991644024848938,
"reward_std": 0.004419418517500162,
"rewards/acc_reward/mean": 0.9924516677856445,
"rewards/acc_reward/std": 0.020128827542066574,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 62
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 499.0,
"completions/mean_length": 207.34375,
"completions/mean_terminated_length": 202.50794982910156,
"completions/min_length": 78.0,
"completions/min_terminated_length": 78.0,
"entropy": 2.75,
"epoch": 0.11864406779661017,
"frac_reward_zero_std": 0.875,
"grad_norm": 0.6225361227989197,
"learning_rate": 8.832391713747645e-07,
"loss": -0.0011,
"num_tokens": 8500670.0,
"reward": 0.9970052242279053,
"reward_std": 0.004419418517500162,
"rewards/acc_reward/mean": 0.9984085559844971,
"rewards/acc_reward/std": 0.004243890754878521,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 63
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 369.0,
"completions/mean_length": 169.6875,
"completions/mean_terminated_length": 164.25396728515625,
"completions/min_length": 60.0,
"completions/min_terminated_length": 60.0,
"entropy": 2.3125,
"epoch": 0.12052730696798493,
"frac_reward_zero_std": 0.625,
"grad_norm": 1.537191390991211,
"learning_rate": 8.813559322033897e-07,
"loss": 0.0021,
"num_tokens": 8631754.0,
"reward": 0.9799120426177979,
"reward_std": 0.013258256018161774,
"rewards/acc_reward/mean": 0.982888400554657,
"rewards/acc_reward/std": 0.022922541946172714,
"rewards/format_reward/mean": 0.953125,
"rewards/format_reward/std": 0.21304203569889069,
"step": 64
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 500.0,
"completions/max_terminated_length": 500.0,
"completions/mean_length": 172.59375,
"completions/mean_terminated_length": 172.59375,
"completions/min_length": 38.0,
"completions/min_terminated_length": 38.0,
"entropy": 2.609375,
"epoch": 0.1224105461393597,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 8.79472693032015e-07,
"loss": 0.0,
"num_tokens": 8762784.0,
"reward": 0.9871211051940918,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.985690176486969,
"rewards/acc_reward/std": 0.017727959901094437,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 65
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 405.0,
"completions/max_terminated_length": 405.0,
"completions/mean_length": 181.234375,
"completions/mean_terminated_length": 181.234375,
"completions/min_length": 64.0,
"completions/min_terminated_length": 64.0,
"entropy": 2.453125,
"epoch": 0.12429378531073447,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 8.775894538606404e-07,
"loss": 0.0,
"num_tokens": 8887127.0,
"reward": 0.9676999449729919,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9641109704971313,
"rewards/acc_reward/std": 0.03724094480276108,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 66
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 319.0,
"completions/mean_length": 163.078125,
"completions/mean_terminated_length": 157.53968811035156,
"completions/min_length": 61.0,
"completions/min_terminated_length": 61.0,
"entropy": 2.296875,
"epoch": 0.12617702448210924,
"frac_reward_zero_std": 0.875,
"grad_norm": 0.6480050086975098,
"learning_rate": 8.757062146892656e-07,
"loss": -0.0023,
"num_tokens": 9024028.0,
"reward": 0.9807539582252502,
"reward_std": 0.004419418517500162,
"rewards/acc_reward/mean": 0.9803516268730164,
"rewards/acc_reward/std": 0.02998371422290802,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 67
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 382.0,
"completions/max_terminated_length": 382.0,
"completions/mean_length": 150.859375,
"completions/mean_terminated_length": 150.859375,
"completions/min_length": 48.0,
"completions/min_terminated_length": 48.0,
"entropy": 2.375,
"epoch": 0.128060263653484,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 8.738229755178908e-07,
"loss": 0.0,
"num_tokens": 9149267.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/acc_reward/mean": 1.0,
"rewards/acc_reward/std": 0.0,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 68
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 347.0,
"completions/max_terminated_length": 347.0,
"completions/mean_length": 156.34375,
"completions/mean_terminated_length": 156.34375,
"completions/min_length": 35.0,
"completions/min_terminated_length": 35.0,
"entropy": 2.359375,
"epoch": 0.12994350282485875,
"frac_reward_zero_std": 0.75,
"grad_norm": 1.1719481945037842,
"learning_rate": 8.71939736346516e-07,
"loss": -0.0147,
"num_tokens": 9283337.0,
"reward": 0.9746325016021729,
"reward_std": 0.04419417679309845,
"rewards/acc_reward/mean": 0.9735499620437622,
"rewards/acc_reward/std": 0.1257747858762741,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 69
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 427.0,
"completions/max_terminated_length": 427.0,
"completions/mean_length": 153.515625,
"completions/mean_terminated_length": 153.515625,
"completions/min_length": 23.0,
"completions/min_terminated_length": 23.0,
"entropy": 2.53125,
"epoch": 0.1318267419962335,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 8.700564971751412e-07,
"loss": 0.0,
"num_tokens": 9422506.0,
"reward": 0.9936791658401489,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9929767847061157,
"rewards/acc_reward/std": 0.018728474155068398,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 70
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 311.0,
"completions/max_terminated_length": 311.0,
"completions/mean_length": 159.90625,
"completions/mean_terminated_length": 159.90625,
"completions/min_length": 64.0,
"completions/min_terminated_length": 64.0,
"entropy": 2.234375,
"epoch": 0.1337099811676083,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 8.681732580037664e-07,
"loss": 0.0,
"num_tokens": 9558724.0,
"reward": 0.9724128246307373,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9693475365638733,
"rewards/acc_reward/std": 0.049584269523620605,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 71
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 393.0,
"completions/max_terminated_length": 393.0,
"completions/mean_length": 171.421875,
"completions/mean_terminated_length": 171.421875,
"completions/min_length": 51.0,
"completions/min_terminated_length": 51.0,
"entropy": 2.65625,
"epoch": 0.13559322033898305,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 8.662900188323917e-07,
"loss": 0.0,
"num_tokens": 9696479.0,
"reward": 0.9782567620277405,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9758408665657043,
"rewards/acc_reward/std": 0.035969410091638565,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 72
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 355.0,
"completions/max_terminated_length": 355.0,
"completions/mean_length": 146.640625,
"completions/mean_terminated_length": 146.640625,
"completions/min_length": 58.0,
"completions/min_terminated_length": 58.0,
"entropy": 2.1875,
"epoch": 0.1374764595103578,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 8.64406779661017e-07,
"loss": 0.0,
"num_tokens": 9833768.0,
"reward": 0.9815881252288818,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9795423150062561,
"rewards/acc_reward/std": 0.03601390868425369,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 73
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 361.0,
"completions/mean_length": 159.484375,
"completions/mean_terminated_length": 153.88888549804688,
"completions/min_length": 52.0,
"completions/min_terminated_length": 52.0,
"entropy": 2.40625,
"epoch": 0.1393596986817326,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 8.625235404896422e-07,
"loss": 0.0,
"num_tokens": 9965639.0,
"reward": 0.9888086318969727,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9875651597976685,
"rewards/acc_reward/std": 0.02026844024658203,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 74
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 368.0,
"completions/max_terminated_length": 368.0,
"completions/mean_length": 159.8125,
"completions/mean_terminated_length": 159.8125,
"completions/min_length": 69.0,
"completions/min_terminated_length": 69.0,
"entropy": 2.28125,
"epoch": 0.14124293785310735,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 8.606403013182674e-07,
"loss": 0.0,
"num_tokens": 10102131.0,
"reward": 0.9853819608688354,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.983757734298706,
"rewards/acc_reward/std": 0.023104524239897728,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 75
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 456.0,
"completions/mean_length": 184.40625,
"completions/mean_terminated_length": 173.8386993408203,
"completions/min_length": 63.0,
"completions/min_terminated_length": 63.0,
"entropy": 2.390625,
"epoch": 0.1431261770244821,
"frac_reward_zero_std": 0.875,
"grad_norm": 0.7094928026199341,
"learning_rate": 8.587570621468926e-07,
"loss": -0.0279,
"num_tokens": 10234637.0,
"reward": 0.9758157730102539,
"reward_std": 0.009495548903942108,
"rewards/acc_reward/mean": 0.973128616809845,
"rewards/acc_reward/std": 0.04172620549798012,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 76
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 493.0,
"completions/max_terminated_length": 493.0,
"completions/mean_length": 187.34375,
"completions/mean_terminated_length": 187.34375,
"completions/min_length": 68.0,
"completions/min_terminated_length": 68.0,
"entropy": 2.484375,
"epoch": 0.14500941619585686,
"frac_reward_zero_std": 0.875,
"grad_norm": 0.6446596384048462,
"learning_rate": 8.568738229755178e-07,
"loss": -0.0238,
"num_tokens": 10380091.0,
"reward": 0.9779398441314697,
"reward_std": 0.004419418517500162,
"rewards/acc_reward/mean": 0.9772248268127441,
"rewards/acc_reward/std": 0.026913031935691833,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 77
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 433.0,
"completions/max_terminated_length": 433.0,
"completions/mean_length": 161.046875,
"completions/mean_terminated_length": 161.046875,
"completions/min_length": 71.0,
"completions/min_terminated_length": 71.0,
"entropy": 2.359375,
"epoch": 0.14689265536723164,
"frac_reward_zero_std": 0.875,
"grad_norm": 0.9724075198173523,
"learning_rate": 8.549905838041431e-07,
"loss": 0.0269,
"num_tokens": 10505822.0,
"reward": 0.988434910774231,
"reward_std": 0.004419418517500162,
"rewards/acc_reward/mean": 0.9888859987258911,
"rewards/acc_reward/std": 0.024498289451003075,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 78
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 446.0,
"completions/max_terminated_length": 446.0,
"completions/mean_length": 159.9375,
"completions/mean_terminated_length": 159.9375,
"completions/min_length": 40.0,
"completions/min_terminated_length": 40.0,
"entropy": 2.375,
"epoch": 0.1487758945386064,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 8.531073446327683e-07,
"loss": 0.0,
"num_tokens": 10638682.0,
"reward": 0.990105152130127,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9890057444572449,
"rewards/acc_reward/std": 0.019625553861260414,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 79
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 471.0,
"completions/mean_length": 165.125,
"completions/mean_terminated_length": 159.6190643310547,
"completions/min_length": 65.0,
"completions/min_terminated_length": 65.0,
"entropy": 2.734375,
"epoch": 0.15065913370998116,
"frac_reward_zero_std": 0.875,
"grad_norm": 0.6837416291236877,
"learning_rate": 8.512241054613935e-07,
"loss": -0.0191,
"num_tokens": 10769954.0,
"reward": 0.98872971534729,
"reward_std": 0.004419418517500162,
"rewards/acc_reward/mean": 0.9892135858535767,
"rewards/acc_reward/std": 0.02876383066177368,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 80
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 413.0,
"completions/mean_length": 184.4375,
"completions/mean_terminated_length": 179.23809814453125,
"completions/min_length": 56.0,
"completions/min_terminated_length": 56.0,
"entropy": 2.625,
"epoch": 0.15254237288135594,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 8.493408662900188e-07,
"loss": 0.0,
"num_tokens": 10908782.0,
"reward": 0.9871925115585327,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.985769510269165,
"rewards/acc_reward/std": 0.02491430565714836,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 81
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 339.0,
"completions/max_terminated_length": 339.0,
"completions/mean_length": 155.265625,
"completions/mean_terminated_length": 155.265625,
"completions/min_length": 73.0,
"completions/min_terminated_length": 73.0,
"entropy": 2.4375,
"epoch": 0.1544256120527307,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 8.47457627118644e-07,
"loss": 0.0,
"num_tokens": 11042303.0,
"reward": 0.9902667999267578,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9891853332519531,
"rewards/acc_reward/std": 0.015684949234128,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 82
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 369.0,
"completions/max_terminated_length": 369.0,
"completions/mean_length": 153.015625,
"completions/mean_terminated_length": 153.015625,
"completions/min_length": 48.0,
"completions/min_terminated_length": 48.0,
"entropy": 2.5,
"epoch": 0.15630885122410546,
"frac_reward_zero_std": 0.875,
"grad_norm": 0.6075497269630432,
"learning_rate": 8.455743879472693e-07,
"loss": -0.027,
"num_tokens": 11173760.0,
"reward": 0.9887193441390991,
"reward_std": 0.004419418517500162,
"rewards/acc_reward/mean": 0.989202082157135,
"rewards/acc_reward/std": 0.012971931137144566,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 83
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 297.0,
"completions/max_terminated_length": 297.0,
"completions/mean_length": 142.296875,
"completions/mean_terminated_length": 142.296875,
"completions/min_length": 38.0,
"completions/min_terminated_length": 38.0,
"entropy": 2.265625,
"epoch": 0.15819209039548024,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 8.436911487758945e-07,
"loss": 0.0,
"num_tokens": 11310011.0,
"reward": 0.9917968511581421,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9908854365348816,
"rewards/acc_reward/std": 0.01598946936428547,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 84
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 244.0,
"completions/max_terminated_length": 244.0,
"completions/mean_length": 138.125,
"completions/mean_terminated_length": 138.125,
"completions/min_length": 54.0,
"completions/min_terminated_length": 54.0,
"entropy": 2.265625,
"epoch": 0.160075329566855,
"frac_reward_zero_std": 0.875,
"grad_norm": 0.6712620854377747,
"learning_rate": 8.418079096045197e-07,
"loss": -0.0066,
"num_tokens": 11440675.0,
"reward": 0.9767186641693115,
"reward_std": 0.004419418517500162,
"rewards/acc_reward/mean": 0.9758679866790771,
"rewards/acc_reward/std": 0.04487896338105202,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 85
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 451.0,
"completions/max_terminated_length": 451.0,
"completions/mean_length": 172.578125,
"completions/mean_terminated_length": 172.578125,
"completions/min_length": 48.0,
"completions/min_terminated_length": 48.0,
"entropy": 2.640625,
"epoch": 0.16195856873822975,
"frac_reward_zero_std": 0.875,
"grad_norm": 0.770696222782135,
"learning_rate": 8.399246704331449e-07,
"loss": -0.0185,
"num_tokens": 11569064.0,
"reward": 0.9869691133499146,
"reward_std": 0.004419418517500162,
"rewards/acc_reward/mean": 0.9872573614120483,
"rewards/acc_reward/std": 0.022249845787882805,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 86
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 426.0,
"completions/max_terminated_length": 426.0,
"completions/mean_length": 172.6875,
"completions/mean_terminated_length": 172.6875,
"completions/min_length": 41.0,
"completions/min_terminated_length": 41.0,
"entropy": 2.3125,
"epoch": 0.1638418079096045,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 8.380414312617701e-07,
"loss": 0.0,
"num_tokens": 11708700.0,
"reward": 0.9803584814071655,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9781760573387146,
"rewards/acc_reward/std": 0.02318427711725235,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 87
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 276.0,
"completions/mean_length": 155.0625,
"completions/mean_terminated_length": 143.5483856201172,
"completions/min_length": 75.0,
"completions/min_terminated_length": 75.0,
"entropy": 2.40625,
"epoch": 0.1657250470809793,
"frac_reward_zero_std": 0.875,
"grad_norm": 0.9133210778236389,
"learning_rate": 8.361581920903954e-07,
"loss": 0.0155,
"num_tokens": 11832608.0,
"reward": 0.9537662267684937,
"reward_std": 0.006725744344294071,
"rewards/acc_reward/mean": 0.9486291408538818,
"rewards/acc_reward/std": 0.06949032843112946,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 88
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 420.0,
"completions/max_terminated_length": 420.0,
"completions/mean_length": 170.796875,
"completions/mean_terminated_length": 170.796875,
"completions/min_length": 22.0,
"completions/min_terminated_length": 22.0,
"entropy": 2.359375,
"epoch": 0.16760828625235405,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.9491801857948303,
"learning_rate": 8.342749529190208e-07,
"loss": -0.0359,
"num_tokens": 11968843.0,
"reward": 0.9640452265739441,
"reward_std": 0.010205795988440514,
"rewards/acc_reward/mean": 0.9652585983276367,
"rewards/acc_reward/std": 0.046331144869327545,
"rewards/format_reward/mean": 0.953125,
"rewards/format_reward/std": 0.21304203569889069,
"step": 89
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 275.0,
"completions/max_terminated_length": 275.0,
"completions/mean_length": 151.96875,
"completions/mean_terminated_length": 151.96875,
"completions/min_length": 56.0,
"completions/min_terminated_length": 56.0,
"entropy": 2.3125,
"epoch": 0.1694915254237288,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 8.32391713747646e-07,
"loss": 0.0,
"num_tokens": 12106473.0,
"reward": 0.9978047609329224,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9975607991218567,
"rewards/acc_reward/std": 0.006504515651613474,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 90
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 505.0,
"completions/max_terminated_length": 505.0,
"completions/mean_length": 170.75,
"completions/mean_terminated_length": 170.75,
"completions/min_length": 49.0,
"completions/min_terminated_length": 49.0,
"entropy": 2.484375,
"epoch": 0.1713747645951036,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 8.305084745762712e-07,
"loss": 0.0,
"num_tokens": 12241945.0,
"reward": 0.9895379543304443,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9883755445480347,
"rewards/acc_reward/std": 0.018741585314273834,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 91
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 324.0,
"completions/max_terminated_length": 324.0,
"completions/mean_length": 165.765625,
"completions/mean_terminated_length": 165.765625,
"completions/min_length": 77.0,
"completions/min_terminated_length": 77.0,
"entropy": 2.34375,
"epoch": 0.17325800376647835,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 8.286252354048964e-07,
"loss": 0.0,
"num_tokens": 12375658.0,
"reward": 0.9758948087692261,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9732165336608887,
"rewards/acc_reward/std": 0.04799450561404228,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 92
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 451.0,
"completions/mean_length": 195.265625,
"completions/mean_terminated_length": 190.23809814453125,
"completions/min_length": 60.0,
"completions/min_terminated_length": 60.0,
"entropy": 2.46875,
"epoch": 0.1751412429378531,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 8.267419962335216e-07,
"loss": 0.0,
"num_tokens": 12513219.0,
"reward": 0.9753564596176147,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9726182222366333,
"rewards/acc_reward/std": 0.045861802995204926,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 93
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 321.0,
"completions/max_terminated_length": 321.0,
"completions/mean_length": 155.5,
"completions/mean_terminated_length": 155.5,
"completions/min_length": 77.0,
"completions/min_terminated_length": 77.0,
"entropy": 2.1875,
"epoch": 0.17702448210922786,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 8.248587570621468e-07,
"loss": 0.0,
"num_tokens": 12648475.0,
"reward": 0.9844207763671875,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9826897382736206,
"rewards/acc_reward/std": 0.02190142311155796,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 94
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 508.0,
"completions/mean_length": 187.34375,
"completions/mean_terminated_length": 182.19049072265625,
"completions/min_length": 30.0,
"completions/min_terminated_length": 30.0,
"entropy": 2.5625,
"epoch": 0.17890772128060264,
"frac_reward_zero_std": 0.75,
"grad_norm": 1.2475651502609253,
"learning_rate": 8.229755178907722e-07,
"loss": -0.0212,
"num_tokens": 12785969.0,
"reward": 0.9898383617401123,
"reward_std": 0.008838837035000324,
"rewards/acc_reward/mean": 0.9921815395355225,
"rewards/acc_reward/std": 0.015049039386212826,
"rewards/format_reward/mean": 0.96875,
"rewards/format_reward/std": 0.17536810040473938,
"step": 95
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 368.0,
"completions/max_terminated_length": 368.0,
"completions/mean_length": 153.0,
"completions/mean_terminated_length": 153.0,
"completions/min_length": 45.0,
"completions/min_terminated_length": 45.0,
"entropy": 2.34375,
"epoch": 0.1807909604519774,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 8.210922787193974e-07,
"loss": 0.0,
"num_tokens": 12907345.0,
"reward": 0.9843592047691345,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9826213121414185,
"rewards/acc_reward/std": 0.03606174886226654,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 96
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 474.0,
"completions/max_terminated_length": 474.0,
"completions/mean_length": 168.28125,
"completions/mean_terminated_length": 168.28125,
"completions/min_length": 50.0,
"completions/min_terminated_length": 50.0,
"entropy": 2.171875,
"epoch": 0.18267419962335216,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 8.192090395480226e-07,
"loss": 0.0,
"num_tokens": 13033539.0,
"reward": 0.9740588665008545,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9711765050888062,
"rewards/acc_reward/std": 0.033579710870981216,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 97
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 367.0,
"completions/mean_length": 164.03125,
"completions/mean_terminated_length": 158.5079345703125,
"completions/min_length": 56.0,
"completions/min_terminated_length": 56.0,
"entropy": 2.546875,
"epoch": 0.18455743879472694,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 8.173258003766478e-07,
"loss": 0.0,
"num_tokens": 13159461.0,
"reward": 0.983502984046936,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9816700220108032,
"rewards/acc_reward/std": 0.024904713034629822,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 98
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 322.0,
"completions/mean_length": 174.546875,
"completions/mean_terminated_length": 169.19049072265625,
"completions/min_length": 75.0,
"completions/min_terminated_length": 75.0,
"entropy": 2.140625,
"epoch": 0.1864406779661017,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 8.15442561205273e-07,
"loss": 0.0,
"num_tokens": 13287976.0,
"reward": 0.9533977508544922,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9482197761535645,
"rewards/acc_reward/std": 0.043142788112163544,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 99
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 382.0,
"completions/mean_length": 161.15625,
"completions/mean_terminated_length": 155.58731079101562,
"completions/min_length": 57.0,
"completions/min_terminated_length": 57.0,
"entropy": 2.25,
"epoch": 0.18832391713747645,
"frac_reward_zero_std": 0.875,
"grad_norm": 0.5508601665496826,
"learning_rate": 8.135593220338983e-07,
"loss": -0.0178,
"num_tokens": 13424754.0,
"reward": 0.9814343452453613,
"reward_std": 0.004419418517500162,
"rewards/acc_reward/mean": 0.9811076521873474,
"rewards/acc_reward/std": 0.01872284896671772,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 100
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 310.0,
"completions/max_terminated_length": 310.0,
"completions/mean_length": 152.96875,
"completions/mean_terminated_length": 152.96875,
"completions/min_length": 64.0,
"completions/min_terminated_length": 64.0,
"entropy": 2.40625,
"epoch": 0.1902071563088512,
"frac_reward_zero_std": 0.875,
"grad_norm": 0.733428418636322,
"learning_rate": 8.116760828625235e-07,
"loss": -0.0236,
"num_tokens": 13554768.0,
"reward": 0.9719411730766296,
"reward_std": 0.039774756878614426,
"rewards/acc_reward/mean": 0.9688235521316528,
"rewards/acc_reward/std": 0.1259699910879135,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 101
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 331.0,
"completions/mean_length": 163.6875,
"completions/mean_terminated_length": 158.1587371826172,
"completions/min_length": 55.0,
"completions/min_terminated_length": 55.0,
"entropy": 2.28125,
"epoch": 0.192090395480226,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 8.097928436911488e-07,
"loss": 0.0,
"num_tokens": 13684028.0,
"reward": 0.9804370403289795,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9782633781433105,
"rewards/acc_reward/std": 0.03396952524781227,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 102
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 442.0,
"completions/mean_length": 189.28125,
"completions/mean_terminated_length": 178.8709716796875,
"completions/min_length": 43.0,
"completions/min_terminated_length": 43.0,
"entropy": 2.5625,
"epoch": 0.19397363465160075,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 8.07909604519774e-07,
"loss": 0.0,
"num_tokens": 13813006.0,
"reward": 0.9762270450592041,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9735856056213379,
"rewards/acc_reward/std": 0.05650464445352554,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 103
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 495.0,
"completions/max_terminated_length": 495.0,
"completions/mean_length": 176.3125,
"completions/mean_terminated_length": 176.3125,
"completions/min_length": 52.0,
"completions/min_terminated_length": 52.0,
"entropy": 2.515625,
"epoch": 0.1958568738229755,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 8.060263653483992e-07,
"loss": 0.0,
"num_tokens": 13945954.0,
"reward": 0.9901642799377441,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9890714883804321,
"rewards/acc_reward/std": 0.014226200059056282,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 104
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 385.0,
"completions/max_terminated_length": 385.0,
"completions/mean_length": 175.34375,
"completions/mean_terminated_length": 175.34375,
"completions/min_length": 63.0,
"completions/min_terminated_length": 63.0,
"entropy": 2.515625,
"epoch": 0.1977401129943503,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 8.041431261770244e-07,
"loss": 0.0,
"num_tokens": 14080760.0,
"reward": 0.9843112826347351,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.982568085193634,
"rewards/acc_reward/std": 0.029767252504825592,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 105
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 427.0,
"completions/max_terminated_length": 427.0,
"completions/mean_length": 162.34375,
"completions/mean_terminated_length": 162.34375,
"completions/min_length": 37.0,
"completions/min_terminated_length": 37.0,
"entropy": 2.25,
"epoch": 0.19962335216572505,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 8.022598870056497e-07,
"loss": 0.0,
"num_tokens": 14202574.0,
"reward": 0.9882901310920715,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9869890213012695,
"rewards/acc_reward/std": 0.017871392890810966,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 106
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 393.0,
"completions/max_terminated_length": 393.0,
"completions/mean_length": 177.140625,
"completions/mean_terminated_length": 177.140625,
"completions/min_length": 69.0,
"completions/min_terminated_length": 69.0,
"entropy": 2.515625,
"epoch": 0.2015065913370998,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 8.003766478342749e-07,
"loss": 0.0,
"num_tokens": 14339895.0,
"reward": 0.9751439094543457,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.972382128238678,
"rewards/acc_reward/std": 0.04191429913043976,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 107
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 457.0,
"completions/max_terminated_length": 457.0,
"completions/mean_length": 182.734375,
"completions/mean_terminated_length": 182.734375,
"completions/min_length": 71.0,
"completions/min_terminated_length": 71.0,
"entropy": 2.46875,
"epoch": 0.2033898305084746,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 7.984934086629001e-07,
"loss": 0.0,
"num_tokens": 14480414.0,
"reward": 0.9581470489501953,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9534966945648193,
"rewards/acc_reward/std": 0.06734198331832886,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 108
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 315.0,
"completions/max_terminated_length": 315.0,
"completions/mean_length": 150.65625,
"completions/mean_terminated_length": 150.65625,
"completions/min_length": 57.0,
"completions/min_terminated_length": 57.0,
"entropy": 2.203125,
"epoch": 0.20527306967984935,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 7.966101694915253e-07,
"loss": 0.0,
"num_tokens": 14604040.0,
"reward": 0.9401878118515015,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9335420727729797,
"rewards/acc_reward/std": 0.09204845130443573,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 109
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 350.0,
"completions/max_terminated_length": 350.0,
"completions/mean_length": 153.203125,
"completions/mean_terminated_length": 153.203125,
"completions/min_length": 61.0,
"completions/min_terminated_length": 61.0,
"entropy": 2.28125,
"epoch": 0.2071563088512241,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 7.947269303201506e-07,
"loss": 0.0,
"num_tokens": 14742229.0,
"reward": 0.9852752685546875,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9836391806602478,
"rewards/acc_reward/std": 0.04362887144088745,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 110
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 336.0,
"completions/max_terminated_length": 336.0,
"completions/mean_length": 170.71875,
"completions/mean_terminated_length": 170.71875,
"completions/min_length": 78.0,
"completions/min_terminated_length": 78.0,
"entropy": 2.375,
"epoch": 0.20903954802259886,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 7.928436911487758e-07,
"loss": 0.0,
"num_tokens": 14873859.0,
"reward": 0.9933172464370728,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9925747513771057,
"rewards/acc_reward/std": 0.010412875562906265,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 111
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 349.0,
"completions/max_terminated_length": 349.0,
"completions/mean_length": 168.53125,
"completions/mean_terminated_length": 168.53125,
"completions/min_length": 66.0,
"completions/min_terminated_length": 66.0,
"entropy": 2.5,
"epoch": 0.21092278719397364,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 7.909604519774011e-07,
"loss": 0.0,
"num_tokens": 15006189.0,
"reward": 0.9692245125770569,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9658049941062927,
"rewards/acc_reward/std": 0.0648469477891922,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 112
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 379.0,
"completions/max_terminated_length": 379.0,
"completions/mean_length": 169.125,
"completions/mean_terminated_length": 169.125,
"completions/min_length": 51.0,
"completions/min_terminated_length": 51.0,
"entropy": 2.3125,
"epoch": 0.2128060263653484,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 7.890772128060263e-07,
"loss": 0.0,
"num_tokens": 15144437.0,
"reward": 0.9728338718414307,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9698153734207153,
"rewards/acc_reward/std": 0.040937572717666626,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 113
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 362.0,
"completions/max_terminated_length": 362.0,
"completions/mean_length": 158.234375,
"completions/mean_terminated_length": 158.234375,
"completions/min_length": 66.0,
"completions/min_terminated_length": 66.0,
"entropy": 2.421875,
"epoch": 0.21468926553672316,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 7.871939736346515e-07,
"loss": 0.0,
"num_tokens": 15266348.0,
"reward": 0.9900326728820801,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9889251589775085,
"rewards/acc_reward/std": 0.011797359213232994,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 114
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 421.0,
"completions/mean_length": 184.890625,
"completions/mean_terminated_length": 179.69842529296875,
"completions/min_length": 35.0,
"completions/min_terminated_length": 35.0,
"entropy": 2.609375,
"epoch": 0.21657250470809794,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 7.853107344632767e-07,
"loss": 0.0,
"num_tokens": 15406085.0,
"reward": 0.9864563941955566,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9849514961242676,
"rewards/acc_reward/std": 0.023492755368351936,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 115
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 436.0,
"completions/mean_length": 179.328125,
"completions/mean_terminated_length": 174.04762268066406,
"completions/min_length": 68.0,
"completions/min_terminated_length": 68.0,
"entropy": 2.34375,
"epoch": 0.2184557438794727,
"frac_reward_zero_std": 0.875,
"grad_norm": 0.8667036294937134,
"learning_rate": 7.83427495291902e-07,
"loss": -0.0178,
"num_tokens": 15537066.0,
"reward": 0.9839749336242676,
"reward_std": 0.004419418517500162,
"rewards/acc_reward/mean": 0.9839304685592651,
"rewards/acc_reward/std": 0.025710513815283775,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 116
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 319.0,
"completions/max_terminated_length": 319.0,
"completions/mean_length": 151.453125,
"completions/mean_terminated_length": 151.453125,
"completions/min_length": 37.0,
"completions/min_terminated_length": 37.0,
"entropy": 2.34375,
"epoch": 0.22033898305084745,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 7.815442561205274e-07,
"loss": 0.0,
"num_tokens": 15661063.0,
"reward": 0.9836729168891907,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9818587899208069,
"rewards/acc_reward/std": 0.03338773176074028,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 117
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 448.0,
"completions/max_terminated_length": 448.0,
"completions/mean_length": 173.40625,
"completions/mean_terminated_length": 173.40625,
"completions/min_length": 75.0,
"completions/min_terminated_length": 75.0,
"entropy": 2.53125,
"epoch": 0.2222222222222222,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 7.796610169491526e-07,
"loss": 0.0,
"num_tokens": 15794785.0,
"reward": 0.970079779624939,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9667552709579468,
"rewards/acc_reward/std": 0.03764721751213074,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 118
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 353.0,
"completions/max_terminated_length": 353.0,
"completions/mean_length": 156.765625,
"completions/mean_terminated_length": 156.765625,
"completions/min_length": 56.0,
"completions/min_terminated_length": 56.0,
"entropy": 2.171875,
"epoch": 0.224105461393597,
"frac_reward_zero_std": 0.875,
"grad_norm": 1.2777234315872192,
"learning_rate": 7.777777777777778e-07,
"loss": 0.0405,
"num_tokens": 15925402.0,
"reward": 0.9777387380599976,
"reward_std": 0.013079374097287655,
"rewards/acc_reward/mean": 0.9752652645111084,
"rewards/acc_reward/std": 0.04798175394535065,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 119
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 408.0,
"completions/mean_length": 203.3125,
"completions/mean_terminated_length": 198.4127197265625,
"completions/min_length": 54.0,
"completions/min_terminated_length": 54.0,
"entropy": 2.65625,
"epoch": 0.22598870056497175,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 7.75894538606403e-07,
"loss": 0.0,
"num_tokens": 16073038.0,
"reward": 0.9984081983566284,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9982313513755798,
"rewards/acc_reward/std": 0.0047163767740130424,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 120
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 458.0,
"completions/mean_length": 204.984375,
"completions/mean_terminated_length": 195.08062744140625,
"completions/min_length": 70.0,
"completions/min_terminated_length": 70.0,
"entropy": 2.59375,
"epoch": 0.2278719397363465,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 7.740112994350282e-07,
"loss": 0.0,
"num_tokens": 16218901.0,
"reward": 0.9989771842956543,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9988635778427124,
"rewards/acc_reward/std": 0.0020075358916074038,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 121
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 404.0,
"completions/max_terminated_length": 404.0,
"completions/mean_length": 174.4375,
"completions/mean_terminated_length": 174.4375,
"completions/min_length": 50.0,
"completions/min_terminated_length": 50.0,
"entropy": 2.328125,
"epoch": 0.2297551789077213,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 7.721280602636534e-07,
"loss": 0.0,
"num_tokens": 16352369.0,
"reward": 0.988227367401123,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9869192838668823,
"rewards/acc_reward/std": 0.026611221954226494,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 122
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 341.0,
"completions/max_terminated_length": 341.0,
"completions/mean_length": 167.890625,
"completions/mean_terminated_length": 167.890625,
"completions/min_length": 53.0,
"completions/min_terminated_length": 53.0,
"entropy": 2.390625,
"epoch": 0.23163841807909605,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 7.702448210922787e-07,
"loss": 0.0,
"num_tokens": 16482378.0,
"reward": 0.9788169860839844,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9764633178710938,
"rewards/acc_reward/std": 0.03820019215345383,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 123
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 321.0,
"completions/max_terminated_length": 321.0,
"completions/mean_length": 178.171875,
"completions/mean_terminated_length": 178.171875,
"completions/min_length": 49.0,
"completions/min_terminated_length": 49.0,
"entropy": 2.125,
"epoch": 0.2335216572504708,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 7.68361581920904e-07,
"loss": 0.0,
"num_tokens": 16619925.0,
"reward": 0.9605213403701782,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9561347961425781,
"rewards/acc_reward/std": 0.08166956156492233,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 124
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 320.0,
"completions/mean_length": 173.6875,
"completions/mean_terminated_length": 168.3174591064453,
"completions/min_length": 59.0,
"completions/min_terminated_length": 59.0,
"entropy": 2.390625,
"epoch": 0.23540489642184556,
"frac_reward_zero_std": 0.875,
"grad_norm": 0.9748631119728088,
"learning_rate": 7.664783427495292e-07,
"loss": 0.0211,
"num_tokens": 16749945.0,
"reward": 0.9801042079925537,
"reward_std": 0.039774756878614426,
"rewards/acc_reward/mean": 0.9778935313224792,
"rewards/acc_reward/std": 0.12537133693695068,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 125
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 406.0,
"completions/mean_length": 178.25,
"completions/mean_terminated_length": 167.48387145996094,
"completions/min_length": 39.0,
"completions/min_terminated_length": 39.0,
"entropy": 2.5625,
"epoch": 0.23728813559322035,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 7.645951035781544e-07,
"loss": 0.0,
"num_tokens": 16879657.0,
"reward": 0.9902485609054565,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9891650676727295,
"rewards/acc_reward/std": 0.015085420571267605,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 126
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 319.0,
"completions/max_terminated_length": 319.0,
"completions/mean_length": 162.15625,
"completions/mean_terminated_length": 162.15625,
"completions/min_length": 46.0,
"completions/min_terminated_length": 46.0,
"entropy": 2.328125,
"epoch": 0.2391713747645951,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 7.627118644067796e-07,
"loss": 0.0,
"num_tokens": 17009299.0,
"reward": 0.9965801239013672,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9962000846862793,
"rewards/acc_reward/std": 0.010133087635040283,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 127
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 343.0,
"completions/max_terminated_length": 343.0,
"completions/mean_length": 160.796875,
"completions/mean_terminated_length": 160.796875,
"completions/min_length": 57.0,
"completions/min_terminated_length": 57.0,
"entropy": 2.28125,
"epoch": 0.24105461393596986,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 7.608286252354048e-07,
"loss": 0.0,
"num_tokens": 17146406.0,
"reward": 0.9833929538726807,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9815477728843689,
"rewards/acc_reward/std": 0.024218887090682983,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 128
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 436.0,
"completions/max_terminated_length": 436.0,
"completions/mean_length": 175.1875,
"completions/mean_terminated_length": 175.1875,
"completions/min_length": 56.0,
"completions/min_terminated_length": 56.0,
"entropy": 2.484375,
"epoch": 0.24293785310734464,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 7.589453860640301e-07,
"loss": 0.0,
"num_tokens": 17274242.0,
"reward": 0.9791369438171387,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9768187999725342,
"rewards/acc_reward/std": 0.03209559619426727,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 129
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 441.0,
"completions/mean_length": 174.9375,
"completions/mean_terminated_length": 169.58731079101562,
"completions/min_length": 72.0,
"completions/min_terminated_length": 72.0,
"entropy": 2.359375,
"epoch": 0.2448210922787194,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 7.570621468926553e-07,
"loss": 0.0,
"num_tokens": 17409982.0,
"reward": 0.9768315553665161,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9742573499679565,
"rewards/acc_reward/std": 0.05567716062068939,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 130
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 394.0,
"completions/mean_length": 186.6875,
"completions/mean_terminated_length": 181.52381896972656,
"completions/min_length": 58.0,
"completions/min_terminated_length": 58.0,
"entropy": 2.421875,
"epoch": 0.24670433145009416,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 7.551789077212806e-07,
"loss": 0.0,
"num_tokens": 17545354.0,
"reward": 0.9810404777526855,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9789338707923889,
"rewards/acc_reward/std": 0.03438215330243111,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 131
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 483.0,
"completions/max_terminated_length": 483.0,
"completions/mean_length": 155.46875,
"completions/mean_terminated_length": 155.46875,
"completions/min_length": 61.0,
"completions/min_terminated_length": 61.0,
"entropy": 2.234375,
"epoch": 0.24858757062146894,
"frac_reward_zero_std": 0.875,
"grad_norm": 0.42140984535217285,
"learning_rate": 7.532956685499058e-07,
"loss": 0.0881,
"num_tokens": 17679648.0,
"reward": 0.9783517122268677,
"reward_std": 0.039774756878614426,
"rewards/acc_reward/mean": 0.975946307182312,
"rewards/acc_reward/std": 0.12442652136087418,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 132
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 370.0,
"completions/mean_length": 180.71875,
"completions/mean_terminated_length": 170.03225708007812,
"completions/min_length": 62.0,
"completions/min_terminated_length": 62.0,
"entropy": 2.5625,
"epoch": 0.2504708097928437,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 7.51412429378531e-07,
"loss": 0.0,
"num_tokens": 17811718.0,
"reward": 0.9774539470672607,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9749488830566406,
"rewards/acc_reward/std": 0.04454605653882027,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 133
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 498.0,
"completions/mean_length": 177.40625,
"completions/mean_terminated_length": 172.09524536132812,
"completions/min_length": 67.0,
"completions/min_terminated_length": 67.0,
"entropy": 2.21875,
"epoch": 0.2523540489642185,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 7.495291902071563e-07,
"loss": 0.0,
"num_tokens": 17944256.0,
"reward": 0.9872071743011475,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.985785722732544,
"rewards/acc_reward/std": 0.01633262448012829,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 134
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 388.0,
"completions/max_terminated_length": 388.0,
"completions/mean_length": 167.875,
"completions/mean_terminated_length": 167.875,
"completions/min_length": 75.0,
"completions/min_terminated_length": 75.0,
"entropy": 2.375,
"epoch": 0.2542372881355932,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 7.476459510357815e-07,
"loss": 0.0,
"num_tokens": 18074344.0,
"reward": 0.996717095375061,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9963523149490356,
"rewards/acc_reward/std": 0.009727060794830322,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 135
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 341.0,
"completions/max_terminated_length": 341.0,
"completions/mean_length": 168.921875,
"completions/mean_terminated_length": 168.921875,
"completions/min_length": 71.0,
"completions/min_terminated_length": 71.0,
"entropy": 2.3125,
"epoch": 0.256120527306968,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 7.457627118644067e-07,
"loss": 0.0,
"num_tokens": 18202499.0,
"reward": 0.9629114866256714,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9587906002998352,
"rewards/acc_reward/std": 0.05813661590218544,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 136
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 413.0,
"completions/max_terminated_length": 413.0,
"completions/mean_length": 152.171875,
"completions/mean_terminated_length": 152.171875,
"completions/min_length": 54.0,
"completions/min_terminated_length": 54.0,
"entropy": 2.234375,
"epoch": 0.2580037664783427,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 7.438794726930319e-07,
"loss": 0.0,
"num_tokens": 18325262.0,
"reward": 0.9783220887184143,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.975913405418396,
"rewards/acc_reward/std": 0.03493288531899452,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 137
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 468.0,
"completions/max_terminated_length": 468.0,
"completions/mean_length": 182.3125,
"completions/mean_terminated_length": 182.3125,
"completions/min_length": 61.0,
"completions/min_terminated_length": 61.0,
"entropy": 2.421875,
"epoch": 0.2598870056497175,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 7.419962335216571e-07,
"loss": 0.0,
"num_tokens": 18463394.0,
"reward": 0.9732788801193237,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9703099131584167,
"rewards/acc_reward/std": 0.02722685970366001,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 138
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 312.0,
"completions/max_terminated_length": 312.0,
"completions/mean_length": 137.0,
"completions/mean_terminated_length": 137.0,
"completions/min_length": 47.0,
"completions/min_terminated_length": 47.0,
"entropy": 2.171875,
"epoch": 0.2617702448210923,
"frac_reward_zero_std": 0.875,
"grad_norm": 1.0246717929840088,
"learning_rate": 7.401129943502824e-07,
"loss": 0.0013,
"num_tokens": 18588066.0,
"reward": 0.9853101968765259,
"reward_std": 0.004419418517500162,
"rewards/acc_reward/mean": 0.9854141473770142,
"rewards/acc_reward/std": 0.01550329476594925,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 139
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 462.0,
"completions/mean_length": 180.15625,
"completions/mean_terminated_length": 174.88890075683594,
"completions/min_length": 74.0,
"completions/min_terminated_length": 74.0,
"entropy": 2.4375,
"epoch": 0.263653483992467,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 7.382297551789078e-07,
"loss": 0.0,
"num_tokens": 18720780.0,
"reward": 0.973831057548523,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9709233641624451,
"rewards/acc_reward/std": 0.047117821872234344,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 140
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 451.0,
"completions/mean_length": 190.109375,
"completions/mean_terminated_length": 179.72579956054688,
"completions/min_length": 52.0,
"completions/min_terminated_length": 52.0,
"entropy": 2.40625,
"epoch": 0.2655367231638418,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 7.36346516007533e-07,
"loss": 0.0,
"num_tokens": 18857011.0,
"reward": 0.9712393283843994,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9680436849594116,
"rewards/acc_reward/std": 0.04809395968914032,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 141
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 410.0,
"completions/max_terminated_length": 410.0,
"completions/mean_length": 160.671875,
"completions/mean_terminated_length": 160.671875,
"completions/min_length": 72.0,
"completions/min_terminated_length": 72.0,
"entropy": 2.140625,
"epoch": 0.2674199623352166,
"frac_reward_zero_std": 0.75,
"grad_norm": 1.3321737051010132,
"learning_rate": 7.344632768361582e-07,
"loss": -0.0087,
"num_tokens": 18989918.0,
"reward": 0.9906257390975952,
"reward_std": 0.008838837035000324,
"rewards/acc_reward/mean": 0.9930564165115356,
"rewards/acc_reward/std": 0.010120646096765995,
"rewards/format_reward/mean": 0.96875,
"rewards/format_reward/std": 0.17536810040473938,
"step": 142
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 352.0,
"completions/max_terminated_length": 352.0,
"completions/mean_length": 147.78125,
"completions/mean_terminated_length": 147.78125,
"completions/min_length": 54.0,
"completions/min_terminated_length": 54.0,
"entropy": 2.25,
"epoch": 0.2693032015065913,
"frac_reward_zero_std": 0.875,
"grad_norm": 1.1365125179290771,
"learning_rate": 7.325800376647834e-07,
"loss": 0.0151,
"num_tokens": 19115760.0,
"reward": 0.9455662965774536,
"reward_std": 0.004419418517500162,
"rewards/acc_reward/mean": 0.9412541389465332,
"rewards/acc_reward/std": 0.09196340292692184,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 143
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 335.0,
"completions/max_terminated_length": 335.0,
"completions/mean_length": 161.796875,
"completions/mean_terminated_length": 161.796875,
"completions/min_length": 53.0,
"completions/min_terminated_length": 53.0,
"entropy": 2.234375,
"epoch": 0.2711864406779661,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 7.306967984934086e-07,
"loss": 0.0,
"num_tokens": 19244779.0,
"reward": 0.9764645099639893,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9738494753837585,
"rewards/acc_reward/std": 0.043730027973651886,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 144
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 354.0,
"completions/mean_length": 164.109375,
"completions/mean_terminated_length": 158.58731079101562,
"completions/min_length": 68.0,
"completions/min_terminated_length": 68.0,
"entropy": 2.28125,
"epoch": 0.2730696798493409,
"frac_reward_zero_std": 0.75,
"grad_norm": 1.4404025077819824,
"learning_rate": 7.288135593220338e-07,
"loss": 0.0261,
"num_tokens": 19387506.0,
"reward": 0.9824453592300415,
"reward_std": 0.017121607437729836,
"rewards/acc_reward/mean": 0.9822309613227844,
"rewards/acc_reward/std": 0.04615851864218712,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 145
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 322.0,
"completions/max_terminated_length": 322.0,
"completions/mean_length": 161.578125,
"completions/mean_terminated_length": 161.578125,
"completions/min_length": 83.0,
"completions/min_terminated_length": 83.0,
"entropy": 2.21875,
"epoch": 0.2749529190207156,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 7.269303201506592e-07,
"loss": 0.0,
"num_tokens": 19526271.0,
"reward": 0.9767446517944336,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9741606712341309,
"rewards/acc_reward/std": 0.02946525067090988,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 146
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 299.0,
"completions/max_terminated_length": 299.0,
"completions/mean_length": 162.734375,
"completions/mean_terminated_length": 162.734375,
"completions/min_length": 70.0,
"completions/min_terminated_length": 70.0,
"entropy": 2.296875,
"epoch": 0.2768361581920904,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 7.250470809792844e-07,
"loss": 0.0,
"num_tokens": 19656430.0,
"reward": 0.9855356216430664,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9839285016059875,
"rewards/acc_reward/std": 0.03281255066394806,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 147
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 328.0,
"completions/max_terminated_length": 328.0,
"completions/mean_length": 152.265625,
"completions/mean_terminated_length": 152.265625,
"completions/min_length": 54.0,
"completions/min_terminated_length": 54.0,
"entropy": 2.328125,
"epoch": 0.2787193973634652,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 7.231638418079096e-07,
"loss": 0.0,
"num_tokens": 19791039.0,
"reward": 0.9486349821090698,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9429277777671814,
"rewards/acc_reward/std": 0.0684126690030098,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 148
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 335.0,
"completions/max_terminated_length": 335.0,
"completions/mean_length": 138.96875,
"completions/mean_terminated_length": 138.96875,
"completions/min_length": 44.0,
"completions/min_terminated_length": 44.0,
"entropy": 2.046875,
"epoch": 0.2806026365348399,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 7.212806026365348e-07,
"loss": 0.0,
"num_tokens": 19920157.0,
"reward": 0.9899982213973999,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9888868927955627,
"rewards/acc_reward/std": 0.014463546685874462,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 149
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 420.0,
"completions/mean_length": 171.4375,
"completions/mean_terminated_length": 166.03173828125,
"completions/min_length": 54.0,
"completions/min_terminated_length": 54.0,
"entropy": 2.1875,
"epoch": 0.2824858757062147,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 7.1939736346516e-07,
"loss": 0.0,
"num_tokens": 20054233.0,
"reward": 0.9820305109024048,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9800339341163635,
"rewards/acc_reward/std": 0.03505489602684975,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 150
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 353.0,
"completions/max_terminated_length": 353.0,
"completions/mean_length": 154.09375,
"completions/mean_terminated_length": 154.09375,
"completions/min_length": 66.0,
"completions/min_terminated_length": 66.0,
"entropy": 2.265625,
"epoch": 0.2843691148775895,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 7.175141242937853e-07,
"loss": 0.0,
"num_tokens": 20177839.0,
"reward": 0.9695570468902588,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9661744832992554,
"rewards/acc_reward/std": 0.051941804587841034,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 151
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 317.0,
"completions/max_terminated_length": 317.0,
"completions/mean_length": 159.078125,
"completions/mean_terminated_length": 159.078125,
"completions/min_length": 52.0,
"completions/min_terminated_length": 52.0,
"entropy": 2.21875,
"epoch": 0.2862523540489642,
"frac_reward_zero_std": 0.875,
"grad_norm": 1.135265588760376,
"learning_rate": 7.156308851224105e-07,
"loss": 0.0085,
"num_tokens": 20313964.0,
"reward": 0.9910129308700562,
"reward_std": 0.004419418517500162,
"rewards/acc_reward/mean": 0.9917504787445068,
"rewards/acc_reward/std": 0.015448656864464283,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 152
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 384.0,
"completions/max_terminated_length": 384.0,
"completions/mean_length": 176.328125,
"completions/mean_terminated_length": 176.328125,
"completions/min_length": 44.0,
"completions/min_terminated_length": 44.0,
"entropy": 2.546875,
"epoch": 0.288135593220339,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 7.137476459510358e-07,
"loss": 0.0,
"num_tokens": 20454113.0,
"reward": 0.9844459295272827,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9827176928520203,
"rewards/acc_reward/std": 0.018290938809514046,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 153
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 447.0,
"completions/max_terminated_length": 447.0,
"completions/mean_length": 154.140625,
"completions/mean_terminated_length": 154.140625,
"completions/min_length": 46.0,
"completions/min_terminated_length": 46.0,
"entropy": 2.265625,
"epoch": 0.2900188323917137,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 7.11864406779661e-07,
"loss": 0.0,
"num_tokens": 20591402.0,
"reward": 0.9614624977111816,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9571805596351624,
"rewards/acc_reward/std": 0.06432777643203735,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 154
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 323.0,
"completions/max_terminated_length": 323.0,
"completions/mean_length": 151.9375,
"completions/mean_terminated_length": 151.9375,
"completions/min_length": 29.0,
"completions/min_terminated_length": 29.0,
"entropy": 2.1875,
"epoch": 0.2919020715630885,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 7.099811676082862e-07,
"loss": 0.0,
"num_tokens": 20714630.0,
"reward": 0.9820876717567444,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9800974130630493,
"rewards/acc_reward/std": 0.037367500364780426,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 155
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 436.0,
"completions/mean_length": 173.15625,
"completions/mean_terminated_length": 167.77780151367188,
"completions/min_length": 63.0,
"completions/min_terminated_length": 63.0,
"entropy": 2.296875,
"epoch": 0.2937853107344633,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 7.080979284369114e-07,
"loss": 0.0,
"num_tokens": 20853136.0,
"reward": 0.9967447519302368,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9963830709457397,
"rewards/acc_reward/std": 0.009645064361393452,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 156
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 382.0,
"completions/max_terminated_length": 382.0,
"completions/mean_length": 158.5,
"completions/mean_terminated_length": 158.5,
"completions/min_length": 60.0,
"completions/min_terminated_length": 60.0,
"entropy": 2.21875,
"epoch": 0.295668549905838,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 7.062146892655367e-07,
"loss": 0.0,
"num_tokens": 20979664.0,
"reward": 0.9870873689651489,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9856526851654053,
"rewards/acc_reward/std": 0.020136423408985138,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 157
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 383.0,
"completions/max_terminated_length": 383.0,
"completions/mean_length": 164.1875,
"completions/mean_terminated_length": 164.1875,
"completions/min_length": 68.0,
"completions/min_terminated_length": 68.0,
"entropy": 2.234375,
"epoch": 0.2975517890772128,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 7.043314500941619e-07,
"loss": 0.0,
"num_tokens": 21116156.0,
"reward": 0.9893269538879395,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9881410598754883,
"rewards/acc_reward/std": 0.023672664538025856,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 158
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 512.0,
"completions/max_terminated_length": 404.0,
"completions/mean_length": 177.203125,
"completions/mean_terminated_length": 166.40322875976562,
"completions/min_length": 64.0,
"completions/min_terminated_length": 64.0,
"entropy": 2.328125,
"epoch": 0.2994350282485876,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 7.024482109227871e-07,
"loss": 0.0,
"num_tokens": 21245321.0,
"reward": 0.9947940111160278,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9942155480384827,
"rewards/acc_reward/std": 0.0101578738540411,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 159
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 427.0,
"completions/max_terminated_length": 427.0,
"completions/mean_length": 167.0625,
"completions/mean_terminated_length": 167.0625,
"completions/min_length": 52.0,
"completions/min_terminated_length": 52.0,
"entropy": 2.1875,
"epoch": 0.3013182674199623,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 7.005649717514124e-07,
"loss": 0.0,
"num_tokens": 21376237.0,
"reward": 0.9860028624534607,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9844475984573364,
"rewards/acc_reward/std": 0.019196392968297005,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 160
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 338.0,
"completions/mean_length": 169.265625,
"completions/mean_terminated_length": 163.82540893554688,
"completions/min_length": 34.0,
"completions/min_terminated_length": 34.0,
"entropy": 2.359375,
"epoch": 0.3032015065913371,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 6.986817325800376e-07,
"loss": 0.0,
"num_tokens": 21516414.0,
"reward": 0.9679353833198547,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9643726348876953,
"rewards/acc_reward/std": 0.05980806425213814,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 161
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 348.0,
"completions/max_terminated_length": 348.0,
"completions/mean_length": 153.296875,
"completions/mean_terminated_length": 153.296875,
"completions/min_length": 68.0,
"completions/min_terminated_length": 68.0,
"entropy": 2.09375,
"epoch": 0.3050847457627119,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 6.967984934086629e-07,
"loss": 0.0,
"num_tokens": 21642289.0,
"reward": 0.9967857003211975,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9964286088943481,
"rewards/acc_reward/std": 0.009523809887468815,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 162
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 327.0,
"completions/max_terminated_length": 327.0,
"completions/mean_length": 164.515625,
"completions/mean_terminated_length": 164.515625,
"completions/min_length": 78.0,
"completions/min_terminated_length": 78.0,
"entropy": 2.171875,
"epoch": 0.3069679849340866,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 6.949152542372881e-07,
"loss": 0.0,
"num_tokens": 21782162.0,
"reward": 0.9368254542350769,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.929806113243103,
"rewards/acc_reward/std": 0.0714760348200798,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 163
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 387.0,
"completions/max_terminated_length": 387.0,
"completions/mean_length": 168.53125,
"completions/mean_terminated_length": 168.53125,
"completions/min_length": 64.0,
"completions/min_terminated_length": 64.0,
"entropy": 2.234375,
"epoch": 0.3088512241054614,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 6.930320150659133e-07,
"loss": 0.0,
"num_tokens": 21911732.0,
"reward": 0.9614138603210449,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9571264982223511,
"rewards/acc_reward/std": 0.08462820202112198,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 164
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 389.0,
"completions/max_terminated_length": 389.0,
"completions/mean_length": 172.609375,
"completions/mean_terminated_length": 172.609375,
"completions/min_length": 46.0,
"completions/min_terminated_length": 46.0,
"entropy": 2.015625,
"epoch": 0.3107344632768362,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 6.911487758945385e-07,
"loss": 0.0,
"num_tokens": 22043643.0,
"reward": 0.9874755144119263,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9860839247703552,
"rewards/acc_reward/std": 0.02431458979845047,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 165
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 468.0,
"completions/max_terminated_length": 468.0,
"completions/mean_length": 156.015625,
"completions/mean_terminated_length": 156.015625,
"completions/min_length": 70.0,
"completions/min_terminated_length": 70.0,
"entropy": 2.234375,
"epoch": 0.3126177024482109,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 6.892655367231638e-07,
"loss": 0.0,
"num_tokens": 22167892.0,
"reward": 0.9494754076004028,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9438616037368774,
"rewards/acc_reward/std": 0.05841909348964691,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 166
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 285.0,
"completions/max_terminated_length": 285.0,
"completions/mean_length": 144.0,
"completions/mean_terminated_length": 144.0,
"completions/min_length": 51.0,
"completions/min_terminated_length": 51.0,
"entropy": 1.8828125,
"epoch": 0.3145009416195857,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 6.87382297551789e-07,
"loss": 0.0,
"num_tokens": 22302132.0,
"reward": 0.9812496900558472,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9791662693023682,
"rewards/acc_reward/std": 0.023800579831004143,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 167
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 309.0,
"completions/mean_length": 163.0,
"completions/mean_terminated_length": 157.4603271484375,
"completions/min_length": 69.0,
"completions/min_terminated_length": 69.0,
"entropy": 2.203125,
"epoch": 0.3163841807909605,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 6.854990583804144e-07,
"loss": 0.0,
"num_tokens": 22428188.0,
"reward": 0.9839420318603516,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9821577668190002,
"rewards/acc_reward/std": 0.03933866694569588,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 168
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 388.0,
"completions/max_terminated_length": 388.0,
"completions/mean_length": 160.859375,
"completions/mean_terminated_length": 160.859375,
"completions/min_length": 49.0,
"completions/min_terminated_length": 49.0,
"entropy": 2.28125,
"epoch": 0.3182674199623352,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 6.836158192090396e-07,
"loss": 0.0,
"num_tokens": 22559187.0,
"reward": 0.9785705804824829,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9761895537376404,
"rewards/acc_reward/std": 0.0258196871727705,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 169
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 296.0,
"completions/max_terminated_length": 296.0,
"completions/mean_length": 162.125,
"completions/mean_terminated_length": 162.125,
"completions/min_length": 72.0,
"completions/min_terminated_length": 72.0,
"entropy": 2.015625,
"epoch": 0.32015065913371,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 6.817325800376648e-07,
"loss": 0.0,
"num_tokens": 22686427.0,
"reward": 0.9450536966323853,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9389485120773315,
"rewards/acc_reward/std": 0.07760415226221085,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 170
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 280.0,
"completions/max_terminated_length": 280.0,
"completions/mean_length": 136.40625,
"completions/mean_terminated_length": 136.40625,
"completions/min_length": 64.0,
"completions/min_terminated_length": 64.0,
"entropy": 1.953125,
"epoch": 0.3220338983050847,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 6.7984934086629e-07,
"loss": 0.0,
"num_tokens": 22812501.0,
"reward": 0.9613984823226929,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9571093916893005,
"rewards/acc_reward/std": 0.07542510330677032,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 171
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 316.0,
"completions/max_terminated_length": 316.0,
"completions/mean_length": 157.84375,
"completions/mean_terminated_length": 157.84375,
"completions/min_length": 34.0,
"completions/min_terminated_length": 34.0,
"entropy": 2.046875,
"epoch": 0.3239171374764595,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 6.779661016949152e-07,
"loss": 0.0,
"num_tokens": 22934187.0,
"reward": 0.9912809729576111,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9903122186660767,
"rewards/acc_reward/std": 0.020957766100764275,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 172
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 389.0,
"completions/max_terminated_length": 389.0,
"completions/mean_length": 176.359375,
"completions/mean_terminated_length": 176.359375,
"completions/min_length": 68.0,
"completions/min_terminated_length": 68.0,
"entropy": 2.328125,
"epoch": 0.3258003766478343,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 6.760828625235404e-07,
"loss": 0.0,
"num_tokens": 23070498.0,
"reward": 0.9652288556098938,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9613653421401978,
"rewards/acc_reward/std": 0.06810762733221054,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 173
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 464.0,
"completions/mean_length": 196.28125,
"completions/mean_terminated_length": 191.2698516845703,
"completions/min_length": 77.0,
"completions/min_terminated_length": 77.0,
"entropy": 2.171875,
"epoch": 0.327683615819209,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 6.741996233521658e-07,
"loss": 0.0,
"num_tokens": 23202324.0,
"reward": 0.9608478546142578,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9564976692199707,
"rewards/acc_reward/std": 0.06687356531620026,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 174
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 296.0,
"completions/mean_length": 150.796875,
"completions/mean_terminated_length": 145.06350708007812,
"completions/min_length": 40.0,
"completions/min_terminated_length": 40.0,
"entropy": 1.9609375,
"epoch": 0.3295668549905838,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 6.72316384180791e-07,
"loss": 0.0,
"num_tokens": 23333319.0,
"reward": 0.9884651899337769,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9871835112571716,
"rewards/acc_reward/std": 0.026243234053254128,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 175
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 420.0,
"completions/max_terminated_length": 420.0,
"completions/mean_length": 165.828125,
"completions/mean_terminated_length": 165.828125,
"completions/min_length": 66.0,
"completions/min_terminated_length": 66.0,
"entropy": 2.28125,
"epoch": 0.3314500941619586,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 6.704331450094162e-07,
"loss": 0.0,
"num_tokens": 23460796.0,
"reward": 0.9813232421875,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.979248046875,
"rewards/acc_reward/std": 0.04185057431459427,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 176
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 384.0,
"completions/max_terminated_length": 384.0,
"completions/mean_length": 146.734375,
"completions/mean_terminated_length": 146.734375,
"completions/min_length": 50.0,
"completions/min_terminated_length": 50.0,
"entropy": 2.09375,
"epoch": 0.3333333333333333,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 6.685499058380414e-07,
"loss": 0.0,
"num_tokens": 23584811.0,
"reward": 0.9937499761581421,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9930555820465088,
"rewards/acc_reward/std": 0.01851852796971798,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 177
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 436.0,
"completions/max_terminated_length": 436.0,
"completions/mean_length": 161.828125,
"completions/mean_terminated_length": 161.828125,
"completions/min_length": 67.0,
"completions/min_terminated_length": 67.0,
"entropy": 2.34375,
"epoch": 0.3352165725047081,
"frac_reward_zero_std": 0.875,
"grad_norm": 1.0041779279708862,
"learning_rate": 6.666666666666666e-07,
"loss": 0.0029,
"num_tokens": 23723072.0,
"reward": 0.9907628893852234,
"reward_std": 0.004419418517500162,
"rewards/acc_reward/mean": 0.9914726614952087,
"rewards/acc_reward/std": 0.014887169934809208,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 178
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 447.0,
"completions/max_terminated_length": 447.0,
"completions/mean_length": 170.140625,
"completions/mean_terminated_length": 170.140625,
"completions/min_length": 69.0,
"completions/min_terminated_length": 69.0,
"entropy": 2.296875,
"epoch": 0.3370998116760829,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 6.647834274952919e-07,
"loss": 0.0,
"num_tokens": 23852745.0,
"reward": 0.9886301755905151,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9873669147491455,
"rewards/acc_reward/std": 0.02594558708369732,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 179
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 399.0,
"completions/max_terminated_length": 399.0,
"completions/mean_length": 159.09375,
"completions/mean_terminated_length": 159.09375,
"completions/min_length": 65.0,
"completions/min_terminated_length": 65.0,
"entropy": 2.203125,
"epoch": 0.3389830508474576,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 6.629001883239171e-07,
"loss": 0.0,
"num_tokens": 23976431.0,
"reward": 0.9778439402580261,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9753821492195129,
"rewards/acc_reward/std": 0.02938609942793846,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 180
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 366.0,
"completions/mean_length": 155.328125,
"completions/mean_terminated_length": 149.66668701171875,
"completions/min_length": 49.0,
"completions/min_terminated_length": 49.0,
"entropy": 2.265625,
"epoch": 0.3408662900188324,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 6.610169491525423e-07,
"loss": 0.0,
"num_tokens": 24103236.0,
"reward": 0.9857558012008667,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9841731190681458,
"rewards/acc_reward/std": 0.02304108813405037,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 181
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 379.0,
"completions/max_terminated_length": 379.0,
"completions/mean_length": 157.25,
"completions/mean_terminated_length": 157.25,
"completions/min_length": 58.0,
"completions/min_terminated_length": 58.0,
"entropy": 1.921875,
"epoch": 0.3427495291902072,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 6.591337099811676e-07,
"loss": 0.0,
"num_tokens": 24240244.0,
"reward": 0.9847475290298462,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9830528497695923,
"rewards/acc_reward/std": 0.018334772437810898,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 182
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 353.0,
"completions/max_terminated_length": 353.0,
"completions/mean_length": 153.484375,
"completions/mean_terminated_length": 153.484375,
"completions/min_length": 45.0,
"completions/min_terminated_length": 45.0,
"entropy": 2.09375,
"epoch": 0.3446327683615819,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 6.572504708097928e-07,
"loss": 0.0,
"num_tokens": 24373251.0,
"reward": 0.9889024496078491,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9876693487167358,
"rewards/acc_reward/std": 0.014938557520508766,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 183
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 365.0,
"completions/mean_length": 173.65625,
"completions/mean_terminated_length": 168.2857208251953,
"completions/min_length": 55.0,
"completions/min_terminated_length": 55.0,
"entropy": 2.34375,
"epoch": 0.3465160075329567,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 6.55367231638418e-07,
"loss": 0.0,
"num_tokens": 24507229.0,
"reward": 0.9813482165336609,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9792758226394653,
"rewards/acc_reward/std": 0.03865106776356697,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 184
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 350.0,
"completions/max_terminated_length": 350.0,
"completions/mean_length": 165.078125,
"completions/mean_terminated_length": 165.078125,
"completions/min_length": 63.0,
"completions/min_terminated_length": 63.0,
"entropy": 2.203125,
"epoch": 0.3483992467043315,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 6.534839924670433e-07,
"loss": 0.0,
"num_tokens": 24646554.0,
"reward": 0.9556671380996704,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9507412910461426,
"rewards/acc_reward/std": 0.07888054847717285,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 185
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 285.0,
"completions/max_terminated_length": 285.0,
"completions/mean_length": 154.4375,
"completions/mean_terminated_length": 154.4375,
"completions/min_length": 70.0,
"completions/min_terminated_length": 70.0,
"entropy": 1.984375,
"epoch": 0.3502824858757062,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 6.516007532956685e-07,
"loss": 0.0,
"num_tokens": 24785782.0,
"reward": 0.9671032428741455,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9634479880332947,
"rewards/acc_reward/std": 0.06446754932403564,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 186
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 309.0,
"completions/max_terminated_length": 309.0,
"completions/mean_length": 140.21875,
"completions/mean_terminated_length": 140.21875,
"completions/min_length": 49.0,
"completions/min_terminated_length": 49.0,
"entropy": 2.09375,
"epoch": 0.352165725047081,
"frac_reward_zero_std": 0.875,
"grad_norm": 1.060726284980774,
"learning_rate": 6.497175141242937e-07,
"loss": 0.0088,
"num_tokens": 24903940.0,
"reward": 0.932281494140625,
"reward_std": 0.004419418517500162,
"rewards/acc_reward/mean": 0.9264933466911316,
"rewards/acc_reward/std": 0.08563832193613052,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 187
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 460.0,
"completions/mean_length": 150.15625,
"completions/mean_terminated_length": 144.4127197265625,
"completions/min_length": 60.0,
"completions/min_terminated_length": 60.0,
"entropy": 2.015625,
"epoch": 0.3540489642184557,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 6.478342749529189e-07,
"loss": 0.0,
"num_tokens": 25035214.0,
"reward": 0.9913280010223389,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9903644323348999,
"rewards/acc_reward/std": 0.01791100949048996,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 188
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 493.0,
"completions/max_terminated_length": 493.0,
"completions/mean_length": 156.28125,
"completions/mean_terminated_length": 156.28125,
"completions/min_length": 74.0,
"completions/min_terminated_length": 74.0,
"entropy": 2.109375,
"epoch": 0.3559322033898305,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 6.459510357815442e-07,
"loss": 0.0,
"num_tokens": 25168800.0,
"reward": 0.9506161212921143,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9451290369033813,
"rewards/acc_reward/std": 0.07419686019420624,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 189
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 319.0,
"completions/max_terminated_length": 319.0,
"completions/mean_length": 138.859375,
"completions/mean_terminated_length": 138.859375,
"completions/min_length": 58.0,
"completions/min_terminated_length": 58.0,
"entropy": 1.8359375,
"epoch": 0.3578154425612053,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 6.440677966101694e-07,
"loss": 0.0,
"num_tokens": 25302271.0,
"reward": 0.9889830350875854,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9877589344978333,
"rewards/acc_reward/std": 0.013018240220844746,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 190
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 250.0,
"completions/max_terminated_length": 250.0,
"completions/mean_length": 143.65625,
"completions/mean_terminated_length": 143.65625,
"completions/min_length": 70.0,
"completions/min_terminated_length": 70.0,
"entropy": 2.203125,
"epoch": 0.35969868173258,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 6.421845574387948e-07,
"loss": 0.0,
"num_tokens": 25422569.0,
"reward": 0.9856148958206177,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9840165376663208,
"rewards/acc_reward/std": 0.020176060497760773,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 191
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 493.0,
"completions/mean_length": 172.125,
"completions/mean_terminated_length": 166.73016357421875,
"completions/min_length": 61.0,
"completions/min_terminated_length": 61.0,
"entropy": 2.015625,
"epoch": 0.3615819209039548,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 6.4030131826742e-07,
"loss": 0.0,
"num_tokens": 25560809.0,
"reward": 0.9829681515693665,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9810757637023926,
"rewards/acc_reward/std": 0.0207473486661911,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 192
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 359.0,
"completions/max_terminated_length": 359.0,
"completions/mean_length": 150.859375,
"completions/mean_terminated_length": 150.859375,
"completions/min_length": 67.0,
"completions/min_terminated_length": 67.0,
"entropy": 1.9453125,
"epoch": 0.3634651600753296,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 6.384180790960452e-07,
"loss": 0.0,
"num_tokens": 25691888.0,
"reward": 0.959905743598938,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9554507732391357,
"rewards/acc_reward/std": 0.07276061922311783,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 193
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 329.0,
"completions/max_terminated_length": 329.0,
"completions/mean_length": 164.3125,
"completions/mean_terminated_length": 164.3125,
"completions/min_length": 59.0,
"completions/min_terminated_length": 59.0,
"entropy": 2.171875,
"epoch": 0.3653483992467043,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 6.365348399246704e-07,
"loss": 0.0,
"num_tokens": 25819652.0,
"reward": 0.9704650044441223,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9671833515167236,
"rewards/acc_reward/std": 0.04358522593975067,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 194
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 288.0,
"completions/max_terminated_length": 288.0,
"completions/mean_length": 145.09375,
"completions/mean_terminated_length": 145.09375,
"completions/min_length": 57.0,
"completions/min_terminated_length": 57.0,
"entropy": 2.15625,
"epoch": 0.3672316384180791,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 6.346516007532956e-07,
"loss": 0.0,
"num_tokens": 25950602.0,
"reward": 0.9926788806915283,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9918653964996338,
"rewards/acc_reward/std": 0.011418163776397705,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 195
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 405.0,
"completions/max_terminated_length": 405.0,
"completions/mean_length": 138.3125,
"completions/mean_terminated_length": 138.3125,
"completions/min_length": 69.0,
"completions/min_terminated_length": 69.0,
"entropy": 1.953125,
"epoch": 0.3691148775894539,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 6.32768361581921e-07,
"loss": 0.0,
"num_tokens": 26070558.0,
"reward": 0.9680017232894897,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9644463658332825,
"rewards/acc_reward/std": 0.051819488406181335,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 196
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 368.0,
"completions/max_terminated_length": 368.0,
"completions/mean_length": 164.3125,
"completions/mean_terminated_length": 164.3125,
"completions/min_length": 68.0,
"completions/min_terminated_length": 68.0,
"entropy": 2.109375,
"epoch": 0.3709981167608286,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 6.308851224105462e-07,
"loss": 0.0,
"num_tokens": 26211378.0,
"reward": 0.992339015007019,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9914877414703369,
"rewards/acc_reward/std": 0.012202701531350613,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 197
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 301.0,
"completions/max_terminated_length": 301.0,
"completions/mean_length": 150.03125,
"completions/mean_terminated_length": 150.03125,
"completions/min_length": 57.0,
"completions/min_terminated_length": 57.0,
"entropy": 2.078125,
"epoch": 0.3728813559322034,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 6.290018832391714e-07,
"loss": 0.0,
"num_tokens": 26341204.0,
"reward": 0.9656549692153931,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9618387818336487,
"rewards/acc_reward/std": 0.03533172979950905,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 198
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 349.0,
"completions/max_terminated_length": 349.0,
"completions/mean_length": 144.90625,
"completions/mean_terminated_length": 144.90625,
"completions/min_length": 62.0,
"completions/min_terminated_length": 62.0,
"entropy": 2.046875,
"epoch": 0.3747645951035782,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 6.271186440677966e-07,
"loss": 0.0,
"num_tokens": 26460142.0,
"reward": 0.9934231042861938,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9926923513412476,
"rewards/acc_reward/std": 0.0151524618268013,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 199
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 340.0,
"completions/max_terminated_length": 340.0,
"completions/mean_length": 156.90625,
"completions/mean_terminated_length": 156.90625,
"completions/min_length": 62.0,
"completions/min_terminated_length": 62.0,
"entropy": 2.0625,
"epoch": 0.3766478342749529,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 6.252354048964218e-07,
"loss": 0.0,
"num_tokens": 26586088.0,
"reward": 0.9820280075073242,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9800311326980591,
"rewards/acc_reward/std": 0.018364734947681427,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 200
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 468.0,
"completions/max_terminated_length": 468.0,
"completions/mean_length": 143.34375,
"completions/mean_terminated_length": 143.34375,
"completions/min_length": 37.0,
"completions/min_terminated_length": 37.0,
"entropy": 2.125,
"epoch": 0.3785310734463277,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 6.23352165725047e-07,
"loss": 0.0,
"num_tokens": 26717886.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/acc_reward/mean": 1.0,
"rewards/acc_reward/std": 0.0,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 201
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 409.0,
"completions/max_terminated_length": 409.0,
"completions/mean_length": 175.90625,
"completions/mean_terminated_length": 175.90625,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"entropy": 2.140625,
"epoch": 0.3804143126177024,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 6.214689265536723e-07,
"loss": 0.0,
"num_tokens": 26854168.0,
"reward": 0.9935948252677917,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9928831458091736,
"rewards/acc_reward/std": 0.0077277072705328465,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 202
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 396.0,
"completions/max_terminated_length": 396.0,
"completions/mean_length": 165.25,
"completions/mean_terminated_length": 165.25,
"completions/min_length": 52.0,
"completions/min_terminated_length": 52.0,
"entropy": 2.171875,
"epoch": 0.3822975517890772,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 6.195856873822976e-07,
"loss": 0.0,
"num_tokens": 26989488.0,
"reward": 0.9852421283721924,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9836024045944214,
"rewards/acc_reward/std": 0.01928338035941124,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 203
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 512.0,
"completions/max_terminated_length": 261.0,
"completions/mean_length": 154.296875,
"completions/mean_terminated_length": 148.61904907226562,
"completions/min_length": 46.0,
"completions/min_terminated_length": 46.0,
"entropy": 2.1875,
"epoch": 0.384180790960452,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 6.177024482109228e-07,
"loss": 0.0,
"num_tokens": 27112987.0,
"reward": 0.9929645657539368,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9921828508377075,
"rewards/acc_reward/std": 0.02084571123123169,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 204
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 342.0,
"completions/max_terminated_length": 342.0,
"completions/mean_length": 147.5,
"completions/mean_terminated_length": 147.5,
"completions/min_length": 50.0,
"completions/min_terminated_length": 50.0,
"entropy": 2.125,
"epoch": 0.3860640301318267,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 6.15819209039548e-07,
"loss": 0.0,
"num_tokens": 27235691.0,
"reward": 0.9827622175216675,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9808468818664551,
"rewards/acc_reward/std": 0.027835894376039505,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 205
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 367.0,
"completions/max_terminated_length": 367.0,
"completions/mean_length": 156.53125,
"completions/mean_terminated_length": 156.53125,
"completions/min_length": 67.0,
"completions/min_terminated_length": 67.0,
"entropy": 2.09375,
"epoch": 0.3879472693032015,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 6.139359698681732e-07,
"loss": 0.0,
"num_tokens": 27372173.0,
"reward": 0.9731494784355164,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9701660871505737,
"rewards/acc_reward/std": 0.020826132968068123,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 206
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 409.0,
"completions/max_terminated_length": 409.0,
"completions/mean_length": 164.34375,
"completions/mean_terminated_length": 164.34375,
"completions/min_length": 78.0,
"completions/min_terminated_length": 78.0,
"entropy": 2.09375,
"epoch": 0.3898305084745763,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 6.120527306967984e-07,
"loss": 0.0,
"num_tokens": 27499555.0,
"reward": 0.9984294176101685,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9982548952102661,
"rewards/acc_reward/std": 0.0046535334549844265,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 207
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 416.0,
"completions/max_terminated_length": 416.0,
"completions/mean_length": 157.484375,
"completions/mean_terminated_length": 157.484375,
"completions/min_length": 55.0,
"completions/min_terminated_length": 55.0,
"entropy": 2.25,
"epoch": 0.391713747645951,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 6.101694915254237e-07,
"loss": 0.0,
"num_tokens": 27636098.0,
"reward": 0.9711763262748718,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9679737091064453,
"rewards/acc_reward/std": 0.029890142381191254,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 208
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 408.0,
"completions/max_terminated_length": 408.0,
"completions/mean_length": 174.828125,
"completions/mean_terminated_length": 174.828125,
"completions/min_length": 27.0,
"completions/min_terminated_length": 27.0,
"entropy": 2.046875,
"epoch": 0.3935969868173258,
"frac_reward_zero_std": 0.875,
"grad_norm": 0.4382655620574951,
"learning_rate": 6.082862523540489e-07,
"loss": -0.0223,
"num_tokens": 27777591.0,
"reward": 0.9556211829185486,
"reward_std": 0.004419418517500162,
"rewards/acc_reward/mean": 0.9524263143539429,
"rewards/acc_reward/std": 0.04751761257648468,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 209
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 384.0,
"completions/max_terminated_length": 384.0,
"completions/mean_length": 139.09375,
"completions/mean_terminated_length": 139.09375,
"completions/min_length": 52.0,
"completions/min_terminated_length": 52.0,
"entropy": 2.15625,
"epoch": 0.3954802259887006,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 6.064030131826741e-07,
"loss": 0.0,
"num_tokens": 27906397.0,
"reward": 0.9859374761581421,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.984375,
"rewards/acc_reward/std": 0.0416666679084301,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 210
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 363.0,
"completions/max_terminated_length": 363.0,
"completions/mean_length": 144.625,
"completions/mean_terminated_length": 144.625,
"completions/min_length": 41.0,
"completions/min_terminated_length": 41.0,
"entropy": 1.8984375,
"epoch": 0.3973634651600753,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 6.045197740112994e-07,
"loss": 0.0,
"num_tokens": 28038757.0,
"reward": 0.9920339584350586,
"reward_std": 0.0,
"rewards/acc_reward/mean": 0.9911487698554993,
"rewards/acc_reward/std": 0.013452098704874516,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 211
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 455.0,
"completions/max_terminated_length": 455.0,
"completions/mean_length": 171.578125,
"completions/mean_terminated_length": 171.578125,
"completions/min_length": 59.0,
"completions/min_terminated_length": 59.0,
"entropy": 2.1875,
"epoch": 0.3992467043314501,
"frac_reward_zero_std": 0.875,
"grad_norm": 1.4742763042449951,
"learning_rate": 6.026365348399246e-07,
"loss": 0.0537,
"num_tokens": 28163002.0,
"reward": 0.9887884259223938,
"reward_std": 0.004419418517500162,
"rewards/acc_reward/mean": 0.9892787933349609,
"rewards/acc_reward/std": 0.023635946214199066,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 212
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 381.0,
"completions/max_terminated_length": 381.0,
"completions/mean_length": 135.84375,
"completions/mean_terminated_length": 135.84375,
"completions/min_length": 55.0,
"completions/min_terminated_length": 55.0,
"entropy": 1.921875,
"epoch": 0.4011299435028249,
"frac_reward_zero_std": 0.875,
"grad_norm": 0.6700330972671509,
"learning_rate": 6.007532956685499e-07,
"loss": -0.0233,
"num_tokens": 28279440.0,
"reward": 0.9760647416114807,
"reward_std": 0.004419418517500162,
"rewards/acc_reward/mean": 0.9751413464546204,
"rewards/acc_reward/std": 0.03155434504151344,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 213
}
],
"logging_steps": 1.0,
"max_steps": 531,
"num_input_tokens_seen": 28279440,
"num_train_epochs": 1,
"save_steps": 213,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}