{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.4011299435028249, "eval_steps": 500, "global_step": 213, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 512.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 290.59375, "completions/mean_terminated_length": 271.83050537109375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 2.203125, "epoch": 0.0018832391713747645, "frac_reward_zero_std": 0.0, "grad_norm": 1.8694089651107788, "learning_rate": 1e-06, "loss": -0.0086, "num_tokens": 140262.0, "reward": 0.206417053937912, "reward_std": 0.12193800508975983, "rewards/acc_reward/mean": 0.19810229539871216, "rewards/acc_reward/std": 0.28156620264053345, "rewards/format_reward/mean": 0.28125, "rewards/format_reward/std": 0.4531635046005249, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 512.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 247.734375, "completions/mean_terminated_length": 243.53970336914062, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 1.6796875, "epoch": 0.003766478342749529, "frac_reward_zero_std": 0.0, "grad_norm": 1.8876559734344482, "learning_rate": 9.981167608286253e-07, "loss": 0.0222, "num_tokens": 273701.0, "reward": 0.41670364141464233, "reward_std": 0.150786355137825, "rewards/acc_reward/mean": 0.42307350039482117, "rewards/acc_reward/std": 0.29976290464401245, "rewards/format_reward/mean": 0.359375, "rewards/format_reward/std": 0.4836103618144989, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 236.046875, "completions/mean_terminated_length": 236.046875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 1.6640625, "epoch": 0.005649717514124294, "frac_reward_zero_std": 0.0, "grad_norm": 1.8232742547988892, "learning_rate": 9.962335216572504e-07, "loss": 0.0698, "num_tokens": 396392.0, "reward": 0.4056413769721985, "reward_std": 0.17224639654159546, "rewards/acc_reward/mean": 0.3864765465259552, "rewards/acc_reward/std": 0.3097887337207794, "rewards/format_reward/mean": 0.578125, "rewards/format_reward/std": 0.49776285886764526, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 227.8125, "completions/mean_terminated_length": 227.8125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 1.6484375, "epoch": 0.007532956685499058, "frac_reward_zero_std": 0.0, "grad_norm": 2.0226917266845703, "learning_rate": 9.943502824858757e-07, "loss": -0.0298, "num_tokens": 523036.0, "reward": 0.3570261001586914, "reward_std": 0.12964516878128052, "rewards/acc_reward/mean": 0.33766794204711914, "rewards/acc_reward/std": 0.35702335834503174, "rewards/format_reward/mean": 0.53125, "rewards/format_reward/std": 0.5029674172401428, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 214.515625, "completions/mean_terminated_length": 214.515625, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 1.8515625, "epoch": 0.009416195856873822, "frac_reward_zero_std": 0.125, "grad_norm": 1.8740723133087158, "learning_rate": 9.92467043314501e-07, "loss": -0.0099, "num_tokens": 649309.0, "reward": 0.4160749912261963, "reward_std": 0.13374584913253784, "rewards/acc_reward/mean": 0.37723612785339355, "rewards/acc_reward/std": 0.31442132592201233, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.42695629596710205, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 266.046875, "completions/mean_terminated_length": 258.1129150390625, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 1.796875, "epoch": 0.011299435028248588, "frac_reward_zero_std": 0.0, "grad_norm": 1.7314085960388184, "learning_rate": 9.905838041431261e-07, "loss": 0.1098, "num_tokens": 788480.0, "reward": 0.2147485464811325, "reward_std": 0.1807367354631424, "rewards/acc_reward/mean": 0.15006783604621887, "rewards/acc_reward/std": 0.3153360188007355, "rewards/format_reward/mean": 0.796875, "rewards/format_reward/std": 0.40550529956817627, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 232.5625, "completions/mean_terminated_length": 232.5625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 1.84375, "epoch": 0.013182674199623353, "frac_reward_zero_std": 0.5, "grad_norm": 1.3759692907333374, "learning_rate": 9.887005649717514e-07, "loss": 0.0142, "num_tokens": 923788.0, "reward": 0.21130093932151794, "reward_std": 0.14664816856384277, "rewards/acc_reward/mean": 0.12713992595672607, "rewards/acc_reward/std": 0.2867279052734375, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 256.796875, "completions/mean_terminated_length": 256.796875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 1.6484375, "epoch": 0.015065913370998116, "frac_reward_zero_std": 0.25, "grad_norm": 1.6152563095092773, "learning_rate": 9.868173258003765e-07, "loss": 0.0757, "num_tokens": 1062207.0, "reward": 0.3737403452396393, "reward_std": 0.14178410172462463, "rewards/acc_reward/mean": 0.30589205026626587, "rewards/acc_reward/std": 0.3208266496658325, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 512.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 255.265625, "completions/mean_terminated_length": 238.15000915527344, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 1.765625, "epoch": 0.01694915254237288, "frac_reward_zero_std": 0.25, "grad_norm": 1.4234596490859985, "learning_rate": 9.849340866290019e-07, "loss": 0.0147, "num_tokens": 1205848.0, "reward": 0.3045212924480438, "reward_std": 0.1969890296459198, "rewards/acc_reward/mean": 0.22898197174072266, "rewards/acc_reward/std": 0.3434719443321228, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 224.8125, "completions/mean_terminated_length": 224.8125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 1.8203125, "epoch": 0.018832391713747645, "frac_reward_zero_std": 0.5, "grad_norm": 1.4884109497070312, "learning_rate": 9.830508474576272e-07, "loss": -0.0137, "num_tokens": 1330620.0, "reward": 0.26978251338005066, "reward_std": 0.11548593640327454, "rewards/acc_reward/mean": 0.19038332998752594, "rewards/acc_reward/std": 0.31174740195274353, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 512.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 256.890625, "completions/mean_terminated_length": 252.84127807617188, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 1.8046875, "epoch": 0.02071563088512241, "frac_reward_zero_std": 0.625, "grad_norm": 1.1678308248519897, "learning_rate": 9.811676082862523e-07, "loss": 0.0051, "num_tokens": 1474925.0, "reward": 0.21130861341953278, "reward_std": 0.11159157752990723, "rewards/acc_reward/mean": 0.12367624044418335, "rewards/acc_reward/std": 0.28885576128959656, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 224.640625, "completions/mean_terminated_length": 224.640625, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 1.59375, "epoch": 0.022598870056497175, "frac_reward_zero_std": 0.25, "grad_norm": 1.6417983770370483, "learning_rate": 9.792843691148776e-07, "loss": -0.0083, "num_tokens": 1610966.0, "reward": 0.3159927725791931, "reward_std": 0.1229424774646759, "rewards/acc_reward/mean": 0.24172811210155487, "rewards/acc_reward/std": 0.31462714076042175, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 219.6875, "completions/mean_terminated_length": 219.6875, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 1.4921875, "epoch": 0.02448210922787194, "frac_reward_zero_std": 0.375, "grad_norm": 1.5047811269760132, "learning_rate": 9.774011299435027e-07, "loss": 0.0277, "num_tokens": 1755810.0, "reward": 0.2687425911426544, "reward_std": 0.14603173732757568, "rewards/acc_reward/mean": 0.1874917596578598, "rewards/acc_reward/std": 0.3036465644836426, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 512.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 277.0625, "completions/mean_terminated_length": 257.15252685546875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 1.890625, "epoch": 0.026365348399246705, "frac_reward_zero_std": 0.375, "grad_norm": 1.3763067722320557, "learning_rate": 9.75517890772128e-07, "loss": 0.0409, "num_tokens": 1901926.0, "reward": 0.3287838101387024, "reward_std": 0.17755521833896637, "rewards/acc_reward/mean": 0.2542042136192322, "rewards/acc_reward/std": 0.383696049451828, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 512.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 232.046875, "completions/mean_terminated_length": 227.60317993164062, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 1.59375, "epoch": 0.02824858757062147, "frac_reward_zero_std": 0.5, "grad_norm": 1.1948471069335938, "learning_rate": 9.736346516007531e-07, "loss": -0.0154, "num_tokens": 2028921.0, "reward": 0.31290093064308167, "reward_std": 0.08047251403331757, "rewards/acc_reward/mean": 0.23655660450458527, "rewards/acc_reward/std": 0.3165181875228882, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 512.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 212.40625, "completions/mean_terminated_length": 207.6508026123047, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 1.734375, "epoch": 0.030131826741996232, "frac_reward_zero_std": 0.5, "grad_norm": 1.5261269807815552, "learning_rate": 9.717514124293785e-07, "loss": 0.09, "num_tokens": 2165179.0, "reward": 0.30339735746383667, "reward_std": 0.133845254778862, "rewards/acc_reward/mean": 0.22599703073501587, "rewards/acc_reward/std": 0.3543343245983124, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 201.40625, "completions/mean_terminated_length": 201.40625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 1.4375, "epoch": 0.032015065913371, "frac_reward_zero_std": 0.375, "grad_norm": 1.5391112565994263, "learning_rate": 9.698681732580038e-07, "loss": 0.009, "num_tokens": 2296853.0, "reward": 0.1953125, "reward_std": 0.2162797451019287, "rewards/acc_reward/mean": 0.109375, "rewards/acc_reward/std": 0.3145764470100403, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 512.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 220.84375, "completions/mean_terminated_length": 216.2222442626953, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 1.7421875, "epoch": 0.03389830508474576, "frac_reward_zero_std": 0.625, "grad_norm": 1.1895616054534912, "learning_rate": 9.679849340866289e-07, "loss": 0.0424, "num_tokens": 2431011.0, "reward": 0.30048421025276184, "reward_std": 0.06477973610162735, "rewards/acc_reward/mean": 0.22276021540164948, "rewards/acc_reward/std": 0.31460005044937134, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 223.953125, "completions/mean_terminated_length": 223.953125, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 1.9453125, "epoch": 0.035781544256120526, "frac_reward_zero_std": 0.25, "grad_norm": 1.671203374862671, "learning_rate": 9.661016949152542e-07, "loss": 0.0416, "num_tokens": 2571808.0, "reward": 0.42682912945747375, "reward_std": 0.13467340171337128, "rewards/acc_reward/mean": 0.3631434738636017, "rewards/acc_reward/std": 0.37232744693756104, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 224.140625, "completions/mean_terminated_length": 214.85482788085938, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 1.625, "epoch": 0.03766478342749529, "frac_reward_zero_std": 0.25, "grad_norm": 1.5360738039016724, "learning_rate": 9.642184557438793e-07, "loss": 0.0717, "num_tokens": 2711697.0, "reward": 0.3133315443992615, "reward_std": 0.22658446431159973, "rewards/acc_reward/mean": 0.23703500628471375, "rewards/acc_reward/std": 0.3616204857826233, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 222.515625, "completions/mean_terminated_length": 213.1774139404297, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 1.7109375, "epoch": 0.03954802259887006, "frac_reward_zero_std": 0.125, "grad_norm": 1.8884894847869873, "learning_rate": 9.623352165725046e-07, "loss": 0.0024, "num_tokens": 2841242.0, "reward": 0.5347622036933899, "reward_std": 0.15656878054141998, "rewards/acc_reward/mean": 0.4830690622329712, "rewards/acc_reward/std": 0.31471821665763855, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 512.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 226.703125, "completions/mean_terminated_length": 222.1746063232422, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 1.84375, "epoch": 0.04143126177024482, "frac_reward_zero_std": 0.125, "grad_norm": 1.8899277448654175, "learning_rate": 9.6045197740113e-07, "loss": 0.0705, "num_tokens": 2963015.0, "reward": 0.43775349855422974, "reward_std": 0.2172819972038269, "rewards/acc_reward/mean": 0.3752816617488861, "rewards/acc_reward/std": 0.3713799715042114, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 229.5, "completions/mean_terminated_length": 220.3870849609375, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 1.7421875, "epoch": 0.04331450094161959, "frac_reward_zero_std": 0.25, "grad_norm": 1.6153416633605957, "learning_rate": 9.58568738229755e-07, "loss": -0.0054, "num_tokens": 3109447.0, "reward": 0.32855772972106934, "reward_std": 0.27425360679626465, "rewards/acc_reward/mean": 0.2556891441345215, "rewards/acc_reward/std": 0.40467146039009094, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 212.984375, "completions/mean_terminated_length": 212.984375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 1.484375, "epoch": 0.04519774011299435, "frac_reward_zero_std": 0.125, "grad_norm": 1.8740508556365967, "learning_rate": 9.566854990583804e-07, "loss": 0.0035, "num_tokens": 3255782.0, "reward": 0.4697909355163574, "reward_std": 0.18794436752796173, "rewards/acc_reward/mean": 0.41261494159698486, "rewards/acc_reward/std": 0.39874467253685, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 237.375, "completions/mean_terminated_length": 237.375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 1.9375, "epoch": 0.047080979284369114, "frac_reward_zero_std": 0.0, "grad_norm": 1.9496995210647583, "learning_rate": 9.548022598870055e-07, "loss": 0.1125, "num_tokens": 3398398.0, "reward": 0.43061739206314087, "reward_std": 0.25355520844459534, "rewards/acc_reward/mean": 0.36735260486602783, "rewards/acc_reward/std": 0.3482816219329834, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 236.890625, "completions/mean_terminated_length": 236.890625, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 1.765625, "epoch": 0.04896421845574388, "frac_reward_zero_std": 0.125, "grad_norm": 1.7656739950180054, "learning_rate": 9.529190207156308e-07, "loss": 0.0431, "num_tokens": 3534743.0, "reward": 0.49807009100914, "reward_std": 0.23242174088954926, "rewards/acc_reward/mean": 0.44230008125305176, "rewards/acc_reward/std": 0.38975948095321655, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 187.546875, "completions/mean_terminated_length": 187.546875, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 1.78125, "epoch": 0.05084745762711865, "frac_reward_zero_std": 0.125, "grad_norm": 2.076228618621826, "learning_rate": 9.510357815442561e-07, "loss": 0.0356, "num_tokens": 3659290.0, "reward": 0.44223499298095703, "reward_std": 0.21930678188800812, "rewards/acc_reward/mean": 0.383733332157135, "rewards/acc_reward/std": 0.3709230422973633, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 209.65625, "completions/mean_terminated_length": 209.65625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 1.875, "epoch": 0.05273069679849341, "frac_reward_zero_std": 0.125, "grad_norm": 1.8770859241485596, "learning_rate": 9.491525423728813e-07, "loss": 0.0769, "num_tokens": 3796772.0, "reward": 0.41265854239463806, "reward_std": 0.3168802857398987, "rewards/acc_reward/mean": 0.34913450479507446, "rewards/acc_reward/std": 0.42575517296791077, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 192.75, "completions/mean_terminated_length": 192.75, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 2.140625, "epoch": 0.054613935969868174, "frac_reward_zero_std": 0.125, "grad_norm": 2.1020395755767822, "learning_rate": 9.472693032015065e-07, "loss": 0.0712, "num_tokens": 3925692.0, "reward": 0.41072791814804077, "reward_std": 0.3003063201904297, "rewards/acc_reward/mean": 0.34698933362960815, "rewards/acc_reward/std": 0.4590102732181549, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 189.5625, "completions/mean_terminated_length": 189.5625, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 1.9609375, "epoch": 0.05649717514124294, "frac_reward_zero_std": 0.125, "grad_norm": 2.122809410095215, "learning_rate": 9.453860640301318e-07, "loss": -0.0219, "num_tokens": 4063328.0, "reward": 0.4698576331138611, "reward_std": 0.30180490016937256, "rewards/acc_reward/mean": 0.41095292568206787, "rewards/acc_reward/std": 0.4547015130519867, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 185.6875, "completions/mean_terminated_length": 185.6875, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 1.8359375, "epoch": 0.0583804143126177, "frac_reward_zero_std": 0.125, "grad_norm": 2.101128578186035, "learning_rate": 9.43502824858757e-07, "loss": 0.0325, "num_tokens": 4191876.0, "reward": 0.4844944179058075, "reward_std": 0.30700868368148804, "rewards/acc_reward/mean": 0.42895209789276123, "rewards/acc_reward/std": 0.4517141282558441, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 512.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 179.5, "completions/mean_terminated_length": 174.22222900390625, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 1.859375, "epoch": 0.060263653483992465, "frac_reward_zero_std": 0.0, "grad_norm": 2.2371370792388916, "learning_rate": 9.416195856873822e-07, "loss": -0.0023, "num_tokens": 4322148.0, "reward": 0.5825260281562805, "reward_std": 0.3706633746623993, "rewards/acc_reward/mean": 0.536139965057373, "rewards/acc_reward/std": 0.4555891752243042, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 188.6875, "completions/mean_terminated_length": 188.6875, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 1.625, "epoch": 0.062146892655367235, "frac_reward_zero_std": 0.0, "grad_norm": 1.9997694492340088, "learning_rate": 9.397363465160075e-07, "loss": 0.0677, "num_tokens": 4460208.0, "reward": 0.6379782557487488, "reward_std": 0.284807950258255, "rewards/acc_reward/mean": 0.599489688873291, "rewards/acc_reward/std": 0.42208555340766907, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 206.375, "completions/mean_terminated_length": 206.375, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 2.28125, "epoch": 0.064030131826742, "frac_reward_zero_std": 0.0, "grad_norm": 2.249376058578491, "learning_rate": 9.378531073446327e-07, "loss": 0.0497, "num_tokens": 4598760.0, "reward": 0.6912428140640259, "reward_std": 0.350800096988678, "rewards/acc_reward/mean": 0.658672571182251, "rewards/acc_reward/std": 0.45664042234420776, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 171.703125, "completions/mean_terminated_length": 171.703125, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 1.9609375, "epoch": 0.06591337099811675, "frac_reward_zero_std": 0.125, "grad_norm": 1.9761137962341309, "learning_rate": 9.359698681732579e-07, "loss": 0.0538, "num_tokens": 4729973.0, "reward": 0.7882718443870544, "reward_std": 0.24334368109703064, "rewards/acc_reward/mean": 0.7664825916290283, "rewards/acc_reward/std": 0.34395766258239746, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 165.21875, "completions/mean_terminated_length": 165.21875, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 1.9375, "epoch": 0.06779661016949153, "frac_reward_zero_std": 0.0, "grad_norm": 2.3349790573120117, "learning_rate": 9.340866290018831e-07, "loss": 0.0651, "num_tokens": 4870371.0, "reward": 0.7641240358352661, "reward_std": 0.3322174549102783, "rewards/acc_reward/mean": 0.7396517395973206, "rewards/acc_reward/std": 0.409751832485199, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 512.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 164.234375, "completions/mean_terminated_length": 158.71429443359375, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 1.9765625, "epoch": 0.0696798493408663, "frac_reward_zero_std": 0.125, "grad_norm": 2.1828882694244385, "learning_rate": 9.322033898305083e-07, "loss": 0.0297, "num_tokens": 5002066.0, "reward": 0.7222064733505249, "reward_std": 0.2505956292152405, "rewards/acc_reward/mean": 0.6913405656814575, "rewards/acc_reward/std": 0.3850165903568268, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 512.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 189.25, "completions/mean_terminated_length": 184.1269989013672, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 2.203125, "epoch": 0.07156308851224105, "frac_reward_zero_std": 0.0, "grad_norm": 2.241468906402588, "learning_rate": 9.303201506591337e-07, "loss": 0.0657, "num_tokens": 5135842.0, "reward": 0.767461359500885, "reward_std": 0.23975765705108643, "rewards/acc_reward/mean": 0.74509596824646, "rewards/acc_reward/std": 0.3569471538066864, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 512.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 167.296875, "completions/mean_terminated_length": 161.82540893554688, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 2.21875, "epoch": 0.07344632768361582, "frac_reward_zero_std": 0.125, "grad_norm": 2.625629186630249, "learning_rate": 9.28436911487759e-07, "loss": 0.0755, "num_tokens": 5268213.0, "reward": 0.7769017219543457, "reward_std": 0.301779568195343, "rewards/acc_reward/mean": 0.7555853128433228, "rewards/acc_reward/std": 0.3923026919364929, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 182.0, "completions/mean_terminated_length": 165.77047729492188, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 2.34375, "epoch": 0.07532956685499058, "frac_reward_zero_std": 0.25, "grad_norm": 1.8064289093017578, "learning_rate": 9.265536723163842e-07, "loss": -0.0294, "num_tokens": 5400085.0, "reward": 0.8888776302337646, "reward_std": 0.2301492542028427, "rewards/acc_reward/mean": 0.8782668113708496, "rewards/acc_reward/std": 0.31358516216278076, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 512.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 170.578125, "completions/mean_terminated_length": 165.15872192382812, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 2.375, "epoch": 0.07721280602636535, "frac_reward_zero_std": 0.375, "grad_norm": 1.6493501663208008, "learning_rate": 9.246704331450094e-07, "loss": -0.034, "num_tokens": 5529746.0, "reward": 0.8990048170089722, "reward_std": 0.1494266837835312, "rewards/acc_reward/mean": 0.889519214630127, "rewards/acc_reward/std": 0.26512882113456726, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 158.859375, "completions/mean_terminated_length": 158.859375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 2.4375, "epoch": 0.07909604519774012, "frac_reward_zero_std": 0.5, "grad_norm": 1.683705449104309, "learning_rate": 9.227871939736346e-07, "loss": 0.0056, "num_tokens": 5668777.0, "reward": 0.9233179092407227, "reward_std": 0.14834892749786377, "rewards/acc_reward/mean": 0.9165338277816772, "rewards/acc_reward/std": 0.2692948877811432, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 187.859375, "completions/mean_terminated_length": 187.859375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 2.46875, "epoch": 0.08097928436911488, "frac_reward_zero_std": 0.625, "grad_norm": 1.3769292831420898, "learning_rate": 9.209039548022598e-07, "loss": 0.0487, "num_tokens": 5800544.0, "reward": 0.9674270153045654, "reward_std": 0.048613592982292175, "rewards/acc_reward/mean": 0.9672800302505493, "rewards/acc_reward/std": 0.12451620399951935, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 164.546875, "completions/mean_terminated_length": 153.33871459960938, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 2.671875, "epoch": 0.08286252354048965, "frac_reward_zero_std": 0.75, "grad_norm": 1.0136916637420654, "learning_rate": 9.190207156308852e-07, "loss": -0.0441, "num_tokens": 5929459.0, "reward": 0.9583332538604736, "reward_std": 0.016743799671530724, "rewards/acc_reward/mean": 0.9571758508682251, "rewards/acc_reward/std": 0.0556831993162632, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 166.890625, "completions/mean_terminated_length": 166.890625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 2.25, "epoch": 0.0847457627118644, "frac_reward_zero_std": 0.125, "grad_norm": 2.0560693740844727, "learning_rate": 9.171374764595104e-07, "loss": -0.041, "num_tokens": 6061324.0, "reward": 0.9419167041778564, "reward_std": 0.08913865685462952, "rewards/acc_reward/mean": 0.9458796381950378, "rewards/acc_reward/std": 0.15155728161334991, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 184.484375, "completions/mean_terminated_length": 184.484375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 2.453125, "epoch": 0.08662900188323917, "frac_reward_zero_std": 0.5, "grad_norm": 1.4456909894943237, "learning_rate": 9.152542372881356e-07, "loss": 0.0865, "num_tokens": 6199275.0, "reward": 0.956250011920929, "reward_std": 0.1237436980009079, "rewards/acc_reward/mean": 0.953125, "rewards/acc_reward/std": 0.21304203569889069, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 182.65625, "completions/mean_terminated_length": 172.03225708007812, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 2.734375, "epoch": 0.08851224105461393, "frac_reward_zero_std": 0.875, "grad_norm": 0.9000487923622131, "learning_rate": 9.133709981167608e-07, "loss": 0.0061, "num_tokens": 6339829.0, "reward": 0.9662767648696899, "reward_std": 0.039774756878614426, "rewards/acc_reward/mean": 0.9625297784805298, "rewards/acc_reward/std": 0.12811994552612305, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 512.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 174.3125, "completions/mean_terminated_length": 168.952392578125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 2.546875, "epoch": 0.0903954802259887, "frac_reward_zero_std": 0.75, "grad_norm": 1.0134769678115845, "learning_rate": 9.11487758945386e-07, "loss": -0.0181, "num_tokens": 6473609.0, "reward": 0.9744918346405029, "reward_std": 0.008838837035000324, "rewards/acc_reward/mean": 0.9751298427581787, "rewards/acc_reward/std": 0.04131903871893883, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 188.421875, "completions/mean_terminated_length": 177.98385620117188, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 2.625, "epoch": 0.09227871939736347, "frac_reward_zero_std": 0.625, "grad_norm": 1.7283865213394165, "learning_rate": 9.096045197740112e-07, "loss": 0.0749, "num_tokens": 6607932.0, "reward": 0.9752188920974731, "reward_std": 0.015992172062397003, "rewards/acc_reward/mean": 0.981145977973938, "rewards/acc_reward/std": 0.025175929069519043, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.27048972249031067, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 194.421875, "completions/mean_terminated_length": 194.421875, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "entropy": 2.53125, "epoch": 0.09416195856873823, "frac_reward_zero_std": 0.75, "grad_norm": 1.1576530933380127, "learning_rate": 9.077212806026365e-07, "loss": 0.0122, "num_tokens": 6752119.0, "reward": 0.9867604374885559, "reward_std": 0.007432654500007629, "rewards/acc_reward/mean": 0.9870254993438721, "rewards/acc_reward/std": 0.017179692164063454, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 186.71875, "completions/mean_terminated_length": 186.71875, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 2.78125, "epoch": 0.096045197740113, "frac_reward_zero_std": 0.125, "grad_norm": 2.141554355621338, "learning_rate": 9.058380414312617e-07, "loss": -0.0311, "num_tokens": 6890533.0, "reward": 0.9580291509628296, "reward_std": 0.0707060694694519, "rewards/acc_reward/mean": 0.9637823700904846, "rewards/acc_reward/std": 0.13111592829227448, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.29378482699394226, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 512.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 169.390625, "completions/mean_terminated_length": 163.952392578125, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 2.421875, "epoch": 0.09792843691148775, "frac_reward_zero_std": 0.875, "grad_norm": 0.8654880523681641, "learning_rate": 9.03954802259887e-07, "loss": 0.0053, "num_tokens": 7027358.0, "reward": 0.9961555004119873, "reward_std": 0.004419418517500162, "rewards/acc_reward/mean": 0.9974644780158997, "rewards/acc_reward/std": 0.006761432159692049, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 512.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 176.078125, "completions/mean_terminated_length": 170.74603271484375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 2.609375, "epoch": 0.09981167608286252, "frac_reward_zero_std": 0.625, "grad_norm": 1.4219504594802856, "learning_rate": 9.020715630885122e-07, "loss": -0.0008, "num_tokens": 7162123.0, "reward": 0.9900810718536377, "reward_std": 0.021838055923581123, "rewards/acc_reward/mean": 0.9924511909484863, "rewards/acc_reward/std": 0.04345937818288803, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 512.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 171.125, "completions/mean_terminated_length": 165.71429443359375, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 2.453125, "epoch": 0.1016949152542373, "frac_reward_zero_std": 0.875, "grad_norm": 0.9306376576423645, "learning_rate": 9.001883239171374e-07, "loss": 0.0039, "num_tokens": 7294099.0, "reward": 0.9847477674484253, "reward_std": 0.005786377005279064, "rewards/acc_reward/mean": 0.986525297164917, "rewards/acc_reward/std": 0.02815171889960766, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 185.28125, "completions/mean_terminated_length": 185.28125, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 2.59375, "epoch": 0.10357815442561205, "frac_reward_zero_std": 0.75, "grad_norm": 1.0906463861465454, "learning_rate": 8.983050847457627e-07, "loss": -0.0101, "num_tokens": 7421381.0, "reward": 0.9820950031280518, "reward_std": 0.013950306922197342, "rewards/acc_reward/mean": 0.9818416833877563, "rewards/acc_reward/std": 0.03777196630835533, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 189.296875, "completions/mean_terminated_length": 189.296875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 2.71875, "epoch": 0.10546139359698682, "frac_reward_zero_std": 0.875, "grad_norm": 1.0221842527389526, "learning_rate": 8.964218455743879e-07, "loss": -0.0019, "num_tokens": 7550840.0, "reward": 0.9848359823226929, "reward_std": 0.004419418517500162, "rewards/acc_reward/mean": 0.9848872423171997, "rewards/acc_reward/std": 0.026450620964169502, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 204.6875, "completions/mean_terminated_length": 204.6875, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 2.609375, "epoch": 0.10734463276836158, "frac_reward_zero_std": 0.75, "grad_norm": 1.1911293268203735, "learning_rate": 8.945386064030131e-07, "loss": 0.0145, "num_tokens": 7693764.0, "reward": 0.9755189418792725, "reward_std": 0.04419417679309845, "rewards/acc_reward/mean": 0.9745348691940308, "rewards/acc_reward/std": 0.1250728815793991, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 210.546875, "completions/mean_terminated_length": 210.546875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 2.65625, "epoch": 0.10922787193973635, "frac_reward_zero_std": 0.875, "grad_norm": 0.7357279062271118, "learning_rate": 8.926553672316383e-07, "loss": 0.0164, "num_tokens": 7828263.0, "reward": 0.9803258180618286, "reward_std": 0.007434290833771229, "rewards/acc_reward/mean": 0.9781398177146912, "rewards/acc_reward/std": 0.03639683872461319, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 154.046875, "completions/mean_terminated_length": 154.046875, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 2.5625, "epoch": 0.1111111111111111, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.907721280602636e-07, "loss": 0.0, "num_tokens": 7968106.0, "reward": 0.9852668046951294, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9836297631263733, "rewards/acc_reward/std": 0.024946285411715508, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 177.296875, "completions/mean_terminated_length": 166.5, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 2.546875, "epoch": 0.11299435028248588, "frac_reward_zero_std": 0.875, "grad_norm": 1.5839695930480957, "learning_rate": 8.888888888888888e-07, "loss": 0.0779, "num_tokens": 8098157.0, "reward": 0.9858125448226929, "reward_std": 0.004419418517500162, "rewards/acc_reward/mean": 0.9859722852706909, "rewards/acc_reward/std": 0.019654173403978348, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 512.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 167.1875, "completions/mean_terminated_length": 161.71429443359375, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 2.765625, "epoch": 0.11487758945386065, "frac_reward_zero_std": 0.75, "grad_norm": 1.4451484680175781, "learning_rate": 8.870056497175141e-07, "loss": -0.0, "num_tokens": 8227641.0, "reward": 0.9645360708236694, "reward_std": 0.008838837035000324, "rewards/acc_reward/mean": 0.964067816734314, "rewards/acc_reward/std": 0.040923893451690674, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 181.734375, "completions/mean_terminated_length": 181.734375, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "entropy": 2.625, "epoch": 0.1167608286252354, "frac_reward_zero_std": 0.875, "grad_norm": 0.3860294222831726, "learning_rate": 8.851224105461393e-07, "loss": -0.0431, "num_tokens": 8359496.0, "reward": 0.991644024848938, "reward_std": 0.004419418517500162, "rewards/acc_reward/mean": 0.9924516677856445, "rewards/acc_reward/std": 0.020128827542066574, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 512.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 207.34375, "completions/mean_terminated_length": 202.50794982910156, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 2.75, "epoch": 0.11864406779661017, "frac_reward_zero_std": 0.875, "grad_norm": 0.6225361227989197, "learning_rate": 8.832391713747645e-07, "loss": -0.0011, "num_tokens": 8500670.0, "reward": 0.9970052242279053, "reward_std": 0.004419418517500162, "rewards/acc_reward/mean": 0.9984085559844971, "rewards/acc_reward/std": 0.004243890754878521, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 512.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 169.6875, "completions/mean_terminated_length": 164.25396728515625, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 2.3125, "epoch": 0.12052730696798493, "frac_reward_zero_std": 0.625, "grad_norm": 1.537191390991211, "learning_rate": 8.813559322033897e-07, "loss": 0.0021, "num_tokens": 8631754.0, "reward": 0.9799120426177979, "reward_std": 0.013258256018161774, "rewards/acc_reward/mean": 0.982888400554657, "rewards/acc_reward/std": 0.022922541946172714, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 172.59375, "completions/mean_terminated_length": 172.59375, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 2.609375, "epoch": 0.1224105461393597, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.79472693032015e-07, "loss": 0.0, "num_tokens": 8762784.0, "reward": 0.9871211051940918, "reward_std": 0.0, "rewards/acc_reward/mean": 0.985690176486969, "rewards/acc_reward/std": 0.017727959901094437, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 181.234375, "completions/mean_terminated_length": 181.234375, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 2.453125, "epoch": 0.12429378531073447, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.775894538606404e-07, "loss": 0.0, "num_tokens": 8887127.0, "reward": 0.9676999449729919, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9641109704971313, "rewards/acc_reward/std": 0.03724094480276108, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 512.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 163.078125, "completions/mean_terminated_length": 157.53968811035156, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 2.296875, "epoch": 0.12617702448210924, "frac_reward_zero_std": 0.875, "grad_norm": 0.6480050086975098, "learning_rate": 8.757062146892656e-07, "loss": -0.0023, "num_tokens": 9024028.0, "reward": 0.9807539582252502, "reward_std": 0.004419418517500162, "rewards/acc_reward/mean": 0.9803516268730164, "rewards/acc_reward/std": 0.02998371422290802, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 150.859375, "completions/mean_terminated_length": 150.859375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 2.375, "epoch": 0.128060263653484, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.738229755178908e-07, "loss": 0.0, "num_tokens": 9149267.0, "reward": 1.0, "reward_std": 0.0, "rewards/acc_reward/mean": 1.0, "rewards/acc_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 156.34375, "completions/mean_terminated_length": 156.34375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 2.359375, "epoch": 0.12994350282485875, "frac_reward_zero_std": 0.75, "grad_norm": 1.1719481945037842, "learning_rate": 8.71939736346516e-07, "loss": -0.0147, "num_tokens": 9283337.0, "reward": 0.9746325016021729, "reward_std": 0.04419417679309845, "rewards/acc_reward/mean": 0.9735499620437622, "rewards/acc_reward/std": 0.1257747858762741, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 153.515625, "completions/mean_terminated_length": 153.515625, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "entropy": 2.53125, "epoch": 0.1318267419962335, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.700564971751412e-07, "loss": 0.0, "num_tokens": 9422506.0, "reward": 0.9936791658401489, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9929767847061157, "rewards/acc_reward/std": 0.018728474155068398, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 159.90625, "completions/mean_terminated_length": 159.90625, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 2.234375, "epoch": 0.1337099811676083, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.681732580037664e-07, "loss": 0.0, "num_tokens": 9558724.0, "reward": 0.9724128246307373, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9693475365638733, "rewards/acc_reward/std": 0.049584269523620605, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 171.421875, "completions/mean_terminated_length": 171.421875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 2.65625, "epoch": 0.13559322033898305, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.662900188323917e-07, "loss": 0.0, "num_tokens": 9696479.0, "reward": 0.9782567620277405, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9758408665657043, "rewards/acc_reward/std": 0.035969410091638565, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 146.640625, "completions/mean_terminated_length": 146.640625, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 2.1875, "epoch": 0.1374764595103578, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.64406779661017e-07, "loss": 0.0, "num_tokens": 9833768.0, "reward": 0.9815881252288818, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9795423150062561, "rewards/acc_reward/std": 0.03601390868425369, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 512.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 159.484375, "completions/mean_terminated_length": 153.88888549804688, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 2.40625, "epoch": 0.1393596986817326, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.625235404896422e-07, "loss": 0.0, "num_tokens": 9965639.0, "reward": 0.9888086318969727, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9875651597976685, "rewards/acc_reward/std": 0.02026844024658203, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 159.8125, "completions/mean_terminated_length": 159.8125, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 2.28125, "epoch": 0.14124293785310735, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.606403013182674e-07, "loss": 0.0, "num_tokens": 10102131.0, "reward": 0.9853819608688354, "reward_std": 0.0, "rewards/acc_reward/mean": 0.983757734298706, "rewards/acc_reward/std": 0.023104524239897728, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 184.40625, "completions/mean_terminated_length": 173.8386993408203, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 2.390625, "epoch": 0.1431261770244821, "frac_reward_zero_std": 0.875, "grad_norm": 0.7094928026199341, "learning_rate": 8.587570621468926e-07, "loss": -0.0279, "num_tokens": 10234637.0, "reward": 0.9758157730102539, "reward_std": 0.009495548903942108, "rewards/acc_reward/mean": 0.973128616809845, "rewards/acc_reward/std": 0.04172620549798012, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 187.34375, "completions/mean_terminated_length": 187.34375, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 2.484375, "epoch": 0.14500941619585686, "frac_reward_zero_std": 0.875, "grad_norm": 0.6446596384048462, "learning_rate": 8.568738229755178e-07, "loss": -0.0238, "num_tokens": 10380091.0, "reward": 0.9779398441314697, "reward_std": 0.004419418517500162, "rewards/acc_reward/mean": 0.9772248268127441, "rewards/acc_reward/std": 0.026913031935691833, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 161.046875, "completions/mean_terminated_length": 161.046875, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 2.359375, "epoch": 0.14689265536723164, "frac_reward_zero_std": 0.875, "grad_norm": 0.9724075198173523, "learning_rate": 8.549905838041431e-07, "loss": 0.0269, "num_tokens": 10505822.0, "reward": 0.988434910774231, "reward_std": 0.004419418517500162, "rewards/acc_reward/mean": 0.9888859987258911, "rewards/acc_reward/std": 0.024498289451003075, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 159.9375, "completions/mean_terminated_length": 159.9375, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 2.375, "epoch": 0.1487758945386064, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.531073446327683e-07, "loss": 0.0, "num_tokens": 10638682.0, "reward": 0.990105152130127, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9890057444572449, "rewards/acc_reward/std": 0.019625553861260414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 512.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 165.125, "completions/mean_terminated_length": 159.6190643310547, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 2.734375, "epoch": 0.15065913370998116, "frac_reward_zero_std": 0.875, "grad_norm": 0.6837416291236877, "learning_rate": 8.512241054613935e-07, "loss": -0.0191, "num_tokens": 10769954.0, "reward": 0.98872971534729, "reward_std": 0.004419418517500162, "rewards/acc_reward/mean": 0.9892135858535767, "rewards/acc_reward/std": 0.02876383066177368, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 512.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 184.4375, "completions/mean_terminated_length": 179.23809814453125, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 2.625, "epoch": 0.15254237288135594, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.493408662900188e-07, "loss": 0.0, "num_tokens": 10908782.0, "reward": 0.9871925115585327, "reward_std": 0.0, "rewards/acc_reward/mean": 0.985769510269165, "rewards/acc_reward/std": 0.02491430565714836, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 155.265625, "completions/mean_terminated_length": 155.265625, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 2.4375, "epoch": 0.1544256120527307, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.47457627118644e-07, "loss": 0.0, "num_tokens": 11042303.0, "reward": 0.9902667999267578, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9891853332519531, "rewards/acc_reward/std": 0.015684949234128, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 153.015625, "completions/mean_terminated_length": 153.015625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 2.5, "epoch": 0.15630885122410546, "frac_reward_zero_std": 0.875, "grad_norm": 0.6075497269630432, "learning_rate": 8.455743879472693e-07, "loss": -0.027, "num_tokens": 11173760.0, "reward": 0.9887193441390991, "reward_std": 0.004419418517500162, "rewards/acc_reward/mean": 0.989202082157135, "rewards/acc_reward/std": 0.012971931137144566, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 142.296875, "completions/mean_terminated_length": 142.296875, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 2.265625, "epoch": 0.15819209039548024, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.436911487758945e-07, "loss": 0.0, "num_tokens": 11310011.0, "reward": 0.9917968511581421, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9908854365348816, "rewards/acc_reward/std": 0.01598946936428547, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 138.125, "completions/mean_terminated_length": 138.125, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 2.265625, "epoch": 0.160075329566855, "frac_reward_zero_std": 0.875, "grad_norm": 0.6712620854377747, "learning_rate": 8.418079096045197e-07, "loss": -0.0066, "num_tokens": 11440675.0, "reward": 0.9767186641693115, "reward_std": 0.004419418517500162, "rewards/acc_reward/mean": 0.9758679866790771, "rewards/acc_reward/std": 0.04487896338105202, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 172.578125, "completions/mean_terminated_length": 172.578125, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 2.640625, "epoch": 0.16195856873822975, "frac_reward_zero_std": 0.875, "grad_norm": 0.770696222782135, "learning_rate": 8.399246704331449e-07, "loss": -0.0185, "num_tokens": 11569064.0, "reward": 0.9869691133499146, "reward_std": 0.004419418517500162, "rewards/acc_reward/mean": 0.9872573614120483, "rewards/acc_reward/std": 0.022249845787882805, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 172.6875, "completions/mean_terminated_length": 172.6875, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 2.3125, "epoch": 0.1638418079096045, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.380414312617701e-07, "loss": 0.0, "num_tokens": 11708700.0, "reward": 0.9803584814071655, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9781760573387146, "rewards/acc_reward/std": 0.02318427711725235, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 155.0625, "completions/mean_terminated_length": 143.5483856201172, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 2.40625, "epoch": 0.1657250470809793, "frac_reward_zero_std": 0.875, "grad_norm": 0.9133210778236389, "learning_rate": 8.361581920903954e-07, "loss": 0.0155, "num_tokens": 11832608.0, "reward": 0.9537662267684937, "reward_std": 0.006725744344294071, "rewards/acc_reward/mean": 0.9486291408538818, "rewards/acc_reward/std": 0.06949032843112946, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 170.796875, "completions/mean_terminated_length": 170.796875, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "entropy": 2.359375, "epoch": 0.16760828625235405, "frac_reward_zero_std": 0.75, "grad_norm": 0.9491801857948303, "learning_rate": 8.342749529190208e-07, "loss": -0.0359, "num_tokens": 11968843.0, "reward": 0.9640452265739441, "reward_std": 0.010205795988440514, "rewards/acc_reward/mean": 0.9652585983276367, "rewards/acc_reward/std": 0.046331144869327545, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21304203569889069, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 151.96875, "completions/mean_terminated_length": 151.96875, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 2.3125, "epoch": 0.1694915254237288, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.32391713747646e-07, "loss": 0.0, "num_tokens": 12106473.0, "reward": 0.9978047609329224, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9975607991218567, "rewards/acc_reward/std": 0.006504515651613474, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 170.75, "completions/mean_terminated_length": 170.75, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 2.484375, "epoch": 0.1713747645951036, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.305084745762712e-07, "loss": 0.0, "num_tokens": 12241945.0, "reward": 0.9895379543304443, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9883755445480347, "rewards/acc_reward/std": 0.018741585314273834, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 165.765625, "completions/mean_terminated_length": 165.765625, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 2.34375, "epoch": 0.17325800376647835, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.286252354048964e-07, "loss": 0.0, "num_tokens": 12375658.0, "reward": 0.9758948087692261, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9732165336608887, "rewards/acc_reward/std": 0.04799450561404228, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 512.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 195.265625, "completions/mean_terminated_length": 190.23809814453125, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 2.46875, "epoch": 0.1751412429378531, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.267419962335216e-07, "loss": 0.0, "num_tokens": 12513219.0, "reward": 0.9753564596176147, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9726182222366333, "rewards/acc_reward/std": 0.045861802995204926, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 155.5, "completions/mean_terminated_length": 155.5, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 2.1875, "epoch": 0.17702448210922786, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.248587570621468e-07, "loss": 0.0, "num_tokens": 12648475.0, "reward": 0.9844207763671875, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9826897382736206, "rewards/acc_reward/std": 0.02190142311155796, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 512.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 187.34375, "completions/mean_terminated_length": 182.19049072265625, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 2.5625, "epoch": 0.17890772128060264, "frac_reward_zero_std": 0.75, "grad_norm": 1.2475651502609253, "learning_rate": 8.229755178907722e-07, "loss": -0.0212, "num_tokens": 12785969.0, "reward": 0.9898383617401123, "reward_std": 0.008838837035000324, "rewards/acc_reward/mean": 0.9921815395355225, "rewards/acc_reward/std": 0.015049039386212826, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 153.0, "completions/mean_terminated_length": 153.0, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 2.34375, "epoch": 0.1807909604519774, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.210922787193974e-07, "loss": 0.0, "num_tokens": 12907345.0, "reward": 0.9843592047691345, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9826213121414185, "rewards/acc_reward/std": 0.03606174886226654, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 168.28125, "completions/mean_terminated_length": 168.28125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 2.171875, "epoch": 0.18267419962335216, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.192090395480226e-07, "loss": 0.0, "num_tokens": 13033539.0, "reward": 0.9740588665008545, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9711765050888062, "rewards/acc_reward/std": 0.033579710870981216, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 512.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 164.03125, "completions/mean_terminated_length": 158.5079345703125, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 2.546875, "epoch": 0.18455743879472694, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.173258003766478e-07, "loss": 0.0, "num_tokens": 13159461.0, "reward": 0.983502984046936, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9816700220108032, "rewards/acc_reward/std": 0.024904713034629822, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 512.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 174.546875, "completions/mean_terminated_length": 169.19049072265625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 2.140625, "epoch": 0.1864406779661017, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.15442561205273e-07, "loss": 0.0, "num_tokens": 13287976.0, "reward": 0.9533977508544922, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9482197761535645, "rewards/acc_reward/std": 0.043142788112163544, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 512.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 161.15625, "completions/mean_terminated_length": 155.58731079101562, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 2.25, "epoch": 0.18832391713747645, "frac_reward_zero_std": 0.875, "grad_norm": 0.5508601665496826, "learning_rate": 8.135593220338983e-07, "loss": -0.0178, "num_tokens": 13424754.0, "reward": 0.9814343452453613, "reward_std": 0.004419418517500162, "rewards/acc_reward/mean": 0.9811076521873474, "rewards/acc_reward/std": 0.01872284896671772, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 152.96875, "completions/mean_terminated_length": 152.96875, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 2.40625, "epoch": 0.1902071563088512, "frac_reward_zero_std": 0.875, "grad_norm": 0.733428418636322, "learning_rate": 8.116760828625235e-07, "loss": -0.0236, "num_tokens": 13554768.0, "reward": 0.9719411730766296, "reward_std": 0.039774756878614426, "rewards/acc_reward/mean": 0.9688235521316528, "rewards/acc_reward/std": 0.1259699910879135, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 512.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 163.6875, "completions/mean_terminated_length": 158.1587371826172, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 2.28125, "epoch": 0.192090395480226, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.097928436911488e-07, "loss": 0.0, "num_tokens": 13684028.0, "reward": 0.9804370403289795, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9782633781433105, "rewards/acc_reward/std": 0.03396952524781227, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 189.28125, "completions/mean_terminated_length": 178.8709716796875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 2.5625, "epoch": 0.19397363465160075, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.07909604519774e-07, "loss": 0.0, "num_tokens": 13813006.0, "reward": 0.9762270450592041, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9735856056213379, "rewards/acc_reward/std": 0.05650464445352554, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 176.3125, "completions/mean_terminated_length": 176.3125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 2.515625, "epoch": 0.1958568738229755, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.060263653483992e-07, "loss": 0.0, "num_tokens": 13945954.0, "reward": 0.9901642799377441, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9890714883804321, "rewards/acc_reward/std": 0.014226200059056282, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 175.34375, "completions/mean_terminated_length": 175.34375, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 2.515625, "epoch": 0.1977401129943503, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.041431261770244e-07, "loss": 0.0, "num_tokens": 14080760.0, "reward": 0.9843112826347351, "reward_std": 0.0, "rewards/acc_reward/mean": 0.982568085193634, "rewards/acc_reward/std": 0.029767252504825592, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 162.34375, "completions/mean_terminated_length": 162.34375, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 2.25, "epoch": 0.19962335216572505, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.022598870056497e-07, "loss": 0.0, "num_tokens": 14202574.0, "reward": 0.9882901310920715, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9869890213012695, "rewards/acc_reward/std": 0.017871392890810966, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 177.140625, "completions/mean_terminated_length": 177.140625, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 2.515625, "epoch": 0.2015065913370998, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 8.003766478342749e-07, "loss": 0.0, "num_tokens": 14339895.0, "reward": 0.9751439094543457, "reward_std": 0.0, "rewards/acc_reward/mean": 0.972382128238678, "rewards/acc_reward/std": 0.04191429913043976, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 182.734375, "completions/mean_terminated_length": 182.734375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 2.46875, "epoch": 0.2033898305084746, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.984934086629001e-07, "loss": 0.0, "num_tokens": 14480414.0, "reward": 0.9581470489501953, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9534966945648193, "rewards/acc_reward/std": 0.06734198331832886, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 150.65625, "completions/mean_terminated_length": 150.65625, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 2.203125, "epoch": 0.20527306967984935, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.966101694915253e-07, "loss": 0.0, "num_tokens": 14604040.0, "reward": 0.9401878118515015, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9335420727729797, "rewards/acc_reward/std": 0.09204845130443573, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 153.203125, "completions/mean_terminated_length": 153.203125, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 2.28125, "epoch": 0.2071563088512241, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.947269303201506e-07, "loss": 0.0, "num_tokens": 14742229.0, "reward": 0.9852752685546875, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9836391806602478, "rewards/acc_reward/std": 0.04362887144088745, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 170.71875, "completions/mean_terminated_length": 170.71875, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 2.375, "epoch": 0.20903954802259886, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.928436911487758e-07, "loss": 0.0, "num_tokens": 14873859.0, "reward": 0.9933172464370728, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9925747513771057, "rewards/acc_reward/std": 0.010412875562906265, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 168.53125, "completions/mean_terminated_length": 168.53125, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 2.5, "epoch": 0.21092278719397364, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.909604519774011e-07, "loss": 0.0, "num_tokens": 15006189.0, "reward": 0.9692245125770569, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9658049941062927, "rewards/acc_reward/std": 0.0648469477891922, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 169.125, "completions/mean_terminated_length": 169.125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 2.3125, "epoch": 0.2128060263653484, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.890772128060263e-07, "loss": 0.0, "num_tokens": 15144437.0, "reward": 0.9728338718414307, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9698153734207153, "rewards/acc_reward/std": 0.040937572717666626, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 158.234375, "completions/mean_terminated_length": 158.234375, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 2.421875, "epoch": 0.21468926553672316, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.871939736346515e-07, "loss": 0.0, "num_tokens": 15266348.0, "reward": 0.9900326728820801, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9889251589775085, "rewards/acc_reward/std": 0.011797359213232994, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 512.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 184.890625, "completions/mean_terminated_length": 179.69842529296875, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 2.609375, "epoch": 0.21657250470809794, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.853107344632767e-07, "loss": 0.0, "num_tokens": 15406085.0, "reward": 0.9864563941955566, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9849514961242676, "rewards/acc_reward/std": 0.023492755368351936, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 512.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 179.328125, "completions/mean_terminated_length": 174.04762268066406, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 2.34375, "epoch": 0.2184557438794727, "frac_reward_zero_std": 0.875, "grad_norm": 0.8667036294937134, "learning_rate": 7.83427495291902e-07, "loss": -0.0178, "num_tokens": 15537066.0, "reward": 0.9839749336242676, "reward_std": 0.004419418517500162, "rewards/acc_reward/mean": 0.9839304685592651, "rewards/acc_reward/std": 0.025710513815283775, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 151.453125, "completions/mean_terminated_length": 151.453125, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 2.34375, "epoch": 0.22033898305084745, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.815442561205274e-07, "loss": 0.0, "num_tokens": 15661063.0, "reward": 0.9836729168891907, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9818587899208069, "rewards/acc_reward/std": 0.03338773176074028, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 173.40625, "completions/mean_terminated_length": 173.40625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 2.53125, "epoch": 0.2222222222222222, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.796610169491526e-07, "loss": 0.0, "num_tokens": 15794785.0, "reward": 0.970079779624939, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9667552709579468, "rewards/acc_reward/std": 0.03764721751213074, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 156.765625, "completions/mean_terminated_length": 156.765625, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 2.171875, "epoch": 0.224105461393597, "frac_reward_zero_std": 0.875, "grad_norm": 1.2777234315872192, "learning_rate": 7.777777777777778e-07, "loss": 0.0405, "num_tokens": 15925402.0, "reward": 0.9777387380599976, "reward_std": 0.013079374097287655, "rewards/acc_reward/mean": 0.9752652645111084, "rewards/acc_reward/std": 0.04798175394535065, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 512.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 203.3125, "completions/mean_terminated_length": 198.4127197265625, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 2.65625, "epoch": 0.22598870056497175, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.75894538606403e-07, "loss": 0.0, "num_tokens": 16073038.0, "reward": 0.9984081983566284, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9982313513755798, "rewards/acc_reward/std": 0.0047163767740130424, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 204.984375, "completions/mean_terminated_length": 195.08062744140625, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 2.59375, "epoch": 0.2278719397363465, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.740112994350282e-07, "loss": 0.0, "num_tokens": 16218901.0, "reward": 0.9989771842956543, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9988635778427124, "rewards/acc_reward/std": 0.0020075358916074038, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 174.4375, "completions/mean_terminated_length": 174.4375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 2.328125, "epoch": 0.2297551789077213, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.721280602636534e-07, "loss": 0.0, "num_tokens": 16352369.0, "reward": 0.988227367401123, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9869192838668823, "rewards/acc_reward/std": 0.026611221954226494, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 167.890625, "completions/mean_terminated_length": 167.890625, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 2.390625, "epoch": 0.23163841807909605, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.702448210922787e-07, "loss": 0.0, "num_tokens": 16482378.0, "reward": 0.9788169860839844, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9764633178710938, "rewards/acc_reward/std": 0.03820019215345383, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 178.171875, "completions/mean_terminated_length": 178.171875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 2.125, "epoch": 0.2335216572504708, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.68361581920904e-07, "loss": 0.0, "num_tokens": 16619925.0, "reward": 0.9605213403701782, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9561347961425781, "rewards/acc_reward/std": 0.08166956156492233, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 512.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 173.6875, "completions/mean_terminated_length": 168.3174591064453, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 2.390625, "epoch": 0.23540489642184556, "frac_reward_zero_std": 0.875, "grad_norm": 0.9748631119728088, "learning_rate": 7.664783427495292e-07, "loss": 0.0211, "num_tokens": 16749945.0, "reward": 0.9801042079925537, "reward_std": 0.039774756878614426, "rewards/acc_reward/mean": 0.9778935313224792, "rewards/acc_reward/std": 0.12537133693695068, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 178.25, "completions/mean_terminated_length": 167.48387145996094, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 2.5625, "epoch": 0.23728813559322035, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.645951035781544e-07, "loss": 0.0, "num_tokens": 16879657.0, "reward": 0.9902485609054565, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9891650676727295, "rewards/acc_reward/std": 0.015085420571267605, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 162.15625, "completions/mean_terminated_length": 162.15625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 2.328125, "epoch": 0.2391713747645951, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.627118644067796e-07, "loss": 0.0, "num_tokens": 17009299.0, "reward": 0.9965801239013672, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9962000846862793, "rewards/acc_reward/std": 0.010133087635040283, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 160.796875, "completions/mean_terminated_length": 160.796875, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 2.28125, "epoch": 0.24105461393596986, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.608286252354048e-07, "loss": 0.0, "num_tokens": 17146406.0, "reward": 0.9833929538726807, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9815477728843689, "rewards/acc_reward/std": 0.024218887090682983, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 175.1875, "completions/mean_terminated_length": 175.1875, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 2.484375, "epoch": 0.24293785310734464, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.589453860640301e-07, "loss": 0.0, "num_tokens": 17274242.0, "reward": 0.9791369438171387, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9768187999725342, "rewards/acc_reward/std": 0.03209559619426727, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 512.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 174.9375, "completions/mean_terminated_length": 169.58731079101562, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 2.359375, "epoch": 0.2448210922787194, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.570621468926553e-07, "loss": 0.0, "num_tokens": 17409982.0, "reward": 0.9768315553665161, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9742573499679565, "rewards/acc_reward/std": 0.05567716062068939, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 512.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 186.6875, "completions/mean_terminated_length": 181.52381896972656, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 2.421875, "epoch": 0.24670433145009416, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.551789077212806e-07, "loss": 0.0, "num_tokens": 17545354.0, "reward": 0.9810404777526855, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9789338707923889, "rewards/acc_reward/std": 0.03438215330243111, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 155.46875, "completions/mean_terminated_length": 155.46875, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 2.234375, "epoch": 0.24858757062146894, "frac_reward_zero_std": 0.875, "grad_norm": 0.42140984535217285, "learning_rate": 7.532956685499058e-07, "loss": 0.0881, "num_tokens": 17679648.0, "reward": 0.9783517122268677, "reward_std": 0.039774756878614426, "rewards/acc_reward/mean": 0.975946307182312, "rewards/acc_reward/std": 0.12442652136087418, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 180.71875, "completions/mean_terminated_length": 170.03225708007812, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 2.5625, "epoch": 0.2504708097928437, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.51412429378531e-07, "loss": 0.0, "num_tokens": 17811718.0, "reward": 0.9774539470672607, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9749488830566406, "rewards/acc_reward/std": 0.04454605653882027, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 512.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 177.40625, "completions/mean_terminated_length": 172.09524536132812, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 2.21875, "epoch": 0.2523540489642185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.495291902071563e-07, "loss": 0.0, "num_tokens": 17944256.0, "reward": 0.9872071743011475, "reward_std": 0.0, "rewards/acc_reward/mean": 0.985785722732544, "rewards/acc_reward/std": 0.01633262448012829, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 167.875, "completions/mean_terminated_length": 167.875, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 2.375, "epoch": 0.2542372881355932, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.476459510357815e-07, "loss": 0.0, "num_tokens": 18074344.0, "reward": 0.996717095375061, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9963523149490356, "rewards/acc_reward/std": 0.009727060794830322, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 168.921875, "completions/mean_terminated_length": 168.921875, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 2.3125, "epoch": 0.256120527306968, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.457627118644067e-07, "loss": 0.0, "num_tokens": 18202499.0, "reward": 0.9629114866256714, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9587906002998352, "rewards/acc_reward/std": 0.05813661590218544, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 152.171875, "completions/mean_terminated_length": 152.171875, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 2.234375, "epoch": 0.2580037664783427, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.438794726930319e-07, "loss": 0.0, "num_tokens": 18325262.0, "reward": 0.9783220887184143, "reward_std": 0.0, "rewards/acc_reward/mean": 0.975913405418396, "rewards/acc_reward/std": 0.03493288531899452, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 182.3125, "completions/mean_terminated_length": 182.3125, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 2.421875, "epoch": 0.2598870056497175, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.419962335216571e-07, "loss": 0.0, "num_tokens": 18463394.0, "reward": 0.9732788801193237, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9703099131584167, "rewards/acc_reward/std": 0.02722685970366001, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 137.0, "completions/mean_terminated_length": 137.0, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 2.171875, "epoch": 0.2617702448210923, "frac_reward_zero_std": 0.875, "grad_norm": 1.0246717929840088, "learning_rate": 7.401129943502824e-07, "loss": 0.0013, "num_tokens": 18588066.0, "reward": 0.9853101968765259, "reward_std": 0.004419418517500162, "rewards/acc_reward/mean": 0.9854141473770142, "rewards/acc_reward/std": 0.01550329476594925, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 512.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 180.15625, "completions/mean_terminated_length": 174.88890075683594, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 2.4375, "epoch": 0.263653483992467, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.382297551789078e-07, "loss": 0.0, "num_tokens": 18720780.0, "reward": 0.973831057548523, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9709233641624451, "rewards/acc_reward/std": 0.047117821872234344, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 190.109375, "completions/mean_terminated_length": 179.72579956054688, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 2.40625, "epoch": 0.2655367231638418, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.36346516007533e-07, "loss": 0.0, "num_tokens": 18857011.0, "reward": 0.9712393283843994, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9680436849594116, "rewards/acc_reward/std": 0.04809395968914032, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 160.671875, "completions/mean_terminated_length": 160.671875, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 2.140625, "epoch": 0.2674199623352166, "frac_reward_zero_std": 0.75, "grad_norm": 1.3321737051010132, "learning_rate": 7.344632768361582e-07, "loss": -0.0087, "num_tokens": 18989918.0, "reward": 0.9906257390975952, "reward_std": 0.008838837035000324, "rewards/acc_reward/mean": 0.9930564165115356, "rewards/acc_reward/std": 0.010120646096765995, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17536810040473938, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 147.78125, "completions/mean_terminated_length": 147.78125, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 2.25, "epoch": 0.2693032015065913, "frac_reward_zero_std": 0.875, "grad_norm": 1.1365125179290771, "learning_rate": 7.325800376647834e-07, "loss": 0.0151, "num_tokens": 19115760.0, "reward": 0.9455662965774536, "reward_std": 0.004419418517500162, "rewards/acc_reward/mean": 0.9412541389465332, "rewards/acc_reward/std": 0.09196340292692184, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 161.796875, "completions/mean_terminated_length": 161.796875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 2.234375, "epoch": 0.2711864406779661, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.306967984934086e-07, "loss": 0.0, "num_tokens": 19244779.0, "reward": 0.9764645099639893, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9738494753837585, "rewards/acc_reward/std": 0.043730027973651886, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 512.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 164.109375, "completions/mean_terminated_length": 158.58731079101562, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 2.28125, "epoch": 0.2730696798493409, "frac_reward_zero_std": 0.75, "grad_norm": 1.4404025077819824, "learning_rate": 7.288135593220338e-07, "loss": 0.0261, "num_tokens": 19387506.0, "reward": 0.9824453592300415, "reward_std": 0.017121607437729836, "rewards/acc_reward/mean": 0.9822309613227844, "rewards/acc_reward/std": 0.04615851864218712, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 161.578125, "completions/mean_terminated_length": 161.578125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 2.21875, "epoch": 0.2749529190207156, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.269303201506592e-07, "loss": 0.0, "num_tokens": 19526271.0, "reward": 0.9767446517944336, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9741606712341309, "rewards/acc_reward/std": 0.02946525067090988, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 162.734375, "completions/mean_terminated_length": 162.734375, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 2.296875, "epoch": 0.2768361581920904, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.250470809792844e-07, "loss": 0.0, "num_tokens": 19656430.0, "reward": 0.9855356216430664, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9839285016059875, "rewards/acc_reward/std": 0.03281255066394806, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 152.265625, "completions/mean_terminated_length": 152.265625, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 2.328125, "epoch": 0.2787193973634652, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.231638418079096e-07, "loss": 0.0, "num_tokens": 19791039.0, "reward": 0.9486349821090698, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9429277777671814, "rewards/acc_reward/std": 0.0684126690030098, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 138.96875, "completions/mean_terminated_length": 138.96875, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 2.046875, "epoch": 0.2806026365348399, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.212806026365348e-07, "loss": 0.0, "num_tokens": 19920157.0, "reward": 0.9899982213973999, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9888868927955627, "rewards/acc_reward/std": 0.014463546685874462, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 512.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 171.4375, "completions/mean_terminated_length": 166.03173828125, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 2.1875, "epoch": 0.2824858757062147, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.1939736346516e-07, "loss": 0.0, "num_tokens": 20054233.0, "reward": 0.9820305109024048, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9800339341163635, "rewards/acc_reward/std": 0.03505489602684975, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 154.09375, "completions/mean_terminated_length": 154.09375, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 2.265625, "epoch": 0.2843691148775895, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.175141242937853e-07, "loss": 0.0, "num_tokens": 20177839.0, "reward": 0.9695570468902588, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9661744832992554, "rewards/acc_reward/std": 0.051941804587841034, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 159.078125, "completions/mean_terminated_length": 159.078125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 2.21875, "epoch": 0.2862523540489642, "frac_reward_zero_std": 0.875, "grad_norm": 1.135265588760376, "learning_rate": 7.156308851224105e-07, "loss": 0.0085, "num_tokens": 20313964.0, "reward": 0.9910129308700562, "reward_std": 0.004419418517500162, "rewards/acc_reward/mean": 0.9917504787445068, "rewards/acc_reward/std": 0.015448656864464283, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 176.328125, "completions/mean_terminated_length": 176.328125, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 2.546875, "epoch": 0.288135593220339, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.137476459510358e-07, "loss": 0.0, "num_tokens": 20454113.0, "reward": 0.9844459295272827, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9827176928520203, "rewards/acc_reward/std": 0.018290938809514046, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 154.140625, "completions/mean_terminated_length": 154.140625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 2.265625, "epoch": 0.2900188323917137, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.11864406779661e-07, "loss": 0.0, "num_tokens": 20591402.0, "reward": 0.9614624977111816, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9571805596351624, "rewards/acc_reward/std": 0.06432777643203735, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 151.9375, "completions/mean_terminated_length": 151.9375, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 2.1875, "epoch": 0.2919020715630885, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.099811676082862e-07, "loss": 0.0, "num_tokens": 20714630.0, "reward": 0.9820876717567444, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9800974130630493, "rewards/acc_reward/std": 0.037367500364780426, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 512.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 173.15625, "completions/mean_terminated_length": 167.77780151367188, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 2.296875, "epoch": 0.2937853107344633, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.080979284369114e-07, "loss": 0.0, "num_tokens": 20853136.0, "reward": 0.9967447519302368, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9963830709457397, "rewards/acc_reward/std": 0.009645064361393452, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 158.5, "completions/mean_terminated_length": 158.5, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 2.21875, "epoch": 0.295668549905838, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.062146892655367e-07, "loss": 0.0, "num_tokens": 20979664.0, "reward": 0.9870873689651489, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9856526851654053, "rewards/acc_reward/std": 0.020136423408985138, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 164.1875, "completions/mean_terminated_length": 164.1875, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 2.234375, "epoch": 0.2975517890772128, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.043314500941619e-07, "loss": 0.0, "num_tokens": 21116156.0, "reward": 0.9893269538879395, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9881410598754883, "rewards/acc_reward/std": 0.023672664538025856, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 512.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 177.203125, "completions/mean_terminated_length": 166.40322875976562, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 2.328125, "epoch": 0.2994350282485876, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.024482109227871e-07, "loss": 0.0, "num_tokens": 21245321.0, "reward": 0.9947940111160278, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9942155480384827, "rewards/acc_reward/std": 0.0101578738540411, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 167.0625, "completions/mean_terminated_length": 167.0625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 2.1875, "epoch": 0.3013182674199623, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 7.005649717514124e-07, "loss": 0.0, "num_tokens": 21376237.0, "reward": 0.9860028624534607, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9844475984573364, "rewards/acc_reward/std": 0.019196392968297005, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 512.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 169.265625, "completions/mean_terminated_length": 163.82540893554688, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 2.359375, "epoch": 0.3032015065913371, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.986817325800376e-07, "loss": 0.0, "num_tokens": 21516414.0, "reward": 0.9679353833198547, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9643726348876953, "rewards/acc_reward/std": 0.05980806425213814, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 153.296875, "completions/mean_terminated_length": 153.296875, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 2.09375, "epoch": 0.3050847457627119, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.967984934086629e-07, "loss": 0.0, "num_tokens": 21642289.0, "reward": 0.9967857003211975, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9964286088943481, "rewards/acc_reward/std": 0.009523809887468815, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 164.515625, "completions/mean_terminated_length": 164.515625, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 2.171875, "epoch": 0.3069679849340866, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.949152542372881e-07, "loss": 0.0, "num_tokens": 21782162.0, "reward": 0.9368254542350769, "reward_std": 0.0, "rewards/acc_reward/mean": 0.929806113243103, "rewards/acc_reward/std": 0.0714760348200798, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 168.53125, "completions/mean_terminated_length": 168.53125, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 2.234375, "epoch": 0.3088512241054614, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.930320150659133e-07, "loss": 0.0, "num_tokens": 21911732.0, "reward": 0.9614138603210449, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9571264982223511, "rewards/acc_reward/std": 0.08462820202112198, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 172.609375, "completions/mean_terminated_length": 172.609375, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 2.015625, "epoch": 0.3107344632768362, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.911487758945385e-07, "loss": 0.0, "num_tokens": 22043643.0, "reward": 0.9874755144119263, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9860839247703552, "rewards/acc_reward/std": 0.02431458979845047, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 156.015625, "completions/mean_terminated_length": 156.015625, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 2.234375, "epoch": 0.3126177024482109, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.892655367231638e-07, "loss": 0.0, "num_tokens": 22167892.0, "reward": 0.9494754076004028, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9438616037368774, "rewards/acc_reward/std": 0.05841909348964691, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 144.0, "completions/mean_terminated_length": 144.0, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 1.8828125, "epoch": 0.3145009416195857, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.87382297551789e-07, "loss": 0.0, "num_tokens": 22302132.0, "reward": 0.9812496900558472, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9791662693023682, "rewards/acc_reward/std": 0.023800579831004143, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 512.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 163.0, "completions/mean_terminated_length": 157.4603271484375, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 2.203125, "epoch": 0.3163841807909605, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.854990583804144e-07, "loss": 0.0, "num_tokens": 22428188.0, "reward": 0.9839420318603516, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9821577668190002, "rewards/acc_reward/std": 0.03933866694569588, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 160.859375, "completions/mean_terminated_length": 160.859375, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 2.28125, "epoch": 0.3182674199623352, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.836158192090396e-07, "loss": 0.0, "num_tokens": 22559187.0, "reward": 0.9785705804824829, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9761895537376404, "rewards/acc_reward/std": 0.0258196871727705, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 162.125, "completions/mean_terminated_length": 162.125, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 2.015625, "epoch": 0.32015065913371, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.817325800376648e-07, "loss": 0.0, "num_tokens": 22686427.0, "reward": 0.9450536966323853, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9389485120773315, "rewards/acc_reward/std": 0.07760415226221085, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 136.40625, "completions/mean_terminated_length": 136.40625, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 1.953125, "epoch": 0.3220338983050847, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.7984934086629e-07, "loss": 0.0, "num_tokens": 22812501.0, "reward": 0.9613984823226929, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9571093916893005, "rewards/acc_reward/std": 0.07542510330677032, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 157.84375, "completions/mean_terminated_length": 157.84375, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 2.046875, "epoch": 0.3239171374764595, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.779661016949152e-07, "loss": 0.0, "num_tokens": 22934187.0, "reward": 0.9912809729576111, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9903122186660767, "rewards/acc_reward/std": 0.020957766100764275, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 176.359375, "completions/mean_terminated_length": 176.359375, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 2.328125, "epoch": 0.3258003766478343, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.760828625235404e-07, "loss": 0.0, "num_tokens": 23070498.0, "reward": 0.9652288556098938, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9613653421401978, "rewards/acc_reward/std": 0.06810762733221054, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 512.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 196.28125, "completions/mean_terminated_length": 191.2698516845703, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 2.171875, "epoch": 0.327683615819209, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.741996233521658e-07, "loss": 0.0, "num_tokens": 23202324.0, "reward": 0.9608478546142578, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9564976692199707, "rewards/acc_reward/std": 0.06687356531620026, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 512.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 150.796875, "completions/mean_terminated_length": 145.06350708007812, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 1.9609375, "epoch": 0.3295668549905838, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.72316384180791e-07, "loss": 0.0, "num_tokens": 23333319.0, "reward": 0.9884651899337769, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9871835112571716, "rewards/acc_reward/std": 0.026243234053254128, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 165.828125, "completions/mean_terminated_length": 165.828125, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 2.28125, "epoch": 0.3314500941619586, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.704331450094162e-07, "loss": 0.0, "num_tokens": 23460796.0, "reward": 0.9813232421875, "reward_std": 0.0, "rewards/acc_reward/mean": 0.979248046875, "rewards/acc_reward/std": 0.04185057431459427, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 146.734375, "completions/mean_terminated_length": 146.734375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 2.09375, "epoch": 0.3333333333333333, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.685499058380414e-07, "loss": 0.0, "num_tokens": 23584811.0, "reward": 0.9937499761581421, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9930555820465088, "rewards/acc_reward/std": 0.01851852796971798, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 161.828125, "completions/mean_terminated_length": 161.828125, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 2.34375, "epoch": 0.3352165725047081, "frac_reward_zero_std": 0.875, "grad_norm": 1.0041779279708862, "learning_rate": 6.666666666666666e-07, "loss": 0.0029, "num_tokens": 23723072.0, "reward": 0.9907628893852234, "reward_std": 0.004419418517500162, "rewards/acc_reward/mean": 0.9914726614952087, "rewards/acc_reward/std": 0.014887169934809208, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 170.140625, "completions/mean_terminated_length": 170.140625, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 2.296875, "epoch": 0.3370998116760829, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.647834274952919e-07, "loss": 0.0, "num_tokens": 23852745.0, "reward": 0.9886301755905151, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9873669147491455, "rewards/acc_reward/std": 0.02594558708369732, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 159.09375, "completions/mean_terminated_length": 159.09375, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 2.203125, "epoch": 0.3389830508474576, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.629001883239171e-07, "loss": 0.0, "num_tokens": 23976431.0, "reward": 0.9778439402580261, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9753821492195129, "rewards/acc_reward/std": 0.02938609942793846, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 512.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 155.328125, "completions/mean_terminated_length": 149.66668701171875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 2.265625, "epoch": 0.3408662900188324, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.610169491525423e-07, "loss": 0.0, "num_tokens": 24103236.0, "reward": 0.9857558012008667, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9841731190681458, "rewards/acc_reward/std": 0.02304108813405037, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 157.25, "completions/mean_terminated_length": 157.25, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 1.921875, "epoch": 0.3427495291902072, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.591337099811676e-07, "loss": 0.0, "num_tokens": 24240244.0, "reward": 0.9847475290298462, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9830528497695923, "rewards/acc_reward/std": 0.018334772437810898, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 153.484375, "completions/mean_terminated_length": 153.484375, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 2.09375, "epoch": 0.3446327683615819, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.572504708097928e-07, "loss": 0.0, "num_tokens": 24373251.0, "reward": 0.9889024496078491, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9876693487167358, "rewards/acc_reward/std": 0.014938557520508766, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 512.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 173.65625, "completions/mean_terminated_length": 168.2857208251953, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 2.34375, "epoch": 0.3465160075329567, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.55367231638418e-07, "loss": 0.0, "num_tokens": 24507229.0, "reward": 0.9813482165336609, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9792758226394653, "rewards/acc_reward/std": 0.03865106776356697, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 165.078125, "completions/mean_terminated_length": 165.078125, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 2.203125, "epoch": 0.3483992467043315, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.534839924670433e-07, "loss": 0.0, "num_tokens": 24646554.0, "reward": 0.9556671380996704, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9507412910461426, "rewards/acc_reward/std": 0.07888054847717285, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 154.4375, "completions/mean_terminated_length": 154.4375, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 1.984375, "epoch": 0.3502824858757062, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.516007532956685e-07, "loss": 0.0, "num_tokens": 24785782.0, "reward": 0.9671032428741455, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9634479880332947, "rewards/acc_reward/std": 0.06446754932403564, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 140.21875, "completions/mean_terminated_length": 140.21875, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 2.09375, "epoch": 0.352165725047081, "frac_reward_zero_std": 0.875, "grad_norm": 1.060726284980774, "learning_rate": 6.497175141242937e-07, "loss": 0.0088, "num_tokens": 24903940.0, "reward": 0.932281494140625, "reward_std": 0.004419418517500162, "rewards/acc_reward/mean": 0.9264933466911316, "rewards/acc_reward/std": 0.08563832193613052, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 512.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 150.15625, "completions/mean_terminated_length": 144.4127197265625, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 2.015625, "epoch": 0.3540489642184557, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.478342749529189e-07, "loss": 0.0, "num_tokens": 25035214.0, "reward": 0.9913280010223389, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9903644323348999, "rewards/acc_reward/std": 0.01791100949048996, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 156.28125, "completions/mean_terminated_length": 156.28125, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 2.109375, "epoch": 0.3559322033898305, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.459510357815442e-07, "loss": 0.0, "num_tokens": 25168800.0, "reward": 0.9506161212921143, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9451290369033813, "rewards/acc_reward/std": 0.07419686019420624, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 138.859375, "completions/mean_terminated_length": 138.859375, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 1.8359375, "epoch": 0.3578154425612053, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.440677966101694e-07, "loss": 0.0, "num_tokens": 25302271.0, "reward": 0.9889830350875854, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9877589344978333, "rewards/acc_reward/std": 0.013018240220844746, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 143.65625, "completions/mean_terminated_length": 143.65625, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 2.203125, "epoch": 0.35969868173258, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.421845574387948e-07, "loss": 0.0, "num_tokens": 25422569.0, "reward": 0.9856148958206177, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9840165376663208, "rewards/acc_reward/std": 0.020176060497760773, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 512.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 172.125, "completions/mean_terminated_length": 166.73016357421875, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 2.015625, "epoch": 0.3615819209039548, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.4030131826742e-07, "loss": 0.0, "num_tokens": 25560809.0, "reward": 0.9829681515693665, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9810757637023926, "rewards/acc_reward/std": 0.0207473486661911, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 150.859375, "completions/mean_terminated_length": 150.859375, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 1.9453125, "epoch": 0.3634651600753296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.384180790960452e-07, "loss": 0.0, "num_tokens": 25691888.0, "reward": 0.959905743598938, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9554507732391357, "rewards/acc_reward/std": 0.07276061922311783, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 164.3125, "completions/mean_terminated_length": 164.3125, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 2.171875, "epoch": 0.3653483992467043, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.365348399246704e-07, "loss": 0.0, "num_tokens": 25819652.0, "reward": 0.9704650044441223, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9671833515167236, "rewards/acc_reward/std": 0.04358522593975067, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 145.09375, "completions/mean_terminated_length": 145.09375, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 2.15625, "epoch": 0.3672316384180791, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.346516007532956e-07, "loss": 0.0, "num_tokens": 25950602.0, "reward": 0.9926788806915283, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9918653964996338, "rewards/acc_reward/std": 0.011418163776397705, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 138.3125, "completions/mean_terminated_length": 138.3125, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 1.953125, "epoch": 0.3691148775894539, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.32768361581921e-07, "loss": 0.0, "num_tokens": 26070558.0, "reward": 0.9680017232894897, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9644463658332825, "rewards/acc_reward/std": 0.051819488406181335, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 164.3125, "completions/mean_terminated_length": 164.3125, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 2.109375, "epoch": 0.3709981167608286, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.308851224105462e-07, "loss": 0.0, "num_tokens": 26211378.0, "reward": 0.992339015007019, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9914877414703369, "rewards/acc_reward/std": 0.012202701531350613, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 150.03125, "completions/mean_terminated_length": 150.03125, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 2.078125, "epoch": 0.3728813559322034, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.290018832391714e-07, "loss": 0.0, "num_tokens": 26341204.0, "reward": 0.9656549692153931, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9618387818336487, "rewards/acc_reward/std": 0.03533172979950905, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 144.90625, "completions/mean_terminated_length": 144.90625, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 2.046875, "epoch": 0.3747645951035782, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.271186440677966e-07, "loss": 0.0, "num_tokens": 26460142.0, "reward": 0.9934231042861938, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9926923513412476, "rewards/acc_reward/std": 0.0151524618268013, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 156.90625, "completions/mean_terminated_length": 156.90625, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 2.0625, "epoch": 0.3766478342749529, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.252354048964218e-07, "loss": 0.0, "num_tokens": 26586088.0, "reward": 0.9820280075073242, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9800311326980591, "rewards/acc_reward/std": 0.018364734947681427, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 143.34375, "completions/mean_terminated_length": 143.34375, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 2.125, "epoch": 0.3785310734463277, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.23352165725047e-07, "loss": 0.0, "num_tokens": 26717886.0, "reward": 1.0, "reward_std": 0.0, "rewards/acc_reward/mean": 1.0, "rewards/acc_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 175.90625, "completions/mean_terminated_length": 175.90625, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "entropy": 2.140625, "epoch": 0.3804143126177024, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.214689265536723e-07, "loss": 0.0, "num_tokens": 26854168.0, "reward": 0.9935948252677917, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9928831458091736, "rewards/acc_reward/std": 0.0077277072705328465, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 165.25, "completions/mean_terminated_length": 165.25, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 2.171875, "epoch": 0.3822975517890772, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.195856873822976e-07, "loss": 0.0, "num_tokens": 26989488.0, "reward": 0.9852421283721924, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9836024045944214, "rewards/acc_reward/std": 0.01928338035941124, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 512.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 154.296875, "completions/mean_terminated_length": 148.61904907226562, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 2.1875, "epoch": 0.384180790960452, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.177024482109228e-07, "loss": 0.0, "num_tokens": 27112987.0, "reward": 0.9929645657539368, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9921828508377075, "rewards/acc_reward/std": 0.02084571123123169, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 147.5, "completions/mean_terminated_length": 147.5, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 2.125, "epoch": 0.3860640301318267, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.15819209039548e-07, "loss": 0.0, "num_tokens": 27235691.0, "reward": 0.9827622175216675, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9808468818664551, "rewards/acc_reward/std": 0.027835894376039505, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 156.53125, "completions/mean_terminated_length": 156.53125, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 2.09375, "epoch": 0.3879472693032015, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.139359698681732e-07, "loss": 0.0, "num_tokens": 27372173.0, "reward": 0.9731494784355164, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9701660871505737, "rewards/acc_reward/std": 0.020826132968068123, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 164.34375, "completions/mean_terminated_length": 164.34375, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 2.09375, "epoch": 0.3898305084745763, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.120527306967984e-07, "loss": 0.0, "num_tokens": 27499555.0, "reward": 0.9984294176101685, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9982548952102661, "rewards/acc_reward/std": 0.0046535334549844265, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 157.484375, "completions/mean_terminated_length": 157.484375, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 2.25, "epoch": 0.391713747645951, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.101694915254237e-07, "loss": 0.0, "num_tokens": 27636098.0, "reward": 0.9711763262748718, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9679737091064453, "rewards/acc_reward/std": 0.029890142381191254, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 174.828125, "completions/mean_terminated_length": 174.828125, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "entropy": 2.046875, "epoch": 0.3935969868173258, "frac_reward_zero_std": 0.875, "grad_norm": 0.4382655620574951, "learning_rate": 6.082862523540489e-07, "loss": -0.0223, "num_tokens": 27777591.0, "reward": 0.9556211829185486, "reward_std": 0.004419418517500162, "rewards/acc_reward/mean": 0.9524263143539429, "rewards/acc_reward/std": 0.04751761257648468, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 139.09375, "completions/mean_terminated_length": 139.09375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 2.15625, "epoch": 0.3954802259887006, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.064030131826741e-07, "loss": 0.0, "num_tokens": 27906397.0, "reward": 0.9859374761581421, "reward_std": 0.0, "rewards/acc_reward/mean": 0.984375, "rewards/acc_reward/std": 0.0416666679084301, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 144.625, "completions/mean_terminated_length": 144.625, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 1.8984375, "epoch": 0.3973634651600753, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 6.045197740112994e-07, "loss": 0.0, "num_tokens": 28038757.0, "reward": 0.9920339584350586, "reward_std": 0.0, "rewards/acc_reward/mean": 0.9911487698554993, "rewards/acc_reward/std": 0.013452098704874516, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 171.578125, "completions/mean_terminated_length": 171.578125, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 2.1875, "epoch": 0.3992467043314501, "frac_reward_zero_std": 0.875, "grad_norm": 1.4742763042449951, "learning_rate": 6.026365348399246e-07, "loss": 0.0537, "num_tokens": 28163002.0, "reward": 0.9887884259223938, "reward_std": 0.004419418517500162, "rewards/acc_reward/mean": 0.9892787933349609, "rewards/acc_reward/std": 0.023635946214199066, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 135.84375, "completions/mean_terminated_length": 135.84375, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 1.921875, "epoch": 0.4011299435028249, "frac_reward_zero_std": 0.875, "grad_norm": 0.6700330972671509, "learning_rate": 6.007532956685499e-07, "loss": -0.0233, "num_tokens": 28279440.0, "reward": 0.9760647416114807, "reward_std": 0.004419418517500162, "rewards/acc_reward/mean": 0.9751413464546204, "rewards/acc_reward/std": 0.03155434504151344, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.125, "step": 213 } ], "logging_steps": 1.0, "max_steps": 531, "num_input_tokens_seen": 28279440, "num_train_epochs": 1, "save_steps": 213, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }