| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.4011299435028249, |
| "eval_steps": 500, |
| "global_step": 213, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.078125, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 495.0, |
| "completions/mean_length": 290.59375, |
| "completions/mean_terminated_length": 271.83050537109375, |
| "completions/min_length": 99.0, |
| "completions/min_terminated_length": 99.0, |
| "entropy": 2.203125, |
| "epoch": 0.0018832391713747645, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.8694089651107788, |
| "learning_rate": 1e-06, |
| "loss": -0.0086, |
| "num_tokens": 140262.0, |
| "reward": 0.206417053937912, |
| "reward_std": 0.12193800508975983, |
| "rewards/acc_reward/mean": 0.19810229539871216, |
| "rewards/acc_reward/std": 0.28156620264053345, |
| "rewards/format_reward/mean": 0.28125, |
| "rewards/format_reward/std": 0.4531635046005249, |
| "step": 1 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 504.0, |
| "completions/mean_length": 247.734375, |
| "completions/mean_terminated_length": 243.53970336914062, |
| "completions/min_length": 125.0, |
| "completions/min_terminated_length": 125.0, |
| "entropy": 1.6796875, |
| "epoch": 0.003766478342749529, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.8876559734344482, |
| "learning_rate": 9.981167608286253e-07, |
| "loss": 0.0222, |
| "num_tokens": 273701.0, |
| "reward": 0.41670364141464233, |
| "reward_std": 0.150786355137825, |
| "rewards/acc_reward/mean": 0.42307350039482117, |
| "rewards/acc_reward/std": 0.29976290464401245, |
| "rewards/format_reward/mean": 0.359375, |
| "rewards/format_reward/std": 0.4836103618144989, |
| "step": 2 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 482.0, |
| "completions/max_terminated_length": 482.0, |
| "completions/mean_length": 236.046875, |
| "completions/mean_terminated_length": 236.046875, |
| "completions/min_length": 86.0, |
| "completions/min_terminated_length": 86.0, |
| "entropy": 1.6640625, |
| "epoch": 0.005649717514124294, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.8232742547988892, |
| "learning_rate": 9.962335216572504e-07, |
| "loss": 0.0698, |
| "num_tokens": 396392.0, |
| "reward": 0.4056413769721985, |
| "reward_std": 0.17224639654159546, |
| "rewards/acc_reward/mean": 0.3864765465259552, |
| "rewards/acc_reward/std": 0.3097887337207794, |
| "rewards/format_reward/mean": 0.578125, |
| "rewards/format_reward/std": 0.49776285886764526, |
| "step": 3 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 389.0, |
| "completions/max_terminated_length": 389.0, |
| "completions/mean_length": 227.8125, |
| "completions/mean_terminated_length": 227.8125, |
| "completions/min_length": 110.0, |
| "completions/min_terminated_length": 110.0, |
| "entropy": 1.6484375, |
| "epoch": 0.007532956685499058, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.0226917266845703, |
| "learning_rate": 9.943502824858757e-07, |
| "loss": -0.0298, |
| "num_tokens": 523036.0, |
| "reward": 0.3570261001586914, |
| "reward_std": 0.12964516878128052, |
| "rewards/acc_reward/mean": 0.33766794204711914, |
| "rewards/acc_reward/std": 0.35702335834503174, |
| "rewards/format_reward/mean": 0.53125, |
| "rewards/format_reward/std": 0.5029674172401428, |
| "step": 4 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 414.0, |
| "completions/max_terminated_length": 414.0, |
| "completions/mean_length": 214.515625, |
| "completions/mean_terminated_length": 214.515625, |
| "completions/min_length": 69.0, |
| "completions/min_terminated_length": 69.0, |
| "entropy": 1.8515625, |
| "epoch": 0.009416195856873822, |
| "frac_reward_zero_std": 0.125, |
| "grad_norm": 1.8740723133087158, |
| "learning_rate": 9.92467043314501e-07, |
| "loss": -0.0099, |
| "num_tokens": 649309.0, |
| "reward": 0.4160749912261963, |
| "reward_std": 0.13374584913253784, |
| "rewards/acc_reward/mean": 0.37723612785339355, |
| "rewards/acc_reward/std": 0.31442132592201233, |
| "rewards/format_reward/mean": 0.765625, |
| "rewards/format_reward/std": 0.42695629596710205, |
| "step": 5 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.03125, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 470.0, |
| "completions/mean_length": 266.046875, |
| "completions/mean_terminated_length": 258.1129150390625, |
| "completions/min_length": 65.0, |
| "completions/min_terminated_length": 65.0, |
| "entropy": 1.796875, |
| "epoch": 0.011299435028248588, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.7314085960388184, |
| "learning_rate": 9.905838041431261e-07, |
| "loss": 0.1098, |
| "num_tokens": 788480.0, |
| "reward": 0.2147485464811325, |
| "reward_std": 0.1807367354631424, |
| "rewards/acc_reward/mean": 0.15006783604621887, |
| "rewards/acc_reward/std": 0.3153360188007355, |
| "rewards/format_reward/mean": 0.796875, |
| "rewards/format_reward/std": 0.40550529956817627, |
| "step": 6 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 506.0, |
| "completions/max_terminated_length": 506.0, |
| "completions/mean_length": 232.5625, |
| "completions/mean_terminated_length": 232.5625, |
| "completions/min_length": 134.0, |
| "completions/min_terminated_length": 134.0, |
| "entropy": 1.84375, |
| "epoch": 0.013182674199623353, |
| "frac_reward_zero_std": 0.5, |
| "grad_norm": 1.3759692907333374, |
| "learning_rate": 9.887005649717514e-07, |
| "loss": 0.0142, |
| "num_tokens": 923788.0, |
| "reward": 0.21130093932151794, |
| "reward_std": 0.14664816856384277, |
| "rewards/acc_reward/mean": 0.12713992595672607, |
| "rewards/acc_reward/std": 0.2867279052734375, |
| "rewards/format_reward/mean": 0.96875, |
| "rewards/format_reward/std": 0.17536810040473938, |
| "step": 7 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 461.0, |
| "completions/max_terminated_length": 461.0, |
| "completions/mean_length": 256.796875, |
| "completions/mean_terminated_length": 256.796875, |
| "completions/min_length": 136.0, |
| "completions/min_terminated_length": 136.0, |
| "entropy": 1.6484375, |
| "epoch": 0.015065913370998116, |
| "frac_reward_zero_std": 0.25, |
| "grad_norm": 1.6152563095092773, |
| "learning_rate": 9.868173258003765e-07, |
| "loss": 0.0757, |
| "num_tokens": 1062207.0, |
| "reward": 0.3737403452396393, |
| "reward_std": 0.14178410172462463, |
| "rewards/acc_reward/mean": 0.30589205026626587, |
| "rewards/acc_reward/std": 0.3208266496658325, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 8 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0625, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 436.0, |
| "completions/mean_length": 255.265625, |
| "completions/mean_terminated_length": 238.15000915527344, |
| "completions/min_length": 108.0, |
| "completions/min_terminated_length": 108.0, |
| "entropy": 1.765625, |
| "epoch": 0.01694915254237288, |
| "frac_reward_zero_std": 0.25, |
| "grad_norm": 1.4234596490859985, |
| "learning_rate": 9.849340866290019e-07, |
| "loss": 0.0147, |
| "num_tokens": 1205848.0, |
| "reward": 0.3045212924480438, |
| "reward_std": 0.1969890296459198, |
| "rewards/acc_reward/mean": 0.22898197174072266, |
| "rewards/acc_reward/std": 0.3434719443321228, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 9 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 487.0, |
| "completions/max_terminated_length": 487.0, |
| "completions/mean_length": 224.8125, |
| "completions/mean_terminated_length": 224.8125, |
| "completions/min_length": 83.0, |
| "completions/min_terminated_length": 83.0, |
| "entropy": 1.8203125, |
| "epoch": 0.018832391713747645, |
| "frac_reward_zero_std": 0.5, |
| "grad_norm": 1.4884109497070312, |
| "learning_rate": 9.830508474576272e-07, |
| "loss": -0.0137, |
| "num_tokens": 1330620.0, |
| "reward": 0.26978251338005066, |
| "reward_std": 0.11548593640327454, |
| "rewards/acc_reward/mean": 0.19038332998752594, |
| "rewards/acc_reward/std": 0.31174740195274353, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 10 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 484.0, |
| "completions/mean_length": 256.890625, |
| "completions/mean_terminated_length": 252.84127807617188, |
| "completions/min_length": 92.0, |
| "completions/min_terminated_length": 92.0, |
| "entropy": 1.8046875, |
| "epoch": 0.02071563088512241, |
| "frac_reward_zero_std": 0.625, |
| "grad_norm": 1.1678308248519897, |
| "learning_rate": 9.811676082862523e-07, |
| "loss": 0.0051, |
| "num_tokens": 1474925.0, |
| "reward": 0.21130861341953278, |
| "reward_std": 0.11159157752990723, |
| "rewards/acc_reward/mean": 0.12367624044418335, |
| "rewards/acc_reward/std": 0.28885576128959656, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 11 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 438.0, |
| "completions/max_terminated_length": 438.0, |
| "completions/mean_length": 224.640625, |
| "completions/mean_terminated_length": 224.640625, |
| "completions/min_length": 69.0, |
| "completions/min_terminated_length": 69.0, |
| "entropy": 1.59375, |
| "epoch": 0.022598870056497175, |
| "frac_reward_zero_std": 0.25, |
| "grad_norm": 1.6417983770370483, |
| "learning_rate": 9.792843691148776e-07, |
| "loss": -0.0083, |
| "num_tokens": 1610966.0, |
| "reward": 0.3159927725791931, |
| "reward_std": 0.1229424774646759, |
| "rewards/acc_reward/mean": 0.24172811210155487, |
| "rewards/acc_reward/std": 0.31462714076042175, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 12 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 483.0, |
| "completions/max_terminated_length": 483.0, |
| "completions/mean_length": 219.6875, |
| "completions/mean_terminated_length": 219.6875, |
| "completions/min_length": 80.0, |
| "completions/min_terminated_length": 80.0, |
| "entropy": 1.4921875, |
| "epoch": 0.02448210922787194, |
| "frac_reward_zero_std": 0.375, |
| "grad_norm": 1.5047811269760132, |
| "learning_rate": 9.774011299435027e-07, |
| "loss": 0.0277, |
| "num_tokens": 1755810.0, |
| "reward": 0.2687425911426544, |
| "reward_std": 0.14603173732757568, |
| "rewards/acc_reward/mean": 0.1874917596578598, |
| "rewards/acc_reward/std": 0.3036465644836426, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 13 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.078125, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 459.0, |
| "completions/mean_length": 277.0625, |
| "completions/mean_terminated_length": 257.15252685546875, |
| "completions/min_length": 142.0, |
| "completions/min_terminated_length": 142.0, |
| "entropy": 1.890625, |
| "epoch": 0.026365348399246705, |
| "frac_reward_zero_std": 0.375, |
| "grad_norm": 1.3763067722320557, |
| "learning_rate": 9.75517890772128e-07, |
| "loss": 0.0409, |
| "num_tokens": 1901926.0, |
| "reward": 0.3287838101387024, |
| "reward_std": 0.17755521833896637, |
| "rewards/acc_reward/mean": 0.2542042136192322, |
| "rewards/acc_reward/std": 0.383696049451828, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 14 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 483.0, |
| "completions/mean_length": 232.046875, |
| "completions/mean_terminated_length": 227.60317993164062, |
| "completions/min_length": 104.0, |
| "completions/min_terminated_length": 104.0, |
| "entropy": 1.59375, |
| "epoch": 0.02824858757062147, |
| "frac_reward_zero_std": 0.5, |
| "grad_norm": 1.1948471069335938, |
| "learning_rate": 9.736346516007531e-07, |
| "loss": -0.0154, |
| "num_tokens": 2028921.0, |
| "reward": 0.31290093064308167, |
| "reward_std": 0.08047251403331757, |
| "rewards/acc_reward/mean": 0.23655660450458527, |
| "rewards/acc_reward/std": 0.3165181875228882, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 15 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 486.0, |
| "completions/mean_length": 212.40625, |
| "completions/mean_terminated_length": 207.6508026123047, |
| "completions/min_length": 83.0, |
| "completions/min_terminated_length": 83.0, |
| "entropy": 1.734375, |
| "epoch": 0.030131826741996232, |
| "frac_reward_zero_std": 0.5, |
| "grad_norm": 1.5261269807815552, |
| "learning_rate": 9.717514124293785e-07, |
| "loss": 0.09, |
| "num_tokens": 2165179.0, |
| "reward": 0.30339735746383667, |
| "reward_std": 0.133845254778862, |
| "rewards/acc_reward/mean": 0.22599703073501587, |
| "rewards/acc_reward/std": 0.3543343245983124, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 16 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 482.0, |
| "completions/max_terminated_length": 482.0, |
| "completions/mean_length": 201.40625, |
| "completions/mean_terminated_length": 201.40625, |
| "completions/min_length": 89.0, |
| "completions/min_terminated_length": 89.0, |
| "entropy": 1.4375, |
| "epoch": 0.032015065913371, |
| "frac_reward_zero_std": 0.375, |
| "grad_norm": 1.5391112565994263, |
| "learning_rate": 9.698681732580038e-07, |
| "loss": 0.009, |
| "num_tokens": 2296853.0, |
| "reward": 0.1953125, |
| "reward_std": 0.2162797451019287, |
| "rewards/acc_reward/mean": 0.109375, |
| "rewards/acc_reward/std": 0.3145764470100403, |
| "rewards/format_reward/mean": 0.96875, |
| "rewards/format_reward/std": 0.17536810040473938, |
| "step": 17 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 482.0, |
| "completions/mean_length": 220.84375, |
| "completions/mean_terminated_length": 216.2222442626953, |
| "completions/min_length": 110.0, |
| "completions/min_terminated_length": 110.0, |
| "entropy": 1.7421875, |
| "epoch": 0.03389830508474576, |
| "frac_reward_zero_std": 0.625, |
| "grad_norm": 1.1895616054534912, |
| "learning_rate": 9.679849340866289e-07, |
| "loss": 0.0424, |
| "num_tokens": 2431011.0, |
| "reward": 0.30048421025276184, |
| "reward_std": 0.06477973610162735, |
| "rewards/acc_reward/mean": 0.22276021540164948, |
| "rewards/acc_reward/std": 0.31460005044937134, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 18 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 428.0, |
| "completions/max_terminated_length": 428.0, |
| "completions/mean_length": 223.953125, |
| "completions/mean_terminated_length": 223.953125, |
| "completions/min_length": 86.0, |
| "completions/min_terminated_length": 86.0, |
| "entropy": 1.9453125, |
| "epoch": 0.035781544256120526, |
| "frac_reward_zero_std": 0.25, |
| "grad_norm": 1.671203374862671, |
| "learning_rate": 9.661016949152542e-07, |
| "loss": 0.0416, |
| "num_tokens": 2571808.0, |
| "reward": 0.42682912945747375, |
| "reward_std": 0.13467340171337128, |
| "rewards/acc_reward/mean": 0.3631434738636017, |
| "rewards/acc_reward/std": 0.37232744693756104, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 19 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.03125, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 482.0, |
| "completions/mean_length": 224.140625, |
| "completions/mean_terminated_length": 214.85482788085938, |
| "completions/min_length": 83.0, |
| "completions/min_terminated_length": 83.0, |
| "entropy": 1.625, |
| "epoch": 0.03766478342749529, |
| "frac_reward_zero_std": 0.25, |
| "grad_norm": 1.5360738039016724, |
| "learning_rate": 9.642184557438793e-07, |
| "loss": 0.0717, |
| "num_tokens": 2711697.0, |
| "reward": 0.3133315443992615, |
| "reward_std": 0.22658446431159973, |
| "rewards/acc_reward/mean": 0.23703500628471375, |
| "rewards/acc_reward/std": 0.3616204857826233, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 20 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.03125, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 398.0, |
| "completions/mean_length": 222.515625, |
| "completions/mean_terminated_length": 213.1774139404297, |
| "completions/min_length": 115.0, |
| "completions/min_terminated_length": 115.0, |
| "entropy": 1.7109375, |
| "epoch": 0.03954802259887006, |
| "frac_reward_zero_std": 0.125, |
| "grad_norm": 1.8884894847869873, |
| "learning_rate": 9.623352165725046e-07, |
| "loss": 0.0024, |
| "num_tokens": 2841242.0, |
| "reward": 0.5347622036933899, |
| "reward_std": 0.15656878054141998, |
| "rewards/acc_reward/mean": 0.4830690622329712, |
| "rewards/acc_reward/std": 0.31471821665763855, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 21 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 485.0, |
| "completions/mean_length": 226.703125, |
| "completions/mean_terminated_length": 222.1746063232422, |
| "completions/min_length": 90.0, |
| "completions/min_terminated_length": 90.0, |
| "entropy": 1.84375, |
| "epoch": 0.04143126177024482, |
| "frac_reward_zero_std": 0.125, |
| "grad_norm": 1.8899277448654175, |
| "learning_rate": 9.6045197740113e-07, |
| "loss": 0.0705, |
| "num_tokens": 2963015.0, |
| "reward": 0.43775349855422974, |
| "reward_std": 0.2172819972038269, |
| "rewards/acc_reward/mean": 0.3752816617488861, |
| "rewards/acc_reward/std": 0.3713799715042114, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 22 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.03125, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 482.0, |
| "completions/mean_length": 229.5, |
| "completions/mean_terminated_length": 220.3870849609375, |
| "completions/min_length": 77.0, |
| "completions/min_terminated_length": 77.0, |
| "entropy": 1.7421875, |
| "epoch": 0.04331450094161959, |
| "frac_reward_zero_std": 0.25, |
| "grad_norm": 1.6153416633605957, |
| "learning_rate": 9.58568738229755e-07, |
| "loss": -0.0054, |
| "num_tokens": 3109447.0, |
| "reward": 0.32855772972106934, |
| "reward_std": 0.27425360679626465, |
| "rewards/acc_reward/mean": 0.2556891441345215, |
| "rewards/acc_reward/std": 0.40467146039009094, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 23 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 510.0, |
| "completions/max_terminated_length": 510.0, |
| "completions/mean_length": 212.984375, |
| "completions/mean_terminated_length": 212.984375, |
| "completions/min_length": 94.0, |
| "completions/min_terminated_length": 94.0, |
| "entropy": 1.484375, |
| "epoch": 0.04519774011299435, |
| "frac_reward_zero_std": 0.125, |
| "grad_norm": 1.8740508556365967, |
| "learning_rate": 9.566854990583804e-07, |
| "loss": 0.0035, |
| "num_tokens": 3255782.0, |
| "reward": 0.4697909355163574, |
| "reward_std": 0.18794436752796173, |
| "rewards/acc_reward/mean": 0.41261494159698486, |
| "rewards/acc_reward/std": 0.39874467253685, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 24 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 483.0, |
| "completions/max_terminated_length": 483.0, |
| "completions/mean_length": 237.375, |
| "completions/mean_terminated_length": 237.375, |
| "completions/min_length": 42.0, |
| "completions/min_terminated_length": 42.0, |
| "entropy": 1.9375, |
| "epoch": 0.047080979284369114, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.9496995210647583, |
| "learning_rate": 9.548022598870055e-07, |
| "loss": 0.1125, |
| "num_tokens": 3398398.0, |
| "reward": 0.43061739206314087, |
| "reward_std": 0.25355520844459534, |
| "rewards/acc_reward/mean": 0.36735260486602783, |
| "rewards/acc_reward/std": 0.3482816219329834, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 25 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 483.0, |
| "completions/max_terminated_length": 483.0, |
| "completions/mean_length": 236.890625, |
| "completions/mean_terminated_length": 236.890625, |
| "completions/min_length": 86.0, |
| "completions/min_terminated_length": 86.0, |
| "entropy": 1.765625, |
| "epoch": 0.04896421845574388, |
| "frac_reward_zero_std": 0.125, |
| "grad_norm": 1.7656739950180054, |
| "learning_rate": 9.529190207156308e-07, |
| "loss": 0.0431, |
| "num_tokens": 3534743.0, |
| "reward": 0.49807009100914, |
| "reward_std": 0.23242174088954926, |
| "rewards/acc_reward/mean": 0.44230008125305176, |
| "rewards/acc_reward/std": 0.38975948095321655, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 26 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 398.0, |
| "completions/max_terminated_length": 398.0, |
| "completions/mean_length": 187.546875, |
| "completions/mean_terminated_length": 187.546875, |
| "completions/min_length": 74.0, |
| "completions/min_terminated_length": 74.0, |
| "entropy": 1.78125, |
| "epoch": 0.05084745762711865, |
| "frac_reward_zero_std": 0.125, |
| "grad_norm": 2.076228618621826, |
| "learning_rate": 9.510357815442561e-07, |
| "loss": 0.0356, |
| "num_tokens": 3659290.0, |
| "reward": 0.44223499298095703, |
| "reward_std": 0.21930678188800812, |
| "rewards/acc_reward/mean": 0.383733332157135, |
| "rewards/acc_reward/std": 0.3709230422973633, |
| "rewards/format_reward/mean": 0.96875, |
| "rewards/format_reward/std": 0.17536810040473938, |
| "step": 27 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 484.0, |
| "completions/max_terminated_length": 484.0, |
| "completions/mean_length": 209.65625, |
| "completions/mean_terminated_length": 209.65625, |
| "completions/min_length": 37.0, |
| "completions/min_terminated_length": 37.0, |
| "entropy": 1.875, |
| "epoch": 0.05273069679849341, |
| "frac_reward_zero_std": 0.125, |
| "grad_norm": 1.8770859241485596, |
| "learning_rate": 9.491525423728813e-07, |
| "loss": 0.0769, |
| "num_tokens": 3796772.0, |
| "reward": 0.41265854239463806, |
| "reward_std": 0.3168802857398987, |
| "rewards/acc_reward/mean": 0.34913450479507446, |
| "rewards/acc_reward/std": 0.42575517296791077, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 28 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 385.0, |
| "completions/max_terminated_length": 385.0, |
| "completions/mean_length": 192.75, |
| "completions/mean_terminated_length": 192.75, |
| "completions/min_length": 66.0, |
| "completions/min_terminated_length": 66.0, |
| "entropy": 2.140625, |
| "epoch": 0.054613935969868174, |
| "frac_reward_zero_std": 0.125, |
| "grad_norm": 2.1020395755767822, |
| "learning_rate": 9.472693032015065e-07, |
| "loss": 0.0712, |
| "num_tokens": 3925692.0, |
| "reward": 0.41072791814804077, |
| "reward_std": 0.3003063201904297, |
| "rewards/acc_reward/mean": 0.34698933362960815, |
| "rewards/acc_reward/std": 0.4590102732181549, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 29 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 479.0, |
| "completions/max_terminated_length": 479.0, |
| "completions/mean_length": 189.5625, |
| "completions/mean_terminated_length": 189.5625, |
| "completions/min_length": 81.0, |
| "completions/min_terminated_length": 81.0, |
| "entropy": 1.9609375, |
| "epoch": 0.05649717514124294, |
| "frac_reward_zero_std": 0.125, |
| "grad_norm": 2.122809410095215, |
| "learning_rate": 9.453860640301318e-07, |
| "loss": -0.0219, |
| "num_tokens": 4063328.0, |
| "reward": 0.4698576331138611, |
| "reward_std": 0.30180490016937256, |
| "rewards/acc_reward/mean": 0.41095292568206787, |
| "rewards/acc_reward/std": 0.4547015130519867, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 30 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 492.0, |
| "completions/max_terminated_length": 492.0, |
| "completions/mean_length": 185.6875, |
| "completions/mean_terminated_length": 185.6875, |
| "completions/min_length": 88.0, |
| "completions/min_terminated_length": 88.0, |
| "entropy": 1.8359375, |
| "epoch": 0.0583804143126177, |
| "frac_reward_zero_std": 0.125, |
| "grad_norm": 2.101128578186035, |
| "learning_rate": 9.43502824858757e-07, |
| "loss": 0.0325, |
| "num_tokens": 4191876.0, |
| "reward": 0.4844944179058075, |
| "reward_std": 0.30700868368148804, |
| "rewards/acc_reward/mean": 0.42895209789276123, |
| "rewards/acc_reward/std": 0.4517141282558441, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 31 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 435.0, |
| "completions/mean_length": 179.5, |
| "completions/mean_terminated_length": 174.22222900390625, |
| "completions/min_length": 35.0, |
| "completions/min_terminated_length": 35.0, |
| "entropy": 1.859375, |
| "epoch": 0.060263653483992465, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.2371370792388916, |
| "learning_rate": 9.416195856873822e-07, |
| "loss": -0.0023, |
| "num_tokens": 4322148.0, |
| "reward": 0.5825260281562805, |
| "reward_std": 0.3706633746623993, |
| "rewards/acc_reward/mean": 0.536139965057373, |
| "rewards/acc_reward/std": 0.4555891752243042, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 32 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 482.0, |
| "completions/max_terminated_length": 482.0, |
| "completions/mean_length": 188.6875, |
| "completions/mean_terminated_length": 188.6875, |
| "completions/min_length": 41.0, |
| "completions/min_terminated_length": 41.0, |
| "entropy": 1.625, |
| "epoch": 0.062146892655367235, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.9997694492340088, |
| "learning_rate": 9.397363465160075e-07, |
| "loss": 0.0677, |
| "num_tokens": 4460208.0, |
| "reward": 0.6379782557487488, |
| "reward_std": 0.284807950258255, |
| "rewards/acc_reward/mean": 0.599489688873291, |
| "rewards/acc_reward/std": 0.42208555340766907, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 33 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 479.0, |
| "completions/max_terminated_length": 479.0, |
| "completions/mean_length": 206.375, |
| "completions/mean_terminated_length": 206.375, |
| "completions/min_length": 76.0, |
| "completions/min_terminated_length": 76.0, |
| "entropy": 2.28125, |
| "epoch": 0.064030131826742, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.249376058578491, |
| "learning_rate": 9.378531073446327e-07, |
| "loss": 0.0497, |
| "num_tokens": 4598760.0, |
| "reward": 0.6912428140640259, |
| "reward_std": 0.350800096988678, |
| "rewards/acc_reward/mean": 0.658672571182251, |
| "rewards/acc_reward/std": 0.45664042234420776, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 34 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 482.0, |
| "completions/max_terminated_length": 482.0, |
| "completions/mean_length": 171.703125, |
| "completions/mean_terminated_length": 171.703125, |
| "completions/min_length": 62.0, |
| "completions/min_terminated_length": 62.0, |
| "entropy": 1.9609375, |
| "epoch": 0.06591337099811675, |
| "frac_reward_zero_std": 0.125, |
| "grad_norm": 1.9761137962341309, |
| "learning_rate": 9.359698681732579e-07, |
| "loss": 0.0538, |
| "num_tokens": 4729973.0, |
| "reward": 0.7882718443870544, |
| "reward_std": 0.24334368109703064, |
| "rewards/acc_reward/mean": 0.7664825916290283, |
| "rewards/acc_reward/std": 0.34395766258239746, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 35 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 319.0, |
| "completions/max_terminated_length": 319.0, |
| "completions/mean_length": 165.21875, |
| "completions/mean_terminated_length": 165.21875, |
| "completions/min_length": 84.0, |
| "completions/min_terminated_length": 84.0, |
| "entropy": 1.9375, |
| "epoch": 0.06779661016949153, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.3349790573120117, |
| "learning_rate": 9.340866290018831e-07, |
| "loss": 0.0651, |
| "num_tokens": 4870371.0, |
| "reward": 0.7641240358352661, |
| "reward_std": 0.3322174549102783, |
| "rewards/acc_reward/mean": 0.7396517395973206, |
| "rewards/acc_reward/std": 0.409751832485199, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 36 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 398.0, |
| "completions/mean_length": 164.234375, |
| "completions/mean_terminated_length": 158.71429443359375, |
| "completions/min_length": 42.0, |
| "completions/min_terminated_length": 42.0, |
| "entropy": 1.9765625, |
| "epoch": 0.0696798493408663, |
| "frac_reward_zero_std": 0.125, |
| "grad_norm": 2.1828882694244385, |
| "learning_rate": 9.322033898305083e-07, |
| "loss": 0.0297, |
| "num_tokens": 5002066.0, |
| "reward": 0.7222064733505249, |
| "reward_std": 0.2505956292152405, |
| "rewards/acc_reward/mean": 0.6913405656814575, |
| "rewards/acc_reward/std": 0.3850165903568268, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 37 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 483.0, |
| "completions/mean_length": 189.25, |
| "completions/mean_terminated_length": 184.1269989013672, |
| "completions/min_length": 67.0, |
| "completions/min_terminated_length": 67.0, |
| "entropy": 2.203125, |
| "epoch": 0.07156308851224105, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.241468906402588, |
| "learning_rate": 9.303201506591337e-07, |
| "loss": 0.0657, |
| "num_tokens": 5135842.0, |
| "reward": 0.767461359500885, |
| "reward_std": 0.23975765705108643, |
| "rewards/acc_reward/mean": 0.74509596824646, |
| "rewards/acc_reward/std": 0.3569471538066864, |
| "rewards/format_reward/mean": 0.96875, |
| "rewards/format_reward/std": 0.17536810040473938, |
| "step": 38 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 334.0, |
| "completions/mean_length": 167.296875, |
| "completions/mean_terminated_length": 161.82540893554688, |
| "completions/min_length": 64.0, |
| "completions/min_terminated_length": 64.0, |
| "entropy": 2.21875, |
| "epoch": 0.07344632768361582, |
| "frac_reward_zero_std": 0.125, |
| "grad_norm": 2.625629186630249, |
| "learning_rate": 9.28436911487759e-07, |
| "loss": 0.0755, |
| "num_tokens": 5268213.0, |
| "reward": 0.7769017219543457, |
| "reward_std": 0.301779568195343, |
| "rewards/acc_reward/mean": 0.7555853128433228, |
| "rewards/acc_reward/std": 0.3923026919364929, |
| "rewards/format_reward/mean": 0.96875, |
| "rewards/format_reward/std": 0.17536810040473938, |
| "step": 39 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.046875, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 512.0, |
| "completions/mean_length": 182.0, |
| "completions/mean_terminated_length": 165.77047729492188, |
| "completions/min_length": 73.0, |
| "completions/min_terminated_length": 73.0, |
| "entropy": 2.34375, |
| "epoch": 0.07532956685499058, |
| "frac_reward_zero_std": 0.25, |
| "grad_norm": 1.8064289093017578, |
| "learning_rate": 9.265536723163842e-07, |
| "loss": -0.0294, |
| "num_tokens": 5400085.0, |
| "reward": 0.8888776302337646, |
| "reward_std": 0.2301492542028427, |
| "rewards/acc_reward/mean": 0.8782668113708496, |
| "rewards/acc_reward/std": 0.31358516216278076, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 40 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 377.0, |
| "completions/mean_length": 170.578125, |
| "completions/mean_terminated_length": 165.15872192382812, |
| "completions/min_length": 46.0, |
| "completions/min_terminated_length": 46.0, |
| "entropy": 2.375, |
| "epoch": 0.07721280602636535, |
| "frac_reward_zero_std": 0.375, |
| "grad_norm": 1.6493501663208008, |
| "learning_rate": 9.246704331450094e-07, |
| "loss": -0.034, |
| "num_tokens": 5529746.0, |
| "reward": 0.8990048170089722, |
| "reward_std": 0.1494266837835312, |
| "rewards/acc_reward/mean": 0.889519214630127, |
| "rewards/acc_reward/std": 0.26512882113456726, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 41 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 393.0, |
| "completions/max_terminated_length": 393.0, |
| "completions/mean_length": 158.859375, |
| "completions/mean_terminated_length": 158.859375, |
| "completions/min_length": 71.0, |
| "completions/min_terminated_length": 71.0, |
| "entropy": 2.4375, |
| "epoch": 0.07909604519774012, |
| "frac_reward_zero_std": 0.5, |
| "grad_norm": 1.683705449104309, |
| "learning_rate": 9.227871939736346e-07, |
| "loss": 0.0056, |
| "num_tokens": 5668777.0, |
| "reward": 0.9233179092407227, |
| "reward_std": 0.14834892749786377, |
| "rewards/acc_reward/mean": 0.9165338277816772, |
| "rewards/acc_reward/std": 0.2692948877811432, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 42 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 460.0, |
| "completions/max_terminated_length": 460.0, |
| "completions/mean_length": 187.859375, |
| "completions/mean_terminated_length": 187.859375, |
| "completions/min_length": 44.0, |
| "completions/min_terminated_length": 44.0, |
| "entropy": 2.46875, |
| "epoch": 0.08097928436911488, |
| "frac_reward_zero_std": 0.625, |
| "grad_norm": 1.3769292831420898, |
| "learning_rate": 9.209039548022598e-07, |
| "loss": 0.0487, |
| "num_tokens": 5800544.0, |
| "reward": 0.9674270153045654, |
| "reward_std": 0.048613592982292175, |
| "rewards/acc_reward/mean": 0.9672800302505493, |
| "rewards/acc_reward/std": 0.12451620399951935, |
| "rewards/format_reward/mean": 0.96875, |
| "rewards/format_reward/std": 0.17536810040473938, |
| "step": 43 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.03125, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 453.0, |
| "completions/mean_length": 164.546875, |
| "completions/mean_terminated_length": 153.33871459960938, |
| "completions/min_length": 31.0, |
| "completions/min_terminated_length": 31.0, |
| "entropy": 2.671875, |
| "epoch": 0.08286252354048965, |
| "frac_reward_zero_std": 0.75, |
| "grad_norm": 1.0136916637420654, |
| "learning_rate": 9.190207156308852e-07, |
| "loss": -0.0441, |
| "num_tokens": 5929459.0, |
| "reward": 0.9583332538604736, |
| "reward_std": 0.016743799671530724, |
| "rewards/acc_reward/mean": 0.9571758508682251, |
| "rewards/acc_reward/std": 0.0556831993162632, |
| "rewards/format_reward/mean": 0.96875, |
| "rewards/format_reward/std": 0.17536810040473938, |
| "step": 44 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 294.0, |
| "completions/max_terminated_length": 294.0, |
| "completions/mean_length": 166.890625, |
| "completions/mean_terminated_length": 166.890625, |
| "completions/min_length": 51.0, |
| "completions/min_terminated_length": 51.0, |
| "entropy": 2.25, |
| "epoch": 0.0847457627118644, |
| "frac_reward_zero_std": 0.125, |
| "grad_norm": 2.0560693740844727, |
| "learning_rate": 9.171374764595104e-07, |
| "loss": -0.041, |
| "num_tokens": 6061324.0, |
| "reward": 0.9419167041778564, |
| "reward_std": 0.08913865685462952, |
| "rewards/acc_reward/mean": 0.9458796381950378, |
| "rewards/acc_reward/std": 0.15155728161334991, |
| "rewards/format_reward/mean": 0.90625, |
| "rewards/format_reward/std": 0.29378482699394226, |
| "step": 45 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 462.0, |
| "completions/max_terminated_length": 462.0, |
| "completions/mean_length": 184.484375, |
| "completions/mean_terminated_length": 184.484375, |
| "completions/min_length": 74.0, |
| "completions/min_terminated_length": 74.0, |
| "entropy": 2.453125, |
| "epoch": 0.08662900188323917, |
| "frac_reward_zero_std": 0.5, |
| "grad_norm": 1.4456909894943237, |
| "learning_rate": 9.152542372881356e-07, |
| "loss": 0.0865, |
| "num_tokens": 6199275.0, |
| "reward": 0.956250011920929, |
| "reward_std": 0.1237436980009079, |
| "rewards/acc_reward/mean": 0.953125, |
| "rewards/acc_reward/std": 0.21304203569889069, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 46 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.03125, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 395.0, |
| "completions/mean_length": 182.65625, |
| "completions/mean_terminated_length": 172.03225708007812, |
| "completions/min_length": 47.0, |
| "completions/min_terminated_length": 47.0, |
| "entropy": 2.734375, |
| "epoch": 0.08851224105461393, |
| "frac_reward_zero_std": 0.875, |
| "grad_norm": 0.9000487923622131, |
| "learning_rate": 9.133709981167608e-07, |
| "loss": 0.0061, |
| "num_tokens": 6339829.0, |
| "reward": 0.9662767648696899, |
| "reward_std": 0.039774756878614426, |
| "rewards/acc_reward/mean": 0.9625297784805298, |
| "rewards/acc_reward/std": 0.12811994552612305, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 47 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 360.0, |
| "completions/mean_length": 174.3125, |
| "completions/mean_terminated_length": 168.952392578125, |
| "completions/min_length": 31.0, |
| "completions/min_terminated_length": 31.0, |
| "entropy": 2.546875, |
| "epoch": 0.0903954802259887, |
| "frac_reward_zero_std": 0.75, |
| "grad_norm": 1.0134769678115845, |
| "learning_rate": 9.11487758945386e-07, |
| "loss": -0.0181, |
| "num_tokens": 6473609.0, |
| "reward": 0.9744918346405029, |
| "reward_std": 0.008838837035000324, |
| "rewards/acc_reward/mean": 0.9751298427581787, |
| "rewards/acc_reward/std": 0.04131903871893883, |
| "rewards/format_reward/mean": 0.96875, |
| "rewards/format_reward/std": 0.17536810040473938, |
| "step": 48 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.03125, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 454.0, |
| "completions/mean_length": 188.421875, |
| "completions/mean_terminated_length": 177.98385620117188, |
| "completions/min_length": 65.0, |
| "completions/min_terminated_length": 65.0, |
| "entropy": 2.625, |
| "epoch": 0.09227871939736347, |
| "frac_reward_zero_std": 0.625, |
| "grad_norm": 1.7283865213394165, |
| "learning_rate": 9.096045197740112e-07, |
| "loss": 0.0749, |
| "num_tokens": 6607932.0, |
| "reward": 0.9752188920974731, |
| "reward_std": 0.015992172062397003, |
| "rewards/acc_reward/mean": 0.981145977973938, |
| "rewards/acc_reward/std": 0.025175929069519043, |
| "rewards/format_reward/mean": 0.921875, |
| "rewards/format_reward/std": 0.27048972249031067, |
| "step": 49 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 509.0, |
| "completions/max_terminated_length": 509.0, |
| "completions/mean_length": 194.421875, |
| "completions/mean_terminated_length": 194.421875, |
| "completions/min_length": 20.0, |
| "completions/min_terminated_length": 20.0, |
| "entropy": 2.53125, |
| "epoch": 0.09416195856873823, |
| "frac_reward_zero_std": 0.75, |
| "grad_norm": 1.1576530933380127, |
| "learning_rate": 9.077212806026365e-07, |
| "loss": 0.0122, |
| "num_tokens": 6752119.0, |
| "reward": 0.9867604374885559, |
| "reward_std": 0.007432654500007629, |
| "rewards/acc_reward/mean": 0.9870254993438721, |
| "rewards/acc_reward/std": 0.017179692164063454, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 50 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 509.0, |
| "completions/max_terminated_length": 509.0, |
| "completions/mean_length": 186.71875, |
| "completions/mean_terminated_length": 186.71875, |
| "completions/min_length": 67.0, |
| "completions/min_terminated_length": 67.0, |
| "entropy": 2.78125, |
| "epoch": 0.096045197740113, |
| "frac_reward_zero_std": 0.125, |
| "grad_norm": 2.141554355621338, |
| "learning_rate": 9.058380414312617e-07, |
| "loss": -0.0311, |
| "num_tokens": 6890533.0, |
| "reward": 0.9580291509628296, |
| "reward_std": 0.0707060694694519, |
| "rewards/acc_reward/mean": 0.9637823700904846, |
| "rewards/acc_reward/std": 0.13111592829227448, |
| "rewards/format_reward/mean": 0.90625, |
| "rewards/format_reward/std": 0.29378482699394226, |
| "step": 51 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 307.0, |
| "completions/mean_length": 169.390625, |
| "completions/mean_terminated_length": 163.952392578125, |
| "completions/min_length": 32.0, |
| "completions/min_terminated_length": 32.0, |
| "entropy": 2.421875, |
| "epoch": 0.09792843691148775, |
| "frac_reward_zero_std": 0.875, |
| "grad_norm": 0.8654880523681641, |
| "learning_rate": 9.03954802259887e-07, |
| "loss": 0.0053, |
| "num_tokens": 7027358.0, |
| "reward": 0.9961555004119873, |
| "reward_std": 0.004419418517500162, |
| "rewards/acc_reward/mean": 0.9974644780158997, |
| "rewards/acc_reward/std": 0.006761432159692049, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 52 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 464.0, |
| "completions/mean_length": 176.078125, |
| "completions/mean_terminated_length": 170.74603271484375, |
| "completions/min_length": 51.0, |
| "completions/min_terminated_length": 51.0, |
| "entropy": 2.609375, |
| "epoch": 0.09981167608286252, |
| "frac_reward_zero_std": 0.625, |
| "grad_norm": 1.4219504594802856, |
| "learning_rate": 9.020715630885122e-07, |
| "loss": -0.0008, |
| "num_tokens": 7162123.0, |
| "reward": 0.9900810718536377, |
| "reward_std": 0.021838055923581123, |
| "rewards/acc_reward/mean": 0.9924511909484863, |
| "rewards/acc_reward/std": 0.04345937818288803, |
| "rewards/format_reward/mean": 0.96875, |
| "rewards/format_reward/std": 0.17536810040473938, |
| "step": 53 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 428.0, |
| "completions/mean_length": 171.125, |
| "completions/mean_terminated_length": 165.71429443359375, |
| "completions/min_length": 43.0, |
| "completions/min_terminated_length": 43.0, |
| "entropy": 2.453125, |
| "epoch": 0.1016949152542373, |
| "frac_reward_zero_std": 0.875, |
| "grad_norm": 0.9306376576423645, |
| "learning_rate": 9.001883239171374e-07, |
| "loss": 0.0039, |
| "num_tokens": 7294099.0, |
| "reward": 0.9847477674484253, |
| "reward_std": 0.005786377005279064, |
| "rewards/acc_reward/mean": 0.986525297164917, |
| "rewards/acc_reward/std": 0.02815171889960766, |
| "rewards/format_reward/mean": 0.96875, |
| "rewards/format_reward/std": 0.17536810040473938, |
| "step": 54 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 317.0, |
| "completions/max_terminated_length": 317.0, |
| "completions/mean_length": 185.28125, |
| "completions/mean_terminated_length": 185.28125, |
| "completions/min_length": 70.0, |
| "completions/min_terminated_length": 70.0, |
| "entropy": 2.59375, |
| "epoch": 0.10357815442561205, |
| "frac_reward_zero_std": 0.75, |
| "grad_norm": 1.0906463861465454, |
| "learning_rate": 8.983050847457627e-07, |
| "loss": -0.0101, |
| "num_tokens": 7421381.0, |
| "reward": 0.9820950031280518, |
| "reward_std": 0.013950306922197342, |
| "rewards/acc_reward/mean": 0.9818416833877563, |
| "rewards/acc_reward/std": 0.03777196630835533, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 55 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 512.0, |
| "completions/mean_length": 189.296875, |
| "completions/mean_terminated_length": 189.296875, |
| "completions/min_length": 47.0, |
| "completions/min_terminated_length": 47.0, |
| "entropy": 2.71875, |
| "epoch": 0.10546139359698682, |
| "frac_reward_zero_std": 0.875, |
| "grad_norm": 1.0221842527389526, |
| "learning_rate": 8.964218455743879e-07, |
| "loss": -0.0019, |
| "num_tokens": 7550840.0, |
| "reward": 0.9848359823226929, |
| "reward_std": 0.004419418517500162, |
| "rewards/acc_reward/mean": 0.9848872423171997, |
| "rewards/acc_reward/std": 0.026450620964169502, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 56 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 412.0, |
| "completions/max_terminated_length": 412.0, |
| "completions/mean_length": 204.6875, |
| "completions/mean_terminated_length": 204.6875, |
| "completions/min_length": 76.0, |
| "completions/min_terminated_length": 76.0, |
| "entropy": 2.609375, |
| "epoch": 0.10734463276836158, |
| "frac_reward_zero_std": 0.75, |
| "grad_norm": 1.1911293268203735, |
| "learning_rate": 8.945386064030131e-07, |
| "loss": 0.0145, |
| "num_tokens": 7693764.0, |
| "reward": 0.9755189418792725, |
| "reward_std": 0.04419417679309845, |
| "rewards/acc_reward/mean": 0.9745348691940308, |
| "rewards/acc_reward/std": 0.1250728815793991, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 57 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 452.0, |
| "completions/max_terminated_length": 452.0, |
| "completions/mean_length": 210.546875, |
| "completions/mean_terminated_length": 210.546875, |
| "completions/min_length": 87.0, |
| "completions/min_terminated_length": 87.0, |
| "entropy": 2.65625, |
| "epoch": 0.10922787193973635, |
| "frac_reward_zero_std": 0.875, |
| "grad_norm": 0.7357279062271118, |
| "learning_rate": 8.926553672316383e-07, |
| "loss": 0.0164, |
| "num_tokens": 7828263.0, |
| "reward": 0.9803258180618286, |
| "reward_std": 0.007434290833771229, |
| "rewards/acc_reward/mean": 0.9781398177146912, |
| "rewards/acc_reward/std": 0.03639683872461319, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 58 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 289.0, |
| "completions/max_terminated_length": 289.0, |
| "completions/mean_length": 154.046875, |
| "completions/mean_terminated_length": 154.046875, |
| "completions/min_length": 48.0, |
| "completions/min_terminated_length": 48.0, |
| "entropy": 2.5625, |
| "epoch": 0.1111111111111111, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 8.907721280602636e-07, |
| "loss": 0.0, |
| "num_tokens": 7968106.0, |
| "reward": 0.9852668046951294, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9836297631263733, |
| "rewards/acc_reward/std": 0.024946285411715508, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 59 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.03125, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 449.0, |
| "completions/mean_length": 177.296875, |
| "completions/mean_terminated_length": 166.5, |
| "completions/min_length": 51.0, |
| "completions/min_terminated_length": 51.0, |
| "entropy": 2.546875, |
| "epoch": 0.11299435028248588, |
| "frac_reward_zero_std": 0.875, |
| "grad_norm": 1.5839695930480957, |
| "learning_rate": 8.888888888888888e-07, |
| "loss": 0.0779, |
| "num_tokens": 8098157.0, |
| "reward": 0.9858125448226929, |
| "reward_std": 0.004419418517500162, |
| "rewards/acc_reward/mean": 0.9859722852706909, |
| "rewards/acc_reward/std": 0.019654173403978348, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 60 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 385.0, |
| "completions/mean_length": 167.1875, |
| "completions/mean_terminated_length": 161.71429443359375, |
| "completions/min_length": 61.0, |
| "completions/min_terminated_length": 61.0, |
| "entropy": 2.765625, |
| "epoch": 0.11487758945386065, |
| "frac_reward_zero_std": 0.75, |
| "grad_norm": 1.4451484680175781, |
| "learning_rate": 8.870056497175141e-07, |
| "loss": -0.0, |
| "num_tokens": 8227641.0, |
| "reward": 0.9645360708236694, |
| "reward_std": 0.008838837035000324, |
| "rewards/acc_reward/mean": 0.964067816734314, |
| "rewards/acc_reward/std": 0.040923893451690674, |
| "rewards/format_reward/mean": 0.96875, |
| "rewards/format_reward/std": 0.17536810040473938, |
| "step": 61 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 459.0, |
| "completions/max_terminated_length": 459.0, |
| "completions/mean_length": 181.734375, |
| "completions/mean_terminated_length": 181.734375, |
| "completions/min_length": 16.0, |
| "completions/min_terminated_length": 16.0, |
| "entropy": 2.625, |
| "epoch": 0.1167608286252354, |
| "frac_reward_zero_std": 0.875, |
| "grad_norm": 0.3860294222831726, |
| "learning_rate": 8.851224105461393e-07, |
| "loss": -0.0431, |
| "num_tokens": 8359496.0, |
| "reward": 0.991644024848938, |
| "reward_std": 0.004419418517500162, |
| "rewards/acc_reward/mean": 0.9924516677856445, |
| "rewards/acc_reward/std": 0.020128827542066574, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 62 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 499.0, |
| "completions/mean_length": 207.34375, |
| "completions/mean_terminated_length": 202.50794982910156, |
| "completions/min_length": 78.0, |
| "completions/min_terminated_length": 78.0, |
| "entropy": 2.75, |
| "epoch": 0.11864406779661017, |
| "frac_reward_zero_std": 0.875, |
| "grad_norm": 0.6225361227989197, |
| "learning_rate": 8.832391713747645e-07, |
| "loss": -0.0011, |
| "num_tokens": 8500670.0, |
| "reward": 0.9970052242279053, |
| "reward_std": 0.004419418517500162, |
| "rewards/acc_reward/mean": 0.9984085559844971, |
| "rewards/acc_reward/std": 0.004243890754878521, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 63 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 369.0, |
| "completions/mean_length": 169.6875, |
| "completions/mean_terminated_length": 164.25396728515625, |
| "completions/min_length": 60.0, |
| "completions/min_terminated_length": 60.0, |
| "entropy": 2.3125, |
| "epoch": 0.12052730696798493, |
| "frac_reward_zero_std": 0.625, |
| "grad_norm": 1.537191390991211, |
| "learning_rate": 8.813559322033897e-07, |
| "loss": 0.0021, |
| "num_tokens": 8631754.0, |
| "reward": 0.9799120426177979, |
| "reward_std": 0.013258256018161774, |
| "rewards/acc_reward/mean": 0.982888400554657, |
| "rewards/acc_reward/std": 0.022922541946172714, |
| "rewards/format_reward/mean": 0.953125, |
| "rewards/format_reward/std": 0.21304203569889069, |
| "step": 64 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 500.0, |
| "completions/max_terminated_length": 500.0, |
| "completions/mean_length": 172.59375, |
| "completions/mean_terminated_length": 172.59375, |
| "completions/min_length": 38.0, |
| "completions/min_terminated_length": 38.0, |
| "entropy": 2.609375, |
| "epoch": 0.1224105461393597, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 8.79472693032015e-07, |
| "loss": 0.0, |
| "num_tokens": 8762784.0, |
| "reward": 0.9871211051940918, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.985690176486969, |
| "rewards/acc_reward/std": 0.017727959901094437, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 65 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 405.0, |
| "completions/max_terminated_length": 405.0, |
| "completions/mean_length": 181.234375, |
| "completions/mean_terminated_length": 181.234375, |
| "completions/min_length": 64.0, |
| "completions/min_terminated_length": 64.0, |
| "entropy": 2.453125, |
| "epoch": 0.12429378531073447, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 8.775894538606404e-07, |
| "loss": 0.0, |
| "num_tokens": 8887127.0, |
| "reward": 0.9676999449729919, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9641109704971313, |
| "rewards/acc_reward/std": 0.03724094480276108, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 66 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 319.0, |
| "completions/mean_length": 163.078125, |
| "completions/mean_terminated_length": 157.53968811035156, |
| "completions/min_length": 61.0, |
| "completions/min_terminated_length": 61.0, |
| "entropy": 2.296875, |
| "epoch": 0.12617702448210924, |
| "frac_reward_zero_std": 0.875, |
| "grad_norm": 0.6480050086975098, |
| "learning_rate": 8.757062146892656e-07, |
| "loss": -0.0023, |
| "num_tokens": 9024028.0, |
| "reward": 0.9807539582252502, |
| "reward_std": 0.004419418517500162, |
| "rewards/acc_reward/mean": 0.9803516268730164, |
| "rewards/acc_reward/std": 0.02998371422290802, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 67 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 382.0, |
| "completions/max_terminated_length": 382.0, |
| "completions/mean_length": 150.859375, |
| "completions/mean_terminated_length": 150.859375, |
| "completions/min_length": 48.0, |
| "completions/min_terminated_length": 48.0, |
| "entropy": 2.375, |
| "epoch": 0.128060263653484, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 8.738229755178908e-07, |
| "loss": 0.0, |
| "num_tokens": 9149267.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 1.0, |
| "rewards/acc_reward/std": 0.0, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 68 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 347.0, |
| "completions/max_terminated_length": 347.0, |
| "completions/mean_length": 156.34375, |
| "completions/mean_terminated_length": 156.34375, |
| "completions/min_length": 35.0, |
| "completions/min_terminated_length": 35.0, |
| "entropy": 2.359375, |
| "epoch": 0.12994350282485875, |
| "frac_reward_zero_std": 0.75, |
| "grad_norm": 1.1719481945037842, |
| "learning_rate": 8.71939736346516e-07, |
| "loss": -0.0147, |
| "num_tokens": 9283337.0, |
| "reward": 0.9746325016021729, |
| "reward_std": 0.04419417679309845, |
| "rewards/acc_reward/mean": 0.9735499620437622, |
| "rewards/acc_reward/std": 0.1257747858762741, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 69 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 427.0, |
| "completions/max_terminated_length": 427.0, |
| "completions/mean_length": 153.515625, |
| "completions/mean_terminated_length": 153.515625, |
| "completions/min_length": 23.0, |
| "completions/min_terminated_length": 23.0, |
| "entropy": 2.53125, |
| "epoch": 0.1318267419962335, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 8.700564971751412e-07, |
| "loss": 0.0, |
| "num_tokens": 9422506.0, |
| "reward": 0.9936791658401489, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9929767847061157, |
| "rewards/acc_reward/std": 0.018728474155068398, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 70 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 311.0, |
| "completions/max_terminated_length": 311.0, |
| "completions/mean_length": 159.90625, |
| "completions/mean_terminated_length": 159.90625, |
| "completions/min_length": 64.0, |
| "completions/min_terminated_length": 64.0, |
| "entropy": 2.234375, |
| "epoch": 0.1337099811676083, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 8.681732580037664e-07, |
| "loss": 0.0, |
| "num_tokens": 9558724.0, |
| "reward": 0.9724128246307373, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9693475365638733, |
| "rewards/acc_reward/std": 0.049584269523620605, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 71 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 393.0, |
| "completions/max_terminated_length": 393.0, |
| "completions/mean_length": 171.421875, |
| "completions/mean_terminated_length": 171.421875, |
| "completions/min_length": 51.0, |
| "completions/min_terminated_length": 51.0, |
| "entropy": 2.65625, |
| "epoch": 0.13559322033898305, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 8.662900188323917e-07, |
| "loss": 0.0, |
| "num_tokens": 9696479.0, |
| "reward": 0.9782567620277405, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9758408665657043, |
| "rewards/acc_reward/std": 0.035969410091638565, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 72 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 355.0, |
| "completions/max_terminated_length": 355.0, |
| "completions/mean_length": 146.640625, |
| "completions/mean_terminated_length": 146.640625, |
| "completions/min_length": 58.0, |
| "completions/min_terminated_length": 58.0, |
| "entropy": 2.1875, |
| "epoch": 0.1374764595103578, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 8.64406779661017e-07, |
| "loss": 0.0, |
| "num_tokens": 9833768.0, |
| "reward": 0.9815881252288818, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9795423150062561, |
| "rewards/acc_reward/std": 0.03601390868425369, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 73 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 361.0, |
| "completions/mean_length": 159.484375, |
| "completions/mean_terminated_length": 153.88888549804688, |
| "completions/min_length": 52.0, |
| "completions/min_terminated_length": 52.0, |
| "entropy": 2.40625, |
| "epoch": 0.1393596986817326, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 8.625235404896422e-07, |
| "loss": 0.0, |
| "num_tokens": 9965639.0, |
| "reward": 0.9888086318969727, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9875651597976685, |
| "rewards/acc_reward/std": 0.02026844024658203, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 74 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 368.0, |
| "completions/max_terminated_length": 368.0, |
| "completions/mean_length": 159.8125, |
| "completions/mean_terminated_length": 159.8125, |
| "completions/min_length": 69.0, |
| "completions/min_terminated_length": 69.0, |
| "entropy": 2.28125, |
| "epoch": 0.14124293785310735, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 8.606403013182674e-07, |
| "loss": 0.0, |
| "num_tokens": 10102131.0, |
| "reward": 0.9853819608688354, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.983757734298706, |
| "rewards/acc_reward/std": 0.023104524239897728, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 75 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.03125, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 456.0, |
| "completions/mean_length": 184.40625, |
| "completions/mean_terminated_length": 173.8386993408203, |
| "completions/min_length": 63.0, |
| "completions/min_terminated_length": 63.0, |
| "entropy": 2.390625, |
| "epoch": 0.1431261770244821, |
| "frac_reward_zero_std": 0.875, |
| "grad_norm": 0.7094928026199341, |
| "learning_rate": 8.587570621468926e-07, |
| "loss": -0.0279, |
| "num_tokens": 10234637.0, |
| "reward": 0.9758157730102539, |
| "reward_std": 0.009495548903942108, |
| "rewards/acc_reward/mean": 0.973128616809845, |
| "rewards/acc_reward/std": 0.04172620549798012, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 76 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 493.0, |
| "completions/max_terminated_length": 493.0, |
| "completions/mean_length": 187.34375, |
| "completions/mean_terminated_length": 187.34375, |
| "completions/min_length": 68.0, |
| "completions/min_terminated_length": 68.0, |
| "entropy": 2.484375, |
| "epoch": 0.14500941619585686, |
| "frac_reward_zero_std": 0.875, |
| "grad_norm": 0.6446596384048462, |
| "learning_rate": 8.568738229755178e-07, |
| "loss": -0.0238, |
| "num_tokens": 10380091.0, |
| "reward": 0.9779398441314697, |
| "reward_std": 0.004419418517500162, |
| "rewards/acc_reward/mean": 0.9772248268127441, |
| "rewards/acc_reward/std": 0.026913031935691833, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 77 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 433.0, |
| "completions/max_terminated_length": 433.0, |
| "completions/mean_length": 161.046875, |
| "completions/mean_terminated_length": 161.046875, |
| "completions/min_length": 71.0, |
| "completions/min_terminated_length": 71.0, |
| "entropy": 2.359375, |
| "epoch": 0.14689265536723164, |
| "frac_reward_zero_std": 0.875, |
| "grad_norm": 0.9724075198173523, |
| "learning_rate": 8.549905838041431e-07, |
| "loss": 0.0269, |
| "num_tokens": 10505822.0, |
| "reward": 0.988434910774231, |
| "reward_std": 0.004419418517500162, |
| "rewards/acc_reward/mean": 0.9888859987258911, |
| "rewards/acc_reward/std": 0.024498289451003075, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 78 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 446.0, |
| "completions/max_terminated_length": 446.0, |
| "completions/mean_length": 159.9375, |
| "completions/mean_terminated_length": 159.9375, |
| "completions/min_length": 40.0, |
| "completions/min_terminated_length": 40.0, |
| "entropy": 2.375, |
| "epoch": 0.1487758945386064, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 8.531073446327683e-07, |
| "loss": 0.0, |
| "num_tokens": 10638682.0, |
| "reward": 0.990105152130127, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9890057444572449, |
| "rewards/acc_reward/std": 0.019625553861260414, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 79 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 471.0, |
| "completions/mean_length": 165.125, |
| "completions/mean_terminated_length": 159.6190643310547, |
| "completions/min_length": 65.0, |
| "completions/min_terminated_length": 65.0, |
| "entropy": 2.734375, |
| "epoch": 0.15065913370998116, |
| "frac_reward_zero_std": 0.875, |
| "grad_norm": 0.6837416291236877, |
| "learning_rate": 8.512241054613935e-07, |
| "loss": -0.0191, |
| "num_tokens": 10769954.0, |
| "reward": 0.98872971534729, |
| "reward_std": 0.004419418517500162, |
| "rewards/acc_reward/mean": 0.9892135858535767, |
| "rewards/acc_reward/std": 0.02876383066177368, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 80 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 413.0, |
| "completions/mean_length": 184.4375, |
| "completions/mean_terminated_length": 179.23809814453125, |
| "completions/min_length": 56.0, |
| "completions/min_terminated_length": 56.0, |
| "entropy": 2.625, |
| "epoch": 0.15254237288135594, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 8.493408662900188e-07, |
| "loss": 0.0, |
| "num_tokens": 10908782.0, |
| "reward": 0.9871925115585327, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.985769510269165, |
| "rewards/acc_reward/std": 0.02491430565714836, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 81 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 339.0, |
| "completions/max_terminated_length": 339.0, |
| "completions/mean_length": 155.265625, |
| "completions/mean_terminated_length": 155.265625, |
| "completions/min_length": 73.0, |
| "completions/min_terminated_length": 73.0, |
| "entropy": 2.4375, |
| "epoch": 0.1544256120527307, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 8.47457627118644e-07, |
| "loss": 0.0, |
| "num_tokens": 11042303.0, |
| "reward": 0.9902667999267578, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9891853332519531, |
| "rewards/acc_reward/std": 0.015684949234128, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 82 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 369.0, |
| "completions/max_terminated_length": 369.0, |
| "completions/mean_length": 153.015625, |
| "completions/mean_terminated_length": 153.015625, |
| "completions/min_length": 48.0, |
| "completions/min_terminated_length": 48.0, |
| "entropy": 2.5, |
| "epoch": 0.15630885122410546, |
| "frac_reward_zero_std": 0.875, |
| "grad_norm": 0.6075497269630432, |
| "learning_rate": 8.455743879472693e-07, |
| "loss": -0.027, |
| "num_tokens": 11173760.0, |
| "reward": 0.9887193441390991, |
| "reward_std": 0.004419418517500162, |
| "rewards/acc_reward/mean": 0.989202082157135, |
| "rewards/acc_reward/std": 0.012971931137144566, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 83 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 297.0, |
| "completions/max_terminated_length": 297.0, |
| "completions/mean_length": 142.296875, |
| "completions/mean_terminated_length": 142.296875, |
| "completions/min_length": 38.0, |
| "completions/min_terminated_length": 38.0, |
| "entropy": 2.265625, |
| "epoch": 0.15819209039548024, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 8.436911487758945e-07, |
| "loss": 0.0, |
| "num_tokens": 11310011.0, |
| "reward": 0.9917968511581421, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9908854365348816, |
| "rewards/acc_reward/std": 0.01598946936428547, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 84 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 244.0, |
| "completions/max_terminated_length": 244.0, |
| "completions/mean_length": 138.125, |
| "completions/mean_terminated_length": 138.125, |
| "completions/min_length": 54.0, |
| "completions/min_terminated_length": 54.0, |
| "entropy": 2.265625, |
| "epoch": 0.160075329566855, |
| "frac_reward_zero_std": 0.875, |
| "grad_norm": 0.6712620854377747, |
| "learning_rate": 8.418079096045197e-07, |
| "loss": -0.0066, |
| "num_tokens": 11440675.0, |
| "reward": 0.9767186641693115, |
| "reward_std": 0.004419418517500162, |
| "rewards/acc_reward/mean": 0.9758679866790771, |
| "rewards/acc_reward/std": 0.04487896338105202, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 85 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 451.0, |
| "completions/max_terminated_length": 451.0, |
| "completions/mean_length": 172.578125, |
| "completions/mean_terminated_length": 172.578125, |
| "completions/min_length": 48.0, |
| "completions/min_terminated_length": 48.0, |
| "entropy": 2.640625, |
| "epoch": 0.16195856873822975, |
| "frac_reward_zero_std": 0.875, |
| "grad_norm": 0.770696222782135, |
| "learning_rate": 8.399246704331449e-07, |
| "loss": -0.0185, |
| "num_tokens": 11569064.0, |
| "reward": 0.9869691133499146, |
| "reward_std": 0.004419418517500162, |
| "rewards/acc_reward/mean": 0.9872573614120483, |
| "rewards/acc_reward/std": 0.022249845787882805, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 86 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 426.0, |
| "completions/max_terminated_length": 426.0, |
| "completions/mean_length": 172.6875, |
| "completions/mean_terminated_length": 172.6875, |
| "completions/min_length": 41.0, |
| "completions/min_terminated_length": 41.0, |
| "entropy": 2.3125, |
| "epoch": 0.1638418079096045, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 8.380414312617701e-07, |
| "loss": 0.0, |
| "num_tokens": 11708700.0, |
| "reward": 0.9803584814071655, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9781760573387146, |
| "rewards/acc_reward/std": 0.02318427711725235, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 87 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.03125, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 276.0, |
| "completions/mean_length": 155.0625, |
| "completions/mean_terminated_length": 143.5483856201172, |
| "completions/min_length": 75.0, |
| "completions/min_terminated_length": 75.0, |
| "entropy": 2.40625, |
| "epoch": 0.1657250470809793, |
| "frac_reward_zero_std": 0.875, |
| "grad_norm": 0.9133210778236389, |
| "learning_rate": 8.361581920903954e-07, |
| "loss": 0.0155, |
| "num_tokens": 11832608.0, |
| "reward": 0.9537662267684937, |
| "reward_std": 0.006725744344294071, |
| "rewards/acc_reward/mean": 0.9486291408538818, |
| "rewards/acc_reward/std": 0.06949032843112946, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 88 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 420.0, |
| "completions/max_terminated_length": 420.0, |
| "completions/mean_length": 170.796875, |
| "completions/mean_terminated_length": 170.796875, |
| "completions/min_length": 22.0, |
| "completions/min_terminated_length": 22.0, |
| "entropy": 2.359375, |
| "epoch": 0.16760828625235405, |
| "frac_reward_zero_std": 0.75, |
| "grad_norm": 0.9491801857948303, |
| "learning_rate": 8.342749529190208e-07, |
| "loss": -0.0359, |
| "num_tokens": 11968843.0, |
| "reward": 0.9640452265739441, |
| "reward_std": 0.010205795988440514, |
| "rewards/acc_reward/mean": 0.9652585983276367, |
| "rewards/acc_reward/std": 0.046331144869327545, |
| "rewards/format_reward/mean": 0.953125, |
| "rewards/format_reward/std": 0.21304203569889069, |
| "step": 89 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 275.0, |
| "completions/max_terminated_length": 275.0, |
| "completions/mean_length": 151.96875, |
| "completions/mean_terminated_length": 151.96875, |
| "completions/min_length": 56.0, |
| "completions/min_terminated_length": 56.0, |
| "entropy": 2.3125, |
| "epoch": 0.1694915254237288, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 8.32391713747646e-07, |
| "loss": 0.0, |
| "num_tokens": 12106473.0, |
| "reward": 0.9978047609329224, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9975607991218567, |
| "rewards/acc_reward/std": 0.006504515651613474, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 90 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 505.0, |
| "completions/max_terminated_length": 505.0, |
| "completions/mean_length": 170.75, |
| "completions/mean_terminated_length": 170.75, |
| "completions/min_length": 49.0, |
| "completions/min_terminated_length": 49.0, |
| "entropy": 2.484375, |
| "epoch": 0.1713747645951036, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 8.305084745762712e-07, |
| "loss": 0.0, |
| "num_tokens": 12241945.0, |
| "reward": 0.9895379543304443, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9883755445480347, |
| "rewards/acc_reward/std": 0.018741585314273834, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 91 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 324.0, |
| "completions/max_terminated_length": 324.0, |
| "completions/mean_length": 165.765625, |
| "completions/mean_terminated_length": 165.765625, |
| "completions/min_length": 77.0, |
| "completions/min_terminated_length": 77.0, |
| "entropy": 2.34375, |
| "epoch": 0.17325800376647835, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 8.286252354048964e-07, |
| "loss": 0.0, |
| "num_tokens": 12375658.0, |
| "reward": 0.9758948087692261, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9732165336608887, |
| "rewards/acc_reward/std": 0.04799450561404228, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 92 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 451.0, |
| "completions/mean_length": 195.265625, |
| "completions/mean_terminated_length": 190.23809814453125, |
| "completions/min_length": 60.0, |
| "completions/min_terminated_length": 60.0, |
| "entropy": 2.46875, |
| "epoch": 0.1751412429378531, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 8.267419962335216e-07, |
| "loss": 0.0, |
| "num_tokens": 12513219.0, |
| "reward": 0.9753564596176147, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9726182222366333, |
| "rewards/acc_reward/std": 0.045861802995204926, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 93 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 321.0, |
| "completions/max_terminated_length": 321.0, |
| "completions/mean_length": 155.5, |
| "completions/mean_terminated_length": 155.5, |
| "completions/min_length": 77.0, |
| "completions/min_terminated_length": 77.0, |
| "entropy": 2.1875, |
| "epoch": 0.17702448210922786, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 8.248587570621468e-07, |
| "loss": 0.0, |
| "num_tokens": 12648475.0, |
| "reward": 0.9844207763671875, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9826897382736206, |
| "rewards/acc_reward/std": 0.02190142311155796, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 94 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 508.0, |
| "completions/mean_length": 187.34375, |
| "completions/mean_terminated_length": 182.19049072265625, |
| "completions/min_length": 30.0, |
| "completions/min_terminated_length": 30.0, |
| "entropy": 2.5625, |
| "epoch": 0.17890772128060264, |
| "frac_reward_zero_std": 0.75, |
| "grad_norm": 1.2475651502609253, |
| "learning_rate": 8.229755178907722e-07, |
| "loss": -0.0212, |
| "num_tokens": 12785969.0, |
| "reward": 0.9898383617401123, |
| "reward_std": 0.008838837035000324, |
| "rewards/acc_reward/mean": 0.9921815395355225, |
| "rewards/acc_reward/std": 0.015049039386212826, |
| "rewards/format_reward/mean": 0.96875, |
| "rewards/format_reward/std": 0.17536810040473938, |
| "step": 95 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 368.0, |
| "completions/max_terminated_length": 368.0, |
| "completions/mean_length": 153.0, |
| "completions/mean_terminated_length": 153.0, |
| "completions/min_length": 45.0, |
| "completions/min_terminated_length": 45.0, |
| "entropy": 2.34375, |
| "epoch": 0.1807909604519774, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 8.210922787193974e-07, |
| "loss": 0.0, |
| "num_tokens": 12907345.0, |
| "reward": 0.9843592047691345, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9826213121414185, |
| "rewards/acc_reward/std": 0.03606174886226654, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 96 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 474.0, |
| "completions/max_terminated_length": 474.0, |
| "completions/mean_length": 168.28125, |
| "completions/mean_terminated_length": 168.28125, |
| "completions/min_length": 50.0, |
| "completions/min_terminated_length": 50.0, |
| "entropy": 2.171875, |
| "epoch": 0.18267419962335216, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 8.192090395480226e-07, |
| "loss": 0.0, |
| "num_tokens": 13033539.0, |
| "reward": 0.9740588665008545, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9711765050888062, |
| "rewards/acc_reward/std": 0.033579710870981216, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 97 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 367.0, |
| "completions/mean_length": 164.03125, |
| "completions/mean_terminated_length": 158.5079345703125, |
| "completions/min_length": 56.0, |
| "completions/min_terminated_length": 56.0, |
| "entropy": 2.546875, |
| "epoch": 0.18455743879472694, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 8.173258003766478e-07, |
| "loss": 0.0, |
| "num_tokens": 13159461.0, |
| "reward": 0.983502984046936, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9816700220108032, |
| "rewards/acc_reward/std": 0.024904713034629822, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 98 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 322.0, |
| "completions/mean_length": 174.546875, |
| "completions/mean_terminated_length": 169.19049072265625, |
| "completions/min_length": 75.0, |
| "completions/min_terminated_length": 75.0, |
| "entropy": 2.140625, |
| "epoch": 0.1864406779661017, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 8.15442561205273e-07, |
| "loss": 0.0, |
| "num_tokens": 13287976.0, |
| "reward": 0.9533977508544922, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9482197761535645, |
| "rewards/acc_reward/std": 0.043142788112163544, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 99 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 382.0, |
| "completions/mean_length": 161.15625, |
| "completions/mean_terminated_length": 155.58731079101562, |
| "completions/min_length": 57.0, |
| "completions/min_terminated_length": 57.0, |
| "entropy": 2.25, |
| "epoch": 0.18832391713747645, |
| "frac_reward_zero_std": 0.875, |
| "grad_norm": 0.5508601665496826, |
| "learning_rate": 8.135593220338983e-07, |
| "loss": -0.0178, |
| "num_tokens": 13424754.0, |
| "reward": 0.9814343452453613, |
| "reward_std": 0.004419418517500162, |
| "rewards/acc_reward/mean": 0.9811076521873474, |
| "rewards/acc_reward/std": 0.01872284896671772, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 100 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 310.0, |
| "completions/max_terminated_length": 310.0, |
| "completions/mean_length": 152.96875, |
| "completions/mean_terminated_length": 152.96875, |
| "completions/min_length": 64.0, |
| "completions/min_terminated_length": 64.0, |
| "entropy": 2.40625, |
| "epoch": 0.1902071563088512, |
| "frac_reward_zero_std": 0.875, |
| "grad_norm": 0.733428418636322, |
| "learning_rate": 8.116760828625235e-07, |
| "loss": -0.0236, |
| "num_tokens": 13554768.0, |
| "reward": 0.9719411730766296, |
| "reward_std": 0.039774756878614426, |
| "rewards/acc_reward/mean": 0.9688235521316528, |
| "rewards/acc_reward/std": 0.1259699910879135, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 101 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 331.0, |
| "completions/mean_length": 163.6875, |
| "completions/mean_terminated_length": 158.1587371826172, |
| "completions/min_length": 55.0, |
| "completions/min_terminated_length": 55.0, |
| "entropy": 2.28125, |
| "epoch": 0.192090395480226, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 8.097928436911488e-07, |
| "loss": 0.0, |
| "num_tokens": 13684028.0, |
| "reward": 0.9804370403289795, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9782633781433105, |
| "rewards/acc_reward/std": 0.03396952524781227, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 102 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.03125, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 442.0, |
| "completions/mean_length": 189.28125, |
| "completions/mean_terminated_length": 178.8709716796875, |
| "completions/min_length": 43.0, |
| "completions/min_terminated_length": 43.0, |
| "entropy": 2.5625, |
| "epoch": 0.19397363465160075, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 8.07909604519774e-07, |
| "loss": 0.0, |
| "num_tokens": 13813006.0, |
| "reward": 0.9762270450592041, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9735856056213379, |
| "rewards/acc_reward/std": 0.05650464445352554, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 103 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 495.0, |
| "completions/max_terminated_length": 495.0, |
| "completions/mean_length": 176.3125, |
| "completions/mean_terminated_length": 176.3125, |
| "completions/min_length": 52.0, |
| "completions/min_terminated_length": 52.0, |
| "entropy": 2.515625, |
| "epoch": 0.1958568738229755, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 8.060263653483992e-07, |
| "loss": 0.0, |
| "num_tokens": 13945954.0, |
| "reward": 0.9901642799377441, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9890714883804321, |
| "rewards/acc_reward/std": 0.014226200059056282, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 104 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 385.0, |
| "completions/max_terminated_length": 385.0, |
| "completions/mean_length": 175.34375, |
| "completions/mean_terminated_length": 175.34375, |
| "completions/min_length": 63.0, |
| "completions/min_terminated_length": 63.0, |
| "entropy": 2.515625, |
| "epoch": 0.1977401129943503, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 8.041431261770244e-07, |
| "loss": 0.0, |
| "num_tokens": 14080760.0, |
| "reward": 0.9843112826347351, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.982568085193634, |
| "rewards/acc_reward/std": 0.029767252504825592, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 105 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 427.0, |
| "completions/max_terminated_length": 427.0, |
| "completions/mean_length": 162.34375, |
| "completions/mean_terminated_length": 162.34375, |
| "completions/min_length": 37.0, |
| "completions/min_terminated_length": 37.0, |
| "entropy": 2.25, |
| "epoch": 0.19962335216572505, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 8.022598870056497e-07, |
| "loss": 0.0, |
| "num_tokens": 14202574.0, |
| "reward": 0.9882901310920715, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9869890213012695, |
| "rewards/acc_reward/std": 0.017871392890810966, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 106 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 393.0, |
| "completions/max_terminated_length": 393.0, |
| "completions/mean_length": 177.140625, |
| "completions/mean_terminated_length": 177.140625, |
| "completions/min_length": 69.0, |
| "completions/min_terminated_length": 69.0, |
| "entropy": 2.515625, |
| "epoch": 0.2015065913370998, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 8.003766478342749e-07, |
| "loss": 0.0, |
| "num_tokens": 14339895.0, |
| "reward": 0.9751439094543457, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.972382128238678, |
| "rewards/acc_reward/std": 0.04191429913043976, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 107 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 457.0, |
| "completions/max_terminated_length": 457.0, |
| "completions/mean_length": 182.734375, |
| "completions/mean_terminated_length": 182.734375, |
| "completions/min_length": 71.0, |
| "completions/min_terminated_length": 71.0, |
| "entropy": 2.46875, |
| "epoch": 0.2033898305084746, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 7.984934086629001e-07, |
| "loss": 0.0, |
| "num_tokens": 14480414.0, |
| "reward": 0.9581470489501953, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9534966945648193, |
| "rewards/acc_reward/std": 0.06734198331832886, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 108 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 315.0, |
| "completions/max_terminated_length": 315.0, |
| "completions/mean_length": 150.65625, |
| "completions/mean_terminated_length": 150.65625, |
| "completions/min_length": 57.0, |
| "completions/min_terminated_length": 57.0, |
| "entropy": 2.203125, |
| "epoch": 0.20527306967984935, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 7.966101694915253e-07, |
| "loss": 0.0, |
| "num_tokens": 14604040.0, |
| "reward": 0.9401878118515015, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9335420727729797, |
| "rewards/acc_reward/std": 0.09204845130443573, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 109 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 350.0, |
| "completions/max_terminated_length": 350.0, |
| "completions/mean_length": 153.203125, |
| "completions/mean_terminated_length": 153.203125, |
| "completions/min_length": 61.0, |
| "completions/min_terminated_length": 61.0, |
| "entropy": 2.28125, |
| "epoch": 0.2071563088512241, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 7.947269303201506e-07, |
| "loss": 0.0, |
| "num_tokens": 14742229.0, |
| "reward": 0.9852752685546875, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9836391806602478, |
| "rewards/acc_reward/std": 0.04362887144088745, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 110 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 336.0, |
| "completions/max_terminated_length": 336.0, |
| "completions/mean_length": 170.71875, |
| "completions/mean_terminated_length": 170.71875, |
| "completions/min_length": 78.0, |
| "completions/min_terminated_length": 78.0, |
| "entropy": 2.375, |
| "epoch": 0.20903954802259886, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 7.928436911487758e-07, |
| "loss": 0.0, |
| "num_tokens": 14873859.0, |
| "reward": 0.9933172464370728, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9925747513771057, |
| "rewards/acc_reward/std": 0.010412875562906265, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 111 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 349.0, |
| "completions/max_terminated_length": 349.0, |
| "completions/mean_length": 168.53125, |
| "completions/mean_terminated_length": 168.53125, |
| "completions/min_length": 66.0, |
| "completions/min_terminated_length": 66.0, |
| "entropy": 2.5, |
| "epoch": 0.21092278719397364, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 7.909604519774011e-07, |
| "loss": 0.0, |
| "num_tokens": 15006189.0, |
| "reward": 0.9692245125770569, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9658049941062927, |
| "rewards/acc_reward/std": 0.0648469477891922, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 112 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 379.0, |
| "completions/max_terminated_length": 379.0, |
| "completions/mean_length": 169.125, |
| "completions/mean_terminated_length": 169.125, |
| "completions/min_length": 51.0, |
| "completions/min_terminated_length": 51.0, |
| "entropy": 2.3125, |
| "epoch": 0.2128060263653484, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 7.890772128060263e-07, |
| "loss": 0.0, |
| "num_tokens": 15144437.0, |
| "reward": 0.9728338718414307, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9698153734207153, |
| "rewards/acc_reward/std": 0.040937572717666626, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 113 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 362.0, |
| "completions/max_terminated_length": 362.0, |
| "completions/mean_length": 158.234375, |
| "completions/mean_terminated_length": 158.234375, |
| "completions/min_length": 66.0, |
| "completions/min_terminated_length": 66.0, |
| "entropy": 2.421875, |
| "epoch": 0.21468926553672316, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 7.871939736346515e-07, |
| "loss": 0.0, |
| "num_tokens": 15266348.0, |
| "reward": 0.9900326728820801, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9889251589775085, |
| "rewards/acc_reward/std": 0.011797359213232994, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 114 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 421.0, |
| "completions/mean_length": 184.890625, |
| "completions/mean_terminated_length": 179.69842529296875, |
| "completions/min_length": 35.0, |
| "completions/min_terminated_length": 35.0, |
| "entropy": 2.609375, |
| "epoch": 0.21657250470809794, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 7.853107344632767e-07, |
| "loss": 0.0, |
| "num_tokens": 15406085.0, |
| "reward": 0.9864563941955566, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9849514961242676, |
| "rewards/acc_reward/std": 0.023492755368351936, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 115 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 436.0, |
| "completions/mean_length": 179.328125, |
| "completions/mean_terminated_length": 174.04762268066406, |
| "completions/min_length": 68.0, |
| "completions/min_terminated_length": 68.0, |
| "entropy": 2.34375, |
| "epoch": 0.2184557438794727, |
| "frac_reward_zero_std": 0.875, |
| "grad_norm": 0.8667036294937134, |
| "learning_rate": 7.83427495291902e-07, |
| "loss": -0.0178, |
| "num_tokens": 15537066.0, |
| "reward": 0.9839749336242676, |
| "reward_std": 0.004419418517500162, |
| "rewards/acc_reward/mean": 0.9839304685592651, |
| "rewards/acc_reward/std": 0.025710513815283775, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 116 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 319.0, |
| "completions/max_terminated_length": 319.0, |
| "completions/mean_length": 151.453125, |
| "completions/mean_terminated_length": 151.453125, |
| "completions/min_length": 37.0, |
| "completions/min_terminated_length": 37.0, |
| "entropy": 2.34375, |
| "epoch": 0.22033898305084745, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 7.815442561205274e-07, |
| "loss": 0.0, |
| "num_tokens": 15661063.0, |
| "reward": 0.9836729168891907, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9818587899208069, |
| "rewards/acc_reward/std": 0.03338773176074028, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 117 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 448.0, |
| "completions/max_terminated_length": 448.0, |
| "completions/mean_length": 173.40625, |
| "completions/mean_terminated_length": 173.40625, |
| "completions/min_length": 75.0, |
| "completions/min_terminated_length": 75.0, |
| "entropy": 2.53125, |
| "epoch": 0.2222222222222222, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 7.796610169491526e-07, |
| "loss": 0.0, |
| "num_tokens": 15794785.0, |
| "reward": 0.970079779624939, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9667552709579468, |
| "rewards/acc_reward/std": 0.03764721751213074, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 118 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 353.0, |
| "completions/max_terminated_length": 353.0, |
| "completions/mean_length": 156.765625, |
| "completions/mean_terminated_length": 156.765625, |
| "completions/min_length": 56.0, |
| "completions/min_terminated_length": 56.0, |
| "entropy": 2.171875, |
| "epoch": 0.224105461393597, |
| "frac_reward_zero_std": 0.875, |
| "grad_norm": 1.2777234315872192, |
| "learning_rate": 7.777777777777778e-07, |
| "loss": 0.0405, |
| "num_tokens": 15925402.0, |
| "reward": 0.9777387380599976, |
| "reward_std": 0.013079374097287655, |
| "rewards/acc_reward/mean": 0.9752652645111084, |
| "rewards/acc_reward/std": 0.04798175394535065, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 119 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 408.0, |
| "completions/mean_length": 203.3125, |
| "completions/mean_terminated_length": 198.4127197265625, |
| "completions/min_length": 54.0, |
| "completions/min_terminated_length": 54.0, |
| "entropy": 2.65625, |
| "epoch": 0.22598870056497175, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 7.75894538606403e-07, |
| "loss": 0.0, |
| "num_tokens": 16073038.0, |
| "reward": 0.9984081983566284, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9982313513755798, |
| "rewards/acc_reward/std": 0.0047163767740130424, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 120 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.03125, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 458.0, |
| "completions/mean_length": 204.984375, |
| "completions/mean_terminated_length": 195.08062744140625, |
| "completions/min_length": 70.0, |
| "completions/min_terminated_length": 70.0, |
| "entropy": 2.59375, |
| "epoch": 0.2278719397363465, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 7.740112994350282e-07, |
| "loss": 0.0, |
| "num_tokens": 16218901.0, |
| "reward": 0.9989771842956543, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9988635778427124, |
| "rewards/acc_reward/std": 0.0020075358916074038, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 121 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 404.0, |
| "completions/max_terminated_length": 404.0, |
| "completions/mean_length": 174.4375, |
| "completions/mean_terminated_length": 174.4375, |
| "completions/min_length": 50.0, |
| "completions/min_terminated_length": 50.0, |
| "entropy": 2.328125, |
| "epoch": 0.2297551789077213, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 7.721280602636534e-07, |
| "loss": 0.0, |
| "num_tokens": 16352369.0, |
| "reward": 0.988227367401123, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9869192838668823, |
| "rewards/acc_reward/std": 0.026611221954226494, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 122 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 341.0, |
| "completions/max_terminated_length": 341.0, |
| "completions/mean_length": 167.890625, |
| "completions/mean_terminated_length": 167.890625, |
| "completions/min_length": 53.0, |
| "completions/min_terminated_length": 53.0, |
| "entropy": 2.390625, |
| "epoch": 0.23163841807909605, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 7.702448210922787e-07, |
| "loss": 0.0, |
| "num_tokens": 16482378.0, |
| "reward": 0.9788169860839844, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9764633178710938, |
| "rewards/acc_reward/std": 0.03820019215345383, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 123 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 321.0, |
| "completions/max_terminated_length": 321.0, |
| "completions/mean_length": 178.171875, |
| "completions/mean_terminated_length": 178.171875, |
| "completions/min_length": 49.0, |
| "completions/min_terminated_length": 49.0, |
| "entropy": 2.125, |
| "epoch": 0.2335216572504708, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 7.68361581920904e-07, |
| "loss": 0.0, |
| "num_tokens": 16619925.0, |
| "reward": 0.9605213403701782, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9561347961425781, |
| "rewards/acc_reward/std": 0.08166956156492233, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 124 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 320.0, |
| "completions/mean_length": 173.6875, |
| "completions/mean_terminated_length": 168.3174591064453, |
| "completions/min_length": 59.0, |
| "completions/min_terminated_length": 59.0, |
| "entropy": 2.390625, |
| "epoch": 0.23540489642184556, |
| "frac_reward_zero_std": 0.875, |
| "grad_norm": 0.9748631119728088, |
| "learning_rate": 7.664783427495292e-07, |
| "loss": 0.0211, |
| "num_tokens": 16749945.0, |
| "reward": 0.9801042079925537, |
| "reward_std": 0.039774756878614426, |
| "rewards/acc_reward/mean": 0.9778935313224792, |
| "rewards/acc_reward/std": 0.12537133693695068, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 125 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.03125, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 406.0, |
| "completions/mean_length": 178.25, |
| "completions/mean_terminated_length": 167.48387145996094, |
| "completions/min_length": 39.0, |
| "completions/min_terminated_length": 39.0, |
| "entropy": 2.5625, |
| "epoch": 0.23728813559322035, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 7.645951035781544e-07, |
| "loss": 0.0, |
| "num_tokens": 16879657.0, |
| "reward": 0.9902485609054565, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9891650676727295, |
| "rewards/acc_reward/std": 0.015085420571267605, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 126 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 319.0, |
| "completions/max_terminated_length": 319.0, |
| "completions/mean_length": 162.15625, |
| "completions/mean_terminated_length": 162.15625, |
| "completions/min_length": 46.0, |
| "completions/min_terminated_length": 46.0, |
| "entropy": 2.328125, |
| "epoch": 0.2391713747645951, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 7.627118644067796e-07, |
| "loss": 0.0, |
| "num_tokens": 17009299.0, |
| "reward": 0.9965801239013672, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9962000846862793, |
| "rewards/acc_reward/std": 0.010133087635040283, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 127 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 343.0, |
| "completions/max_terminated_length": 343.0, |
| "completions/mean_length": 160.796875, |
| "completions/mean_terminated_length": 160.796875, |
| "completions/min_length": 57.0, |
| "completions/min_terminated_length": 57.0, |
| "entropy": 2.28125, |
| "epoch": 0.24105461393596986, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 7.608286252354048e-07, |
| "loss": 0.0, |
| "num_tokens": 17146406.0, |
| "reward": 0.9833929538726807, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9815477728843689, |
| "rewards/acc_reward/std": 0.024218887090682983, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 128 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 436.0, |
| "completions/max_terminated_length": 436.0, |
| "completions/mean_length": 175.1875, |
| "completions/mean_terminated_length": 175.1875, |
| "completions/min_length": 56.0, |
| "completions/min_terminated_length": 56.0, |
| "entropy": 2.484375, |
| "epoch": 0.24293785310734464, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 7.589453860640301e-07, |
| "loss": 0.0, |
| "num_tokens": 17274242.0, |
| "reward": 0.9791369438171387, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9768187999725342, |
| "rewards/acc_reward/std": 0.03209559619426727, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 129 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 441.0, |
| "completions/mean_length": 174.9375, |
| "completions/mean_terminated_length": 169.58731079101562, |
| "completions/min_length": 72.0, |
| "completions/min_terminated_length": 72.0, |
| "entropy": 2.359375, |
| "epoch": 0.2448210922787194, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 7.570621468926553e-07, |
| "loss": 0.0, |
| "num_tokens": 17409982.0, |
| "reward": 0.9768315553665161, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9742573499679565, |
| "rewards/acc_reward/std": 0.05567716062068939, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 130 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 394.0, |
| "completions/mean_length": 186.6875, |
| "completions/mean_terminated_length": 181.52381896972656, |
| "completions/min_length": 58.0, |
| "completions/min_terminated_length": 58.0, |
| "entropy": 2.421875, |
| "epoch": 0.24670433145009416, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 7.551789077212806e-07, |
| "loss": 0.0, |
| "num_tokens": 17545354.0, |
| "reward": 0.9810404777526855, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9789338707923889, |
| "rewards/acc_reward/std": 0.03438215330243111, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 131 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 483.0, |
| "completions/max_terminated_length": 483.0, |
| "completions/mean_length": 155.46875, |
| "completions/mean_terminated_length": 155.46875, |
| "completions/min_length": 61.0, |
| "completions/min_terminated_length": 61.0, |
| "entropy": 2.234375, |
| "epoch": 0.24858757062146894, |
| "frac_reward_zero_std": 0.875, |
| "grad_norm": 0.42140984535217285, |
| "learning_rate": 7.532956685499058e-07, |
| "loss": 0.0881, |
| "num_tokens": 17679648.0, |
| "reward": 0.9783517122268677, |
| "reward_std": 0.039774756878614426, |
| "rewards/acc_reward/mean": 0.975946307182312, |
| "rewards/acc_reward/std": 0.12442652136087418, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 132 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.03125, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 370.0, |
| "completions/mean_length": 180.71875, |
| "completions/mean_terminated_length": 170.03225708007812, |
| "completions/min_length": 62.0, |
| "completions/min_terminated_length": 62.0, |
| "entropy": 2.5625, |
| "epoch": 0.2504708097928437, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 7.51412429378531e-07, |
| "loss": 0.0, |
| "num_tokens": 17811718.0, |
| "reward": 0.9774539470672607, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9749488830566406, |
| "rewards/acc_reward/std": 0.04454605653882027, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 133 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 498.0, |
| "completions/mean_length": 177.40625, |
| "completions/mean_terminated_length": 172.09524536132812, |
| "completions/min_length": 67.0, |
| "completions/min_terminated_length": 67.0, |
| "entropy": 2.21875, |
| "epoch": 0.2523540489642185, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 7.495291902071563e-07, |
| "loss": 0.0, |
| "num_tokens": 17944256.0, |
| "reward": 0.9872071743011475, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.985785722732544, |
| "rewards/acc_reward/std": 0.01633262448012829, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 134 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 388.0, |
| "completions/max_terminated_length": 388.0, |
| "completions/mean_length": 167.875, |
| "completions/mean_terminated_length": 167.875, |
| "completions/min_length": 75.0, |
| "completions/min_terminated_length": 75.0, |
| "entropy": 2.375, |
| "epoch": 0.2542372881355932, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 7.476459510357815e-07, |
| "loss": 0.0, |
| "num_tokens": 18074344.0, |
| "reward": 0.996717095375061, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9963523149490356, |
| "rewards/acc_reward/std": 0.009727060794830322, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 135 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 341.0, |
| "completions/max_terminated_length": 341.0, |
| "completions/mean_length": 168.921875, |
| "completions/mean_terminated_length": 168.921875, |
| "completions/min_length": 71.0, |
| "completions/min_terminated_length": 71.0, |
| "entropy": 2.3125, |
| "epoch": 0.256120527306968, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 7.457627118644067e-07, |
| "loss": 0.0, |
| "num_tokens": 18202499.0, |
| "reward": 0.9629114866256714, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9587906002998352, |
| "rewards/acc_reward/std": 0.05813661590218544, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 136 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 413.0, |
| "completions/max_terminated_length": 413.0, |
| "completions/mean_length": 152.171875, |
| "completions/mean_terminated_length": 152.171875, |
| "completions/min_length": 54.0, |
| "completions/min_terminated_length": 54.0, |
| "entropy": 2.234375, |
| "epoch": 0.2580037664783427, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 7.438794726930319e-07, |
| "loss": 0.0, |
| "num_tokens": 18325262.0, |
| "reward": 0.9783220887184143, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.975913405418396, |
| "rewards/acc_reward/std": 0.03493288531899452, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 137 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 468.0, |
| "completions/max_terminated_length": 468.0, |
| "completions/mean_length": 182.3125, |
| "completions/mean_terminated_length": 182.3125, |
| "completions/min_length": 61.0, |
| "completions/min_terminated_length": 61.0, |
| "entropy": 2.421875, |
| "epoch": 0.2598870056497175, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 7.419962335216571e-07, |
| "loss": 0.0, |
| "num_tokens": 18463394.0, |
| "reward": 0.9732788801193237, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9703099131584167, |
| "rewards/acc_reward/std": 0.02722685970366001, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 138 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 312.0, |
| "completions/max_terminated_length": 312.0, |
| "completions/mean_length": 137.0, |
| "completions/mean_terminated_length": 137.0, |
| "completions/min_length": 47.0, |
| "completions/min_terminated_length": 47.0, |
| "entropy": 2.171875, |
| "epoch": 0.2617702448210923, |
| "frac_reward_zero_std": 0.875, |
| "grad_norm": 1.0246717929840088, |
| "learning_rate": 7.401129943502824e-07, |
| "loss": 0.0013, |
| "num_tokens": 18588066.0, |
| "reward": 0.9853101968765259, |
| "reward_std": 0.004419418517500162, |
| "rewards/acc_reward/mean": 0.9854141473770142, |
| "rewards/acc_reward/std": 0.01550329476594925, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 139 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 462.0, |
| "completions/mean_length": 180.15625, |
| "completions/mean_terminated_length": 174.88890075683594, |
| "completions/min_length": 74.0, |
| "completions/min_terminated_length": 74.0, |
| "entropy": 2.4375, |
| "epoch": 0.263653483992467, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 7.382297551789078e-07, |
| "loss": 0.0, |
| "num_tokens": 18720780.0, |
| "reward": 0.973831057548523, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9709233641624451, |
| "rewards/acc_reward/std": 0.047117821872234344, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 140 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.03125, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 451.0, |
| "completions/mean_length": 190.109375, |
| "completions/mean_terminated_length": 179.72579956054688, |
| "completions/min_length": 52.0, |
| "completions/min_terminated_length": 52.0, |
| "entropy": 2.40625, |
| "epoch": 0.2655367231638418, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 7.36346516007533e-07, |
| "loss": 0.0, |
| "num_tokens": 18857011.0, |
| "reward": 0.9712393283843994, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9680436849594116, |
| "rewards/acc_reward/std": 0.04809395968914032, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 141 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 410.0, |
| "completions/max_terminated_length": 410.0, |
| "completions/mean_length": 160.671875, |
| "completions/mean_terminated_length": 160.671875, |
| "completions/min_length": 72.0, |
| "completions/min_terminated_length": 72.0, |
| "entropy": 2.140625, |
| "epoch": 0.2674199623352166, |
| "frac_reward_zero_std": 0.75, |
| "grad_norm": 1.3321737051010132, |
| "learning_rate": 7.344632768361582e-07, |
| "loss": -0.0087, |
| "num_tokens": 18989918.0, |
| "reward": 0.9906257390975952, |
| "reward_std": 0.008838837035000324, |
| "rewards/acc_reward/mean": 0.9930564165115356, |
| "rewards/acc_reward/std": 0.010120646096765995, |
| "rewards/format_reward/mean": 0.96875, |
| "rewards/format_reward/std": 0.17536810040473938, |
| "step": 142 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 352.0, |
| "completions/max_terminated_length": 352.0, |
| "completions/mean_length": 147.78125, |
| "completions/mean_terminated_length": 147.78125, |
| "completions/min_length": 54.0, |
| "completions/min_terminated_length": 54.0, |
| "entropy": 2.25, |
| "epoch": 0.2693032015065913, |
| "frac_reward_zero_std": 0.875, |
| "grad_norm": 1.1365125179290771, |
| "learning_rate": 7.325800376647834e-07, |
| "loss": 0.0151, |
| "num_tokens": 19115760.0, |
| "reward": 0.9455662965774536, |
| "reward_std": 0.004419418517500162, |
| "rewards/acc_reward/mean": 0.9412541389465332, |
| "rewards/acc_reward/std": 0.09196340292692184, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 143 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 335.0, |
| "completions/max_terminated_length": 335.0, |
| "completions/mean_length": 161.796875, |
| "completions/mean_terminated_length": 161.796875, |
| "completions/min_length": 53.0, |
| "completions/min_terminated_length": 53.0, |
| "entropy": 2.234375, |
| "epoch": 0.2711864406779661, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 7.306967984934086e-07, |
| "loss": 0.0, |
| "num_tokens": 19244779.0, |
| "reward": 0.9764645099639893, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9738494753837585, |
| "rewards/acc_reward/std": 0.043730027973651886, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 144 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 354.0, |
| "completions/mean_length": 164.109375, |
| "completions/mean_terminated_length": 158.58731079101562, |
| "completions/min_length": 68.0, |
| "completions/min_terminated_length": 68.0, |
| "entropy": 2.28125, |
| "epoch": 0.2730696798493409, |
| "frac_reward_zero_std": 0.75, |
| "grad_norm": 1.4404025077819824, |
| "learning_rate": 7.288135593220338e-07, |
| "loss": 0.0261, |
| "num_tokens": 19387506.0, |
| "reward": 0.9824453592300415, |
| "reward_std": 0.017121607437729836, |
| "rewards/acc_reward/mean": 0.9822309613227844, |
| "rewards/acc_reward/std": 0.04615851864218712, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 145 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 322.0, |
| "completions/max_terminated_length": 322.0, |
| "completions/mean_length": 161.578125, |
| "completions/mean_terminated_length": 161.578125, |
| "completions/min_length": 83.0, |
| "completions/min_terminated_length": 83.0, |
| "entropy": 2.21875, |
| "epoch": 0.2749529190207156, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 7.269303201506592e-07, |
| "loss": 0.0, |
| "num_tokens": 19526271.0, |
| "reward": 0.9767446517944336, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9741606712341309, |
| "rewards/acc_reward/std": 0.02946525067090988, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 146 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 299.0, |
| "completions/max_terminated_length": 299.0, |
| "completions/mean_length": 162.734375, |
| "completions/mean_terminated_length": 162.734375, |
| "completions/min_length": 70.0, |
| "completions/min_terminated_length": 70.0, |
| "entropy": 2.296875, |
| "epoch": 0.2768361581920904, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 7.250470809792844e-07, |
| "loss": 0.0, |
| "num_tokens": 19656430.0, |
| "reward": 0.9855356216430664, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9839285016059875, |
| "rewards/acc_reward/std": 0.03281255066394806, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 147 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 328.0, |
| "completions/max_terminated_length": 328.0, |
| "completions/mean_length": 152.265625, |
| "completions/mean_terminated_length": 152.265625, |
| "completions/min_length": 54.0, |
| "completions/min_terminated_length": 54.0, |
| "entropy": 2.328125, |
| "epoch": 0.2787193973634652, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 7.231638418079096e-07, |
| "loss": 0.0, |
| "num_tokens": 19791039.0, |
| "reward": 0.9486349821090698, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9429277777671814, |
| "rewards/acc_reward/std": 0.0684126690030098, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 148 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 335.0, |
| "completions/max_terminated_length": 335.0, |
| "completions/mean_length": 138.96875, |
| "completions/mean_terminated_length": 138.96875, |
| "completions/min_length": 44.0, |
| "completions/min_terminated_length": 44.0, |
| "entropy": 2.046875, |
| "epoch": 0.2806026365348399, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 7.212806026365348e-07, |
| "loss": 0.0, |
| "num_tokens": 19920157.0, |
| "reward": 0.9899982213973999, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9888868927955627, |
| "rewards/acc_reward/std": 0.014463546685874462, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 149 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 420.0, |
| "completions/mean_length": 171.4375, |
| "completions/mean_terminated_length": 166.03173828125, |
| "completions/min_length": 54.0, |
| "completions/min_terminated_length": 54.0, |
| "entropy": 2.1875, |
| "epoch": 0.2824858757062147, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 7.1939736346516e-07, |
| "loss": 0.0, |
| "num_tokens": 20054233.0, |
| "reward": 0.9820305109024048, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9800339341163635, |
| "rewards/acc_reward/std": 0.03505489602684975, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 150 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 353.0, |
| "completions/max_terminated_length": 353.0, |
| "completions/mean_length": 154.09375, |
| "completions/mean_terminated_length": 154.09375, |
| "completions/min_length": 66.0, |
| "completions/min_terminated_length": 66.0, |
| "entropy": 2.265625, |
| "epoch": 0.2843691148775895, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 7.175141242937853e-07, |
| "loss": 0.0, |
| "num_tokens": 20177839.0, |
| "reward": 0.9695570468902588, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9661744832992554, |
| "rewards/acc_reward/std": 0.051941804587841034, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 151 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 317.0, |
| "completions/max_terminated_length": 317.0, |
| "completions/mean_length": 159.078125, |
| "completions/mean_terminated_length": 159.078125, |
| "completions/min_length": 52.0, |
| "completions/min_terminated_length": 52.0, |
| "entropy": 2.21875, |
| "epoch": 0.2862523540489642, |
| "frac_reward_zero_std": 0.875, |
| "grad_norm": 1.135265588760376, |
| "learning_rate": 7.156308851224105e-07, |
| "loss": 0.0085, |
| "num_tokens": 20313964.0, |
| "reward": 0.9910129308700562, |
| "reward_std": 0.004419418517500162, |
| "rewards/acc_reward/mean": 0.9917504787445068, |
| "rewards/acc_reward/std": 0.015448656864464283, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 152 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 384.0, |
| "completions/max_terminated_length": 384.0, |
| "completions/mean_length": 176.328125, |
| "completions/mean_terminated_length": 176.328125, |
| "completions/min_length": 44.0, |
| "completions/min_terminated_length": 44.0, |
| "entropy": 2.546875, |
| "epoch": 0.288135593220339, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 7.137476459510358e-07, |
| "loss": 0.0, |
| "num_tokens": 20454113.0, |
| "reward": 0.9844459295272827, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9827176928520203, |
| "rewards/acc_reward/std": 0.018290938809514046, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 153 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 447.0, |
| "completions/max_terminated_length": 447.0, |
| "completions/mean_length": 154.140625, |
| "completions/mean_terminated_length": 154.140625, |
| "completions/min_length": 46.0, |
| "completions/min_terminated_length": 46.0, |
| "entropy": 2.265625, |
| "epoch": 0.2900188323917137, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 7.11864406779661e-07, |
| "loss": 0.0, |
| "num_tokens": 20591402.0, |
| "reward": 0.9614624977111816, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9571805596351624, |
| "rewards/acc_reward/std": 0.06432777643203735, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 154 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 323.0, |
| "completions/max_terminated_length": 323.0, |
| "completions/mean_length": 151.9375, |
| "completions/mean_terminated_length": 151.9375, |
| "completions/min_length": 29.0, |
| "completions/min_terminated_length": 29.0, |
| "entropy": 2.1875, |
| "epoch": 0.2919020715630885, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 7.099811676082862e-07, |
| "loss": 0.0, |
| "num_tokens": 20714630.0, |
| "reward": 0.9820876717567444, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9800974130630493, |
| "rewards/acc_reward/std": 0.037367500364780426, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 155 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 436.0, |
| "completions/mean_length": 173.15625, |
| "completions/mean_terminated_length": 167.77780151367188, |
| "completions/min_length": 63.0, |
| "completions/min_terminated_length": 63.0, |
| "entropy": 2.296875, |
| "epoch": 0.2937853107344633, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 7.080979284369114e-07, |
| "loss": 0.0, |
| "num_tokens": 20853136.0, |
| "reward": 0.9967447519302368, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9963830709457397, |
| "rewards/acc_reward/std": 0.009645064361393452, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 156 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 382.0, |
| "completions/max_terminated_length": 382.0, |
| "completions/mean_length": 158.5, |
| "completions/mean_terminated_length": 158.5, |
| "completions/min_length": 60.0, |
| "completions/min_terminated_length": 60.0, |
| "entropy": 2.21875, |
| "epoch": 0.295668549905838, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 7.062146892655367e-07, |
| "loss": 0.0, |
| "num_tokens": 20979664.0, |
| "reward": 0.9870873689651489, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9856526851654053, |
| "rewards/acc_reward/std": 0.020136423408985138, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 157 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 383.0, |
| "completions/max_terminated_length": 383.0, |
| "completions/mean_length": 164.1875, |
| "completions/mean_terminated_length": 164.1875, |
| "completions/min_length": 68.0, |
| "completions/min_terminated_length": 68.0, |
| "entropy": 2.234375, |
| "epoch": 0.2975517890772128, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 7.043314500941619e-07, |
| "loss": 0.0, |
| "num_tokens": 21116156.0, |
| "reward": 0.9893269538879395, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9881410598754883, |
| "rewards/acc_reward/std": 0.023672664538025856, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 158 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.03125, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 404.0, |
| "completions/mean_length": 177.203125, |
| "completions/mean_terminated_length": 166.40322875976562, |
| "completions/min_length": 64.0, |
| "completions/min_terminated_length": 64.0, |
| "entropy": 2.328125, |
| "epoch": 0.2994350282485876, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 7.024482109227871e-07, |
| "loss": 0.0, |
| "num_tokens": 21245321.0, |
| "reward": 0.9947940111160278, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9942155480384827, |
| "rewards/acc_reward/std": 0.0101578738540411, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 159 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 427.0, |
| "completions/max_terminated_length": 427.0, |
| "completions/mean_length": 167.0625, |
| "completions/mean_terminated_length": 167.0625, |
| "completions/min_length": 52.0, |
| "completions/min_terminated_length": 52.0, |
| "entropy": 2.1875, |
| "epoch": 0.3013182674199623, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 7.005649717514124e-07, |
| "loss": 0.0, |
| "num_tokens": 21376237.0, |
| "reward": 0.9860028624534607, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9844475984573364, |
| "rewards/acc_reward/std": 0.019196392968297005, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 160 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 338.0, |
| "completions/mean_length": 169.265625, |
| "completions/mean_terminated_length": 163.82540893554688, |
| "completions/min_length": 34.0, |
| "completions/min_terminated_length": 34.0, |
| "entropy": 2.359375, |
| "epoch": 0.3032015065913371, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 6.986817325800376e-07, |
| "loss": 0.0, |
| "num_tokens": 21516414.0, |
| "reward": 0.9679353833198547, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9643726348876953, |
| "rewards/acc_reward/std": 0.05980806425213814, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 161 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 348.0, |
| "completions/max_terminated_length": 348.0, |
| "completions/mean_length": 153.296875, |
| "completions/mean_terminated_length": 153.296875, |
| "completions/min_length": 68.0, |
| "completions/min_terminated_length": 68.0, |
| "entropy": 2.09375, |
| "epoch": 0.3050847457627119, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 6.967984934086629e-07, |
| "loss": 0.0, |
| "num_tokens": 21642289.0, |
| "reward": 0.9967857003211975, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9964286088943481, |
| "rewards/acc_reward/std": 0.009523809887468815, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 162 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 327.0, |
| "completions/max_terminated_length": 327.0, |
| "completions/mean_length": 164.515625, |
| "completions/mean_terminated_length": 164.515625, |
| "completions/min_length": 78.0, |
| "completions/min_terminated_length": 78.0, |
| "entropy": 2.171875, |
| "epoch": 0.3069679849340866, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 6.949152542372881e-07, |
| "loss": 0.0, |
| "num_tokens": 21782162.0, |
| "reward": 0.9368254542350769, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.929806113243103, |
| "rewards/acc_reward/std": 0.0714760348200798, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 163 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 387.0, |
| "completions/max_terminated_length": 387.0, |
| "completions/mean_length": 168.53125, |
| "completions/mean_terminated_length": 168.53125, |
| "completions/min_length": 64.0, |
| "completions/min_terminated_length": 64.0, |
| "entropy": 2.234375, |
| "epoch": 0.3088512241054614, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 6.930320150659133e-07, |
| "loss": 0.0, |
| "num_tokens": 21911732.0, |
| "reward": 0.9614138603210449, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9571264982223511, |
| "rewards/acc_reward/std": 0.08462820202112198, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 164 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 389.0, |
| "completions/max_terminated_length": 389.0, |
| "completions/mean_length": 172.609375, |
| "completions/mean_terminated_length": 172.609375, |
| "completions/min_length": 46.0, |
| "completions/min_terminated_length": 46.0, |
| "entropy": 2.015625, |
| "epoch": 0.3107344632768362, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 6.911487758945385e-07, |
| "loss": 0.0, |
| "num_tokens": 22043643.0, |
| "reward": 0.9874755144119263, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9860839247703552, |
| "rewards/acc_reward/std": 0.02431458979845047, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 165 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 468.0, |
| "completions/max_terminated_length": 468.0, |
| "completions/mean_length": 156.015625, |
| "completions/mean_terminated_length": 156.015625, |
| "completions/min_length": 70.0, |
| "completions/min_terminated_length": 70.0, |
| "entropy": 2.234375, |
| "epoch": 0.3126177024482109, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 6.892655367231638e-07, |
| "loss": 0.0, |
| "num_tokens": 22167892.0, |
| "reward": 0.9494754076004028, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9438616037368774, |
| "rewards/acc_reward/std": 0.05841909348964691, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 166 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 285.0, |
| "completions/max_terminated_length": 285.0, |
| "completions/mean_length": 144.0, |
| "completions/mean_terminated_length": 144.0, |
| "completions/min_length": 51.0, |
| "completions/min_terminated_length": 51.0, |
| "entropy": 1.8828125, |
| "epoch": 0.3145009416195857, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 6.87382297551789e-07, |
| "loss": 0.0, |
| "num_tokens": 22302132.0, |
| "reward": 0.9812496900558472, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9791662693023682, |
| "rewards/acc_reward/std": 0.023800579831004143, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 167 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 309.0, |
| "completions/mean_length": 163.0, |
| "completions/mean_terminated_length": 157.4603271484375, |
| "completions/min_length": 69.0, |
| "completions/min_terminated_length": 69.0, |
| "entropy": 2.203125, |
| "epoch": 0.3163841807909605, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 6.854990583804144e-07, |
| "loss": 0.0, |
| "num_tokens": 22428188.0, |
| "reward": 0.9839420318603516, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9821577668190002, |
| "rewards/acc_reward/std": 0.03933866694569588, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 168 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 388.0, |
| "completions/max_terminated_length": 388.0, |
| "completions/mean_length": 160.859375, |
| "completions/mean_terminated_length": 160.859375, |
| "completions/min_length": 49.0, |
| "completions/min_terminated_length": 49.0, |
| "entropy": 2.28125, |
| "epoch": 0.3182674199623352, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 6.836158192090396e-07, |
| "loss": 0.0, |
| "num_tokens": 22559187.0, |
| "reward": 0.9785705804824829, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9761895537376404, |
| "rewards/acc_reward/std": 0.0258196871727705, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 169 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 296.0, |
| "completions/max_terminated_length": 296.0, |
| "completions/mean_length": 162.125, |
| "completions/mean_terminated_length": 162.125, |
| "completions/min_length": 72.0, |
| "completions/min_terminated_length": 72.0, |
| "entropy": 2.015625, |
| "epoch": 0.32015065913371, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 6.817325800376648e-07, |
| "loss": 0.0, |
| "num_tokens": 22686427.0, |
| "reward": 0.9450536966323853, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9389485120773315, |
| "rewards/acc_reward/std": 0.07760415226221085, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 170 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 280.0, |
| "completions/max_terminated_length": 280.0, |
| "completions/mean_length": 136.40625, |
| "completions/mean_terminated_length": 136.40625, |
| "completions/min_length": 64.0, |
| "completions/min_terminated_length": 64.0, |
| "entropy": 1.953125, |
| "epoch": 0.3220338983050847, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 6.7984934086629e-07, |
| "loss": 0.0, |
| "num_tokens": 22812501.0, |
| "reward": 0.9613984823226929, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9571093916893005, |
| "rewards/acc_reward/std": 0.07542510330677032, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 171 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 316.0, |
| "completions/max_terminated_length": 316.0, |
| "completions/mean_length": 157.84375, |
| "completions/mean_terminated_length": 157.84375, |
| "completions/min_length": 34.0, |
| "completions/min_terminated_length": 34.0, |
| "entropy": 2.046875, |
| "epoch": 0.3239171374764595, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 6.779661016949152e-07, |
| "loss": 0.0, |
| "num_tokens": 22934187.0, |
| "reward": 0.9912809729576111, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9903122186660767, |
| "rewards/acc_reward/std": 0.020957766100764275, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 172 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 389.0, |
| "completions/max_terminated_length": 389.0, |
| "completions/mean_length": 176.359375, |
| "completions/mean_terminated_length": 176.359375, |
| "completions/min_length": 68.0, |
| "completions/min_terminated_length": 68.0, |
| "entropy": 2.328125, |
| "epoch": 0.3258003766478343, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 6.760828625235404e-07, |
| "loss": 0.0, |
| "num_tokens": 23070498.0, |
| "reward": 0.9652288556098938, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9613653421401978, |
| "rewards/acc_reward/std": 0.06810762733221054, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 173 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 464.0, |
| "completions/mean_length": 196.28125, |
| "completions/mean_terminated_length": 191.2698516845703, |
| "completions/min_length": 77.0, |
| "completions/min_terminated_length": 77.0, |
| "entropy": 2.171875, |
| "epoch": 0.327683615819209, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 6.741996233521658e-07, |
| "loss": 0.0, |
| "num_tokens": 23202324.0, |
| "reward": 0.9608478546142578, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9564976692199707, |
| "rewards/acc_reward/std": 0.06687356531620026, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 174 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 296.0, |
| "completions/mean_length": 150.796875, |
| "completions/mean_terminated_length": 145.06350708007812, |
| "completions/min_length": 40.0, |
| "completions/min_terminated_length": 40.0, |
| "entropy": 1.9609375, |
| "epoch": 0.3295668549905838, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 6.72316384180791e-07, |
| "loss": 0.0, |
| "num_tokens": 23333319.0, |
| "reward": 0.9884651899337769, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9871835112571716, |
| "rewards/acc_reward/std": 0.026243234053254128, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 175 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 420.0, |
| "completions/max_terminated_length": 420.0, |
| "completions/mean_length": 165.828125, |
| "completions/mean_terminated_length": 165.828125, |
| "completions/min_length": 66.0, |
| "completions/min_terminated_length": 66.0, |
| "entropy": 2.28125, |
| "epoch": 0.3314500941619586, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 6.704331450094162e-07, |
| "loss": 0.0, |
| "num_tokens": 23460796.0, |
| "reward": 0.9813232421875, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.979248046875, |
| "rewards/acc_reward/std": 0.04185057431459427, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 176 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 384.0, |
| "completions/max_terminated_length": 384.0, |
| "completions/mean_length": 146.734375, |
| "completions/mean_terminated_length": 146.734375, |
| "completions/min_length": 50.0, |
| "completions/min_terminated_length": 50.0, |
| "entropy": 2.09375, |
| "epoch": 0.3333333333333333, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 6.685499058380414e-07, |
| "loss": 0.0, |
| "num_tokens": 23584811.0, |
| "reward": 0.9937499761581421, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9930555820465088, |
| "rewards/acc_reward/std": 0.01851852796971798, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 177 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 436.0, |
| "completions/max_terminated_length": 436.0, |
| "completions/mean_length": 161.828125, |
| "completions/mean_terminated_length": 161.828125, |
| "completions/min_length": 67.0, |
| "completions/min_terminated_length": 67.0, |
| "entropy": 2.34375, |
| "epoch": 0.3352165725047081, |
| "frac_reward_zero_std": 0.875, |
| "grad_norm": 1.0041779279708862, |
| "learning_rate": 6.666666666666666e-07, |
| "loss": 0.0029, |
| "num_tokens": 23723072.0, |
| "reward": 0.9907628893852234, |
| "reward_std": 0.004419418517500162, |
| "rewards/acc_reward/mean": 0.9914726614952087, |
| "rewards/acc_reward/std": 0.014887169934809208, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 178 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 447.0, |
| "completions/max_terminated_length": 447.0, |
| "completions/mean_length": 170.140625, |
| "completions/mean_terminated_length": 170.140625, |
| "completions/min_length": 69.0, |
| "completions/min_terminated_length": 69.0, |
| "entropy": 2.296875, |
| "epoch": 0.3370998116760829, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 6.647834274952919e-07, |
| "loss": 0.0, |
| "num_tokens": 23852745.0, |
| "reward": 0.9886301755905151, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9873669147491455, |
| "rewards/acc_reward/std": 0.02594558708369732, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 179 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 399.0, |
| "completions/max_terminated_length": 399.0, |
| "completions/mean_length": 159.09375, |
| "completions/mean_terminated_length": 159.09375, |
| "completions/min_length": 65.0, |
| "completions/min_terminated_length": 65.0, |
| "entropy": 2.203125, |
| "epoch": 0.3389830508474576, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 6.629001883239171e-07, |
| "loss": 0.0, |
| "num_tokens": 23976431.0, |
| "reward": 0.9778439402580261, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9753821492195129, |
| "rewards/acc_reward/std": 0.02938609942793846, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 180 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 366.0, |
| "completions/mean_length": 155.328125, |
| "completions/mean_terminated_length": 149.66668701171875, |
| "completions/min_length": 49.0, |
| "completions/min_terminated_length": 49.0, |
| "entropy": 2.265625, |
| "epoch": 0.3408662900188324, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 6.610169491525423e-07, |
| "loss": 0.0, |
| "num_tokens": 24103236.0, |
| "reward": 0.9857558012008667, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9841731190681458, |
| "rewards/acc_reward/std": 0.02304108813405037, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 181 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 379.0, |
| "completions/max_terminated_length": 379.0, |
| "completions/mean_length": 157.25, |
| "completions/mean_terminated_length": 157.25, |
| "completions/min_length": 58.0, |
| "completions/min_terminated_length": 58.0, |
| "entropy": 1.921875, |
| "epoch": 0.3427495291902072, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 6.591337099811676e-07, |
| "loss": 0.0, |
| "num_tokens": 24240244.0, |
| "reward": 0.9847475290298462, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9830528497695923, |
| "rewards/acc_reward/std": 0.018334772437810898, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 182 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 353.0, |
| "completions/max_terminated_length": 353.0, |
| "completions/mean_length": 153.484375, |
| "completions/mean_terminated_length": 153.484375, |
| "completions/min_length": 45.0, |
| "completions/min_terminated_length": 45.0, |
| "entropy": 2.09375, |
| "epoch": 0.3446327683615819, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 6.572504708097928e-07, |
| "loss": 0.0, |
| "num_tokens": 24373251.0, |
| "reward": 0.9889024496078491, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9876693487167358, |
| "rewards/acc_reward/std": 0.014938557520508766, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 183 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 365.0, |
| "completions/mean_length": 173.65625, |
| "completions/mean_terminated_length": 168.2857208251953, |
| "completions/min_length": 55.0, |
| "completions/min_terminated_length": 55.0, |
| "entropy": 2.34375, |
| "epoch": 0.3465160075329567, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 6.55367231638418e-07, |
| "loss": 0.0, |
| "num_tokens": 24507229.0, |
| "reward": 0.9813482165336609, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9792758226394653, |
| "rewards/acc_reward/std": 0.03865106776356697, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 184 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 350.0, |
| "completions/max_terminated_length": 350.0, |
| "completions/mean_length": 165.078125, |
| "completions/mean_terminated_length": 165.078125, |
| "completions/min_length": 63.0, |
| "completions/min_terminated_length": 63.0, |
| "entropy": 2.203125, |
| "epoch": 0.3483992467043315, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 6.534839924670433e-07, |
| "loss": 0.0, |
| "num_tokens": 24646554.0, |
| "reward": 0.9556671380996704, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9507412910461426, |
| "rewards/acc_reward/std": 0.07888054847717285, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 185 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 285.0, |
| "completions/max_terminated_length": 285.0, |
| "completions/mean_length": 154.4375, |
| "completions/mean_terminated_length": 154.4375, |
| "completions/min_length": 70.0, |
| "completions/min_terminated_length": 70.0, |
| "entropy": 1.984375, |
| "epoch": 0.3502824858757062, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 6.516007532956685e-07, |
| "loss": 0.0, |
| "num_tokens": 24785782.0, |
| "reward": 0.9671032428741455, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9634479880332947, |
| "rewards/acc_reward/std": 0.06446754932403564, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 186 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 309.0, |
| "completions/max_terminated_length": 309.0, |
| "completions/mean_length": 140.21875, |
| "completions/mean_terminated_length": 140.21875, |
| "completions/min_length": 49.0, |
| "completions/min_terminated_length": 49.0, |
| "entropy": 2.09375, |
| "epoch": 0.352165725047081, |
| "frac_reward_zero_std": 0.875, |
| "grad_norm": 1.060726284980774, |
| "learning_rate": 6.497175141242937e-07, |
| "loss": 0.0088, |
| "num_tokens": 24903940.0, |
| "reward": 0.932281494140625, |
| "reward_std": 0.004419418517500162, |
| "rewards/acc_reward/mean": 0.9264933466911316, |
| "rewards/acc_reward/std": 0.08563832193613052, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 187 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 460.0, |
| "completions/mean_length": 150.15625, |
| "completions/mean_terminated_length": 144.4127197265625, |
| "completions/min_length": 60.0, |
| "completions/min_terminated_length": 60.0, |
| "entropy": 2.015625, |
| "epoch": 0.3540489642184557, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 6.478342749529189e-07, |
| "loss": 0.0, |
| "num_tokens": 25035214.0, |
| "reward": 0.9913280010223389, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9903644323348999, |
| "rewards/acc_reward/std": 0.01791100949048996, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 188 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 493.0, |
| "completions/max_terminated_length": 493.0, |
| "completions/mean_length": 156.28125, |
| "completions/mean_terminated_length": 156.28125, |
| "completions/min_length": 74.0, |
| "completions/min_terminated_length": 74.0, |
| "entropy": 2.109375, |
| "epoch": 0.3559322033898305, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 6.459510357815442e-07, |
| "loss": 0.0, |
| "num_tokens": 25168800.0, |
| "reward": 0.9506161212921143, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9451290369033813, |
| "rewards/acc_reward/std": 0.07419686019420624, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 189 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 319.0, |
| "completions/max_terminated_length": 319.0, |
| "completions/mean_length": 138.859375, |
| "completions/mean_terminated_length": 138.859375, |
| "completions/min_length": 58.0, |
| "completions/min_terminated_length": 58.0, |
| "entropy": 1.8359375, |
| "epoch": 0.3578154425612053, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 6.440677966101694e-07, |
| "loss": 0.0, |
| "num_tokens": 25302271.0, |
| "reward": 0.9889830350875854, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9877589344978333, |
| "rewards/acc_reward/std": 0.013018240220844746, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 190 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 250.0, |
| "completions/max_terminated_length": 250.0, |
| "completions/mean_length": 143.65625, |
| "completions/mean_terminated_length": 143.65625, |
| "completions/min_length": 70.0, |
| "completions/min_terminated_length": 70.0, |
| "entropy": 2.203125, |
| "epoch": 0.35969868173258, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 6.421845574387948e-07, |
| "loss": 0.0, |
| "num_tokens": 25422569.0, |
| "reward": 0.9856148958206177, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9840165376663208, |
| "rewards/acc_reward/std": 0.020176060497760773, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 191 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 493.0, |
| "completions/mean_length": 172.125, |
| "completions/mean_terminated_length": 166.73016357421875, |
| "completions/min_length": 61.0, |
| "completions/min_terminated_length": 61.0, |
| "entropy": 2.015625, |
| "epoch": 0.3615819209039548, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 6.4030131826742e-07, |
| "loss": 0.0, |
| "num_tokens": 25560809.0, |
| "reward": 0.9829681515693665, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9810757637023926, |
| "rewards/acc_reward/std": 0.0207473486661911, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 192 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 359.0, |
| "completions/max_terminated_length": 359.0, |
| "completions/mean_length": 150.859375, |
| "completions/mean_terminated_length": 150.859375, |
| "completions/min_length": 67.0, |
| "completions/min_terminated_length": 67.0, |
| "entropy": 1.9453125, |
| "epoch": 0.3634651600753296, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 6.384180790960452e-07, |
| "loss": 0.0, |
| "num_tokens": 25691888.0, |
| "reward": 0.959905743598938, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9554507732391357, |
| "rewards/acc_reward/std": 0.07276061922311783, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 193 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 329.0, |
| "completions/max_terminated_length": 329.0, |
| "completions/mean_length": 164.3125, |
| "completions/mean_terminated_length": 164.3125, |
| "completions/min_length": 59.0, |
| "completions/min_terminated_length": 59.0, |
| "entropy": 2.171875, |
| "epoch": 0.3653483992467043, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 6.365348399246704e-07, |
| "loss": 0.0, |
| "num_tokens": 25819652.0, |
| "reward": 0.9704650044441223, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9671833515167236, |
| "rewards/acc_reward/std": 0.04358522593975067, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 194 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 288.0, |
| "completions/max_terminated_length": 288.0, |
| "completions/mean_length": 145.09375, |
| "completions/mean_terminated_length": 145.09375, |
| "completions/min_length": 57.0, |
| "completions/min_terminated_length": 57.0, |
| "entropy": 2.15625, |
| "epoch": 0.3672316384180791, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 6.346516007532956e-07, |
| "loss": 0.0, |
| "num_tokens": 25950602.0, |
| "reward": 0.9926788806915283, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9918653964996338, |
| "rewards/acc_reward/std": 0.011418163776397705, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 195 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 405.0, |
| "completions/max_terminated_length": 405.0, |
| "completions/mean_length": 138.3125, |
| "completions/mean_terminated_length": 138.3125, |
| "completions/min_length": 69.0, |
| "completions/min_terminated_length": 69.0, |
| "entropy": 1.953125, |
| "epoch": 0.3691148775894539, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 6.32768361581921e-07, |
| "loss": 0.0, |
| "num_tokens": 26070558.0, |
| "reward": 0.9680017232894897, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9644463658332825, |
| "rewards/acc_reward/std": 0.051819488406181335, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 196 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 368.0, |
| "completions/max_terminated_length": 368.0, |
| "completions/mean_length": 164.3125, |
| "completions/mean_terminated_length": 164.3125, |
| "completions/min_length": 68.0, |
| "completions/min_terminated_length": 68.0, |
| "entropy": 2.109375, |
| "epoch": 0.3709981167608286, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 6.308851224105462e-07, |
| "loss": 0.0, |
| "num_tokens": 26211378.0, |
| "reward": 0.992339015007019, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9914877414703369, |
| "rewards/acc_reward/std": 0.012202701531350613, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 197 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 301.0, |
| "completions/max_terminated_length": 301.0, |
| "completions/mean_length": 150.03125, |
| "completions/mean_terminated_length": 150.03125, |
| "completions/min_length": 57.0, |
| "completions/min_terminated_length": 57.0, |
| "entropy": 2.078125, |
| "epoch": 0.3728813559322034, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 6.290018832391714e-07, |
| "loss": 0.0, |
| "num_tokens": 26341204.0, |
| "reward": 0.9656549692153931, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9618387818336487, |
| "rewards/acc_reward/std": 0.03533172979950905, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 198 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 349.0, |
| "completions/max_terminated_length": 349.0, |
| "completions/mean_length": 144.90625, |
| "completions/mean_terminated_length": 144.90625, |
| "completions/min_length": 62.0, |
| "completions/min_terminated_length": 62.0, |
| "entropy": 2.046875, |
| "epoch": 0.3747645951035782, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 6.271186440677966e-07, |
| "loss": 0.0, |
| "num_tokens": 26460142.0, |
| "reward": 0.9934231042861938, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9926923513412476, |
| "rewards/acc_reward/std": 0.0151524618268013, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 199 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 340.0, |
| "completions/max_terminated_length": 340.0, |
| "completions/mean_length": 156.90625, |
| "completions/mean_terminated_length": 156.90625, |
| "completions/min_length": 62.0, |
| "completions/min_terminated_length": 62.0, |
| "entropy": 2.0625, |
| "epoch": 0.3766478342749529, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 6.252354048964218e-07, |
| "loss": 0.0, |
| "num_tokens": 26586088.0, |
| "reward": 0.9820280075073242, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9800311326980591, |
| "rewards/acc_reward/std": 0.018364734947681427, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 200 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 468.0, |
| "completions/max_terminated_length": 468.0, |
| "completions/mean_length": 143.34375, |
| "completions/mean_terminated_length": 143.34375, |
| "completions/min_length": 37.0, |
| "completions/min_terminated_length": 37.0, |
| "entropy": 2.125, |
| "epoch": 0.3785310734463277, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 6.23352165725047e-07, |
| "loss": 0.0, |
| "num_tokens": 26717886.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 1.0, |
| "rewards/acc_reward/std": 0.0, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 201 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 409.0, |
| "completions/max_terminated_length": 409.0, |
| "completions/mean_length": 175.90625, |
| "completions/mean_terminated_length": 175.90625, |
| "completions/min_length": 28.0, |
| "completions/min_terminated_length": 28.0, |
| "entropy": 2.140625, |
| "epoch": 0.3804143126177024, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 6.214689265536723e-07, |
| "loss": 0.0, |
| "num_tokens": 26854168.0, |
| "reward": 0.9935948252677917, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9928831458091736, |
| "rewards/acc_reward/std": 0.0077277072705328465, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 202 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 396.0, |
| "completions/max_terminated_length": 396.0, |
| "completions/mean_length": 165.25, |
| "completions/mean_terminated_length": 165.25, |
| "completions/min_length": 52.0, |
| "completions/min_terminated_length": 52.0, |
| "entropy": 2.171875, |
| "epoch": 0.3822975517890772, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 6.195856873822976e-07, |
| "loss": 0.0, |
| "num_tokens": 26989488.0, |
| "reward": 0.9852421283721924, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9836024045944214, |
| "rewards/acc_reward/std": 0.01928338035941124, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 203 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 512.0, |
| "completions/max_terminated_length": 261.0, |
| "completions/mean_length": 154.296875, |
| "completions/mean_terminated_length": 148.61904907226562, |
| "completions/min_length": 46.0, |
| "completions/min_terminated_length": 46.0, |
| "entropy": 2.1875, |
| "epoch": 0.384180790960452, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 6.177024482109228e-07, |
| "loss": 0.0, |
| "num_tokens": 27112987.0, |
| "reward": 0.9929645657539368, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9921828508377075, |
| "rewards/acc_reward/std": 0.02084571123123169, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 204 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 342.0, |
| "completions/max_terminated_length": 342.0, |
| "completions/mean_length": 147.5, |
| "completions/mean_terminated_length": 147.5, |
| "completions/min_length": 50.0, |
| "completions/min_terminated_length": 50.0, |
| "entropy": 2.125, |
| "epoch": 0.3860640301318267, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 6.15819209039548e-07, |
| "loss": 0.0, |
| "num_tokens": 27235691.0, |
| "reward": 0.9827622175216675, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9808468818664551, |
| "rewards/acc_reward/std": 0.027835894376039505, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 205 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 367.0, |
| "completions/max_terminated_length": 367.0, |
| "completions/mean_length": 156.53125, |
| "completions/mean_terminated_length": 156.53125, |
| "completions/min_length": 67.0, |
| "completions/min_terminated_length": 67.0, |
| "entropy": 2.09375, |
| "epoch": 0.3879472693032015, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 6.139359698681732e-07, |
| "loss": 0.0, |
| "num_tokens": 27372173.0, |
| "reward": 0.9731494784355164, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9701660871505737, |
| "rewards/acc_reward/std": 0.020826132968068123, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 206 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 409.0, |
| "completions/max_terminated_length": 409.0, |
| "completions/mean_length": 164.34375, |
| "completions/mean_terminated_length": 164.34375, |
| "completions/min_length": 78.0, |
| "completions/min_terminated_length": 78.0, |
| "entropy": 2.09375, |
| "epoch": 0.3898305084745763, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 6.120527306967984e-07, |
| "loss": 0.0, |
| "num_tokens": 27499555.0, |
| "reward": 0.9984294176101685, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9982548952102661, |
| "rewards/acc_reward/std": 0.0046535334549844265, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 207 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 416.0, |
| "completions/max_terminated_length": 416.0, |
| "completions/mean_length": 157.484375, |
| "completions/mean_terminated_length": 157.484375, |
| "completions/min_length": 55.0, |
| "completions/min_terminated_length": 55.0, |
| "entropy": 2.25, |
| "epoch": 0.391713747645951, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 6.101694915254237e-07, |
| "loss": 0.0, |
| "num_tokens": 27636098.0, |
| "reward": 0.9711763262748718, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9679737091064453, |
| "rewards/acc_reward/std": 0.029890142381191254, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 208 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 408.0, |
| "completions/max_terminated_length": 408.0, |
| "completions/mean_length": 174.828125, |
| "completions/mean_terminated_length": 174.828125, |
| "completions/min_length": 27.0, |
| "completions/min_terminated_length": 27.0, |
| "entropy": 2.046875, |
| "epoch": 0.3935969868173258, |
| "frac_reward_zero_std": 0.875, |
| "grad_norm": 0.4382655620574951, |
| "learning_rate": 6.082862523540489e-07, |
| "loss": -0.0223, |
| "num_tokens": 27777591.0, |
| "reward": 0.9556211829185486, |
| "reward_std": 0.004419418517500162, |
| "rewards/acc_reward/mean": 0.9524263143539429, |
| "rewards/acc_reward/std": 0.04751761257648468, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 209 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 384.0, |
| "completions/max_terminated_length": 384.0, |
| "completions/mean_length": 139.09375, |
| "completions/mean_terminated_length": 139.09375, |
| "completions/min_length": 52.0, |
| "completions/min_terminated_length": 52.0, |
| "entropy": 2.15625, |
| "epoch": 0.3954802259887006, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 6.064030131826741e-07, |
| "loss": 0.0, |
| "num_tokens": 27906397.0, |
| "reward": 0.9859374761581421, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.984375, |
| "rewards/acc_reward/std": 0.0416666679084301, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 210 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 363.0, |
| "completions/max_terminated_length": 363.0, |
| "completions/mean_length": 144.625, |
| "completions/mean_terminated_length": 144.625, |
| "completions/min_length": 41.0, |
| "completions/min_terminated_length": 41.0, |
| "entropy": 1.8984375, |
| "epoch": 0.3973634651600753, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 6.045197740112994e-07, |
| "loss": 0.0, |
| "num_tokens": 28038757.0, |
| "reward": 0.9920339584350586, |
| "reward_std": 0.0, |
| "rewards/acc_reward/mean": 0.9911487698554993, |
| "rewards/acc_reward/std": 0.013452098704874516, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 211 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 455.0, |
| "completions/max_terminated_length": 455.0, |
| "completions/mean_length": 171.578125, |
| "completions/mean_terminated_length": 171.578125, |
| "completions/min_length": 59.0, |
| "completions/min_terminated_length": 59.0, |
| "entropy": 2.1875, |
| "epoch": 0.3992467043314501, |
| "frac_reward_zero_std": 0.875, |
| "grad_norm": 1.4742763042449951, |
| "learning_rate": 6.026365348399246e-07, |
| "loss": 0.0537, |
| "num_tokens": 28163002.0, |
| "reward": 0.9887884259223938, |
| "reward_std": 0.004419418517500162, |
| "rewards/acc_reward/mean": 0.9892787933349609, |
| "rewards/acc_reward/std": 0.023635946214199066, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 212 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 381.0, |
| "completions/max_terminated_length": 381.0, |
| "completions/mean_length": 135.84375, |
| "completions/mean_terminated_length": 135.84375, |
| "completions/min_length": 55.0, |
| "completions/min_terminated_length": 55.0, |
| "entropy": 1.921875, |
| "epoch": 0.4011299435028249, |
| "frac_reward_zero_std": 0.875, |
| "grad_norm": 0.6700330972671509, |
| "learning_rate": 6.007532956685499e-07, |
| "loss": -0.0233, |
| "num_tokens": 28279440.0, |
| "reward": 0.9760647416114807, |
| "reward_std": 0.004419418517500162, |
| "rewards/acc_reward/mean": 0.9751413464546204, |
| "rewards/acc_reward/std": 0.03155434504151344, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 213 |
| } |
| ], |
| "logging_steps": 1.0, |
| "max_steps": 531, |
| "num_input_tokens_seen": 28279440, |
| "num_train_epochs": 1, |
| "save_steps": 213, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|