diff --git "a/checkpoint-4491/trainer_state.json" "b/checkpoint-4491/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-4491/trainer_state.json" @@ -0,0 +1,2401 @@ +{ + "best_global_step": 4491, + "best_metric": 0.41118884086608887, + "best_model_checkpoint": "models/grpo_toxic_qwen/checkpoint-4491", + "epoch": 0.9996661101836394, + "eval_steps": 2696, + "global_step": 4491, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.890625, + "completions/max_length": 128.0, + "completions/max_terminated_length": 106.0, + "completions/mean_length": 119.59375, + "completions/mean_terminated_length": 51.142860412597656, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 0.00022259321090706732, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.030076026916504, + "kl": 3.605344500101637e-05, + "learning_rate": 0.0, + "loss": -0.0286, + "num_tokens": 9462.0, + "reward": -6.696479797363281, + "reward_std": 2.205897808074951, + "rewards/RewardModelWrapper/mean": -6.696479797363281, + "rewards/RewardModelWrapper/std": 2.596616506576538, + "step": 1 + }, + { + "clip_ratio/high_max": 0.00045590819666228654, + "clip_ratio/high_mean": 0.00045590819666228654, + "clip_ratio/low_mean": 9.893491918848333e-05, + "clip_ratio/low_min": 9.893491918848333e-05, + "clip_ratio/region_mean": 0.0005548431188205485, + "completions/clipped_ratio": 0.91015625, + "completions/max_length": 128.0, + "completions/max_terminated_length": 115.375, + "completions/mean_length": 124.541015625, + "completions/mean_terminated_length": 88.15992164611816, + "completions/min_length": 53.8125, + "completions/min_terminated_length": 53.8125, + "epoch": 0.011129660545353366, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.771069526672363, + "kl": 0.0014390296781742536, + "learning_rate": 7.350000000000001e-07, + "loss": -0.0097, + "num_tokens": 164224.0, + "reward": -6.273432105779648, + "reward_std": 2.3787402510643005, + "rewards/RewardModelWrapper/mean": -6.273432105779648, + "rewards/RewardModelWrapper/std": 3.4789108261466026, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0075913356387172825, + "clip_ratio/high_mean": 0.0075913356387172825, + "clip_ratio/low_mean": 0.003807623453612905, + "clip_ratio/low_min": 0.003807623453612905, + "clip_ratio/region_mean": 0.011398959086218383, + "completions/clipped_ratio": 0.8915441176470589, + "completions/max_length": 128.0, + "completions/max_terminated_length": 110.17647058823529, + "completions/mean_length": 123.2251838235294, + "completions/mean_terminated_length": 81.49435559441062, + "completions/min_length": 44.470588235294116, + "completions/min_terminated_length": 44.470588235294116, + "epoch": 0.022259321090706732, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.885649681091309, + "kl": 0.019224860495887695, + "learning_rate": 1.485e-06, + "loss": -0.0113, + "num_tokens": 327613.0, + "reward": -5.39674503663007, + "reward_std": 2.7843008882859173, + "rewards/RewardModelWrapper/mean": -5.39674503663007, + "rewards/RewardModelWrapper/std": 3.8948283475988053, + "step": 100 + }, + { + "clip_ratio/high_max": 0.01675744824227877, + "clip_ratio/high_mean": 0.01675744824227877, + "clip_ratio/low_mean": 0.012073511610215065, + "clip_ratio/low_min": 0.012073511610215065, + "clip_ratio/region_mean": 0.028830959817860276, + "completions/clipped_ratio": 0.9091796875, + "completions/max_length": 128.0, + "completions/max_terminated_length": 107.3125, + "completions/mean_length": 124.3720703125, + "completions/mean_terminated_length": 81.7018609046936, + "completions/min_length": 54.6875, + "completions/min_terminated_length": 46.6875, + "epoch": 0.0333889816360601, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.0026164054870605, + "kl": 0.04671986572444439, + "learning_rate": 2.235e-06, + "loss": 0.0052, + "num_tokens": 482354.0, + "reward": -5.547765076160431, + "reward_std": 2.73693485558033, + "rewards/RewardModelWrapper/mean": -5.547765076160431, + "rewards/RewardModelWrapper/std": 3.441145323216915, + "step": 150 + }, + { + "clip_ratio/high_max": 0.02414312065928243, + "clip_ratio/high_mean": 0.02414312065928243, + "clip_ratio/low_mean": 0.017463966414215975, + "clip_ratio/low_min": 0.017463966414215975, + "clip_ratio/region_mean": 0.04160708721727133, + "completions/clipped_ratio": 0.9200367647058824, + "completions/max_length": 128.0, + "completions/max_terminated_length": 119.6470588235294, + "completions/mean_length": 125.29503676470588, + "completions/mean_terminated_length": 94.28872680664062, + "completions/min_length": 65.58823529411765, + "completions/min_terminated_length": 65.58823529411765, + "epoch": 0.044518642181413465, + "frac_reward_zero_std": 0.007352941176470588, + "grad_norm": 3.9181442260742188, + "kl": 0.0877579689398408, + "learning_rate": 2.97e-06, + "loss": 0.0105, + "num_tokens": 648123.0, + "reward": -4.304881698944989, + "reward_std": 3.38148234872257, + "rewards/RewardModelWrapper/mean": -4.304881698944989, + "rewards/RewardModelWrapper/std": 4.617817443959853, + "step": 200 + }, + { + "clip_ratio/high_max": 0.029861916538793595, + "clip_ratio/high_mean": 0.029861916538793595, + "clip_ratio/low_mean": 0.023766413825796917, + "clip_ratio/low_min": 0.023766413825796917, + "clip_ratio/region_mean": 0.05362833026330918, + "completions/clipped_ratio": 0.9172794117647058, + "completions/max_length": 128.0, + "completions/max_terminated_length": 116.0, + "completions/mean_length": 124.69761029411765, + "completions/mean_terminated_length": 89.32857289033778, + "completions/min_length": 53.23529411764706, + "completions/min_terminated_length": 53.23529411764706, + "epoch": 0.05564830272676683, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.056563377380371, + "kl": 0.14030909642577172, + "learning_rate": 2.9836102890962895e-06, + "loss": 0.0228, + "num_tokens": 813050.0, + "reward": -4.152413817013011, + "reward_std": 3.3082274689393887, + "rewards/RewardModelWrapper/mean": -4.152413817013011, + "rewards/RewardModelWrapper/std": 4.367143616956823, + "step": 250 + }, + { + "clip_ratio/high_max": 0.030323101801332086, + "clip_ratio/high_mean": 0.030323101801332086, + "clip_ratio/low_mean": 0.021581946768565105, + "clip_ratio/low_min": 0.021581946768565105, + "clip_ratio/region_mean": 0.051905048433691266, + "completions/clipped_ratio": 0.9248046875, + "completions/max_length": 128.0, + "completions/max_terminated_length": 116.1875, + "completions/mean_length": 125.275390625, + "completions/mean_terminated_length": 89.46597385406494, + "completions/min_length": 56.5625, + "completions/min_terminated_length": 56.5625, + "epoch": 0.0667779632721202, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.983767032623291, + "kl": 0.1569075232744217, + "learning_rate": 2.966537673571591e-06, + "loss": 0.0317, + "num_tokens": 969156.0, + "reward": -3.388260453939438, + "reward_std": 3.2063718885183334, + "rewards/RewardModelWrapper/mean": -3.388260453939438, + "rewards/RewardModelWrapper/std": 4.789341554045677, + "step": 300 + }, + { + "clip_ratio/high_max": 0.027688504084944724, + "clip_ratio/high_mean": 0.027688504084944724, + "clip_ratio/low_mean": 0.019530020136153327, + "clip_ratio/low_min": 0.019530020136153327, + "clip_ratio/region_mean": 0.04721852412912995, + "completions/clipped_ratio": 0.9191176470588235, + "completions/max_length": 128.0, + "completions/max_terminated_length": 120.05882352941177, + "completions/mean_length": 124.81709558823529, + "completions/mean_terminated_length": 90.29201911477482, + "completions/min_length": 52.64705882352941, + "completions/min_terminated_length": 52.64705882352941, + "epoch": 0.07790762381747357, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.5877625942230225, + "kl": 0.1643798241391778, + "learning_rate": 2.9494650580468926e-06, + "loss": 0.0293, + "num_tokens": 1134229.0, + "reward": -3.141713114345775, + "reward_std": 3.3975463895236744, + "rewards/RewardModelWrapper/mean": -3.141713114345775, + "rewards/RewardModelWrapper/std": 4.820348431082333, + "step": 350 + }, + { + "clip_ratio/high_max": 0.028169492546003313, + "clip_ratio/high_mean": 0.028169492546003313, + "clip_ratio/low_mean": 0.019790295051643626, + "clip_ratio/low_min": 0.019790295051643626, + "clip_ratio/region_mean": 0.04795978774316609, + "completions/clipped_ratio": 0.9310661764705882, + "completions/max_length": 128.0, + "completions/max_terminated_length": 108.11764705882354, + "completions/mean_length": 125.64889705882354, + "completions/mean_terminated_length": 87.01379753561581, + "completions/min_length": 62.1764705882353, + "completions/min_terminated_length": 54.64705882352941, + "epoch": 0.08903728436282693, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.695188999176025, + "kl": 0.30343326754868033, + "learning_rate": 2.933416799453676e-06, + "loss": 0.0748, + "num_tokens": 1300167.0, + "reward": -3.474738233229693, + "reward_std": 3.482299538219676, + "rewards/RewardModelWrapper/mean": -3.474738233229693, + "rewards/RewardModelWrapper/std": 4.745730189716115, + "step": 400 + }, + { + "clip_ratio/high_max": 0.029925933612976224, + "clip_ratio/high_mean": 0.029925933612976224, + "clip_ratio/low_mean": 0.019293442433699966, + "clip_ratio/low_min": 0.019293442433699966, + "clip_ratio/region_mean": 0.04921937589067966, + "completions/clipped_ratio": 0.943359375, + "completions/max_length": 128.0, + "completions/max_terminated_length": 110.9375, + "completions/mean_length": 126.4140625, + "completions/mean_terminated_length": 95.56250047683716, + "completions/min_length": 79.5, + "completions/min_terminated_length": 71.5, + "epoch": 0.1001669449081803, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.636467218399048, + "kl": 0.19514246992766857, + "learning_rate": 2.9163441839289777e-06, + "loss": 0.0415, + "num_tokens": 1457135.0, + "reward": -3.0616072714328766, + "reward_std": 3.3436961472034454, + "rewards/RewardModelWrapper/mean": -3.0616072714328766, + "rewards/RewardModelWrapper/std": 4.945626050233841, + "step": 450 + }, + { + "clip_ratio/high_max": 0.027343249125406147, + "clip_ratio/high_mean": 0.027343249125406147, + "clip_ratio/low_mean": 0.01768903057440184, + "clip_ratio/low_min": 0.01768903057440184, + "clip_ratio/region_mean": 0.04503227963577956, + "completions/clipped_ratio": 0.9393382352941176, + "completions/max_length": 128.0, + "completions/max_terminated_length": 109.47058823529412, + "completions/mean_length": 126.07444852941177, + "completions/mean_terminated_length": 92.27339037726907, + "completions/min_length": 73.88235294117646, + "completions/min_terminated_length": 66.3529411764706, + "epoch": 0.11129660545353366, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.624467134475708, + "kl": 0.19471221148967743, + "learning_rate": 2.8992715684042796e-06, + "loss": 0.0459, + "num_tokens": 1623608.0, + "reward": -3.0403577299679028, + "reward_std": 3.5023320422453037, + "rewards/RewardModelWrapper/mean": -3.0403577299679028, + "rewards/RewardModelWrapper/std": 4.758344790514777, + "step": 500 + }, + { + "clip_ratio/high_max": 0.026099461197154596, + "clip_ratio/high_mean": 0.026099461197154596, + "clip_ratio/low_mean": 0.01860616845311597, + "clip_ratio/low_min": 0.01860616845311597, + "clip_ratio/region_mean": 0.04470562972594053, + "completions/clipped_ratio": 0.9209558823529411, + "completions/max_length": 128.0, + "completions/max_terminated_length": 111.3529411764706, + "completions/mean_length": 125.32536764705883, + "completions/mean_terminated_length": 86.94334905287799, + "completions/min_length": 54.64705882352941, + "completions/min_terminated_length": 47.11764705882353, + "epoch": 0.12242626599888703, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.117679119110107, + "kl": 0.1926309671998024, + "learning_rate": 2.882198952879581e-06, + "loss": 0.0424, + "num_tokens": 1789042.0, + "reward": -3.364777831470265, + "reward_std": 3.6073132402756634, + "rewards/RewardModelWrapper/mean": -3.364777831470265, + "rewards/RewardModelWrapper/std": 4.984979461221134, + "step": 550 + }, + { + "clip_ratio/high_max": 0.027654693657532335, + "clip_ratio/high_mean": 0.027654693657532335, + "clip_ratio/low_mean": 0.01964853117824532, + "clip_ratio/low_min": 0.01964853117824532, + "clip_ratio/region_mean": 0.047303224778734144, + "completions/clipped_ratio": 0.8984375, + "completions/max_length": 128.0, + "completions/max_terminated_length": 124.5, + "completions/mean_length": 124.734375, + "completions/mean_terminated_length": 98.57239484786987, + "completions/min_length": 61.25, + "completions/min_terminated_length": 61.25, + "epoch": 0.1335559265442404, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.241142511367798, + "kl": 0.211693402081728, + "learning_rate": 2.865126337354883e-06, + "loss": 0.0498, + "num_tokens": 1944610.0, + "reward": -2.732655808329582, + "reward_std": 3.617541193962097, + "rewards/RewardModelWrapper/mean": -2.732655808329582, + "rewards/RewardModelWrapper/std": 4.809614151716232, + "step": 600 + }, + { + "clip_ratio/high_max": 0.027527469391934574, + "clip_ratio/high_mean": 0.027527469391934574, + "clip_ratio/low_mean": 0.019259323065634815, + "clip_ratio/low_min": 0.019259323065634815, + "clip_ratio/region_mean": 0.046786792553029956, + "completions/clipped_ratio": 0.8933823529411765, + "completions/max_length": 128.0, + "completions/max_terminated_length": 121.47058823529412, + "completions/mean_length": 124.16727941176471, + "completions/mean_terminated_length": 96.0017848295324, + "completions/min_length": 59.23529411764706, + "completions/min_terminated_length": 59.23529411764706, + "epoch": 0.14468558708959378, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.524644374847412, + "kl": 0.2376550894230604, + "learning_rate": 2.8480537218301847e-06, + "loss": 0.0528, + "num_tokens": 2109128.0, + "reward": -1.8917093557469986, + "reward_std": 3.8112815267899456, + "rewards/RewardModelWrapper/mean": -1.8917093557469986, + "rewards/RewardModelWrapper/std": 5.167453260982738, + "step": 650 + }, + { + "clip_ratio/high_max": 0.027425415357574822, + "clip_ratio/high_mean": 0.027425415357574822, + "clip_ratio/low_mean": 0.01982414353871718, + "clip_ratio/low_min": 0.01982414353871718, + "clip_ratio/region_mean": 0.04724955870769918, + "completions/clipped_ratio": 0.8602941176470589, + "completions/max_length": 128.0, + "completions/max_terminated_length": 121.05882352941177, + "completions/mean_length": 123.19117647058823, + "completions/mean_terminated_length": 94.26595889820771, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.15581524763494714, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.250056743621826, + "kl": 0.22009442321956157, + "learning_rate": 2.830981106305486e-06, + "loss": 0.044, + "num_tokens": 2272320.0, + "reward": -2.427665850695442, + "reward_std": 3.78492192661061, + "rewards/RewardModelWrapper/mean": -2.427665850695442, + "rewards/RewardModelWrapper/std": 4.859750719631419, + "step": 700 + }, + { + "clip_ratio/high_max": 0.02454757507191971, + "clip_ratio/high_mean": 0.02454757507191971, + "clip_ratio/low_mean": 0.0160788345040055, + "clip_ratio/low_min": 0.0160788345040055, + "clip_ratio/region_mean": 0.04062640947755426, + "completions/clipped_ratio": 0.8837890625, + "completions/max_length": 128.0, + "completions/max_terminated_length": 117.25, + "completions/mean_length": 123.8193359375, + "completions/mean_terminated_length": 92.61992502212524, + "completions/min_length": 56.125, + "completions/min_terminated_length": 56.125, + "epoch": 0.1669449081803005, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.143310070037842, + "kl": 0.2187542901188135, + "learning_rate": 2.8139084907807877e-06, + "loss": 0.0458, + "num_tokens": 2426567.0, + "reward": -2.639084130525589, + "reward_std": 4.0981148183345795, + "rewards/RewardModelWrapper/mean": -2.639084130525589, + "rewards/RewardModelWrapper/std": 5.267414927482605, + "step": 750 + }, + { + "clip_ratio/high_max": 0.023827595426701008, + "clip_ratio/high_mean": 0.023827595426701008, + "clip_ratio/low_mean": 0.01665229408070445, + "clip_ratio/low_min": 0.01665229408070445, + "clip_ratio/region_mean": 0.04047988944686949, + "completions/clipped_ratio": 0.9172794117647058, + "completions/max_length": 128.0, + "completions/max_terminated_length": 117.41176470588235, + "completions/mean_length": 124.52849264705883, + "completions/mean_terminated_length": 86.05495004092946, + "completions/min_length": 53.294117647058826, + "completions/min_terminated_length": 53.294117647058826, + "epoch": 0.17807456872565386, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.82859206199646, + "kl": 0.2341267079859972, + "learning_rate": 2.7968358752560893e-06, + "loss": 0.0495, + "num_tokens": 2591422.0, + "reward": -1.696559471242568, + "reward_std": 4.100044530980727, + "rewards/RewardModelWrapper/mean": -1.696559471242568, + "rewards/RewardModelWrapper/std": 5.4215626155628875, + "step": 800 + }, + { + "clip_ratio/high_max": 0.025062179565429686, + "clip_ratio/high_mean": 0.025062179565429686, + "clip_ratio/low_mean": 0.018277215642156078, + "clip_ratio/low_min": 0.018277215642156078, + "clip_ratio/region_mean": 0.04333939506206661, + "completions/clipped_ratio": 0.9292279411764706, + "completions/max_length": 128.0, + "completions/max_terminated_length": 113.58823529411765, + "completions/mean_length": 125.45588235294117, + "completions/mean_terminated_length": 91.19166834214154, + "completions/min_length": 58.94117647058823, + "completions/min_terminated_length": 58.94117647058823, + "epoch": 0.18920422927100725, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.91913366317749, + "kl": 0.22518106378614902, + "learning_rate": 2.779763259731391e-06, + "loss": 0.0531, + "num_tokens": 2757254.0, + "reward": -0.3187214837354772, + "reward_std": 5.127424436457017, + "rewards/RewardModelWrapper/mean": -0.3187214837354772, + "rewards/RewardModelWrapper/std": 5.87655990263995, + "step": 850 + }, + { + "clip_ratio/high_max": 0.02293195443926379, + "clip_ratio/high_mean": 0.02293195443926379, + "clip_ratio/low_mean": 0.017691890239948407, + "clip_ratio/low_min": 0.017691890239948407, + "clip_ratio/region_mean": 0.040623844610527156, + "completions/clipped_ratio": 0.9091796875, + "completions/max_length": 128.0, + "completions/max_terminated_length": 119.875, + "completions/mean_length": 124.306640625, + "completions/mean_terminated_length": 88.5287561416626, + "completions/min_length": 47.5, + "completions/min_terminated_length": 47.5, + "epoch": 0.2003338898163606, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.8705227375030518, + "kl": 0.23920009069144726, + "learning_rate": 2.7626906442066923e-06, + "loss": 0.0608, + "num_tokens": 2912304.0, + "reward": -0.5163702219724655, + "reward_std": 5.298731863498688, + "rewards/RewardModelWrapper/mean": -0.5163702219724655, + "rewards/RewardModelWrapper/std": 5.84825000166893, + "step": 900 + }, + { + "clip_ratio/high_max": 0.02397001946810633, + "clip_ratio/high_mean": 0.02397001946810633, + "clip_ratio/low_mean": 0.016966249566758053, + "clip_ratio/low_min": 0.016966249566758053, + "clip_ratio/region_mean": 0.040936269152443854, + "completions/clipped_ratio": 0.9053308823529411, + "completions/max_length": 128.0, + "completions/max_terminated_length": 112.17647058823529, + "completions/mean_length": 124.65533088235294, + "completions/mean_terminated_length": 90.35452988568474, + "completions/min_length": 57.64705882352941, + "completions/min_terminated_length": 57.64705882352941, + "epoch": 0.21146355036171396, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.009652137756348, + "kl": 0.28723155200481415, + "learning_rate": 2.7456180286819943e-06, + "loss": 0.0623, + "num_tokens": 3077033.0, + "reward": 0.3360518918317907, + "reward_std": 5.125342537375057, + "rewards/RewardModelWrapper/mean": 0.3360518918317907, + "rewards/RewardModelWrapper/std": 5.78782990399529, + "step": 950 + }, + { + "clip_ratio/high_max": 0.025908510715235023, + "clip_ratio/high_mean": 0.025908510715235023, + "clip_ratio/low_mean": 0.017599179263343104, + "clip_ratio/low_min": 0.017599179263343104, + "clip_ratio/region_mean": 0.04350769010838121, + "completions/clipped_ratio": 0.9172794117647058, + "completions/max_length": 128.0, + "completions/max_terminated_length": 121.41176470588235, + "completions/mean_length": 125.03125, + "completions/mean_terminated_length": 94.79173772475299, + "completions/min_length": 61.88235294117647, + "completions/min_terminated_length": 61.88235294117647, + "epoch": 0.22259321090706732, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.159637928009033, + "kl": 0.35069699488580225, + "learning_rate": 2.728545413157296e-06, + "loss": 0.0847, + "num_tokens": 3242123.0, + "reward": 1.627946559120627, + "reward_std": 4.790118554059197, + "rewards/RewardModelWrapper/mean": 1.627946559120627, + "rewards/RewardModelWrapper/std": 5.393552022821763, + "step": 1000 + }, + { + "clip_ratio/high_max": 0.024083305108360945, + "clip_ratio/high_mean": 0.024083305108360945, + "clip_ratio/low_mean": 0.013416973181592766, + "clip_ratio/low_min": 0.013416973181592766, + "clip_ratio/region_mean": 0.0375002783536911, + "completions/clipped_ratio": 0.9267578125, + "completions/max_length": 128.0, + "completions/max_terminated_length": 112.8125, + "completions/mean_length": 125.0126953125, + "completions/mean_terminated_length": 87.55602884292603, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.2337228714524207, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.147945880889893, + "kl": 0.3470122530311346, + "learning_rate": 2.7114727976325973e-06, + "loss": 0.089, + "num_tokens": 3397872.0, + "reward": 0.24295206367969513, + "reward_std": 5.033507749438286, + "rewards/RewardModelWrapper/mean": 0.24295206367969513, + "rewards/RewardModelWrapper/std": 5.808434098958969, + "step": 1050 + }, + { + "clip_ratio/high_max": 0.024642590049188583, + "clip_ratio/high_mean": 0.024642590049188583, + "clip_ratio/low_mean": 0.013819608901976609, + "clip_ratio/low_min": 0.013819608901976609, + "clip_ratio/region_mean": 0.0384621987817809, + "completions/clipped_ratio": 0.9264705882352942, + "completions/max_length": 128.0, + "completions/max_terminated_length": 112.70588235294117, + "completions/mean_length": 125.2408088235294, + "completions/mean_terminated_length": 90.793908960679, + "completions/min_length": 65.17647058823529, + "completions/min_terminated_length": 65.17647058823529, + "epoch": 0.24485253199777407, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.4217491149902344, + "kl": 0.377669473439455, + "learning_rate": 2.694400182107899e-06, + "loss": 0.0977, + "num_tokens": 3563110.0, + "reward": 0.4081239700317383, + "reward_std": 5.032071225783405, + "rewards/RewardModelWrapper/mean": 0.4081239700317383, + "rewards/RewardModelWrapper/std": 5.893623436198515, + "step": 1100 + }, + { + "clip_ratio/high_max": 0.022081555526237934, + "clip_ratio/high_mean": 0.022081555526237934, + "clip_ratio/low_mean": 0.015956819643906783, + "clip_ratio/low_min": 0.015956819643906783, + "clip_ratio/region_mean": 0.038038374953903255, + "completions/clipped_ratio": 0.9365808823529411, + "completions/max_length": 128.0, + "completions/max_terminated_length": 95.0, + "completions/mean_length": 125.22426470588235, + "completions/mean_terminated_length": 73.54575303021599, + "completions/min_length": 63.94117647058823, + "completions/min_terminated_length": 48.88235294117647, + "epoch": 0.25598219254312743, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.748858451843262, + "kl": 0.42920990511775015, + "learning_rate": 2.677327566583201e-06, + "loss": 0.1117, + "num_tokens": 3728050.0, + "reward": 1.730754810221055, + "reward_std": 4.819248423856847, + "rewards/RewardModelWrapper/mean": 1.730754810221055, + "rewards/RewardModelWrapper/std": 5.504547006943646, + "step": 1150 + }, + { + "clip_ratio/high_max": 0.02080470887827687, + "clip_ratio/high_mean": 0.02080470887827687, + "clip_ratio/low_mean": 0.01475024281651713, + "clip_ratio/low_min": 0.01475024281651713, + "clip_ratio/region_mean": 0.03555495172040537, + "completions/clipped_ratio": 0.962890625, + "completions/max_length": 128.0, + "completions/max_terminated_length": 98.625, + "completions/mean_length": 126.416015625, + "completions/mean_terminated_length": 81.55208349227905, + "completions/min_length": 72.375, + "completions/min_terminated_length": 64.375, + "epoch": 0.2671118530884808, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.4883859157562256, + "kl": 0.42720063477754594, + "learning_rate": 2.6602549510585024e-06, + "loss": 0.1195, + "num_tokens": 3884876.0, + "reward": 1.9971511512994766, + "reward_std": 4.786640420556068, + "rewards/RewardModelWrapper/mean": 1.9971511512994766, + "rewards/RewardModelWrapper/std": 5.766968697309494, + "step": 1200 + }, + { + "clip_ratio/high_max": 0.023233274864032864, + "clip_ratio/high_mean": 0.023233274864032864, + "clip_ratio/low_mean": 0.01158983559376793, + "clip_ratio/low_min": 0.01158983559376793, + "clip_ratio/region_mean": 0.03482311038998887, + "completions/clipped_ratio": 0.9347426470588235, + "completions/max_length": 128.0, + "completions/max_terminated_length": 114.17647058823529, + "completions/mean_length": 125.17738970588235, + "completions/mean_terminated_length": 89.91648954503677, + "completions/min_length": 64.29411764705883, + "completions/min_terminated_length": 64.29411764705883, + "epoch": 0.27824151363383415, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.5916192531585693, + "kl": 0.3887044958770275, + "learning_rate": 2.643182335533804e-06, + "loss": 0.1015, + "num_tokens": 4050013.0, + "reward": 0.8245974989498362, + "reward_std": 5.001701130586512, + "rewards/RewardModelWrapper/mean": 0.8245974989498362, + "rewards/RewardModelWrapper/std": 5.80830400130328, + "step": 1250 + }, + { + "clip_ratio/high_max": 0.020529154643882067, + "clip_ratio/high_mean": 0.020529154643882067, + "clip_ratio/low_mean": 0.015356352158414665, + "clip_ratio/low_min": 0.015356352158414665, + "clip_ratio/region_mean": 0.03588550680316985, + "completions/clipped_ratio": 0.9512867647058824, + "completions/max_length": 128.0, + "completions/max_terminated_length": 100.76470588235294, + "completions/mean_length": 125.86305147058823, + "completions/mean_terminated_length": 81.50539353314568, + "completions/min_length": 63.470588235294116, + "completions/min_terminated_length": 55.94117647058823, + "epoch": 0.28937117417918756, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.05077600479126, + "kl": 0.4404847612977028, + "learning_rate": 2.6261097200091054e-06, + "loss": 0.1208, + "num_tokens": 4215832.0, + "reward": 2.3118093013763428, + "reward_std": 4.841920866685755, + "rewards/RewardModelWrapper/mean": 2.3118093013763428, + "rewards/RewardModelWrapper/std": 5.525171279907227, + "step": 1300 + }, + { + "clip_ratio/high_max": 0.024388792894314976, + "clip_ratio/high_mean": 0.024388792894314976, + "clip_ratio/low_mean": 0.015166401157330256, + "clip_ratio/low_min": 0.015166401157330256, + "clip_ratio/region_mean": 0.039555194084532556, + "completions/clipped_ratio": 0.9345703125, + "completions/max_length": 128.0, + "completions/max_terminated_length": 111.5, + "completions/mean_length": 125.7822265625, + "completions/mean_terminated_length": 88.69479322433472, + "completions/min_length": 64.0625, + "completions/min_terminated_length": 56.0625, + "epoch": 0.3005008347245409, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.280036926269531, + "kl": 0.4519739609956741, + "learning_rate": 2.609037104484407e-06, + "loss": 0.1237, + "num_tokens": 4372361.0, + "reward": 2.7925052791833878, + "reward_std": 4.665284767746925, + "rewards/RewardModelWrapper/mean": 2.7925052791833878, + "rewards/RewardModelWrapper/std": 5.4118489027023315, + "step": 1350 + }, + { + "clip_ratio/high_max": 0.023544567436911166, + "clip_ratio/high_mean": 0.023544567436911166, + "clip_ratio/low_mean": 0.01299051069712732, + "clip_ratio/low_min": 0.01299051069712732, + "clip_ratio/region_mean": 0.03653507822658866, + "completions/clipped_ratio": 0.9292279411764706, + "completions/max_length": 128.0, + "completions/max_terminated_length": 111.82352941176471, + "completions/mean_length": 124.65165441176471, + "completions/mean_terminated_length": 82.68410469503964, + "completions/min_length": 46.8235294117647, + "completions/min_terminated_length": 46.8235294117647, + "epoch": 0.3116304952698943, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.7401034832000732, + "kl": 0.4922306627035141, + "learning_rate": 2.591964488959709e-06, + "loss": 0.1315, + "num_tokens": 4537846.0, + "reward": 2.862899471731747, + "reward_std": 4.948802695554845, + "rewards/RewardModelWrapper/mean": 2.862899471731747, + "rewards/RewardModelWrapper/std": 5.503605421851663, + "step": 1400 + }, + { + "clip_ratio/high_max": 0.025161673842230812, + "clip_ratio/high_mean": 0.025161673842230812, + "clip_ratio/low_mean": 0.012126781771657989, + "clip_ratio/low_min": 0.012126781771657989, + "clip_ratio/region_mean": 0.037288455746602264, + "completions/clipped_ratio": 0.9292279411764706, + "completions/max_length": 128.0, + "completions/max_terminated_length": 117.17647058823529, + "completions/mean_length": 125.05882352941177, + "completions/mean_terminated_length": 90.7926357493681, + "completions/min_length": 62.11764705882353, + "completions/min_terminated_length": 62.11764705882353, + "epoch": 0.32276015581524764, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.467922687530518, + "kl": 0.46787314653396606, + "learning_rate": 2.5748918734350105e-06, + "loss": 0.1207, + "num_tokens": 4702966.0, + "reward": 1.3220273045932545, + "reward_std": 4.946936158572926, + "rewards/RewardModelWrapper/mean": 1.3220273045932545, + "rewards/RewardModelWrapper/std": 5.816557715920841, + "step": 1450 + }, + { + "clip_ratio/high_max": 0.025372368972748516, + "clip_ratio/high_mean": 0.025372368972748516, + "clip_ratio/low_mean": 0.01208616121119121, + "clip_ratio/low_min": 0.01208616121119121, + "clip_ratio/region_mean": 0.03745853026397526, + "completions/clipped_ratio": 0.9599609375, + "completions/max_length": 128.0, + "completions/max_terminated_length": 88.5625, + "completions/mean_length": 126.3486328125, + "completions/mean_terminated_length": 74.44687557220459, + "completions/min_length": 73.875, + "completions/min_terminated_length": 57.875, + "epoch": 0.333889816360601, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.3412301540374756, + "kl": 0.5128468088805676, + "learning_rate": 2.557819257910312e-06, + "loss": 0.1397, + "num_tokens": 4860219.0, + "reward": 2.0575065165758133, + "reward_std": 5.155221775174141, + "rewards/RewardModelWrapper/mean": 2.0575065165758133, + "rewards/RewardModelWrapper/std": 5.655524164438248, + "step": 1500 + }, + { + "clip_ratio/high_max": 0.023876634621992708, + "clip_ratio/high_mean": 0.023876634621992708, + "clip_ratio/low_mean": 0.013062482952955179, + "clip_ratio/low_min": 0.013062482952955179, + "clip_ratio/region_mean": 0.03693911746609956, + "completions/clipped_ratio": 0.9604779411764706, + "completions/max_length": 128.0, + "completions/max_terminated_length": 84.47058823529412, + "completions/mean_length": 126.6001838235294, + "completions/mean_terminated_length": 73.35098131965188, + "completions/min_length": 92.58823529411765, + "completions/min_terminated_length": 62.470588235294116, + "epoch": 0.34501947690595436, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.4934329986572266, + "kl": 0.5125479310750961, + "learning_rate": 2.5407466423856135e-06, + "loss": 0.139, + "num_tokens": 5027488.0, + "reward": 2.7399597448461197, + "reward_std": 4.745976616354549, + "rewards/RewardModelWrapper/mean": 2.7399597448461197, + "rewards/RewardModelWrapper/std": 5.300730144276338, + "step": 1550 + }, + { + "clip_ratio/high_max": 0.02330939914332703, + "clip_ratio/high_mean": 0.02330939914332703, + "clip_ratio/low_mean": 0.009550860303861555, + "clip_ratio/low_min": 0.009550860303861555, + "clip_ratio/region_mean": 0.032860259409062564, + "completions/clipped_ratio": 0.9641544117647058, + "completions/max_length": 128.0, + "completions/max_terminated_length": 95.70588235294117, + "completions/mean_length": 126.61397058823529, + "completions/mean_terminated_length": 81.39460844152114, + "completions/min_length": 81.88235294117646, + "completions/min_terminated_length": 66.82352941176471, + "epoch": 0.3561491374513077, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.760233402252197, + "kl": 0.5104251652956009, + "learning_rate": 2.523674026860915e-06, + "loss": 0.1396, + "num_tokens": 5193996.0, + "reward": 1.9389969741596895, + "reward_std": 5.14070810991175, + "rewards/RewardModelWrapper/mean": 1.9389969741596895, + "rewards/RewardModelWrapper/std": 5.778058921589571, + "step": 1600 + }, + { + "clip_ratio/high_max": 0.023669966620218474, + "clip_ratio/high_mean": 0.023669966620218474, + "clip_ratio/low_mean": 0.012192065346171147, + "clip_ratio/low_min": 0.012192065346171147, + "clip_ratio/region_mean": 0.035862031998112796, + "completions/clipped_ratio": 0.9599609375, + "completions/max_length": 128.0, + "completions/max_terminated_length": 105.625, + "completions/mean_length": 126.4326171875, + "completions/mean_terminated_length": 91.0947916507721, + "completions/min_length": 77.5, + "completions/min_terminated_length": 77.5, + "epoch": 0.3672787979966611, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.496079206466675, + "kl": 0.5115452679991722, + "learning_rate": 2.5066014113362166e-06, + "loss": 0.1421, + "num_tokens": 5350823.0, + "reward": 2.4139109551906586, + "reward_std": 4.767535001039505, + "rewards/RewardModelWrapper/mean": 2.4139109551906586, + "rewards/RewardModelWrapper/std": 5.584080070257187, + "step": 1650 + }, + { + "clip_ratio/high_max": 0.026008948455564677, + "clip_ratio/high_mean": 0.026008948455564677, + "clip_ratio/low_mean": 0.008926556244841777, + "clip_ratio/low_min": 0.008926556244841777, + "clip_ratio/region_mean": 0.0349355046171695, + "completions/clipped_ratio": 0.9604779411764706, + "completions/max_length": 128.0, + "completions/max_terminated_length": 105.94117647058823, + "completions/mean_length": 126.34926470588235, + "completions/mean_terminated_length": 89.24902052037856, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.3784084585420145, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.9608895778656006, + "kl": 0.5186072036623954, + "learning_rate": 2.489528795811518e-06, + "loss": 0.1427, + "num_tokens": 5517627.0, + "reward": 1.1372435163049137, + "reward_std": 5.190363294938031, + "rewards/RewardModelWrapper/mean": 1.1372435163049137, + "rewards/RewardModelWrapper/std": 5.777948155122645, + "step": 1700 + }, + { + "clip_ratio/high_max": 0.022060031631262973, + "clip_ratio/high_mean": 0.022060031631262973, + "clip_ratio/low_mean": 0.012312272182898596, + "clip_ratio/low_min": 0.012312272182898596, + "clip_ratio/region_mean": 0.03437230377923697, + "completions/clipped_ratio": 0.9632352941176471, + "completions/max_length": 128.0, + "completions/max_terminated_length": 102.82352941176471, + "completions/mean_length": 126.49540441176471, + "completions/mean_terminated_length": 88.83088302612305, + "completions/min_length": 76.3529411764706, + "completions/min_terminated_length": 68.82352941176471, + "epoch": 0.38953811908736785, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.0580315589904785, + "kl": 0.5839428542554379, + "learning_rate": 2.4724561802868197e-06, + "loss": 0.1614, + "num_tokens": 5684102.0, + "reward": 2.689769050654243, + "reward_std": 4.625988932216869, + "rewards/RewardModelWrapper/mean": 2.689769050654243, + "rewards/RewardModelWrapper/std": 5.245194827809053, + "step": 1750 + }, + { + "clip_ratio/high_max": 0.02739144684630446, + "clip_ratio/high_mean": 0.02739144684630446, + "clip_ratio/low_mean": 0.012341015862475616, + "clip_ratio/low_min": 0.012341015862475616, + "clip_ratio/region_mean": 0.03973246271605604, + "completions/clipped_ratio": 0.95703125, + "completions/max_length": 128.0, + "completions/max_terminated_length": 103.5625, + "completions/mean_length": 126.060546875, + "completions/mean_terminated_length": 83.71354246139526, + "completions/min_length": 61.3125, + "completions/min_terminated_length": 61.3125, + "epoch": 0.4006677796327212, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.872232437133789, + "kl": 0.5619081328809261, + "learning_rate": 2.4553835647621216e-06, + "loss": 0.1509, + "num_tokens": 5840596.0, + "reward": 2.3125159442424774, + "reward_std": 4.9210382997989655, + "rewards/RewardModelWrapper/mean": 2.3125159442424774, + "rewards/RewardModelWrapper/std": 5.377374470233917, + "step": 1800 + }, + { + "clip_ratio/high_max": 0.024034175912383944, + "clip_ratio/high_mean": 0.024034175912383944, + "clip_ratio/low_mean": 0.009776253007003107, + "clip_ratio/low_min": 0.009776253007003107, + "clip_ratio/region_mean": 0.03381042889552191, + "completions/clipped_ratio": 0.9549632352941176, + "completions/max_length": 128.0, + "completions/max_terminated_length": 101.41176470588235, + "completions/mean_length": 125.99724264705883, + "completions/mean_terminated_length": 81.53921688304229, + "completions/min_length": 65.23529411764706, + "completions/min_terminated_length": 57.705882352941174, + "epoch": 0.41179744017807457, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.8941831588745117, + "kl": 0.615523195117712, + "learning_rate": 2.4383109492374236e-06, + "loss": 0.1677, + "num_tokens": 6007113.0, + "reward": 2.1543740524965176, + "reward_std": 5.060079883126652, + "rewards/RewardModelWrapper/mean": 2.1543740524965176, + "rewards/RewardModelWrapper/std": 5.475296539418838, + "step": 1850 + }, + { + "clip_ratio/high_max": 0.022410094959195704, + "clip_ratio/high_mean": 0.022410094959195704, + "clip_ratio/low_mean": 0.012868442094186321, + "clip_ratio/low_min": 0.012868442094186321, + "clip_ratio/region_mean": 0.035278537014964965, + "completions/clipped_ratio": 0.9604779411764706, + "completions/max_length": 128.0, + "completions/max_terminated_length": 91.94117647058823, + "completions/mean_length": 126.30238970588235, + "completions/mean_terminated_length": 79.55490246941062, + "completions/min_length": 71.88235294117646, + "completions/min_terminated_length": 64.3529411764706, + "epoch": 0.42292710072342793, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.654233932495117, + "kl": 0.6534902662038803, + "learning_rate": 2.421238333712725e-06, + "loss": 0.1843, + "num_tokens": 6174378.0, + "reward": 2.3925238006255207, + "reward_std": 4.879058487275067, + "rewards/RewardModelWrapper/mean": 2.3925238006255207, + "rewards/RewardModelWrapper/std": 5.418860211091883, + "step": 1900 + }, + { + "clip_ratio/high_max": 0.021921868621138856, + "clip_ratio/high_mean": 0.021921868621138856, + "clip_ratio/low_mean": 0.011612088698893786, + "clip_ratio/low_min": 0.011612088698893786, + "clip_ratio/region_mean": 0.03353395750047639, + "completions/clipped_ratio": 0.94140625, + "completions/max_length": 128.0, + "completions/max_terminated_length": 107.3125, + "completions/mean_length": 125.32421875, + "completions/mean_terminated_length": 82.90129089355469, + "completions/min_length": 54.5, + "completions/min_terminated_length": 54.5, + "epoch": 0.4340567612687813, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.8473002910614014, + "kl": 0.6599007929861546, + "learning_rate": 2.4041657181880266e-06, + "loss": 0.1769, + "num_tokens": 6330166.0, + "reward": 2.618502587080002, + "reward_std": 4.749881863594055, + "rewards/RewardModelWrapper/mean": 2.618502587080002, + "rewards/RewardModelWrapper/std": 5.46898752450943, + "step": 1950 + }, + { + "clip_ratio/high_max": 0.02155641552293673, + "clip_ratio/high_mean": 0.02155641552293673, + "clip_ratio/low_mean": 0.009289601502241568, + "clip_ratio/low_min": 0.009289601502241568, + "clip_ratio/region_mean": 0.030846016986761243, + "completions/clipped_ratio": 0.9568014705882353, + "completions/max_length": 128.0, + "completions/max_terminated_length": 101.88235294117646, + "completions/mean_length": 126.37040441176471, + "completions/mean_terminated_length": 83.31176578297334, + "completions/min_length": 72.88235294117646, + "completions/min_terminated_length": 65.3529411764706, + "epoch": 0.44518642181413465, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.8109655380249023, + "kl": 0.7074493160843849, + "learning_rate": 2.387093102663328e-06, + "loss": 0.1974, + "num_tokens": 6496761.0, + "reward": 3.343541706309599, + "reward_std": 4.7992883710300225, + "rewards/RewardModelWrapper/mean": 3.343541706309599, + "rewards/RewardModelWrapper/std": 5.472757451674518, + "step": 2000 + }, + { + "clip_ratio/high_max": 0.027084801244782283, + "clip_ratio/high_mean": 0.027084801244782283, + "clip_ratio/low_mean": 0.006253871699154843, + "clip_ratio/low_min": 0.006253871699154843, + "clip_ratio/region_mean": 0.03333867286099121, + "completions/clipped_ratio": 0.9503676470588235, + "completions/max_length": 128.0, + "completions/max_terminated_length": 94.88235294117646, + "completions/mean_length": 125.38786764705883, + "completions/mean_terminated_length": 66.87544497321633, + "completions/min_length": 58.1764705882353, + "completions/min_terminated_length": 43.11764705882353, + "epoch": 0.45631608235948806, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.87650203704834, + "kl": 0.6578731602430343, + "learning_rate": 2.3700204871386297e-06, + "loss": 0.1804, + "num_tokens": 6662615.0, + "reward": 1.8048853032729204, + "reward_std": 5.33220240649055, + "rewards/RewardModelWrapper/mean": 1.8048853032729204, + "rewards/RewardModelWrapper/std": 5.8003731334910675, + "step": 2050 + }, + { + "clip_ratio/high_max": 0.022799394286703318, + "clip_ratio/high_mean": 0.022799394286703318, + "clip_ratio/low_mean": 0.008315351814671886, + "clip_ratio/low_min": 0.008315351814671886, + "clip_ratio/region_mean": 0.03111474617384374, + "completions/clipped_ratio": 0.9609375, + "completions/max_length": 128.0, + "completions/max_terminated_length": 87.0625, + "completions/mean_length": 125.9619140625, + "completions/mean_terminated_length": 69.97916746139526, + "completions/min_length": 70.25, + "completions/min_terminated_length": 54.25, + "epoch": 0.4674457429048414, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.432864189147949, + "kl": 0.7079492492973805, + "learning_rate": 2.3529478716139312e-06, + "loss": 0.1956, + "num_tokens": 6819392.0, + "reward": 2.50741083920002, + "reward_std": 5.116829484701157, + "rewards/RewardModelWrapper/mean": 2.50741083920002, + "rewards/RewardModelWrapper/std": 5.797209560871124, + "step": 2100 + }, + { + "clip_ratio/high_max": 0.022400263713207094, + "clip_ratio/high_mean": 0.022400263713207094, + "clip_ratio/low_mean": 0.008116541813942604, + "clip_ratio/low_min": 0.008116541813942604, + "clip_ratio/region_mean": 0.030516805413644762, + "completions/clipped_ratio": 0.9347426470588235, + "completions/max_length": 128.0, + "completions/max_terminated_length": 100.88235294117646, + "completions/mean_length": 125.04779411764706, + "completions/mean_terminated_length": 75.39313866110409, + "completions/min_length": 51.1764705882353, + "completions/min_terminated_length": 43.64705882352941, + "epoch": 0.4785754034501948, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.323696136474609, + "kl": 0.7352806448936462, + "learning_rate": 2.3358752560892328e-06, + "loss": 0.1983, + "num_tokens": 6984364.0, + "reward": 2.347130256540635, + "reward_std": 5.3196556708391975, + "rewards/RewardModelWrapper/mean": 2.347130256540635, + "rewards/RewardModelWrapper/std": 5.771211035111371, + "step": 2150 + }, + { + "clip_ratio/high_max": 0.021681377917993815, + "clip_ratio/high_mean": 0.021681377917993815, + "clip_ratio/low_mean": 0.01035779433674179, + "clip_ratio/low_min": 0.01035779433674179, + "clip_ratio/region_mean": 0.03203917214414105, + "completions/clipped_ratio": 0.9466911764705882, + "completions/max_length": 128.0, + "completions/max_terminated_length": 109.29411764705883, + "completions/mean_length": 125.82996323529412, + "completions/mean_terminated_length": 90.63718593821807, + "completions/min_length": 71.23529411764706, + "completions/min_terminated_length": 71.23529411764706, + "epoch": 0.48970506399554814, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.768919944763184, + "kl": 0.7503913494944573, + "learning_rate": 2.3188026405645343e-06, + "loss": 0.2126, + "num_tokens": 7150483.0, + "reward": 2.530949129777796, + "reward_std": 5.144188319935518, + "rewards/RewardModelWrapper/mean": 2.530949129777796, + "rewards/RewardModelWrapper/std": 5.636184664333568, + "step": 2200 + }, + { + "clip_ratio/high_max": 0.02238714267965406, + "clip_ratio/high_mean": 0.02238714267965406, + "clip_ratio/low_mean": 0.008641490781737957, + "clip_ratio/low_min": 0.008641490781737957, + "clip_ratio/region_mean": 0.031028633578680454, + "completions/clipped_ratio": 0.927734375, + "completions/max_length": 128.0, + "completions/max_terminated_length": 97.6875, + "completions/mean_length": 125.126953125, + "completions/mean_terminated_length": 79.61108827590942, + "completions/min_length": 60.875, + "completions/min_terminated_length": 52.875, + "epoch": 0.5008347245409015, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.544048309326172, + "kl": 0.8463270646333695, + "learning_rate": 2.3017300250398363e-06, + "loss": 0.234, + "num_tokens": 7305749.0, + "reward": 3.097047299146652, + "reward_std": 5.260514736175537, + "rewards/RewardModelWrapper/mean": 3.097047299146652, + "rewards/RewardModelWrapper/std": 5.711855351924896, + "step": 2250 + }, + { + "clip_ratio/high_max": 0.0228908458375372, + "clip_ratio/high_mean": 0.0228908458375372, + "clip_ratio/low_mean": 0.009188006882905029, + "clip_ratio/low_min": 0.009188006882905029, + "clip_ratio/region_mean": 0.03207885263953358, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 128.0, + "completions/max_terminated_length": 95.58823529411765, + "completions/mean_length": 126.76011029411765, + "completions/mean_terminated_length": 83.82843219532685, + "completions/min_length": 76.82352941176471, + "completions/min_terminated_length": 69.29411764705883, + "epoch": 0.5119643850862549, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.981594562530518, + "kl": 0.8844366371631622, + "learning_rate": 2.284657409515138e-06, + "loss": 0.256, + "num_tokens": 7472592.0, + "reward": 3.071570908322054, + "reward_std": 5.142256512361414, + "rewards/RewardModelWrapper/mean": 3.071570908322054, + "rewards/RewardModelWrapper/std": 5.772335641524371, + "step": 2300 + }, + { + "clip_ratio/high_max": 0.0240246270573698, + "clip_ratio/high_mean": 0.0240246270573698, + "clip_ratio/low_mean": 0.0069138467891025355, + "clip_ratio/low_min": 0.0069138467891025355, + "clip_ratio/region_mean": 0.03093847391428426, + "completions/clipped_ratio": 0.9448529411764706, + "completions/max_length": 128.0, + "completions/max_terminated_length": 104.76470588235294, + "completions/mean_length": 125.68014705882354, + "completions/mean_terminated_length": 84.08382460650276, + "completions/min_length": 64.76470588235294, + "completions/min_terminated_length": 57.23529411764706, + "epoch": 0.5230940456316082, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.83440637588501, + "kl": 0.9639204081892967, + "learning_rate": 2.2675847939904393e-06, + "loss": 0.2686, + "num_tokens": 7637628.0, + "reward": 2.8617815410389618, + "reward_std": 5.505871576421401, + "rewards/RewardModelWrapper/mean": 2.8617815410389618, + "rewards/RewardModelWrapper/std": 5.944927496068618, + "step": 2350 + }, + { + "clip_ratio/high_max": 0.02327756991609931, + "clip_ratio/high_mean": 0.02327756991609931, + "clip_ratio/low_mean": 0.011412573783891275, + "clip_ratio/low_min": 0.011412573783891275, + "clip_ratio/region_mean": 0.03469014364061877, + "completions/clipped_ratio": 0.9521484375, + "completions/max_length": 128.0, + "completions/max_terminated_length": 112.875, + "completions/mean_length": 126.125, + "completions/mean_terminated_length": 91.77031326293945, + "completions/min_length": 70.3125, + "completions/min_terminated_length": 70.3125, + "epoch": 0.5342237061769616, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.203379154205322, + "kl": 1.0460551810264587, + "learning_rate": 2.250512178465741e-06, + "loss": 0.299, + "num_tokens": 7793988.0, + "reward": 3.6445817947387695, + "reward_std": 5.2445206344127655, + "rewards/RewardModelWrapper/mean": 3.6445817947387695, + "rewards/RewardModelWrapper/std": 5.754371851682663, + "step": 2400 + }, + { + "clip_ratio/high_max": 0.025683601254131647, + "clip_ratio/high_mean": 0.025683601254131647, + "clip_ratio/low_mean": 0.007094714913982898, + "clip_ratio/low_min": 0.007094714913982898, + "clip_ratio/region_mean": 0.032778316254261884, + "completions/clipped_ratio": 0.9310661764705882, + "completions/max_length": 128.0, + "completions/max_terminated_length": 113.58823529411765, + "completions/mean_length": 125.19117647058823, + "completions/mean_terminated_length": 87.4491610807531, + "completions/min_length": 52.588235294117645, + "completions/min_terminated_length": 52.588235294117645, + "epoch": 0.5453533667223149, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.791234970092773, + "kl": 1.0378785887360573, + "learning_rate": 2.233439562941043e-06, + "loss": 0.2908, + "num_tokens": 7958828.0, + "reward": 2.4743111414067886, + "reward_std": 5.666090853074017, + "rewards/RewardModelWrapper/mean": 2.4743111414067886, + "rewards/RewardModelWrapper/std": 6.052795522353229, + "step": 2450 + }, + { + "clip_ratio/high_max": 0.022869902374222876, + "clip_ratio/high_mean": 0.022869902374222876, + "clip_ratio/low_mean": 0.010338929877325426, + "clip_ratio/low_min": 0.010338929877325426, + "clip_ratio/region_mean": 0.03320883221458644, + "completions/clipped_ratio": 0.9448529411764706, + "completions/max_length": 128.0, + "completions/max_terminated_length": 106.6470588235294, + "completions/mean_length": 125.87040441176471, + "completions/mean_terminated_length": 87.59656883688534, + "completions/min_length": 75.76470588235294, + "completions/min_terminated_length": 68.23529411764706, + "epoch": 0.5564830272676683, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.271297931671143, + "kl": 1.135116419494152, + "learning_rate": 2.2163669474163444e-06, + "loss": 0.3229, + "num_tokens": 8125183.0, + "reward": 2.681288887472714, + "reward_std": 5.512399000280044, + "rewards/RewardModelWrapper/mean": 2.681288887472714, + "rewards/RewardModelWrapper/std": 6.263462291044347, + "step": 2500 + }, + { + "clip_ratio/high_max": 0.024373745566699655, + "clip_ratio/high_mean": 0.024373745566699655, + "clip_ratio/low_mean": 0.007875631948991213, + "clip_ratio/low_min": 0.007875631948991213, + "clip_ratio/region_mean": 0.032249377460684625, + "completions/clipped_ratio": 0.962890625, + "completions/max_length": 128.0, + "completions/max_terminated_length": 99.3125, + "completions/mean_length": 126.6416015625, + "completions/mean_terminated_length": 84.72916746139526, + "completions/min_length": 75.75, + "completions/min_terminated_length": 67.75, + "epoch": 0.5676126878130217, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.9515485763549805, + "kl": 1.1067718014121055, + "learning_rate": 2.199294331891646e-06, + "loss": 0.3174, + "num_tokens": 8282648.0, + "reward": 2.5410157814621925, + "reward_std": 5.60416579246521, + "rewards/RewardModelWrapper/mean": 2.5410157814621925, + "rewards/RewardModelWrapper/std": 6.249917358160019, + "step": 2550 + }, + { + "clip_ratio/high_max": 0.021070915756281464, + "clip_ratio/high_mean": 0.021070915756281464, + "clip_ratio/low_mean": 0.010609990251832641, + "clip_ratio/low_min": 0.010609990251832641, + "clip_ratio/region_mean": 0.03168090590508655, + "completions/clipped_ratio": 0.9613970588235294, + "completions/max_length": 128.0, + "completions/max_terminated_length": 89.23529411764706, + "completions/mean_length": 126.42463235294117, + "completions/mean_terminated_length": 78.3034320158117, + "completions/min_length": 81.52941176470588, + "completions/min_terminated_length": 66.47058823529412, + "epoch": 0.5787423483583751, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.162291049957275, + "kl": 1.2047522097826004, + "learning_rate": 2.1822217163669474e-06, + "loss": 0.3462, + "num_tokens": 8449478.0, + "reward": 3.0924135095932903, + "reward_std": 5.470459377064424, + "rewards/RewardModelWrapper/mean": 3.0924135095932903, + "rewards/RewardModelWrapper/std": 6.024646282196045, + "step": 2600 + }, + { + "clip_ratio/high_max": 0.02261253957170993, + "clip_ratio/high_mean": 0.02261253957170993, + "clip_ratio/low_mean": 0.008833104789373466, + "clip_ratio/low_min": 0.008833104789373466, + "clip_ratio/region_mean": 0.0314456443907693, + "completions/clipped_ratio": 0.9494485294117647, + "completions/max_length": 128.0, + "completions/max_terminated_length": 94.29411764705883, + "completions/mean_length": 125.73161764705883, + "completions/mean_terminated_length": 72.89117723352769, + "completions/min_length": 66.52941176470588, + "completions/min_terminated_length": 51.470588235294116, + "epoch": 0.5898720089037285, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.318300247192383, + "kl": 1.2395999401807785, + "learning_rate": 2.165149100842249e-06, + "loss": 0.3561, + "num_tokens": 8615194.0, + "reward": 2.5635701067307415, + "reward_std": 5.7780221490299, + "rewards/RewardModelWrapper/mean": 2.5635701067307415, + "rewards/RewardModelWrapper/std": 6.476823947008918, + "step": 2650 + }, + { + "clip_ratio/high_max": 0.02310706490650773, + "clip_ratio/high_mean": 0.02310706490650773, + "clip_ratio/low_mean": 0.008465991305129136, + "clip_ratio/low_min": 0.008465991305129136, + "clip_ratio/region_mean": 0.03157305620610714, + "completions/clipped_ratio": 0.9462890625, + "completions/max_length": 128.0, + "completions/max_terminated_length": 107.625, + "completions/mean_length": 125.8720703125, + "completions/mean_terminated_length": 88.43675756454468, + "completions/min_length": 65.875, + "completions/min_terminated_length": 65.875, + "epoch": 0.6010016694490818, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.8498454093933105, + "kl": 1.2743730303645133, + "learning_rate": 2.148076485317551e-06, + "loss": 0.3642, + "num_tokens": 8771759.0, + "reward": 3.0489635169506073, + "reward_std": 5.676127910614014, + "rewards/RewardModelWrapper/mean": 3.0489635169506073, + "rewards/RewardModelWrapper/std": 6.18413832783699, + "step": 2700 + }, + { + "clip_ratio/high_max": 0.017379222289891912, + "clip_ratio/high_mean": 0.017379222289891912, + "clip_ratio/low_mean": 0.012123786294832826, + "clip_ratio/low_min": 0.012123786294832826, + "clip_ratio/region_mean": 0.029503008612664416, + "completions/clipped_ratio": 0.9613970588235294, + "completions/max_length": 128.0, + "completions/max_terminated_length": 87.82352941176471, + "completions/mean_length": 126.33823529411765, + "completions/mean_terminated_length": 70.85490282844094, + "completions/min_length": 75.94117647058823, + "completions/min_terminated_length": 53.35294117647059, + "epoch": 0.6121313299944352, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.328779697418213, + "kl": 1.5358505266904832, + "learning_rate": 2.1320282267243345e-06, + "loss": 0.44, + "num_tokens": 8938623.0, + "reward": 4.035378414041856, + "reward_std": 5.26687082122354, + "rewards/RewardModelWrapper/mean": 4.035378414041856, + "rewards/RewardModelWrapper/std": 6.010923722211053, + "step": 2750 + }, + { + "clip_ratio/high_max": 0.02342768482863903, + "clip_ratio/high_mean": 0.02342768482863903, + "clip_ratio/low_mean": 0.007425281075702514, + "clip_ratio/low_min": 0.007425281075702514, + "clip_ratio/region_mean": 0.03085296612116508, + "completions/clipped_ratio": 0.9476102941176471, + "completions/max_length": 128.0, + "completions/max_terminated_length": 102.82352941176471, + "completions/mean_length": 125.83272058823529, + "completions/mean_terminated_length": 83.69166744456572, + "completions/min_length": 70.94117647058823, + "completions/min_terminated_length": 63.411764705882355, + "epoch": 0.6232609905397886, + "frac_reward_zero_std": 0.0, + "grad_norm": 13.94502067565918, + "kl": 1.3090015414357186, + "learning_rate": 2.114955611199636e-06, + "loss": 0.3749, + "num_tokens": 9104641.0, + "reward": 3.50168057049022, + "reward_std": 5.636927548576804, + "rewards/RewardModelWrapper/mean": 3.50168057049022, + "rewards/RewardModelWrapper/std": 6.223201779758229, + "step": 2800 + }, + { + "clip_ratio/high_max": 0.023096702507464217, + "clip_ratio/high_mean": 0.023096702507464217, + "clip_ratio/low_mean": 0.01079344226163812, + "clip_ratio/low_min": 0.01079344226163812, + "clip_ratio/region_mean": 0.033890144524630156, + "completions/clipped_ratio": 0.947265625, + "completions/max_length": 128.0, + "completions/max_terminated_length": 111.125, + "completions/mean_length": 126.0732421875, + "completions/mean_terminated_length": 91.28541803359985, + "completions/min_length": 68.25, + "completions/min_terminated_length": 68.25, + "epoch": 0.6343906510851419, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.740921974182129, + "kl": 1.2168986845016478, + "learning_rate": 2.0978829956749376e-06, + "loss": 0.3468, + "num_tokens": 9260988.0, + "reward": 2.903833270072937, + "reward_std": 5.634722024202347, + "rewards/RewardModelWrapper/mean": 2.903833270072937, + "rewards/RewardModelWrapper/std": 6.182769417762756, + "step": 2850 + }, + { + "clip_ratio/high_max": 0.021131394968833775, + "clip_ratio/high_mean": 0.021131394968833775, + "clip_ratio/low_mean": 0.00905259191960795, + "clip_ratio/low_min": 0.00905259191960795, + "clip_ratio/region_mean": 0.03018398679094389, + "completions/clipped_ratio": 0.9549632352941176, + "completions/max_length": 128.0, + "completions/max_terminated_length": 96.94117647058823, + "completions/mean_length": 125.89613970588235, + "completions/mean_terminated_length": 78.88333488913143, + "completions/min_length": 67.82352941176471, + "completions/min_terminated_length": 60.294117647058826, + "epoch": 0.6455203116304953, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.610988616943359, + "kl": 1.332914224267006, + "learning_rate": 2.080810380150239e-06, + "loss": 0.3851, + "num_tokens": 9427003.0, + "reward": 3.4099216741674088, + "reward_std": 5.599381278542912, + "rewards/RewardModelWrapper/mean": 3.4099216741674088, + "rewards/RewardModelWrapper/std": 6.283486815059886, + "step": 2900 + }, + { + "clip_ratio/high_max": 0.024977084384299814, + "clip_ratio/high_mean": 0.024977084384299814, + "clip_ratio/low_mean": 0.009850850635266396, + "clip_ratio/low_min": 0.009850850635266396, + "clip_ratio/region_mean": 0.034827935132198035, + "completions/clipped_ratio": 0.9430147058823529, + "completions/max_length": 128.0, + "completions/max_terminated_length": 105.11764705882354, + "completions/mean_length": 125.54503676470588, + "completions/mean_terminated_length": 85.04131810805377, + "completions/min_length": 64.41176470588235, + "completions/min_terminated_length": 64.41176470588235, + "epoch": 0.6566499721758486, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.673405170440674, + "kl": 1.34111887216568, + "learning_rate": 2.0637377646255406e-06, + "loss": 0.3787, + "num_tokens": 9592884.0, + "reward": 3.767064431134392, + "reward_std": 5.628603626700008, + "rewards/RewardModelWrapper/mean": 3.767064431134392, + "rewards/RewardModelWrapper/std": 6.238466964048498, + "step": 2950 + }, + { + "clip_ratio/high_max": 0.019235485673416406, + "clip_ratio/high_mean": 0.019235485673416406, + "clip_ratio/low_mean": 0.008951259328168816, + "clip_ratio/low_min": 0.008951259328168816, + "clip_ratio/region_mean": 0.02818674497772008, + "completions/clipped_ratio": 0.958984375, + "completions/max_length": 128.0, + "completions/max_terminated_length": 93.625, + "completions/mean_length": 126.48046875, + "completions/mean_terminated_length": 79.1166672706604, + "completions/min_length": 76.125, + "completions/min_terminated_length": 60.125, + "epoch": 0.667779632721202, + "frac_reward_zero_std": 0.0, + "grad_norm": Infinity, + "kl": 1.6551162710785865, + "learning_rate": 2.0470066014113363e-06, + "loss": 0.4809, + "num_tokens": 9750288.0, + "reward": 3.3632944226264954, + "reward_std": 5.644728451967239, + "rewards/RewardModelWrapper/mean": 3.3632944226264954, + "rewards/RewardModelWrapper/std": 6.475361466407776, + "step": 3000 + }, + { + "clip_ratio/high_max": 0.021347561194561424, + "clip_ratio/high_mean": 0.021347561194561424, + "clip_ratio/low_mean": 0.012039180095889605, + "clip_ratio/low_min": 0.012039180095889605, + "clip_ratio/region_mean": 0.03338674116646871, + "completions/clipped_ratio": 0.9641544117647058, + "completions/max_length": 128.0, + "completions/max_terminated_length": 84.29411764705883, + "completions/mean_length": 126.47426470588235, + "completions/mean_terminated_length": 67.78921643425437, + "completions/min_length": 73.29411764705883, + "completions/min_terminated_length": 50.705882352941174, + "epoch": 0.6789092932665554, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.413740158081055, + "kl": 1.3862884595990181, + "learning_rate": 2.030275438197132e-06, + "loss": 0.4017, + "num_tokens": 9917180.0, + "reward": 3.722391970017377, + "reward_std": 5.822299059699564, + "rewards/RewardModelWrapper/mean": 3.722391970017377, + "rewards/RewardModelWrapper/std": 6.463091822231517, + "step": 3050 + }, + { + "clip_ratio/high_max": 0.018999405660433694, + "clip_ratio/high_mean": 0.018999405660433694, + "clip_ratio/low_mean": 0.010441597908793484, + "clip_ratio/low_min": 0.010441597908793484, + "clip_ratio/region_mean": 0.029441003524698316, + "completions/clipped_ratio": 0.9586397058823529, + "completions/max_length": 128.0, + "completions/max_terminated_length": 112.29411764705883, + "completions/mean_length": 126.4623161764706, + "completions/mean_terminated_length": 90.0686279745663, + "completions/min_length": 63.94117647058823, + "completions/min_terminated_length": 63.94117647058823, + "epoch": 0.6900389538119087, + "frac_reward_zero_std": 0.0, + "grad_norm": 11.30611801147461, + "kl": 1.3920823442935943, + "learning_rate": 2.0132028226724335e-06, + "loss": 0.4035, + "num_tokens": 10083867.0, + "reward": 3.71955924875596, + "reward_std": 5.790389762205236, + "rewards/RewardModelWrapper/mean": 3.71955924875596, + "rewards/RewardModelWrapper/std": 6.5407993653241325, + "step": 3100 + }, + { + "clip_ratio/high_max": 0.02239516925183125, + "clip_ratio/high_mean": 0.02239516925183125, + "clip_ratio/low_mean": 0.010940310020523612, + "clip_ratio/low_min": 0.010940310020523612, + "clip_ratio/region_mean": 0.03333547928952612, + "completions/clipped_ratio": 0.955078125, + "completions/max_length": 128.0, + "completions/max_terminated_length": 99.875, + "completions/mean_length": 126.2353515625, + "completions/mean_terminated_length": 82.07812547683716, + "completions/min_length": 70.75, + "completions/min_terminated_length": 62.75, + "epoch": 0.7011686143572621, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.098145484924316, + "kl": 1.332285776436329, + "learning_rate": 1.996130207147735e-06, + "loss": 0.3818, + "num_tokens": 10240948.0, + "reward": 3.675293631851673, + "reward_std": 5.620851904153824, + "rewards/RewardModelWrapper/mean": 3.675293631851673, + "rewards/RewardModelWrapper/std": 6.339143455028534, + "step": 3150 + }, + { + "clip_ratio/high_max": 0.017545219952007755, + "clip_ratio/high_mean": 0.017545219952007755, + "clip_ratio/low_mean": 0.006160206313361414, + "clip_ratio/low_min": 0.006160206313361414, + "clip_ratio/region_mean": 0.023705426228698343, + "completions/clipped_ratio": 0.9540441176470589, + "completions/max_length": 128.0, + "completions/max_terminated_length": 105.05882352941177, + "completions/mean_length": 126.03216911764706, + "completions/mean_terminated_length": 81.38235316557042, + "completions/min_length": 66.52941176470588, + "completions/min_terminated_length": 59.0, + "epoch": 0.7122982749026154, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.446596622467041, + "kl": 1.3602485132217408, + "learning_rate": 1.9790575916230366e-06, + "loss": 0.3915, + "num_tokens": 10407047.0, + "reward": 3.461222396177404, + "reward_std": 5.5388546831467576, + "rewards/RewardModelWrapper/mean": 3.461222396177404, + "rewards/RewardModelWrapper/std": 6.420014409457936, + "step": 3200 + }, + { + "clip_ratio/high_max": 0.01795817382866517, + "clip_ratio/high_mean": 0.01795817382866517, + "clip_ratio/low_mean": 0.008432389081281143, + "clip_ratio/low_min": 0.008432389081281143, + "clip_ratio/region_mean": 0.026390562802553176, + "completions/clipped_ratio": 0.9669117647058824, + "completions/max_length": 128.0, + "completions/max_terminated_length": 78.94117647058823, + "completions/mean_length": 126.28033088235294, + "completions/mean_terminated_length": 61.84313740449793, + "completions/min_length": 68.05882352941177, + "completions/min_terminated_length": 45.470588235294116, + "epoch": 0.7234279354479688, + "frac_reward_zero_std": 0.0, + "grad_norm": 17.92909812927246, + "kl": 1.4322891801595687, + "learning_rate": 1.9619849760983386e-06, + "loss": 0.4131, + "num_tokens": 10573736.0, + "reward": 3.6829915467430565, + "reward_std": 5.790671881507425, + "rewards/RewardModelWrapper/mean": 3.6829915467430565, + "rewards/RewardModelWrapper/std": 6.5448582032147575, + "step": 3250 + }, + { + "clip_ratio/high_max": 0.01961003711214289, + "clip_ratio/high_mean": 0.01961003711214289, + "clip_ratio/low_mean": 0.010123618032957893, + "clip_ratio/low_min": 0.010123618032957893, + "clip_ratio/region_mean": 0.02973365513375029, + "completions/clipped_ratio": 0.9736328125, + "completions/max_length": 128.0, + "completions/max_terminated_length": 89.75, + "completions/mean_length": 127.12890625, + "completions/mean_terminated_length": 78.04687547683716, + "completions/min_length": 88.3125, + "completions/min_terminated_length": 64.3125, + "epoch": 0.7345575959933222, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.4893033504486084, + "kl": 1.3992696887254714, + "learning_rate": 1.94491236057364e-06, + "loss": 0.407, + "num_tokens": 10732252.0, + "reward": 4.159975051879883, + "reward_std": 5.596900701522827, + "rewards/RewardModelWrapper/mean": 4.159975051879883, + "rewards/RewardModelWrapper/std": 6.406121611595154, + "step": 3300 + }, + { + "clip_ratio/high_max": 0.01751380935544148, + "clip_ratio/high_mean": 0.01751380935544148, + "clip_ratio/low_mean": 0.006701366908382625, + "clip_ratio/low_min": 0.006701366908382625, + "clip_ratio/region_mean": 0.02421517624054104, + "completions/clipped_ratio": 0.9632352941176471, + "completions/max_length": 128.0, + "completions/max_terminated_length": 89.11764705882354, + "completions/mean_length": 126.52113970588235, + "completions/mean_terminated_length": 75.78823538387523, + "completions/min_length": 73.58823529411765, + "completions/min_terminated_length": 58.529411764705884, + "epoch": 0.7456872565386756, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.024867057800293, + "kl": 1.48705244243145, + "learning_rate": 1.9278397450489416e-06, + "loss": 0.4302, + "num_tokens": 10898899.0, + "reward": 4.022455299601836, + "reward_std": 6.009893417358398, + "rewards/RewardModelWrapper/mean": 4.022455299601836, + "rewards/RewardModelWrapper/std": 6.55277754278744, + "step": 3350 + }, + { + "clip_ratio/high_max": 0.020205343069974332, + "clip_ratio/high_mean": 0.020205343069974332, + "clip_ratio/low_mean": 0.008244332130707334, + "clip_ratio/low_min": 0.008244332130707334, + "clip_ratio/region_mean": 0.028449675207957624, + "completions/clipped_ratio": 0.953125, + "completions/max_length": 128.0, + "completions/max_terminated_length": 83.52941176470588, + "completions/mean_length": 125.67463235294117, + "completions/mean_terminated_length": 65.16414619894589, + "completions/min_length": 67.76470588235294, + "completions/min_terminated_length": 45.1764705882353, + "epoch": 0.756816917084029, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.548982620239258, + "kl": 1.4358280056715012, + "learning_rate": 1.910767129524243e-06, + "loss": 0.4127, + "num_tokens": 11065097.0, + "reward": 3.6614036700304817, + "reward_std": 5.941182669471292, + "rewards/RewardModelWrapper/mean": 3.6614036700304817, + "rewards/RewardModelWrapper/std": 6.68101375243243, + "step": 3400 + }, + { + "clip_ratio/high_max": 0.018692465843632818, + "clip_ratio/high_mean": 0.018692465843632818, + "clip_ratio/low_mean": 0.008573709986812901, + "clip_ratio/low_min": 0.008573709986812901, + "clip_ratio/region_mean": 0.02726617576321587, + "completions/clipped_ratio": 0.9541015625, + "completions/max_length": 128.0, + "completions/max_terminated_length": 98.8125, + "completions/mean_length": 126.1083984375, + "completions/mean_terminated_length": 77.7172622680664, + "completions/min_length": 63.125, + "completions/min_terminated_length": 55.125, + "epoch": 0.7679465776293823, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.6161601543426514, + "kl": 1.4051894819736481, + "learning_rate": 1.8936945139995447e-06, + "loss": 0.4055, + "num_tokens": 11221336.0, + "reward": 2.737824946641922, + "reward_std": 6.139679282903671, + "rewards/RewardModelWrapper/mean": 2.737824946641922, + "rewards/RewardModelWrapper/std": 6.881059348583221, + "step": 3450 + }, + { + "clip_ratio/high_max": 0.019909201117698103, + "clip_ratio/high_mean": 0.019909201117698103, + "clip_ratio/low_mean": 0.009944785697734914, + "clip_ratio/low_min": 0.009944785697734914, + "clip_ratio/region_mean": 0.029853986804373563, + "completions/clipped_ratio": 0.9733455882352942, + "completions/max_length": 128.0, + "completions/max_terminated_length": 91.88235294117646, + "completions/mean_length": 127.20036764705883, + "completions/mean_terminated_length": 86.0049025591682, + "completions/min_length": 94.52941176470588, + "completions/min_terminated_length": 79.47058823529412, + "epoch": 0.7790762381747357, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.78251314163208, + "kl": 1.4502297604084016, + "learning_rate": 1.8766218984748462e-06, + "loss": 0.4266, + "num_tokens": 11389018.0, + "reward": 4.499273047727697, + "reward_std": 5.489500326268813, + "rewards/RewardModelWrapper/mean": 4.499273047727697, + "rewards/RewardModelWrapper/std": 6.2598629839280076, + "step": 3500 + }, + { + "clip_ratio/high_max": 0.01706919132906478, + "clip_ratio/high_mean": 0.01706919132906478, + "clip_ratio/low_mean": 0.007432717043848243, + "clip_ratio/low_min": 0.007432717043848243, + "clip_ratio/region_mean": 0.024501908438978717, + "completions/clipped_ratio": 0.9568014705882353, + "completions/max_length": 128.0, + "completions/max_terminated_length": 91.47058823529412, + "completions/mean_length": 126.015625, + "completions/mean_terminated_length": 74.68823646096622, + "completions/min_length": 61.76470588235294, + "completions/min_terminated_length": 54.23529411764706, + "epoch": 0.7902058987200891, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.431344509124756, + "kl": 1.4172208327054978, + "learning_rate": 1.859549282950148e-06, + "loss": 0.4053, + "num_tokens": 11555371.0, + "reward": 3.9203204547657684, + "reward_std": 5.879987856921027, + "rewards/RewardModelWrapper/mean": 3.9203204547657684, + "rewards/RewardModelWrapper/std": 6.654794917387121, + "step": 3550 + }, + { + "clip_ratio/high_max": 0.017978638106724246, + "clip_ratio/high_mean": 0.017978638106724246, + "clip_ratio/low_mean": 0.008542120530910325, + "clip_ratio/low_min": 0.008542120530910325, + "clip_ratio/region_mean": 0.02652075860532932, + "completions/clipped_ratio": 0.95703125, + "completions/max_length": 128.0, + "completions/max_terminated_length": 105.0625, + "completions/mean_length": 126.1298828125, + "completions/mean_terminated_length": 88.37500047683716, + "completions/min_length": 68.9375, + "completions/min_terminated_length": 68.9375, + "epoch": 0.8013355592654424, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.334597110748291, + "kl": 1.357377045750618, + "learning_rate": 1.8424766674254495e-06, + "loss": 0.39, + "num_tokens": 11712544.0, + "reward": 3.334804505109787, + "reward_std": 6.004520118236542, + "rewards/RewardModelWrapper/mean": 3.334804505109787, + "rewards/RewardModelWrapper/std": 6.608620345592499, + "step": 3600 + }, + { + "clip_ratio/high_max": 0.01815531796310097, + "clip_ratio/high_mean": 0.01815531796310097, + "clip_ratio/low_mean": 0.00551853927434422, + "clip_ratio/low_min": 0.00551853927434422, + "clip_ratio/region_mean": 0.023673857206013053, + "completions/clipped_ratio": 0.9586397058823529, + "completions/max_length": 128.0, + "completions/max_terminated_length": 106.0, + "completions/mean_length": 126.57444852941177, + "completions/mean_terminated_length": 91.43627570657169, + "completions/min_length": 73.05882352941177, + "completions/min_terminated_length": 73.05882352941177, + "epoch": 0.8124652198107958, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.32098913192749, + "kl": 1.3787575218081474, + "learning_rate": 1.825404051900751e-06, + "loss": 0.406, + "num_tokens": 11879329.0, + "reward": 4.733595371246338, + "reward_std": 5.286141087027157, + "rewards/RewardModelWrapper/mean": 4.733595371246338, + "rewards/RewardModelWrapper/std": 6.089175813338336, + "step": 3650 + }, + { + "clip_ratio/high_max": 0.018292159989941867, + "clip_ratio/high_mean": 0.018292159989941867, + "clip_ratio/low_mean": 0.00964461057272274, + "clip_ratio/low_min": 0.00964461057272274, + "clip_ratio/region_mean": 0.027936770617961883, + "completions/clipped_ratio": 0.9347426470588235, + "completions/max_length": 128.0, + "completions/max_terminated_length": 100.94117647058823, + "completions/mean_length": 125.18014705882354, + "completions/mean_terminated_length": 79.63531673655791, + "completions/min_length": 62.8235294117647, + "completions/min_terminated_length": 55.294117647058826, + "epoch": 0.8235948803561491, + "frac_reward_zero_std": 0.0, + "grad_norm": 11.81167984008789, + "kl": 1.3568527114391327, + "learning_rate": 1.8083314363760528e-06, + "loss": 0.3856, + "num_tokens": 12044285.0, + "reward": 3.853144645690918, + "reward_std": 5.8185105744530174, + "rewards/RewardModelWrapper/mean": 3.853144645690918, + "rewards/RewardModelWrapper/std": 6.648196416742661, + "step": 3700 + }, + { + "clip_ratio/high_max": 0.020421573969069868, + "clip_ratio/high_mean": 0.020421573969069868, + "clip_ratio/low_mean": 0.006358395353017841, + "clip_ratio/low_min": 0.006358395353017841, + "clip_ratio/region_mean": 0.02677996931830421, + "completions/clipped_ratio": 0.966796875, + "completions/max_length": 128.0, + "completions/max_terminated_length": 85.4375, + "completions/mean_length": 126.6142578125, + "completions/mean_terminated_length": 75.74791765213013, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "epoch": 0.8347245409015025, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.919222593307495, + "kl": 1.4361850446462632, + "learning_rate": 1.7912588208513545e-06, + "loss": 0.4195, + "num_tokens": 12201530.0, + "reward": 4.583240419626236, + "reward_std": 5.661596119403839, + "rewards/RewardModelWrapper/mean": 4.583240419626236, + "rewards/RewardModelWrapper/std": 6.355997741222382, + "step": 3750 + }, + { + "clip_ratio/high_max": 0.01899803020292893, + "clip_ratio/high_mean": 0.01899803020292893, + "clip_ratio/low_mean": 0.005853212493821047, + "clip_ratio/low_min": 0.005853212493821047, + "clip_ratio/region_mean": 0.02485124268569052, + "completions/clipped_ratio": 0.9485294117647058, + "completions/max_length": 128.0, + "completions/max_terminated_length": 106.41176470588235, + "completions/mean_length": 125.8529411764706, + "completions/mean_terminated_length": 86.47465066348805, + "completions/min_length": 63.11764705882353, + "completions/min_terminated_length": 63.11764705882353, + "epoch": 0.8458542014468559, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.776216983795166, + "kl": 1.4694108253717422, + "learning_rate": 1.7741862053266563e-06, + "loss": 0.4258, + "num_tokens": 12367226.0, + "reward": 4.691084188573501, + "reward_std": 5.392301559448242, + "rewards/RewardModelWrapper/mean": 4.691084188573501, + "rewards/RewardModelWrapper/std": 6.076854313121123, + "step": 3800 + }, + { + "clip_ratio/high_max": 0.020900118886493145, + "clip_ratio/high_mean": 0.020900118886493145, + "clip_ratio/low_mean": 0.008081750934943557, + "clip_ratio/low_min": 0.008081750934943557, + "clip_ratio/region_mean": 0.028981869909912347, + "completions/clipped_ratio": 0.9733455882352942, + "completions/max_length": 128.0, + "completions/max_terminated_length": 76.76470588235294, + "completions/mean_length": 126.63051470588235, + "completions/mean_terminated_length": 62.98235298605526, + "completions/min_length": 81.41176470588235, + "completions/min_terminated_length": 51.294117647058826, + "epoch": 0.8569838619922092, + "frac_reward_zero_std": 0.0, + "grad_norm": 6.310102462768555, + "kl": 1.3774869224429132, + "learning_rate": 1.7571135898019578e-06, + "loss": 0.398, + "num_tokens": 12534040.0, + "reward": 3.871620360542746, + "reward_std": 5.696767147849588, + "rewards/RewardModelWrapper/mean": 3.871620360542746, + "rewards/RewardModelWrapper/std": 6.582426996792064, + "step": 3850 + }, + { + "clip_ratio/high_max": 0.021299479028675704, + "clip_ratio/high_mean": 0.021299479028675704, + "clip_ratio/low_mean": 0.0075305427008424885, + "clip_ratio/low_min": 0.0075305427008424885, + "clip_ratio/region_mean": 0.028830021731555463, + "completions/clipped_ratio": 0.9619140625, + "completions/max_length": 128.0, + "completions/max_terminated_length": 85.8125, + "completions/mean_length": 126.1943359375, + "completions/mean_terminated_length": 71.39270901679993, + "completions/min_length": 71.8125, + "completions/min_terminated_length": 55.8125, + "epoch": 0.8681135225375626, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.930108547210693, + "kl": 1.3843135032057763, + "learning_rate": 1.7400409742772593e-06, + "loss": 0.3964, + "num_tokens": 12690615.0, + "reward": 3.086591437458992, + "reward_std": 6.208359390497208, + "rewards/RewardModelWrapper/mean": 3.086591437458992, + "rewards/RewardModelWrapper/std": 6.8491051197052, + "step": 3900 + }, + { + "clip_ratio/high_max": 0.018020967768970875, + "clip_ratio/high_mean": 0.018020967768970875, + "clip_ratio/low_mean": 0.006037966601434163, + "clip_ratio/low_min": 0.006037966601434163, + "clip_ratio/region_mean": 0.024058934384956956, + "completions/clipped_ratio": 0.953125, + "completions/max_length": 128.0, + "completions/max_terminated_length": 103.76470588235294, + "completions/mean_length": 126.11305147058823, + "completions/mean_terminated_length": 84.38186331356273, + "completions/min_length": 66.11764705882354, + "completions/min_terminated_length": 58.588235294117645, + "epoch": 0.8792431830829159, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.74282169342041, + "kl": 1.432164865732193, + "learning_rate": 1.7229683587525609e-06, + "loss": 0.4134, + "num_tokens": 12857386.0, + "reward": 3.5356551899629483, + "reward_std": 5.877786804648006, + "rewards/RewardModelWrapper/mean": 3.5356551899629483, + "rewards/RewardModelWrapper/std": 6.742880484637092, + "step": 3950 + }, + { + "clip_ratio/high_max": 0.016392124033300207, + "clip_ratio/high_mean": 0.016392124033300207, + "clip_ratio/low_mean": 0.00735437709663529, + "clip_ratio/low_min": 0.00735437709663529, + "clip_ratio/region_mean": 0.02374650107929483, + "completions/clipped_ratio": 0.9669117647058824, + "completions/max_length": 128.0, + "completions/max_terminated_length": 95.0, + "completions/mean_length": 126.86764705882354, + "completions/mean_terminated_length": 82.9313735961914, + "completions/min_length": 84.29411764705883, + "completions/min_terminated_length": 69.23529411764706, + "epoch": 0.8903728436282693, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.2612786293029785, + "kl": 1.4968037492036819, + "learning_rate": 1.7058957432278626e-06, + "loss": 0.4371, + "num_tokens": 13025050.0, + "reward": 3.9833039676441864, + "reward_std": 5.820403575897217, + "rewards/RewardModelWrapper/mean": 3.9833039676441864, + "rewards/RewardModelWrapper/std": 6.59747979220222, + "step": 4000 + }, + { + "clip_ratio/high_max": 0.013542763022705913, + "clip_ratio/high_mean": 0.013542763022705913, + "clip_ratio/low_mean": 0.007844352710526437, + "clip_ratio/low_min": 0.007844352710526437, + "clip_ratio/region_mean": 0.021387115789111705, + "completions/clipped_ratio": 0.9755859375, + "completions/max_length": 128.0, + "completions/max_terminated_length": 82.0, + "completions/mean_length": 127.052734375, + "completions/mean_terminated_length": 69.609375, + "completions/min_length": 89.8125, + "completions/min_terminated_length": 57.8125, + "epoch": 0.9015025041736227, + "frac_reward_zero_std": 0.0, + "grad_norm": 9.22656536102295, + "kl": 1.486895147562027, + "learning_rate": 1.6888231277031642e-06, + "loss": 0.4339, + "num_tokens": 13182896.0, + "reward": 3.8604883551597595, + "reward_std": 5.920006081461906, + "rewards/RewardModelWrapper/mean": 3.8604883551597595, + "rewards/RewardModelWrapper/std": 6.682152062654495, + "step": 4050 + }, + { + "clip_ratio/high_max": 0.017551230599638076, + "clip_ratio/high_mean": 0.017551230599638076, + "clip_ratio/low_mean": 0.006257881603378337, + "clip_ratio/low_min": 0.006257881603378337, + "clip_ratio/region_mean": 0.023809112217277287, + "completions/clipped_ratio": 0.9604779411764706, + "completions/max_length": 128.0, + "completions/max_terminated_length": 103.47058823529412, + "completions/mean_length": 126.76011029411765, + "completions/mean_terminated_length": 90.72815165800206, + "completions/min_length": 81.47058823529412, + "completions/min_terminated_length": 73.94117647058823, + "epoch": 0.9126321647189761, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.342564105987549, + "kl": 1.479159579873085, + "learning_rate": 1.6717505121784657e-06, + "loss": 0.434, + "num_tokens": 13349539.0, + "reward": 3.891232869204353, + "reward_std": 5.906516776365392, + "rewards/RewardModelWrapper/mean": 3.891232869204353, + "rewards/RewardModelWrapper/std": 6.87522164513083, + "step": 4100 + }, + { + "clip_ratio/high_max": 0.017731820455519482, + "clip_ratio/high_mean": 0.017731820455519482, + "clip_ratio/low_mean": 0.0037902081329957583, + "clip_ratio/low_min": 0.0037902081329957583, + "clip_ratio/region_mean": 0.021522028532344847, + "completions/clipped_ratio": 0.9632352941176471, + "completions/max_length": 128.0, + "completions/max_terminated_length": 102.11764705882354, + "completions/mean_length": 126.3373161764706, + "completions/mean_terminated_length": 80.36274584601907, + "completions/min_length": 61.588235294117645, + "completions/min_terminated_length": 54.05882352941177, + "epoch": 0.9237618252643295, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.095427513122559, + "kl": 1.5771301573514938, + "learning_rate": 1.6546778966537674e-06, + "loss": 0.4627, + "num_tokens": 13516058.0, + "reward": 4.4532030890969665, + "reward_std": 5.776824221891515, + "rewards/RewardModelWrapper/mean": 4.4532030890969665, + "rewards/RewardModelWrapper/std": 6.367258969475241, + "step": 4150 + }, + { + "clip_ratio/high_max": 0.018110398813150824, + "clip_ratio/high_mean": 0.018110398813150824, + "clip_ratio/low_mean": 0.006745649516233243, + "clip_ratio/low_min": 0.006745649516233243, + "clip_ratio/region_mean": 0.024856048391666264, + "completions/clipped_ratio": 0.9609375, + "completions/max_length": 128.0, + "completions/max_terminated_length": 93.9375, + "completions/mean_length": 126.3310546875, + "completions/mean_terminated_length": 81.33363127708435, + "completions/min_length": 74.75, + "completions/min_terminated_length": 66.75, + "epoch": 0.9348914858096828, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.7098264694213867, + "kl": 1.4337137299776077, + "learning_rate": 1.637605281129069e-06, + "loss": 0.4139, + "num_tokens": 13673549.0, + "reward": 3.717156395316124, + "reward_std": 5.887754291296005, + "rewards/RewardModelWrapper/mean": 3.717156395316124, + "rewards/RewardModelWrapper/std": 6.542896807193756, + "step": 4200 + }, + { + "clip_ratio/high_max": 0.01609679988003336, + "clip_ratio/high_mean": 0.01609679988003336, + "clip_ratio/low_mean": 0.006251108425203711, + "clip_ratio/low_min": 0.006251108425203711, + "clip_ratio/region_mean": 0.022347908235387876, + "completions/clipped_ratio": 0.9466911764705882, + "completions/max_length": 128.0, + "completions/max_terminated_length": 108.3529411764706, + "completions/mean_length": 126.234375, + "completions/mean_terminated_length": 89.88039308435776, + "completions/min_length": 68.6470588235294, + "completions/min_terminated_length": 61.11764705882353, + "epoch": 0.9460211463550362, + "frac_reward_zero_std": 0.0, + "grad_norm": 4.461524963378906, + "kl": 1.4435530692338943, + "learning_rate": 1.6205326656043705e-06, + "loss": 0.4174, + "num_tokens": 13839820.0, + "reward": 3.5667920813840976, + "reward_std": 5.679576621336095, + "rewards/RewardModelWrapper/mean": 3.5667920813840976, + "rewards/RewardModelWrapper/std": 6.743907311383416, + "step": 4250 + }, + { + "clip_ratio/high_max": 0.01591621272964403, + "clip_ratio/high_mean": 0.01591621272964403, + "clip_ratio/low_mean": 0.005297647488187067, + "clip_ratio/low_min": 0.005297647488187067, + "clip_ratio/region_mean": 0.021213860225398094, + "completions/clipped_ratio": 0.9669117647058824, + "completions/max_length": 128.0, + "completions/max_terminated_length": 100.76470588235294, + "completions/mean_length": 126.63602941176471, + "completions/mean_terminated_length": 87.36666780359604, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.9571508069003896, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.245085716247559, + "kl": 1.4910035210847854, + "learning_rate": 1.603460050079672e-06, + "loss": 0.4312, + "num_tokens": 14007144.0, + "reward": 4.034057981827679, + "reward_std": 5.743304505067713, + "rewards/RewardModelWrapper/mean": 4.034057981827679, + "rewards/RewardModelWrapper/std": 6.6319817094241875, + "step": 4300 + }, + { + "clip_ratio/high_max": 0.0162072420923505, + "clip_ratio/high_mean": 0.0162072420923505, + "clip_ratio/low_mean": 0.00646918074140558, + "clip_ratio/low_min": 0.00646918074140558, + "clip_ratio/region_mean": 0.022676422880031168, + "completions/clipped_ratio": 0.9560546875, + "completions/max_length": 128.0, + "completions/max_terminated_length": 97.0, + "completions/mean_length": 126.1572265625, + "completions/mean_terminated_length": 80.61093807220459, + "completions/min_length": 69.625, + "completions/min_terminated_length": 61.625, + "epoch": 0.9682804674457429, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.9271068572998047, + "kl": 1.5564635121822357, + "learning_rate": 1.5863874345549738e-06, + "loss": 0.4481, + "num_tokens": 14163497.0, + "reward": 4.418118596076965, + "reward_std": 5.663649529218674, + "rewards/RewardModelWrapper/mean": 4.418118596076965, + "rewards/RewardModelWrapper/std": 6.5488221347332, + "step": 4350 + }, + { + "clip_ratio/high_max": 0.015229720452334733, + "clip_ratio/high_mean": 0.015229720452334733, + "clip_ratio/low_mean": 0.005334880515874829, + "clip_ratio/low_min": 0.005334880515874829, + "clip_ratio/region_mean": 0.020564600981306285, + "completions/clipped_ratio": 0.9411764705882353, + "completions/max_length": 128.0, + "completions/max_terminated_length": 101.0, + "completions/mean_length": 124.7876838235294, + "completions/mean_terminated_length": 74.75882474113914, + "completions/min_length": 50.35294117647059, + "completions/min_terminated_length": 50.35294117647059, + "epoch": 0.9794101279910963, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.419699192047119, + "kl": 1.4550988680124284, + "learning_rate": 1.5693148190302755e-06, + "loss": 0.4187, + "num_tokens": 14328034.0, + "reward": 3.752244500552907, + "reward_std": 5.818949124392341, + "rewards/RewardModelWrapper/mean": 3.752244500552907, + "rewards/RewardModelWrapper/std": 6.797629524679745, + "step": 4400 + }, + { + "clip_ratio/high_max": 0.018021058345912024, + "clip_ratio/high_mean": 0.018021058345912024, + "clip_ratio/low_mean": 0.0030438171711284667, + "clip_ratio/low_min": 0.0030438171711284667, + "clip_ratio/region_mean": 0.021064875536831097, + "completions/clipped_ratio": 0.9549632352941176, + "completions/max_length": 128.0, + "completions/max_terminated_length": 96.23529411764706, + "completions/mean_length": 126.15900735294117, + "completions/mean_terminated_length": 79.46218647676356, + "completions/min_length": 68.05882352941177, + "completions/min_terminated_length": 60.529411764705884, + "epoch": 0.9905397885364496, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.946506977081299, + "kl": 1.4544246417284012, + "learning_rate": 1.5522422035055773e-06, + "loss": 0.4204, + "num_tokens": 14494631.0, + "reward": 3.687691057429594, + "reward_std": 5.869795238270479, + "rewards/RewardModelWrapper/mean": 3.687691057429594, + "rewards/RewardModelWrapper/std": 6.839460316826315, + "step": 4450 + }, + { + "epoch": 0.9996661101836394, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.9438291139240507, + "eval_completions/max_length": 128.0, + "eval_completions/max_terminated_length": 53.129746835443036, + "eval_completions/mean_length": 125.40446993670886, + "eval_completions/mean_terminated_length": 48.271835556513146, + "eval_completions/min_length": 96.05696202531645, + "eval_completions/min_terminated_length": 43.39873417721519, + "eval_frac_reward_zero_std": 0.0, + "eval_kl": 1.4363023352019395, + "eval_loss": 0.41118884086608887, + "eval_num_tokens": 14622004.0, + "eval_reward": 3.463206129738047, + "eval_reward_std": 6.040495253722124, + "eval_rewards/RewardModelWrapper/mean": 3.463206129738047, + "eval_rewards/RewardModelWrapper/std": 6.557550964476187, + "eval_runtime": 1430.6223, + "eval_samples_per_second": 0.441, + "eval_steps_per_second": 0.028, + "step": 4491 + } + ], + "logging_steps": 50, + "max_steps": 8986, + "num_input_tokens_seen": 14622004, + "num_train_epochs": 2, + "save_steps": 2696, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}