diff --git "a/checkpoint-100/trainer_state.json" "b/checkpoint-100/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-100/trainer_state.json" @@ -0,0 +1,3434 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.8565310492505354, + "eval_steps": 500, + "global_step": 100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 425.75, + "completions/max_terminated_length": 407.25, + "completions/mean_length": 224.9140625, + "completions/mean_terminated_length": 222.6020851135254, + "completions/min_length": 100.75, + "completions/min_terminated_length": 100.75, + "entropy": 0.3943025507032871, + "epoch": 0.008565310492505354, + "frac_reward_zero_std": 0.859375, + "grad_norm": 2.234375, + "learning_rate": 0.0, + "loss": -0.0682, + "num_tokens": 40707.0, + "reward": 0.109375, + "reward_std": 0.13258252362720668, + "rewards/correctness_reward_func/mean": 0.046875, + "rewards/correctness_reward_func/std": 0.1875, + "rewards/int_reward_func/mean": 0.015625, + "rewards/int_reward_func/std": 0.0625, + "rewards/soft_format_reward_func/mean": 0.015625, + "rewards/soft_format_reward_func/std": 0.05259781517088413, + "rewards/strict_format_reward_func/mean": 0.0, + "rewards/strict_format_reward_func/std": 0.0, + "rewards/xmlcount_reward_func/mean": 0.03125, + "rewards/xmlcount_reward_func/std": 0.09922334365546703, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 454.625, + "completions/max_terminated_length": 431.875, + "completions/mean_length": 240.2421875, + "completions/mean_terminated_length": 236.11719131469727, + "completions/min_length": 120.125, + "completions/min_terminated_length": 120.125, + "entropy": 0.4173264354467392, + "epoch": 0.017130620985010708, + "frac_reward_zero_std": 0.625, + "grad_norm": 4.46875, + "learning_rate": 1.6666666666666667e-06, + "loss": 0.0373, + "num_tokens": 84076.0, + "reward": 0.2431640625, + "reward_std": 0.30797815788537264, + "rewards/correctness_reward_func/mean": 0.078125, + "rewards/correctness_reward_func/std": 0.3125, + "rewards/int_reward_func/mean": 0.0390625, + "rewards/int_reward_func/std": 0.13456955552101135, + "rewards/soft_format_reward_func/mean": 0.046875, + "rewards/soft_format_reward_func/std": 0.12433474138379097, + "rewards/strict_format_reward_func/mean": 0.0, + "rewards/strict_format_reward_func/std": 0.0, + "rewards/xmlcount_reward_func/mean": 0.0791015625, + "rewards/xmlcount_reward_func/std": 0.1614172589033842, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 379.375, + "completions/max_terminated_length": 372.125, + "completions/mean_length": 209.1484375, + "completions/mean_terminated_length": 207.07291793823242, + "completions/min_length": 90.5, + "completions/min_terminated_length": 90.5, + "entropy": 0.39367850497365, + "epoch": 0.02569593147751606, + "frac_reward_zero_std": 0.796875, + "grad_norm": 3.0, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.0145, + "num_tokens": 122753.0, + "reward": 0.1796875, + "reward_std": 0.17677669739350677, + "rewards/correctness_reward_func/mean": 0.078125, + "rewards/correctness_reward_func/std": 0.2257782220840454, + "rewards/int_reward_func/mean": 0.0234375, + "rewards/int_reward_func/std": 0.07206955552101135, + "rewards/soft_format_reward_func/mean": 0.02734375, + "rewards/soft_format_reward_func/std": 0.07779237069189548, + "rewards/strict_format_reward_func/mean": 0.0, + "rewards/strict_format_reward_func/std": 0.0, + "rewards/xmlcount_reward_func/mean": 0.05078125, + "rewards/xmlcount_reward_func/std": 0.11597390845417976, + "step": 3 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.75, + "completions/max_terminated_length": 381.75, + "completions/mean_length": 205.140625, + "completions/mean_terminated_length": 205.140625, + "completions/min_length": 82.5, + "completions/min_terminated_length": 82.5, + "entropy": 0.42507942393422127, + "epoch": 0.034261241970021415, + "frac_reward_zero_std": 0.609375, + "grad_norm": 4.5, + "learning_rate": 5e-06, + "loss": -0.1276, + "num_tokens": 161051.0, + "reward": 0.2958984375, + "reward_std": 0.3052160106599331, + "rewards/correctness_reward_func/mean": 0.078125, + "rewards/correctness_reward_func/std": 0.2257782220840454, + "rewards/int_reward_func/mean": 0.03125, + "rewards/int_reward_func/std": 0.09341737069189548, + "rewards/soft_format_reward_func/mean": 0.07421875, + "rewards/soft_format_reward_func/std": 0.17638970352709293, + "rewards/strict_format_reward_func/mean": 0.0, + "rewards/strict_format_reward_func/std": 0.0, + "rewards/xmlcount_reward_func/mean": 0.1123046875, + "rewards/xmlcount_reward_func/std": 0.1925698984414339, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 383.25, + "completions/max_terminated_length": 383.25, + "completions/mean_length": 207.828125, + "completions/mean_terminated_length": 207.828125, + "completions/min_length": 74.5, + "completions/min_terminated_length": 74.5, + "entropy": 0.4761287160217762, + "epoch": 0.042826552462526764, + "frac_reward_zero_std": 0.34375, + "grad_norm": 6.28125, + "learning_rate": 6.666666666666667e-06, + "loss": -0.0135, + "num_tokens": 199521.0, + "reward": 1.1474609375, + "reward_std": 0.5952402763068676, + "rewards/correctness_reward_func/mean": 0.359375, + "rewards/correctness_reward_func/std": 0.7196519374847412, + "rewards/int_reward_func/mean": 0.12109375, + "rewards/int_reward_func/std": 0.20104984939098358, + "rewards/soft_format_reward_func/mean": 0.3203125, + "rewards/soft_format_reward_func/std": 0.23448428697884083, + "rewards/strict_format_reward_func/mean": 0.0, + "rewards/strict_format_reward_func/std": 0.0, + "rewards/xmlcount_reward_func/mean": 0.3466796875, + "rewards/xmlcount_reward_func/std": 0.20816493593156338, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 434.5, + "completions/max_terminated_length": 387.375, + "completions/mean_length": 231.28125, + "completions/mean_terminated_length": 226.61354446411133, + "completions/min_length": 118.25, + "completions/min_terminated_length": 118.25, + "entropy": 0.42499926686286926, + "epoch": 0.05139186295503212, + "frac_reward_zero_std": 0.578125, + "grad_norm": 3.265625, + "learning_rate": 8.333333333333334e-06, + "loss": 0.1364, + "num_tokens": 241131.0, + "reward": 1.4375, + "reward_std": 0.4833737723529339, + "rewards/correctness_reward_func/mean": 0.375, + "rewards/correctness_reward_func/std": 0.741176463663578, + "rewards/int_reward_func/mean": 0.13671875, + "rewards/int_reward_func/std": 0.2227986976504326, + "rewards/soft_format_reward_func/mean": 0.44921875, + "rewards/soft_format_reward_func/std": 0.1270910371094942, + "rewards/strict_format_reward_func/mean": 0.0, + "rewards/strict_format_reward_func/std": 0.0, + "rewards/xmlcount_reward_func/mean": 0.4765625, + "rewards/xmlcount_reward_func/std": 0.07140547037124634, + "step": 6 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 453.0, + "completions/max_terminated_length": 433.125, + "completions/mean_length": 238.6796875, + "completions/mean_terminated_length": 234.18125343322754, + "completions/min_length": 96.375, + "completions/min_terminated_length": 96.375, + "entropy": 0.4198453910648823, + "epoch": 0.059957173447537475, + "frac_reward_zero_std": 0.421875, + "grad_norm": 4.15625, + "learning_rate": 1e-05, + "loss": 0.1899, + "num_tokens": 283686.0, + "reward": 1.7509765625, + "reward_std": 0.7305849269032478, + "rewards/correctness_reward_func/mean": 0.609375, + "rewards/correctness_reward_func/std": 0.9355916231870651, + "rewards/int_reward_func/mean": 0.1953125, + "rewards/int_reward_func/std": 0.2459700107574463, + "rewards/soft_format_reward_func/mean": 0.4609375, + "rewards/soft_format_reward_func/std": 0.1128891110420227, + "rewards/strict_format_reward_func/mean": 0.00390625, + "rewards/strict_format_reward_func/std": 0.015625, + "rewards/xmlcount_reward_func/mean": 0.4814453125, + "rewards/xmlcount_reward_func/std": 0.046708236914128065, + "step": 7 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 364.375, + "completions/max_terminated_length": 364.375, + "completions/mean_length": 220.34375, + "completions/mean_terminated_length": 220.34375, + "completions/min_length": 98.875, + "completions/min_terminated_length": 98.875, + "entropy": 0.42923642322421074, + "epoch": 0.06852248394004283, + "frac_reward_zero_std": 0.46875, + "grad_norm": 4.59375, + "learning_rate": 1.1666666666666668e-05, + "loss": -0.0768, + "num_tokens": 324156.0, + "reward": 2.1044921875, + "reward_std": 0.6339101828634739, + "rewards/correctness_reward_func/mean": 0.859375, + "rewards/correctness_reward_func/std": 0.9749292060732841, + "rewards/int_reward_func/mean": 0.3046875, + "rewards/int_reward_func/std": 0.23351078107953072, + "rewards/soft_format_reward_func/mean": 0.46484375, + "rewards/soft_format_reward_func/std": 0.08251741342246532, + "rewards/strict_format_reward_func/mean": 0.00390625, + "rewards/strict_format_reward_func/std": 0.015625, + "rewards/xmlcount_reward_func/mean": 0.4716796875, + "rewards/xmlcount_reward_func/std": 0.06621513469144702, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 376.5, + "completions/max_terminated_length": 361.25, + "completions/mean_length": 216.578125, + "completions/mean_terminated_length": 212.43080520629883, + "completions/min_length": 96.875, + "completions/min_terminated_length": 96.875, + "entropy": 0.3928321301937103, + "epoch": 0.07708779443254818, + "frac_reward_zero_std": 0.59375, + "grad_norm": 3.5, + "learning_rate": 1.3333333333333333e-05, + "loss": 0.0392, + "num_tokens": 363864.0, + "reward": 2.5830078125, + "reward_std": 0.40741503052413464, + "rewards/correctness_reward_func/mean": 1.203125, + "rewards/correctness_reward_func/std": 0.9139788597822189, + "rewards/int_reward_func/mean": 0.42578125, + "rewards/int_reward_func/std": 0.1317095011472702, + "rewards/soft_format_reward_func/mean": 0.46875, + "rewards/soft_format_reward_func/std": 0.09341737069189548, + "rewards/strict_format_reward_func/mean": 0.0078125, + "rewards/strict_format_reward_func/std": 0.03125, + "rewards/xmlcount_reward_func/mean": 0.4775390625, + "rewards/xmlcount_reward_func/std": 0.06064485618844628, + "step": 9 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 403.75, + "completions/max_terminated_length": 386.625, + "completions/mean_length": 210.6640625, + "completions/mean_terminated_length": 203.55908012390137, + "completions/min_length": 75.875, + "completions/min_terminated_length": 75.875, + "entropy": 0.3184557221829891, + "epoch": 0.08565310492505353, + "frac_reward_zero_std": 0.421875, + "grad_norm": 5.125, + "learning_rate": 1.5000000000000002e-05, + "loss": -0.0025, + "num_tokens": 402647.0, + "reward": 2.7490234375, + "reward_std": 0.7140121199190617, + "rewards/correctness_reward_func/mean": 1.375, + "rewards/correctness_reward_func/std": 0.9443820938467979, + "rewards/int_reward_func/mean": 0.453125, + "rewards/int_reward_func/std": 0.12136822193861008, + "rewards/soft_format_reward_func/mean": 0.4609375, + "rewards/soft_format_reward_func/std": 0.10298692621290684, + "rewards/strict_format_reward_func/mean": 0.03125, + "rewards/strict_format_reward_func/std": 0.10519563034176826, + "rewards/xmlcount_reward_func/mean": 0.4287109375, + "rewards/xmlcount_reward_func/std": 0.144282141700387, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 434.75, + "completions/max_terminated_length": 372.875, + "completions/mean_length": 238.5390625, + "completions/mean_terminated_length": 229.2265682220459, + "completions/min_length": 88.375, + "completions/min_terminated_length": 88.375, + "entropy": 0.3128885291516781, + "epoch": 0.09421841541755889, + "frac_reward_zero_std": 0.359375, + "grad_norm": 5.78125, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.2439, + "num_tokens": 445266.0, + "reward": 3.0341796875, + "reward_std": 0.5344732906669378, + "rewards/correctness_reward_func/mean": 1.421875, + "rewards/correctness_reward_func/std": 0.8460541293025017, + "rewards/int_reward_func/mean": 0.46484375, + "rewards/int_reward_func/std": 0.09914018586277962, + "rewards/soft_format_reward_func/mean": 0.484375, + "rewards/soft_format_reward_func/std": 0.05259781517088413, + "rewards/strict_format_reward_func/mean": 0.203125, + "rewards/strict_format_reward_func/std": 0.2389280553907156, + "rewards/xmlcount_reward_func/mean": 0.4599609375, + "rewards/xmlcount_reward_func/std": 0.10840688459575176, + "step": 11 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 373.125, + "completions/max_terminated_length": 353.5, + "completions/mean_length": 218.9140625, + "completions/mean_terminated_length": 216.4234390258789, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.2998475953936577, + "epoch": 0.10278372591006424, + "frac_reward_zero_std": 0.71875, + "grad_norm": 4.1875, + "learning_rate": 1.8333333333333333e-05, + "loss": -0.1385, + "num_tokens": 485929.0, + "reward": 3.484375, + "reward_std": 0.38393688201904297, + "rewards/correctness_reward_func/mean": 1.59375, + "rewards/correctness_reward_func/std": 0.7323416471481323, + "rewards/int_reward_func/mean": 0.46875, + "rewards/int_reward_func/std": 0.06877040676772594, + "rewards/soft_format_reward_func/mean": 0.48828125, + "rewards/soft_format_reward_func/std": 0.025194555521011353, + "rewards/strict_format_reward_func/mean": 0.46875, + "rewards/strict_format_reward_func/std": 0.09341737069189548, + "rewards/xmlcount_reward_func/mean": 0.46484375, + "rewards/xmlcount_reward_func/std": 0.09572842810302973, + "step": 12 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 370.125, + "completions/max_terminated_length": 333.5, + "completions/mean_length": 204.46875, + "completions/mean_terminated_length": 201.68333435058594, + "completions/min_length": 95.875, + "completions/min_terminated_length": 95.875, + "entropy": 0.1816732920706272, + "epoch": 0.11134903640256959, + "frac_reward_zero_std": 0.703125, + "grad_norm": 3.96875, + "learning_rate": 2e-05, + "loss": 0.2391, + "num_tokens": 523887.0, + "reward": 3.525390625, + "reward_std": 0.30659707519225776, + "rewards/correctness_reward_func/mean": 1.5625, + "rewards/correctness_reward_func/std": 0.7897166311740875, + "rewards/int_reward_func/mean": 0.4921875, + "rewards/int_reward_func/std": 0.021347815170884132, + "rewards/soft_format_reward_func/mean": 0.5, + "rewards/soft_format_reward_func/std": 0.0, + "rewards/strict_format_reward_func/mean": 0.4765625, + "rewards/strict_format_reward_func/std": 0.08384781517088413, + "rewards/xmlcount_reward_func/mean": 0.494140625, + "rewards/xmlcount_reward_func/std": 0.0234375, + "step": 13 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 404.0, + "completions/max_terminated_length": 354.75, + "completions/mean_length": 187.515625, + "completions/mean_terminated_length": 179.75893211364746, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.15807979460805655, + "epoch": 0.11991434689507495, + "frac_reward_zero_std": 0.75, + "grad_norm": 4.46875, + "learning_rate": 1.9995524322835035e-05, + "loss": -0.0269, + "num_tokens": 559761.0, + "reward": 3.4208984375, + "reward_std": 0.32731308368965983, + "rewards/correctness_reward_func/mean": 1.53125, + "rewards/correctness_reward_func/std": 0.8291211053729057, + "rewards/int_reward_func/mean": 0.46875, + "rewards/int_reward_func/std": 0.09341737069189548, + "rewards/soft_format_reward_func/mean": 0.47265625, + "rewards/soft_format_reward_func/std": 0.0660141110420227, + "rewards/strict_format_reward_func/mean": 0.46875, + "rewards/strict_format_reward_func/std": 0.07173692621290684, + "rewards/xmlcount_reward_func/mean": 0.4794921875, + "rewards/xmlcount_reward_func/std": 0.05074238684028387, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 349.625, + "completions/max_terminated_length": 338.5, + "completions/mean_length": 187.828125, + "completions/mean_terminated_length": 185.50208473205566, + "completions/min_length": 92.625, + "completions/min_terminated_length": 92.625, + "entropy": 0.17131240852177143, + "epoch": 0.1284796573875803, + "frac_reward_zero_std": 0.765625, + "grad_norm": 3.765625, + "learning_rate": 1.998210129767735e-05, + "loss": 0.2086, + "num_tokens": 596775.0, + "reward": 3.4423828125, + "reward_std": 0.3024538792669773, + "rewards/correctness_reward_func/mean": 1.484375, + "rewards/correctness_reward_func/std": 0.8679328411817551, + "rewards/int_reward_func/mean": 0.484375, + "rewards/int_reward_func/std": 0.05259781517088413, + "rewards/soft_format_reward_func/mean": 0.49609375, + "rewards/soft_format_reward_func/std": 0.015625, + "rewards/strict_format_reward_func/mean": 0.484375, + "rewards/strict_format_reward_func/std": 0.04081955552101135, + "rewards/xmlcount_reward_func/mean": 0.4931640625, + "rewards/xmlcount_reward_func/std": 0.02734375, + "step": 15 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 401.375, + "completions/max_terminated_length": 382.125, + "completions/mean_length": 206.8984375, + "completions/mean_terminated_length": 202.10052299499512, + "completions/min_length": 102.25, + "completions/min_terminated_length": 102.25, + "entropy": 0.15633743070065975, + "epoch": 0.13704496788008566, + "frac_reward_zero_std": 0.765625, + "grad_norm": 3.4375, + "learning_rate": 1.9959742939952393e-05, + "loss": 0.0938, + "num_tokens": 635224.0, + "reward": 3.591796875, + "reward_std": 0.35631553269922733, + "rewards/correctness_reward_func/mean": 1.625, + "rewards/correctness_reward_func/std": 0.7407501488924026, + "rewards/int_reward_func/mean": 0.48828125, + "rewards/int_reward_func/std": 0.046875, + "rewards/soft_format_reward_func/mean": 0.4921875, + "rewards/soft_format_reward_func/std": 0.03125, + "rewards/strict_format_reward_func/mean": 0.4921875, + "rewards/strict_format_reward_func/std": 0.03125, + "rewards/xmlcount_reward_func/mean": 0.494140625, + "rewards/xmlcount_reward_func/std": 0.0234375, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 360.0, + "completions/max_terminated_length": 330.125, + "completions/mean_length": 199.09375, + "completions/mean_terminated_length": 194.2984390258789, + "completions/min_length": 104.625, + "completions/min_terminated_length": 104.625, + "entropy": 0.19571769889444113, + "epoch": 0.145610278372591, + "frac_reward_zero_std": 0.8125, + "grad_norm": 2.9375, + "learning_rate": 1.9928469263418376e-05, + "loss": 0.0169, + "num_tokens": 671932.0, + "reward": 3.57421875, + "reward_std": 0.23754368349909782, + "rewards/correctness_reward_func/mean": 1.609375, + "rewards/correctness_reward_func/std": 0.6196783930063248, + "rewards/int_reward_func/mean": 0.484375, + "rewards/int_reward_func/std": 0.04081955552101135, + "rewards/soft_format_reward_func/mean": 0.4921875, + "rewards/soft_format_reward_func/std": 0.03125, + "rewards/strict_format_reward_func/mean": 0.4921875, + "rewards/strict_format_reward_func/std": 0.03125, + "rewards/xmlcount_reward_func/mean": 0.49609375, + "rewards/xmlcount_reward_func/std": 0.015625, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 363.5, + "completions/max_terminated_length": 352.125, + "completions/mean_length": 207.1640625, + "completions/mean_terminated_length": 204.95573043823242, + "completions/min_length": 96.75, + "completions/min_terminated_length": 96.75, + "entropy": 0.20056522078812122, + "epoch": 0.15417558886509636, + "frac_reward_zero_std": 0.765625, + "grad_norm": 5.84375, + "learning_rate": 1.9888308262251286e-05, + "loss": 0.0571, + "num_tokens": 710529.0, + "reward": 3.4853515625, + "reward_std": 0.3079781490378082, + "rewards/correctness_reward_func/mean": 1.578125, + "rewards/correctness_reward_func/std": 0.7429328411817551, + "rewards/int_reward_func/mean": 0.46484375, + "rewards/int_reward_func/std": 0.08923800103366375, + "rewards/soft_format_reward_func/mean": 0.484375, + "rewards/soft_format_reward_func/std": 0.04081955552101135, + "rewards/strict_format_reward_func/mean": 0.47265625, + "rewards/strict_format_reward_func/std": 0.07779237069189548, + "rewards/xmlcount_reward_func/mean": 0.4853515625, + "rewards/xmlcount_reward_func/std": 0.03691330552101135, + "step": 18 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 404.0, + "completions/max_terminated_length": 404.0, + "completions/mean_length": 242.7109375, + "completions/mean_terminated_length": 242.7109375, + "completions/min_length": 131.125, + "completions/min_terminated_length": 131.125, + "entropy": 0.18245846219360828, + "epoch": 0.16274089935760172, + "frac_reward_zero_std": 0.71875, + "grad_norm": 2.609375, + "learning_rate": 1.98392958859863e-05, + "loss": -0.0092, + "num_tokens": 753458.0, + "reward": 3.4609375, + "reward_std": 0.39774756878614426, + "rewards/correctness_reward_func/mean": 1.484375, + "rewards/correctness_reward_func/std": 0.8802329078316689, + "rewards/int_reward_func/mean": 0.4765625, + "rewards/int_reward_func/std": 0.07206955552101135, + "rewards/soft_format_reward_func/mean": 0.5, + "rewards/soft_format_reward_func/std": 0.0, + "rewards/strict_format_reward_func/mean": 0.5, + "rewards/strict_format_reward_func/std": 0.0, + "rewards/xmlcount_reward_func/mean": 0.5, + "rewards/xmlcount_reward_func/std": 0.0, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 453.25, + "completions/max_terminated_length": 416.25, + "completions/mean_length": 240.1328125, + "completions/mean_terminated_length": 232.0755271911621, + "completions/min_length": 136.875, + "completions/min_terminated_length": 136.875, + "entropy": 0.1867619026452303, + "epoch": 0.17130620985010706, + "frac_reward_zero_std": 0.765625, + "grad_norm": 2.640625, + "learning_rate": 1.9781476007338058e-05, + "loss": 0.1299, + "num_tokens": 796607.0, + "reward": 3.5458984375, + "reward_std": 0.33283737674355507, + "rewards/correctness_reward_func/mean": 1.59375, + "rewards/correctness_reward_func/std": 0.7794546857476234, + "rewards/int_reward_func/mean": 0.48828125, + "rewards/int_reward_func/std": 0.046875, + "rewards/soft_format_reward_func/mean": 0.48828125, + "rewards/soft_format_reward_func/std": 0.046875, + "rewards/strict_format_reward_func/mean": 0.484375, + "rewards/strict_format_reward_func/std": 0.0625, + "rewards/xmlcount_reward_func/mean": 0.4912109375, + "rewards/xmlcount_reward_func/std": 0.03515625, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 364.625, + "completions/max_terminated_length": 364.625, + "completions/mean_length": 236.8828125, + "completions/mean_terminated_length": 236.8828125, + "completions/min_length": 135.25, + "completions/min_terminated_length": 135.25, + "entropy": 0.17888653837144375, + "epoch": 0.17987152034261242, + "frac_reward_zero_std": 0.8125, + "grad_norm": 2.40625, + "learning_rate": 1.9714900382928674e-05, + "loss": -0.0113, + "num_tokens": 839112.0, + "reward": 3.537109375, + "reward_std": 0.2568786293268204, + "rewards/correctness_reward_func/mean": 1.546875, + "rewards/correctness_reward_func/std": 0.7888757362961769, + "rewards/int_reward_func/mean": 0.49609375, + "rewards/int_reward_func/std": 0.015625, + "rewards/soft_format_reward_func/mean": 0.5, + "rewards/soft_format_reward_func/std": 0.0, + "rewards/strict_format_reward_func/mean": 0.49609375, + "rewards/strict_format_reward_func/std": 0.015625, + "rewards/xmlcount_reward_func/mean": 0.498046875, + "rewards/xmlcount_reward_func/std": 0.0078125, + "step": 21 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 402.25, + "completions/max_terminated_length": 371.75, + "completions/mean_length": 232.953125, + "completions/mean_terminated_length": 226.11860275268555, + "completions/min_length": 142.125, + "completions/min_terminated_length": 142.125, + "entropy": 0.1563574131578207, + "epoch": 0.18843683083511778, + "frac_reward_zero_std": 0.78125, + "grad_norm": 2.328125, + "learning_rate": 1.9639628606958535e-05, + "loss": 0.0542, + "num_tokens": 881088.0, + "reward": 3.5693359375, + "reward_std": 0.30245387367904186, + "rewards/correctness_reward_func/mean": 1.625, + "rewards/correctness_reward_func/std": 0.7605545148253441, + "rewards/int_reward_func/mean": 0.4765625, + "rewards/int_reward_func/std": 0.08384781517088413, + "rewards/soft_format_reward_func/mean": 0.48828125, + "rewards/soft_format_reward_func/std": 0.03697281517088413, + "rewards/strict_format_reward_func/mean": 0.48828125, + "rewards/strict_format_reward_func/std": 0.03697281517088413, + "rewards/xmlcount_reward_func/mean": 0.4912109375, + "rewards/xmlcount_reward_func/std": 0.027729611843824387, + "step": 22 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 446.75, + "completions/max_terminated_length": 402.875, + "completions/mean_length": 254.4765625, + "completions/mean_terminated_length": 242.60871124267578, + "completions/min_length": 135.75, + "completions/min_terminated_length": 135.75, + "entropy": 0.16357604414224625, + "epoch": 0.19700214132762311, + "frac_reward_zero_std": 0.6875, + "grad_norm": 3.65625, + "learning_rate": 1.955572805786141e-05, + "loss": 0.0195, + "num_tokens": 926107.0, + "reward": 3.341796875, + "reward_std": 0.4087961111217737, + "rewards/correctness_reward_func/mean": 1.46875, + "rewards/correctness_reward_func/std": 0.8502998873591423, + "rewards/int_reward_func/mean": 0.46484375, + "rewards/int_reward_func/std": 0.08439540676772594, + "rewards/soft_format_reward_func/mean": 0.46875, + "rewards/soft_format_reward_func/std": 0.08054866641759872, + "rewards/strict_format_reward_func/mean": 0.4609375, + "rewards/strict_format_reward_func/std": 0.10189648158848286, + "rewards/xmlcount_reward_func/mean": 0.478515625, + "rewards/xmlcount_reward_func/std": 0.04778209747746587, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 403.5, + "completions/max_terminated_length": 380.875, + "completions/mean_length": 241.0703125, + "completions/mean_terminated_length": 239.03854370117188, + "completions/min_length": 119.25, + "completions/min_terminated_length": 119.25, + "entropy": 0.1647733524441719, + "epoch": 0.20556745182012848, + "frac_reward_zero_std": 0.6875, + "grad_norm": 7.75, + "learning_rate": 1.9463273837991643e-05, + "loss": -0.128, + "num_tokens": 968962.0, + "reward": 3.3740234375, + "reward_std": 0.48751697689294815, + "rewards/correctness_reward_func/mean": 1.484375, + "rewards/correctness_reward_func/std": 0.8504082337021828, + "rewards/int_reward_func/mean": 0.46484375, + "rewards/int_reward_func/std": 0.0972641110420227, + "rewards/soft_format_reward_func/mean": 0.4765625, + "rewards/soft_format_reward_func/std": 0.050389111042022705, + "rewards/strict_format_reward_func/mean": 0.47265625, + "rewards/strict_format_reward_func/std": 0.0660141110420227, + "rewards/xmlcount_reward_func/mean": 0.4755859375, + "rewards/xmlcount_reward_func/std": 0.054295361042022705, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 408.75, + "completions/max_terminated_length": 399.0, + "completions/mean_length": 255.03125, + "completions/mean_terminated_length": 253.1906280517578, + "completions/min_length": 156.5, + "completions/min_terminated_length": 156.5, + "entropy": 0.125497592613101, + "epoch": 0.21413276231263384, + "frac_reward_zero_std": 0.8125, + "grad_norm": 2.53125, + "learning_rate": 1.9362348706397374e-05, + "loss": -0.0327, + "num_tokens": 1013778.0, + "reward": 3.583984375, + "reward_std": 0.22373299859464169, + "rewards/correctness_reward_func/mean": 1.609375, + "rewards/correctness_reward_func/std": 0.7345243394374847, + "rewards/int_reward_func/mean": 0.48828125, + "rewards/int_reward_func/std": 0.03697281517088413, + "rewards/soft_format_reward_func/mean": 0.49609375, + "rewards/soft_format_reward_func/std": 0.015625, + "rewards/strict_format_reward_func/mean": 0.4921875, + "rewards/strict_format_reward_func/std": 0.021347815170884132, + "rewards/xmlcount_reward_func/mean": 0.498046875, + "rewards/xmlcount_reward_func/std": 0.005336953792721033, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 403.125, + "completions/max_terminated_length": 384.875, + "completions/mean_length": 262.71875, + "completions/mean_terminated_length": 260.7333335876465, + "completions/min_length": 167.5, + "completions/min_terminated_length": 167.5, + "entropy": 0.18168668448925018, + "epoch": 0.22269807280513917, + "frac_reward_zero_std": 0.75, + "grad_norm": 13.0, + "learning_rate": 1.9253043004739967e-05, + "loss": 0.054, + "num_tokens": 1059274.0, + "reward": 3.5341796875, + "reward_std": 0.39360435120761395, + "rewards/correctness_reward_func/mean": 1.625, + "rewards/correctness_reward_func/std": 0.7649086192250252, + "rewards/int_reward_func/mean": 0.47265625, + "rewards/int_reward_func/std": 0.06116959825158119, + "rewards/soft_format_reward_func/mean": 0.4765625, + "rewards/soft_format_reward_func/std": 0.049298666417598724, + "rewards/strict_format_reward_func/mean": 0.4765625, + "rewards/strict_format_reward_func/std": 0.049298666417598724, + "rewards/xmlcount_reward_func/mean": 0.4833984375, + "rewards/xmlcount_reward_func/std": 0.03711527772247791, + "step": 26 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 467.5, + "completions/max_terminated_length": 456.625, + "completions/mean_length": 268.328125, + "completions/mean_terminated_length": 263.0275344848633, + "completions/min_length": 158.375, + "completions/min_terminated_length": 158.375, + "entropy": 0.13837805949151516, + "epoch": 0.23126338329764454, + "frac_reward_zero_std": 0.6875, + "grad_norm": 10.1875, + "learning_rate": 1.913545457642601e-05, + "loss": -0.0646, + "num_tokens": 1105884.0, + "reward": 3.2080078125, + "reward_std": 0.4985655229538679, + "rewards/correctness_reward_func/mean": 1.359375, + "rewards/correctness_reward_func/std": 0.91850346326828, + "rewards/int_reward_func/mean": 0.453125, + "rewards/int_reward_func/std": 0.11255648173391819, + "rewards/soft_format_reward_func/mean": 0.4609375, + "rewards/soft_format_reward_func/std": 0.09308474138379097, + "rewards/strict_format_reward_func/mean": 0.45703125, + "rewards/strict_format_reward_func/std": 0.09693148173391819, + "rewards/xmlcount_reward_func/mean": 0.4775390625, + "rewards/xmlcount_reward_func/std": 0.05502833751961589, + "step": 27 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 449.5, + "completions/max_terminated_length": 401.75, + "completions/mean_length": 268.9375, + "completions/mean_terminated_length": 260.4345283508301, + "completions/min_length": 159.25, + "completions/min_terminated_length": 159.25, + "entropy": 0.08630623016506433, + "epoch": 0.2398286937901499, + "frac_reward_zero_std": 0.734375, + "grad_norm": 2.578125, + "learning_rate": 1.900968867902419e-05, + "loss": 0.0809, + "num_tokens": 1152072.0, + "reward": 3.59375, + "reward_std": 0.34802911058068275, + "rewards/correctness_reward_func/mean": 1.671875, + "rewards/correctness_reward_func/std": 0.5796433389186859, + "rewards/int_reward_func/mean": 0.4921875, + "rewards/int_reward_func/std": 0.03125, + "rewards/soft_format_reward_func/mean": 0.47265625, + "rewards/soft_format_reward_func/std": 0.0796684455126524, + "rewards/strict_format_reward_func/mean": 0.46875, + "rewards/strict_format_reward_func/std": 0.0952934455126524, + "rewards/xmlcount_reward_func/mean": 0.48828125, + "rewards/xmlcount_reward_func/std": 0.03735560039058328, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 414.125, + "completions/max_terminated_length": 404.75, + "completions/mean_length": 265.9921875, + "completions/mean_terminated_length": 261.0090808868408, + "completions/min_length": 149.125, + "completions/min_terminated_length": 149.125, + "entropy": 0.08809430431574583, + "epoch": 0.24839400428265523, + "frac_reward_zero_std": 0.734375, + "grad_norm": 7.03125, + "learning_rate": 1.8875857890045544e-05, + "loss": -0.0093, + "num_tokens": 1198043.0, + "reward": 3.376953125, + "reward_std": 0.42813105694949627, + "rewards/correctness_reward_func/mean": 1.5, + "rewards/correctness_reward_func/std": 0.8573416471481323, + "rewards/int_reward_func/mean": 0.4609375, + "rewards/int_reward_func/std": 0.10298692621290684, + "rewards/soft_format_reward_func/mean": 0.46875, + "rewards/soft_format_reward_func/std": 0.09341737069189548, + "rewards/strict_format_reward_func/mean": 0.46875, + "rewards/strict_format_reward_func/std": 0.09341737069189548, + "rewards/xmlcount_reward_func/mean": 0.478515625, + "rewards/xmlcount_reward_func/std": 0.060735128819942474, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 468.625, + "completions/max_terminated_length": 436.25, + "completions/mean_length": 267.0859375, + "completions/mean_terminated_length": 261.05000495910645, + "completions/min_length": 156.5, + "completions/min_terminated_length": 156.5, + "entropy": 0.1107498500496149, + "epoch": 0.2569593147751606, + "frac_reward_zero_std": 0.828125, + "grad_norm": 22.375, + "learning_rate": 1.87340820061713e-05, + "loss": 0.0968, + "num_tokens": 1244242.0, + "reward": 3.5576171875, + "reward_std": 0.2610218357294798, + "rewards/correctness_reward_func/mean": 1.671875, + "rewards/correctness_reward_func/std": 0.6637040823698044, + "rewards/int_reward_func/mean": 0.4609375, + "rewards/int_reward_func/std": 0.11179866641759872, + "rewards/soft_format_reward_func/mean": 0.4765625, + "rewards/soft_format_reward_func/std": 0.05920085124671459, + "rewards/strict_format_reward_func/mean": 0.47265625, + "rewards/strict_format_reward_func/std": 0.07482585124671459, + "rewards/xmlcount_reward_func/mean": 0.4755859375, + "rewards/xmlcount_reward_func/std": 0.06549832038581371, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 465.625, + "completions/max_terminated_length": 430.875, + "completions/mean_length": 279.1953125, + "completions/mean_terminated_length": 266.19639587402344, + "completions/min_length": 156.5, + "completions/min_terminated_length": 156.5, + "entropy": 0.10201808018609881, + "epoch": 0.26552462526766596, + "frac_reward_zero_std": 0.734375, + "grad_norm": 20.375, + "learning_rate": 1.8584487936018663e-05, + "loss": 0.1084, + "num_tokens": 1292255.0, + "reward": 3.431640625, + "reward_std": 0.3204077649861574, + "rewards/correctness_reward_func/mean": 1.5625, + "rewards/correctness_reward_func/std": 0.7631078958511353, + "rewards/int_reward_func/mean": 0.47265625, + "rewards/int_reward_func/std": 0.07779237069189548, + "rewards/soft_format_reward_func/mean": 0.46484375, + "rewards/soft_format_reward_func/std": 0.09617366641759872, + "rewards/strict_format_reward_func/mean": 0.4609375, + "rewards/strict_format_reward_func/std": 0.10002040676772594, + "rewards/xmlcount_reward_func/mean": 0.470703125, + "rewards/xmlcount_reward_func/std": 0.0801805853843689, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 456.0, + "completions/max_terminated_length": 420.5, + "completions/mean_length": 273.09375, + "completions/mean_terminated_length": 265.13988304138184, + "completions/min_length": 162.625, + "completions/min_terminated_length": 162.625, + "entropy": 0.0854261638596654, + "epoch": 0.2740899357601713, + "frac_reward_zero_std": 0.640625, + "grad_norm": 6.15625, + "learning_rate": 1.8427209586540392e-05, + "loss": 0.1575, + "num_tokens": 1339511.0, + "reward": 3.3798828125, + "reward_std": 0.4875169713050127, + "rewards/correctness_reward_func/mean": 1.578125, + "rewards/correctness_reward_func/std": 0.80234594643116, + "rewards/int_reward_func/mean": 0.46875, + "rewards/int_reward_func/std": 0.09341737069189548, + "rewards/soft_format_reward_func/mean": 0.4375, + "rewards/soft_format_reward_func/std": 0.15228559263050556, + "rewards/strict_format_reward_func/mean": 0.43359375, + "rewards/strict_format_reward_func/std": 0.15613233298063278, + "rewards/xmlcount_reward_func/mean": 0.4619140625, + "rewards/xmlcount_reward_func/std": 0.10068135987967253, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 469.625, + "completions/max_terminated_length": 423.625, + "completions/mean_length": 274.953125, + "completions/mean_terminated_length": 265.3171920776367, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "entropy": 0.08486761944368482, + "epoch": 0.2826552462526767, + "frac_reward_zero_std": 0.65625, + "grad_norm": 7.25, + "learning_rate": 1.826238774315995e-05, + "loss": 0.124, + "num_tokens": 1386753.0, + "reward": 3.3203125, + "reward_std": 0.5109951309859753, + "rewards/correctness_reward_func/mean": 1.59375, + "rewards/correctness_reward_func/std": 0.6808668300509453, + "rewards/int_reward_func/mean": 0.46484375, + "rewards/int_reward_func/std": 0.08736192621290684, + "rewards/soft_format_reward_func/mean": 0.41796875, + "rewards/soft_format_reward_func/std": 0.17672233283519745, + "rewards/strict_format_reward_func/mean": 0.3984375, + "rewards/strict_format_reward_func/std": 0.19531385228037834, + "rewards/xmlcount_reward_func/mean": 0.4453125, + "rewards/xmlcount_reward_func/std": 0.11597495479509234, + "step": 33 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 500.375, + "completions/max_terminated_length": 410.125, + "completions/mean_length": 295.1015625, + "completions/mean_terminated_length": 264.6992950439453, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "entropy": 0.1147261718288064, + "epoch": 0.291220556745182, + "frac_reward_zero_std": 0.625, + "grad_norm": 10.4375, + "learning_rate": 1.8090169943749477e-05, + "loss": 0.2837, + "num_tokens": 1436884.0, + "reward": 3.1953125, + "reward_std": 0.5910970717668533, + "rewards/correctness_reward_func/mean": 1.453125, + "rewards/correctness_reward_func/std": 0.9149020090699196, + "rewards/int_reward_func/mean": 0.46484375, + "rewards/int_reward_func/std": 0.1109184455126524, + "rewards/soft_format_reward_func/mean": 0.453125, + "rewards/soft_format_reward_func/std": 0.13611300103366375, + "rewards/strict_format_reward_func/mean": 0.390625, + "rewards/strict_format_reward_func/std": 0.21258162707090378, + "rewards/xmlcount_reward_func/mean": 0.43359375, + "rewards/xmlcount_reward_func/std": 0.147721191868186, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 471.0, + "completions/max_terminated_length": 403.25, + "completions/mean_length": 256.2578125, + "completions/mean_terminated_length": 241.55090141296387, + "completions/min_length": 135.5, + "completions/min_terminated_length": 135.5, + "entropy": 0.06437111645936966, + "epoch": 0.29978586723768735, + "frac_reward_zero_std": 0.671875, + "grad_norm": 10.0625, + "learning_rate": 1.7910710346563417e-05, + "loss": 0.0329, + "num_tokens": 1481663.0, + "reward": 3.4833984375, + "reward_std": 0.53999756090343, + "rewards/correctness_reward_func/mean": 1.640625, + "rewards/correctness_reward_func/std": 0.778965063393116, + "rewards/int_reward_func/mean": 0.46484375, + "rewards/int_reward_func/std": 0.12082063034176826, + "rewards/soft_format_reward_func/mean": 0.46484375, + "rewards/soft_format_reward_func/std": 0.12082063034176826, + "rewards/strict_format_reward_func/mean": 0.44921875, + "rewards/strict_format_reward_func/std": 0.14689540676772594, + "rewards/xmlcount_reward_func/mean": 0.4638671875, + "rewards/xmlcount_reward_func/std": 0.10555252991616726, + "step": 35 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 457.875, + "completions/max_terminated_length": 417.625, + "completions/mean_length": 246.2265625, + "completions/mean_terminated_length": 232.80781745910645, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.09419436752796173, + "epoch": 0.3083511777301927, + "frac_reward_zero_std": 0.65625, + "grad_norm": 16.25, + "learning_rate": 1.7724169592245996e-05, + "loss": -0.0012, + "num_tokens": 1524892.0, + "reward": 3.3369140625, + "reward_std": 0.34941017907112837, + "rewards/correctness_reward_func/mean": 1.546875, + "rewards/correctness_reward_func/std": 0.7342507243156433, + "rewards/int_reward_func/mean": 0.46875, + "rewards/int_reward_func/std": 0.09341737069189548, + "rewards/soft_format_reward_func/mean": 0.453125, + "rewards/soft_format_reward_func/std": 0.1095899622887373, + "rewards/strict_format_reward_func/mean": 0.43359375, + "rewards/strict_format_reward_func/std": 0.14689540676772594, + "rewards/xmlcount_reward_func/mean": 0.4345703125, + "rewards/xmlcount_reward_func/std": 0.12757759355008602, + "step": 36 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 466.875, + "completions/max_terminated_length": 341.625, + "completions/mean_length": 237.1953125, + "completions/mean_terminated_length": 214.32523155212402, + "completions/min_length": 102.5, + "completions/min_terminated_length": 102.5, + "entropy": 0.08123180363327265, + "epoch": 0.3169164882226981, + "frac_reward_zero_std": 0.75, + "grad_norm": 12.0625, + "learning_rate": 1.7530714660036112e-05, + "loss": 0.4256, + "num_tokens": 1566655.0, + "reward": 3.6396484375, + "reward_std": 0.23616261687129736, + "rewards/correctness_reward_func/mean": 1.765625, + "rewards/correctness_reward_func/std": 0.45028156042099, + "rewards/int_reward_func/mean": 0.484375, + "rewards/int_reward_func/std": 0.05259781517088413, + "rewards/soft_format_reward_func/mean": 0.484375, + "rewards/soft_format_reward_func/std": 0.0625, + "rewards/strict_format_reward_func/mean": 0.44140625, + "rewards/strict_format_reward_func/std": 0.15537451766431332, + "rewards/xmlcount_reward_func/mean": 0.4638671875, + "rewards/xmlcount_reward_func/std": 0.10096731083467603, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 446.875, + "completions/max_terminated_length": 403.125, + "completions/mean_length": 237.4453125, + "completions/mean_terminated_length": 232.93490028381348, + "completions/min_length": 120.625, + "completions/min_terminated_length": 120.625, + "entropy": 0.07567687798291445, + "epoch": 0.32548179871520344, + "frac_reward_zero_std": 0.75, + "grad_norm": 8.8125, + "learning_rate": 1.7330518718298263e-05, + "loss": 0.1448, + "num_tokens": 1609482.0, + "reward": 3.5947265625, + "reward_std": 0.3079781401902437, + "rewards/correctness_reward_func/mean": 1.625, + "rewards/correctness_reward_func/std": 0.7266493514180183, + "rewards/int_reward_func/mean": 0.5, + "rewards/int_reward_func/std": 0.0, + "rewards/soft_format_reward_func/mean": 0.4921875, + "rewards/soft_format_reward_func/std": 0.021347815170884132, + "rewards/strict_format_reward_func/mean": 0.484375, + "rewards/strict_format_reward_func/std": 0.05259781517088413, + "rewards/xmlcount_reward_func/mean": 0.4931640625, + "rewards/xmlcount_reward_func/std": 0.023663727566599846, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 412.75, + "completions/max_terminated_length": 407.25, + "completions/mean_length": 225.1953125, + "completions/mean_terminated_length": 223.21041870117188, + "completions/min_length": 123.125, + "completions/min_terminated_length": 123.125, + "entropy": 0.061285244300961494, + "epoch": 0.3340471092077088, + "frac_reward_zero_std": 0.84375, + "grad_norm": 6.0, + "learning_rate": 1.712376096951345e-05, + "loss": 0.1485, + "num_tokens": 1649845.0, + "reward": 3.748046875, + "reward_std": 0.17953883367590606, + "rewards/correctness_reward_func/mean": 1.765625, + "rewards/correctness_reward_func/std": 0.601259708404541, + "rewards/int_reward_func/mean": 0.5, + "rewards/int_reward_func/std": 0.0, + "rewards/soft_format_reward_func/mean": 0.5, + "rewards/soft_format_reward_func/std": 0.0, + "rewards/strict_format_reward_func/mean": 0.49609375, + "rewards/strict_format_reward_func/std": 0.015625, + "rewards/xmlcount_reward_func/mean": 0.486328125, + "rewards/xmlcount_reward_func/std": 0.04478531517088413, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 397.125, + "completions/max_terminated_length": 352.75, + "completions/mean_length": 211.8359375, + "completions/mean_terminated_length": 204.96146202087402, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.08425743412226439, + "epoch": 0.3426124197002141, + "frac_reward_zero_std": 0.796875, + "grad_norm": 9.6875, + "learning_rate": 1.691062648986865e-05, + "loss": 0.0101, + "num_tokens": 1689182.0, + "reward": 3.4814453125, + "reward_std": 0.3245509583503008, + "rewards/correctness_reward_func/mean": 1.59375, + "rewards/correctness_reward_func/std": 0.7279798686504364, + "rewards/int_reward_func/mean": 0.46484375, + "rewards/int_reward_func/std": 0.09617366641759872, + "rewards/soft_format_reward_func/mean": 0.4765625, + "rewards/soft_format_reward_func/std": 0.07206955552101135, + "rewards/strict_format_reward_func/mean": 0.46875, + "rewards/strict_format_reward_func/std": 0.10331955552101135, + "rewards/xmlcount_reward_func/mean": 0.4775390625, + "rewards/xmlcount_reward_func/std": 0.06816330552101135, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 415.625, + "completions/max_terminated_length": 373.375, + "completions/mean_length": 202.0078125, + "completions/mean_terminated_length": 192.4726963043213, + "completions/min_length": 73.75, + "completions/min_terminated_length": 73.75, + "entropy": 0.08351494651287794, + "epoch": 0.3511777301927195, + "frac_reward_zero_std": 0.78125, + "grad_norm": 7.1875, + "learning_rate": 1.6691306063588583e-05, + "loss": 0.0866, + "num_tokens": 1726993.0, + "reward": 3.4150390625, + "reward_std": 0.23063834570348263, + "rewards/correctness_reward_func/mean": 1.546875, + "rewards/correctness_reward_func/std": 0.7765756696462631, + "rewards/int_reward_func/mean": 0.453125, + "rewards/int_reward_func/std": 0.11146603710949421, + "rewards/soft_format_reward_func/mean": 0.48046875, + "rewards/soft_format_reward_func/std": 0.046542370691895485, + "rewards/strict_format_reward_func/mean": 0.4609375, + "rewards/strict_format_reward_func/std": 0.10189648158848286, + "rewards/xmlcount_reward_func/mean": 0.4736328125, + "rewards/xmlcount_reward_func/std": 0.06877632485702634, + "step": 41 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 393.0, + "completions/max_terminated_length": 353.875, + "completions/mean_length": 199.8359375, + "completions/mean_terminated_length": 192.48474884033203, + "completions/min_length": 87.5, + "completions/min_terminated_length": 87.5, + "entropy": 0.07358774542808533, + "epoch": 0.35974304068522484, + "frac_reward_zero_std": 0.765625, + "grad_norm": 6.28125, + "learning_rate": 1.6465996012157996e-05, + "loss": 0.2018, + "num_tokens": 1765048.0, + "reward": 3.466796875, + "reward_std": 0.3204077500849962, + "rewards/correctness_reward_func/mean": 1.53125, + "rewards/correctness_reward_func/std": 0.8384338021278381, + "rewards/int_reward_func/mean": 0.48828125, + "rewards/int_reward_func/std": 0.03697281517088413, + "rewards/soft_format_reward_func/mean": 0.48828125, + "rewards/soft_format_reward_func/std": 0.03697281517088413, + "rewards/strict_format_reward_func/mean": 0.47265625, + "rewards/strict_format_reward_func/std": 0.08957063034176826, + "rewards/xmlcount_reward_func/mean": 0.486328125, + "rewards/xmlcount_reward_func/std": 0.045771504286676645, + "step": 42 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 463.75, + "completions/max_terminated_length": 342.0, + "completions/mean_length": 203.1796875, + "completions/mean_terminated_length": 177.74688339233398, + "completions/min_length": 76.625, + "completions/min_terminated_length": 76.625, + "entropy": 0.09338215924799442, + "epoch": 0.3683083511777302, + "frac_reward_zero_std": 0.671875, + "grad_norm": 14.1875, + "learning_rate": 1.6234898018587336e-05, + "loss": 0.4962, + "num_tokens": 1802763.0, + "reward": 3.421875, + "reward_std": 0.39774755109101534, + "rewards/correctness_reward_func/mean": 1.53125, + "rewards/correctness_reward_func/std": 0.8366330787539482, + "rewards/int_reward_func/mean": 0.4921875, + "rewards/int_reward_func/std": 0.03125, + "rewards/soft_format_reward_func/mean": 0.5, + "rewards/soft_format_reward_func/std": 0.0, + "rewards/strict_format_reward_func/mean": 0.4453125, + "rewards/strict_format_reward_func/std": 0.1271837092936039, + "rewards/xmlcount_reward_func/mean": 0.453125, + "rewards/xmlcount_reward_func/std": 0.1128259189426899, + "step": 43 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 434.5, + "completions/max_terminated_length": 338.5, + "completions/mean_length": 200.421875, + "completions/mean_terminated_length": 181.6392650604248, + "completions/min_length": 95.125, + "completions/min_terminated_length": 95.125, + "entropy": 0.09290066035464406, + "epoch": 0.37687366167023556, + "frac_reward_zero_std": 0.78125, + "grad_norm": 129.0, + "learning_rate": 1.599821894687914e-05, + "loss": 0.2028, + "num_tokens": 1840283.0, + "reward": 3.3876953125, + "reward_std": 0.27759466134011745, + "rewards/correctness_reward_func/mean": 1.515625, + "rewards/correctness_reward_func/std": 0.7650169730186462, + "rewards/int_reward_func/mean": 0.48828125, + "rewards/int_reward_func/std": 0.025194555521011353, + "rewards/soft_format_reward_func/mean": 0.48046875, + "rewards/soft_format_reward_func/std": 0.05644455552101135, + "rewards/strict_format_reward_func/mean": 0.4453125, + "rewards/strict_format_reward_func/std": 0.11806907318532467, + "rewards/xmlcount_reward_func/mean": 0.4580078125, + "rewards/xmlcount_reward_func/std": 0.10733805038034916, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 302.125, + "completions/max_terminated_length": 273.5, + "completions/mean_length": 161.5859375, + "completions/mean_terminated_length": 158.94635581970215, + "completions/min_length": 83.75, + "completions/min_terminated_length": 83.75, + "entropy": 0.07536831498146057, + "epoch": 0.3854389721627409, + "frac_reward_zero_std": 0.75, + "grad_norm": 52.5, + "learning_rate": 1.575617065685674e-05, + "loss": 0.0339, + "num_tokens": 1873008.0, + "reward": 3.439453125, + "reward_std": 0.4253689181059599, + "rewards/correctness_reward_func/mean": 1.5625, + "rewards/correctness_reward_func/std": 0.8102209344506264, + "rewards/int_reward_func/mean": 0.45703125, + "rewards/int_reward_func/std": 0.11861192621290684, + "rewards/soft_format_reward_func/mean": 0.4765625, + "rewards/soft_format_reward_func/std": 0.050389111042022705, + "rewards/strict_format_reward_func/mean": 0.47265625, + "rewards/strict_format_reward_func/std": 0.053145406767725945, + "rewards/xmlcount_reward_func/mean": 0.470703125, + "rewards/xmlcount_reward_func/std": 0.060957906767725945, + "step": 45 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 444.5, + "completions/max_terminated_length": 266.75, + "completions/mean_length": 179.7578125, + "completions/mean_terminated_length": 152.03579235076904, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.06918492680415511, + "epoch": 0.39400428265524623, + "frac_reward_zero_std": 0.625, + "grad_norm": 24.125, + "learning_rate": 1.5508969814521026e-05, + "loss": 0.1968, + "num_tokens": 1907887.0, + "reward": 3.259765625, + "reward_std": 0.4695630930364132, + "rewards/correctness_reward_func/mean": 1.5, + "rewards/correctness_reward_func/std": 0.8375296071171761, + "rewards/int_reward_func/mean": 0.4765625, + "rewards/int_reward_func/std": 0.07206955552101135, + "rewards/soft_format_reward_func/mean": 0.4609375, + "rewards/soft_format_reward_func/std": 0.08824022859334946, + "rewards/strict_format_reward_func/mean": 0.38671875, + "rewards/strict_format_reward_func/std": 0.20270179212093353, + "rewards/xmlcount_reward_func/mean": 0.435546875, + "rewards/xmlcount_reward_func/std": 0.12772688083350658, + "step": 46 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5859375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 234.25, + "completions/mean_length": 361.9609375, + "completions/mean_terminated_length": 144.42629528045654, + "completions/min_length": 80.375, + "completions/min_terminated_length": 80.375, + "entropy": 0.06124910665675998, + "epoch": 0.4025695931477516, + "frac_reward_zero_std": 0.25, + "grad_norm": 20.0, + "learning_rate": 1.5256837698105047e-05, + "loss": 0.4458, + "num_tokens": 1965858.0, + "reward": 2.03125, + "reward_std": 0.770635899156332, + "rewards/correctness_reward_func/mean": 1.0625, + "rewards/correctness_reward_func/std": 0.978024922311306, + "rewards/int_reward_func/mean": 0.41015625, + "rewards/int_reward_func/std": 0.14781177043914795, + "rewards/soft_format_reward_func/mean": 0.34765625, + "rewards/soft_format_reward_func/std": 0.221479382365942, + "rewards/strict_format_reward_func/mean": 0.046875, + "rewards/strict_format_reward_func/std": 0.1095899622887373, + "rewards/xmlcount_reward_func/mean": 0.1640625, + "rewards/xmlcount_reward_func/std": 0.17734735272824764, + "step": 47 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.78125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 174.375, + "completions/mean_length": 431.34375, + "completions/mean_terminated_length": 122.72916984558105, + "completions/min_length": 152.25, + "completions/min_terminated_length": 88.25, + "entropy": 0.06367563363164663, + "epoch": 0.41113490364025695, + "frac_reward_zero_std": 0.25, + "grad_norm": 20.75, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.5487, + "num_tokens": 2033422.0, + "reward": 1.4404296875, + "reward_std": 0.7416334673762321, + "rewards/correctness_reward_func/mean": 0.71875, + "rewards/correctness_reward_func/std": 0.9663743898272514, + "rewards/int_reward_func/mean": 0.30078125, + "rewards/int_reward_func/std": 0.23859525099396706, + "rewards/soft_format_reward_func/mean": 0.29296875, + "rewards/soft_format_reward_func/std": 0.23586604371666908, + "rewards/strict_format_reward_func/mean": 0.015625, + "rewards/strict_format_reward_func/std": 0.0625, + "rewards/xmlcount_reward_func/mean": 0.1123046875, + "rewards/xmlcount_reward_func/std": 0.16386567754670978, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6328125, + "completions/max_length": 512.0, + "completions/max_terminated_length": 230.375, + "completions/mean_length": 376.578125, + "completions/mean_terminated_length": 143.8458366394043, + "completions/min_length": 79.75, + "completions/min_terminated_length": 79.75, + "entropy": 0.06622787471860647, + "epoch": 0.4197002141327623, + "frac_reward_zero_std": 0.234375, + "grad_norm": 69.5, + "learning_rate": 1.4738686624729987e-05, + "loss": 0.9777, + "num_tokens": 2093936.0, + "reward": 1.595703125, + "reward_std": 0.7098689079284668, + "rewards/correctness_reward_func/mean": 0.796875, + "rewards/correctness_reward_func/std": 0.9754082411527634, + "rewards/int_reward_func/mean": 0.33984375, + "rewards/int_reward_func/std": 0.21231234446167946, + "rewards/soft_format_reward_func/mean": 0.28125, + "rewards/soft_format_reward_func/std": 0.23983315750956535, + "rewards/strict_format_reward_func/mean": 0.015625, + "rewards/strict_format_reward_func/std": 0.05259781517088413, + "rewards/xmlcount_reward_func/mean": 0.162109375, + "rewards/xmlcount_reward_func/std": 0.1913837492465973, + "step": 49 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.640625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 213.5, + "completions/mean_length": 382.2734375, + "completions/mean_terminated_length": 144.93779945373535, + "completions/min_length": 91.75, + "completions/min_terminated_length": 91.75, + "entropy": 0.055487995967268944, + "epoch": 0.4282655246252677, + "frac_reward_zero_std": 0.265625, + "grad_norm": 21.75, + "learning_rate": 1.4473131483156326e-05, + "loss": 0.6731, + "num_tokens": 2154945.0, + "reward": 1.291015625, + "reward_std": 0.7789223082363605, + "rewards/correctness_reward_func/mean": 0.609375, + "rewards/correctness_reward_func/std": 0.9308035299181938, + "rewards/int_reward_func/mean": 0.25, + "rewards/int_reward_func/std": 0.2540716640651226, + "rewards/soft_format_reward_func/mean": 0.25, + "rewards/soft_format_reward_func/std": 0.24922906793653965, + "rewards/strict_format_reward_func/mean": 0.0, + "rewards/strict_format_reward_func/std": 0.0, + "rewards/xmlcount_reward_func/mean": 0.181640625, + "rewards/xmlcount_reward_func/std": 0.20568038523197174, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5859375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 302.25, + "completions/mean_length": 374.421875, + "completions/mean_terminated_length": 177.11860466003418, + "completions/min_length": 93.125, + "completions/min_terminated_length": 93.125, + "entropy": 0.05054905638098717, + "epoch": 0.43683083511777304, + "frac_reward_zero_std": 0.1875, + "grad_norm": 24.125, + "learning_rate": 1.4203572283095657e-05, + "loss": 1.1031, + "num_tokens": 2214589.0, + "reward": 1.5400390625, + "reward_std": 1.028895616531372, + "rewards/correctness_reward_func/mean": 0.765625, + "rewards/correctness_reward_func/std": 0.9744589924812317, + "rewards/int_reward_func/mean": 0.2734375, + "rewards/int_reward_func/std": 0.2540593519806862, + "rewards/soft_format_reward_func/mean": 0.2890625, + "rewards/soft_format_reward_func/std": 0.25303449109196663, + "rewards/strict_format_reward_func/mean": 0.0, + "rewards/strict_format_reward_func/std": 0.0, + "rewards/xmlcount_reward_func/mean": 0.2119140625, + "rewards/xmlcount_reward_func/std": 0.21405917219817638, + "step": 51 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.7109375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 252.25, + "completions/mean_length": 412.453125, + "completions/mean_terminated_length": 173.9702386856079, + "completions/min_length": 113.25, + "completions/min_terminated_length": 113.25, + "entropy": 0.051852176897227764, + "epoch": 0.44539614561027835, + "frac_reward_zero_std": 0.1875, + "grad_norm": 17.75, + "learning_rate": 1.3930250316539237e-05, + "loss": 1.013, + "num_tokens": 2280025.0, + "reward": 0.9990234375, + "reward_std": 0.885264553129673, + "rewards/correctness_reward_func/mean": 0.4375, + "rewards/correctness_reward_func/std": 0.811570405960083, + "rewards/int_reward_func/mean": 0.15625, + "rewards/int_reward_func/std": 0.22798974812030792, + "rewards/soft_format_reward_func/mean": 0.22265625, + "rewards/soft_format_reward_func/std": 0.25226276740431786, + "rewards/strict_format_reward_func/mean": 0.0, + "rewards/strict_format_reward_func/std": 0.0, + "rewards/xmlcount_reward_func/mean": 0.1826171875, + "rewards/xmlcount_reward_func/std": 0.18229194171726704, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.515625, + "completions/max_length": 512.0, + "completions/max_terminated_length": 285.875, + "completions/mean_length": 351.171875, + "completions/mean_terminated_length": 178.26887321472168, + "completions/min_length": 108.875, + "completions/min_terminated_length": 108.875, + "entropy": 0.0576583961956203, + "epoch": 0.4539614561027837, + "frac_reward_zero_std": 0.359375, + "grad_norm": 21.125, + "learning_rate": 1.3653410243663953e-05, + "loss": 0.8749, + "num_tokens": 2337543.0, + "reward": 1.5859375, + "reward_std": 0.7292038351297379, + "rewards/correctness_reward_func/mean": 0.75, + "rewards/correctness_reward_func/std": 0.9716326966881752, + "rewards/int_reward_func/mean": 0.26953125, + "rewards/int_reward_func/std": 0.2501082383096218, + "rewards/soft_format_reward_func/mean": 0.29296875, + "rewards/soft_format_reward_func/std": 0.24548756889998913, + "rewards/strict_format_reward_func/mean": 0.0, + "rewards/strict_format_reward_func/std": 0.0, + "rewards/xmlcount_reward_func/mean": 0.2734375, + "rewards/xmlcount_reward_func/std": 0.20937122404575348, + "step": 53 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2109375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 285.125, + "completions/mean_length": 242.625, + "completions/mean_terminated_length": 169.74182319641113, + "completions/min_length": 97.375, + "completions/min_terminated_length": 97.375, + "entropy": 0.06909441482275724, + "epoch": 0.4625267665952891, + "frac_reward_zero_std": 0.46875, + "grad_norm": 131.0, + "learning_rate": 1.3373299873828303e-05, + "loss": 0.9203, + "num_tokens": 2379831.0, + "reward": 2.27734375, + "reward_std": 0.7844465803354979, + "rewards/correctness_reward_func/mean": 1.125, + "rewards/correctness_reward_func/std": 0.9970766380429268, + "rewards/int_reward_func/mean": 0.37890625, + "rewards/int_reward_func/std": 0.2144309040158987, + "rewards/soft_format_reward_func/mean": 0.37890625, + "rewards/soft_format_reward_func/std": 0.21300501003861427, + "rewards/strict_format_reward_func/mean": 0.0078125, + "rewards/strict_format_reward_func/std": 0.03125, + "rewards/xmlcount_reward_func/mean": 0.38671875, + "rewards/xmlcount_reward_func/std": 0.18055572640150785, + "step": 54 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1484375, + "completions/max_length": 512.0, + "completions/max_terminated_length": 287.125, + "completions/mean_length": 227.8671875, + "completions/mean_terminated_length": 178.1862964630127, + "completions/min_length": 82.875, + "completions/min_terminated_length": 82.875, + "entropy": 0.0683035789988935, + "epoch": 0.47109207708779444, + "frac_reward_zero_std": 0.53125, + "grad_norm": 75.5, + "learning_rate": 1.3090169943749475e-05, + "loss": 0.7331, + "num_tokens": 2421244.0, + "reward": 2.6162109375, + "reward_std": 0.5620946288108826, + "rewards/correctness_reward_func/mean": 1.375, + "rewards/correctness_reward_func/std": 0.9013157784938812, + "rewards/int_reward_func/mean": 0.41015625, + "rewards/int_reward_func/std": 0.18550433963537216, + "rewards/soft_format_reward_func/mean": 0.4140625, + "rewards/soft_format_reward_func/std": 0.17175541445612907, + "rewards/strict_format_reward_func/mean": 0.0078125, + "rewards/strict_format_reward_func/std": 0.03125, + "rewards/xmlcount_reward_func/mean": 0.4091796875, + "rewards/xmlcount_reward_func/std": 0.15384871885180473, + "step": 55 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 512.0, + "completions/max_terminated_length": 274.875, + "completions/mean_length": 230.6484375, + "completions/mean_terminated_length": 173.36169624328613, + "completions/min_length": 101.5, + "completions/min_terminated_length": 101.5, + "entropy": 0.06412349035963416, + "epoch": 0.4796573875802998, + "frac_reward_zero_std": 0.609375, + "grad_norm": 17.75, + "learning_rate": 1.2804273893060028e-05, + "loss": 0.8135, + "num_tokens": 2462929.0, + "reward": 2.8583984375, + "reward_std": 0.5731431804597378, + "rewards/correctness_reward_func/mean": 1.546875, + "rewards/correctness_reward_func/std": 0.8076042532920837, + "rewards/int_reward_func/mean": 0.43359375, + "rewards/int_reward_func/std": 0.15128782019019127, + "rewards/soft_format_reward_func/mean": 0.4453125, + "rewards/soft_format_reward_func/std": 0.12609326466917992, + "rewards/strict_format_reward_func/mean": 0.01953125, + "rewards/strict_format_reward_func/std": 0.06822281517088413, + "rewards/xmlcount_reward_func/mean": 0.4130859375, + "rewards/xmlcount_reward_func/std": 0.14727921038866043, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 481.625, + "completions/max_terminated_length": 265.5, + "completions/mean_length": 213.640625, + "completions/mean_terminated_length": 158.69331169128418, + "completions/min_length": 87.125, + "completions/min_terminated_length": 87.125, + "entropy": 0.06292425934225321, + "epoch": 0.48822269807280516, + "frac_reward_zero_std": 0.46875, + "grad_norm": 15.0, + "learning_rate": 1.2515867637445088e-05, + "loss": 0.5508, + "num_tokens": 2502783.0, + "reward": 2.8046875, + "reward_std": 0.6159562915563583, + "rewards/correctness_reward_func/mean": 1.34375, + "rewards/correctness_reward_func/std": 0.9502372145652771, + "rewards/int_reward_func/mean": 0.41015625, + "rewards/int_reward_func/std": 0.17638970352709293, + "rewards/soft_format_reward_func/mean": 0.4296875, + "rewards/soft_format_reward_func/std": 0.15997907333076, + "rewards/strict_format_reward_func/mean": 0.25, + "rewards/strict_format_reward_func/std": 0.24410519748926163, + "rewards/xmlcount_reward_func/mean": 0.37109375, + "rewards/xmlcount_reward_func/std": 0.1568639986217022, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 439.75, + "completions/max_terminated_length": 258.375, + "completions/mean_length": 189.484375, + "completions/mean_terminated_length": 149.9307737350464, + "completions/min_length": 81.625, + "completions/min_terminated_length": 81.625, + "entropy": 0.07262948993593454, + "epoch": 0.49678800856531047, + "frac_reward_zero_std": 0.671875, + "grad_norm": 25.375, + "learning_rate": 1.2225209339563144e-05, + "loss": 0.5996, + "num_tokens": 2539729.0, + "reward": 3.1650390625, + "reward_std": 0.42951212264597416, + "rewards/correctness_reward_func/mean": 1.40625, + "rewards/correctness_reward_func/std": 0.9186194837093353, + "rewards/int_reward_func/mean": 0.44921875, + "rewards/int_reward_func/std": 0.11343478411436081, + "rewards/soft_format_reward_func/mean": 0.44921875, + "rewards/soft_format_reward_func/std": 0.1270910371094942, + "rewards/strict_format_reward_func/mean": 0.421875, + "rewards/strict_format_reward_func/std": 0.15183541178703308, + "rewards/xmlcount_reward_func/mean": 0.4384765625, + "rewards/xmlcount_reward_func/std": 0.1242168415337801, + "step": 58 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 428.125, + "completions/max_terminated_length": 271.75, + "completions/mean_length": 168.0390625, + "completions/mean_terminated_length": 153.7322940826416, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.06558680208399892, + "epoch": 0.5053533190578159, + "frac_reward_zero_std": 0.71875, + "grad_norm": 13.6875, + "learning_rate": 1.1932559177955533e-05, + "loss": 0.6108, + "num_tokens": 2573054.0, + "reward": 3.6171875, + "reward_std": 0.38669902086257935, + "rewards/correctness_reward_func/mean": 1.703125, + "rewards/correctness_reward_func/std": 0.7130631133913994, + "rewards/int_reward_func/mean": 0.46484375, + "rewards/int_reward_func/std": 0.09914018586277962, + "rewards/soft_format_reward_func/mean": 0.48828125, + "rewards/soft_format_reward_func/std": 0.046875, + "rewards/strict_format_reward_func/mean": 0.47265625, + "rewards/strict_format_reward_func/std": 0.09947281517088413, + "rewards/xmlcount_reward_func/mean": 0.48828125, + "rewards/xmlcount_reward_func/std": 0.046875, + "step": 59 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 354.25, + "completions/max_terminated_length": 267.375, + "completions/mean_length": 166.265625, + "completions/mean_terminated_length": 155.07939338684082, + "completions/min_length": 84.25, + "completions/min_terminated_length": 84.25, + "entropy": 0.0687381848692894, + "epoch": 0.5139186295503212, + "frac_reward_zero_std": 0.6875, + "grad_norm": 26.25, + "learning_rate": 1.1638179114151378e-05, + "loss": 0.2988, + "num_tokens": 2606004.0, + "reward": 3.314453125, + "reward_std": 0.43365532672032714, + "rewards/correctness_reward_func/mean": 1.4375, + "rewards/correctness_reward_func/std": 0.8874192461371422, + "rewards/int_reward_func/mean": 0.46484375, + "rewards/int_reward_func/std": 0.10904237069189548, + "rewards/soft_format_reward_func/mean": 0.4765625, + "rewards/soft_format_reward_func/std": 0.062167370691895485, + "rewards/strict_format_reward_func/mean": 0.46484375, + "rewards/strict_format_reward_func/std": 0.08923800103366375, + "rewards/xmlcount_reward_func/mean": 0.470703125, + "rewards/xmlcount_reward_func/std": 0.07348538748919964, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 394.875, + "completions/max_terminated_length": 299.5, + "completions/mean_length": 167.234375, + "completions/mean_terminated_length": 155.96860313415527, + "completions/min_length": 81.875, + "completions/min_terminated_length": 81.875, + "entropy": 0.06962216552346945, + "epoch": 0.5224839400428265, + "frac_reward_zero_std": 0.796875, + "grad_norm": 13.0625, + "learning_rate": 1.1342332658176556e-05, + "loss": 0.2338, + "num_tokens": 2639860.0, + "reward": 3.4140625, + "reward_std": 0.19334950856864452, + "rewards/correctness_reward_func/mean": 1.515625, + "rewards/correctness_reward_func/std": 0.8358171209692955, + "rewards/int_reward_func/mean": 0.4765625, + "rewards/int_reward_func/std": 0.07394563034176826, + "rewards/soft_format_reward_func/mean": 0.4765625, + "rewards/soft_format_reward_func/std": 0.07394563034176826, + "rewards/strict_format_reward_func/mean": 0.46875, + "rewards/strict_format_reward_func/std": 0.08351518586277962, + "rewards/xmlcount_reward_func/mean": 0.4765625, + "rewards/xmlcount_reward_func/std": 0.06315224710851908, + "step": 61 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 390.375, + "completions/max_terminated_length": 261.25, + "completions/mean_length": 174.2734375, + "completions/mean_terminated_length": 157.4187536239624, + "completions/min_length": 87.5, + "completions/min_terminated_length": 87.5, + "entropy": 0.06758250948041677, + "epoch": 0.5310492505353319, + "frac_reward_zero_std": 0.671875, + "grad_norm": 11.0, + "learning_rate": 1.1045284632676535e-05, + "loss": 0.1924, + "num_tokens": 2674451.0, + "reward": 3.5439453125, + "reward_std": 0.4018907658755779, + "rewards/correctness_reward_func/mean": 1.609375, + "rewards/correctness_reward_func/std": 0.7834457755088806, + "rewards/int_reward_func/mean": 0.48828125, + "rewards/int_reward_func/std": 0.03697281517088413, + "rewards/soft_format_reward_func/mean": 0.4921875, + "rewards/soft_format_reward_func/std": 0.03125, + "rewards/strict_format_reward_func/mean": 0.46875, + "rewards/strict_format_reward_func/std": 0.0952934455126524, + "rewards/xmlcount_reward_func/mean": 0.4853515625, + "rewards/xmlcount_reward_func/std": 0.04692569188773632, + "step": 62 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 269.5, + "completions/max_terminated_length": 241.5, + "completions/mean_length": 148.2890625, + "completions/mean_terminated_length": 145.62083435058594, + "completions/min_length": 89.625, + "completions/min_terminated_length": 89.625, + "entropy": 0.0636401055380702, + "epoch": 0.5396145610278372, + "frac_reward_zero_std": 0.8125, + "grad_norm": 8.125, + "learning_rate": 1.0747300935864245e-05, + "loss": 0.0794, + "num_tokens": 2705286.0, + "reward": 3.6806640625, + "reward_std": 0.2637839764356613, + "rewards/correctness_reward_func/mean": 1.703125, + "rewards/correctness_reward_func/std": 0.5994666591286659, + "rewards/int_reward_func/mean": 0.484375, + "rewards/int_reward_func/std": 0.042695630341768265, + "rewards/soft_format_reward_func/mean": 0.5, + "rewards/soft_format_reward_func/std": 0.0, + "rewards/strict_format_reward_func/mean": 0.49609375, + "rewards/strict_format_reward_func/std": 0.015625, + "rewards/xmlcount_reward_func/mean": 0.4970703125, + "rewards/xmlcount_reward_func/std": 0.01171875, + "step": 63 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 282.875, + "completions/max_terminated_length": 242.0, + "completions/mean_length": 154.0546875, + "completions/mean_terminated_length": 151.16770935058594, + "completions/min_length": 82.75, + "completions/min_terminated_length": 82.75, + "entropy": 0.06001953314989805, + "epoch": 0.5481798715203426, + "frac_reward_zero_std": 0.84375, + "grad_norm": 7.78125, + "learning_rate": 1.044864830350515e-05, + "loss": 0.045, + "num_tokens": 2737255.0, + "reward": 3.6259765625, + "reward_std": 0.1643470786511898, + "rewards/correctness_reward_func/mean": 1.65625, + "rewards/correctness_reward_func/std": 0.6852209344506264, + "rewards/int_reward_func/mean": 0.4921875, + "rewards/int_reward_func/std": 0.021347815170884132, + "rewards/soft_format_reward_func/mean": 0.5, + "rewards/soft_format_reward_func/std": 0.0, + "rewards/strict_format_reward_func/mean": 0.484375, + "rewards/strict_format_reward_func/std": 0.0625, + "rewards/xmlcount_reward_func/mean": 0.4931640625, + "rewards/xmlcount_reward_func/std": 0.02734375, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 323.5, + "completions/max_terminated_length": 268.75, + "completions/mean_length": 161.921875, + "completions/mean_terminated_length": 154.10967445373535, + "completions/min_length": 83.125, + "completions/min_terminated_length": 83.125, + "entropy": 0.06517814612016082, + "epoch": 0.556745182012848, + "frac_reward_zero_std": 0.796875, + "grad_norm": 7.8125, + "learning_rate": 1.0149594070152638e-05, + "loss": 0.0963, + "num_tokens": 2770597.0, + "reward": 3.3662109375, + "reward_std": 0.29969173669815063, + "rewards/correctness_reward_func/mean": 1.4375, + "rewards/correctness_reward_func/std": 0.8671257123351097, + "rewards/int_reward_func/mean": 0.46875, + "rewards/int_reward_func/std": 0.08054866641759872, + "rewards/soft_format_reward_func/mean": 0.48828125, + "rewards/soft_format_reward_func/std": 0.03697281517088413, + "rewards/strict_format_reward_func/mean": 0.484375, + "rewards/strict_format_reward_func/std": 0.05259781517088413, + "rewards/xmlcount_reward_func/mean": 0.4873046875, + "rewards/xmlcount_reward_func/std": 0.04087906517088413, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1171875, + "completions/max_length": 453.75, + "completions/max_terminated_length": 281.875, + "completions/mean_length": 192.828125, + "completions/mean_terminated_length": 151.42723655700684, + "completions/min_length": 82.125, + "completions/min_terminated_length": 82.125, + "entropy": 0.05498858401551843, + "epoch": 0.5653104925053534, + "frac_reward_zero_std": 0.796875, + "grad_norm": 10.5, + "learning_rate": 9.850405929847367e-06, + "loss": 0.2824, + "num_tokens": 2807611.0, + "reward": 3.3330078125, + "reward_std": 0.3135024200892076, + "rewards/correctness_reward_func/mean": 1.578125, + "rewards/correctness_reward_func/std": 0.8142120242118835, + "rewards/int_reward_func/mean": 0.4375, + "rewards/int_reward_func/std": 0.15119514800608158, + "rewards/soft_format_reward_func/mean": 0.4453125, + "rewards/soft_format_reward_func/std": 0.11994514800608158, + "rewards/strict_format_reward_func/mean": 0.43359375, + "rewards/strict_format_reward_func/std": 0.1569179631769657, + "rewards/xmlcount_reward_func/mean": 0.4384765625, + "rewards/xmlcount_reward_func/std": 0.15258620493113995, + "step": 66 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 377.625, + "completions/max_terminated_length": 273.0, + "completions/mean_length": 154.65625, + "completions/mean_terminated_length": 139.93846988677979, + "completions/min_length": 75.75, + "completions/min_terminated_length": 75.75, + "entropy": 0.0641864649951458, + "epoch": 0.5738758029978587, + "frac_reward_zero_std": 0.796875, + "grad_norm": 14.25, + "learning_rate": 9.551351696494854e-06, + "loss": 0.3309, + "num_tokens": 2839197.0, + "reward": 3.3779296875, + "reward_std": 0.321788830216974, + "rewards/correctness_reward_func/mean": 1.484375, + "rewards/correctness_reward_func/std": 0.8679328411817551, + "rewards/int_reward_func/mean": 0.4609375, + "rewards/int_reward_func/std": 0.11664126068353653, + "rewards/soft_format_reward_func/mean": 0.48046875, + "rewards/soft_format_reward_func/std": 0.058320630341768265, + "rewards/strict_format_reward_func/mean": 0.4765625, + "rewards/strict_format_reward_func/std": 0.0640434455126524, + "rewards/xmlcount_reward_func/mean": 0.4755859375, + "rewards/xmlcount_reward_func/std": 0.07196457590907812, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 324.125, + "completions/max_terminated_length": 250.75, + "completions/mean_length": 156.265625, + "completions/mean_terminated_length": 141.62864875793457, + "completions/min_length": 72.625, + "completions/min_terminated_length": 72.625, + "entropy": 0.05262026563286781, + "epoch": 0.582441113490364, + "frac_reward_zero_std": 0.875, + "grad_norm": 42.5, + "learning_rate": 9.252699064135759e-06, + "loss": 0.2027, + "num_tokens": 2870879.0, + "reward": 3.443359375, + "reward_std": 0.1905873753130436, + "rewards/correctness_reward_func/mean": 1.546875, + "rewards/correctness_reward_func/std": 0.697150319814682, + "rewards/int_reward_func/mean": 0.46484375, + "rewards/int_reward_func/std": 0.09914018586277962, + "rewards/soft_format_reward_func/mean": 0.48046875, + "rewards/soft_format_reward_func/std": 0.05644455552101135, + "rewards/strict_format_reward_func/mean": 0.47265625, + "rewards/strict_format_reward_func/std": 0.07482585124671459, + "rewards/xmlcount_reward_func/mean": 0.478515625, + "rewards/xmlcount_reward_func/std": 0.05138835124671459, + "step": 68 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 314.25, + "completions/max_terminated_length": 225.875, + "completions/mean_length": 140.921875, + "completions/mean_terminated_length": 131.4558048248291, + "completions/min_length": 65.25, + "completions/min_terminated_length": 65.25, + "entropy": 0.05961746862158179, + "epoch": 0.5910064239828694, + "frac_reward_zero_std": 0.734375, + "grad_norm": 29.5, + "learning_rate": 8.954715367323468e-06, + "loss": 0.2871, + "num_tokens": 2900613.0, + "reward": 3.34375, + "reward_std": 0.44194173626601696, + "rewards/correctness_reward_func/mean": 1.515625, + "rewards/correctness_reward_func/std": 0.8490326702594757, + "rewards/int_reward_func/mean": 0.4609375, + "rewards/int_reward_func/std": 0.0783399622887373, + "rewards/soft_format_reward_func/mean": 0.4609375, + "rewards/soft_format_reward_func/std": 0.09120866656303406, + "rewards/strict_format_reward_func/mean": 0.4453125, + "rewards/strict_format_reward_func/std": 0.13093777745962143, + "rewards/xmlcount_reward_func/mean": 0.4609375, + "rewards/xmlcount_reward_func/std": 0.0871200654655695, + "step": 69 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 289.0, + "completions/max_terminated_length": 257.0, + "completions/mean_length": 149.90625, + "completions/mean_terminated_length": 147.21406364440918, + "completions/min_length": 84.25, + "completions/min_terminated_length": 84.25, + "entropy": 0.06772361230105162, + "epoch": 0.5995717344753747, + "frac_reward_zero_std": 0.84375, + "grad_norm": 6.15625, + "learning_rate": 8.657667341823449e-06, + "loss": 0.0863, + "num_tokens": 2931599.0, + "reward": 3.6689453125, + "reward_std": 0.20301698334515095, + "rewards/correctness_reward_func/mean": 1.6875, + "rewards/correctness_reward_func/std": 0.5998296737670898, + "rewards/int_reward_func/mean": 0.4921875, + "rewards/int_reward_func/std": 0.03125, + "rewards/soft_format_reward_func/mean": 0.49609375, + "rewards/soft_format_reward_func/std": 0.015625, + "rewards/strict_format_reward_func/mean": 0.49609375, + "rewards/strict_format_reward_func/std": 0.015625, + "rewards/xmlcount_reward_func/mean": 0.4970703125, + "rewards/xmlcount_reward_func/std": 0.01171875, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 268.0, + "completions/max_terminated_length": 268.0, + "completions/mean_length": 149.4375, + "completions/mean_terminated_length": 149.4375, + "completions/min_length": 71.75, + "completions/min_terminated_length": 71.75, + "entropy": 0.06804852467030287, + "epoch": 0.6081370449678801, + "frac_reward_zero_std": 0.671875, + "grad_norm": 14.75, + "learning_rate": 8.361820885848623e-06, + "loss": -0.1228, + "num_tokens": 2963581.0, + "reward": 3.4091796875, + "reward_std": 0.4819926954805851, + "rewards/correctness_reward_func/mean": 1.46875, + "rewards/correctness_reward_func/std": 0.85780418664217, + "rewards/int_reward_func/mean": 0.48046875, + "rewards/int_reward_func/std": 0.05644455552101135, + "rewards/soft_format_reward_func/mean": 0.48828125, + "rewards/soft_format_reward_func/std": 0.025194555521011353, + "rewards/strict_format_reward_func/mean": 0.484375, + "rewards/strict_format_reward_func/std": 0.04081955552101135, + "rewards/xmlcount_reward_func/mean": 0.4873046875, + "rewards/xmlcount_reward_func/std": 0.029100805521011353, + "step": 71 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 276.25, + "completions/max_terminated_length": 237.375, + "completions/mean_length": 143.1640625, + "completions/mean_terminated_length": 140.2427101135254, + "completions/min_length": 73.125, + "completions/min_terminated_length": 73.125, + "entropy": 0.06070453533902764, + "epoch": 0.6167023554603854, + "frac_reward_zero_std": 0.78125, + "grad_norm": 8.9375, + "learning_rate": 8.06744082204447e-06, + "loss": 0.1705, + "num_tokens": 2993796.0, + "reward": 3.6064453125, + "reward_std": 0.3135024197399616, + "rewards/correctness_reward_func/mean": 1.640625, + "rewards/correctness_reward_func/std": 0.7274979203939438, + "rewards/int_reward_func/mean": 0.4765625, + "rewards/int_reward_func/std": 0.07394563034176826, + "rewards/soft_format_reward_func/mean": 0.5, + "rewards/soft_format_reward_func/std": 0.0, + "rewards/strict_format_reward_func/mean": 0.4921875, + "rewards/strict_format_reward_func/std": 0.03125, + "rewards/xmlcount_reward_func/mean": 0.4970703125, + "rewards/xmlcount_reward_func/std": 0.01171875, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 314.5, + "completions/max_terminated_length": 300.125, + "completions/mean_length": 152.578125, + "completions/mean_terminated_length": 147.3067741394043, + "completions/min_length": 77.5, + "completions/min_terminated_length": 77.5, + "entropy": 0.07308987434953451, + "epoch": 0.6252676659528907, + "frac_reward_zero_std": 0.78125, + "grad_norm": 7.65625, + "learning_rate": 7.774790660436857e-06, + "loss": 0.0711, + "num_tokens": 3025438.0, + "reward": 3.4619140625, + "reward_std": 0.2637839764356613, + "rewards/correctness_reward_func/mean": 1.515625, + "rewards/correctness_reward_func/std": 0.8590980246663094, + "rewards/int_reward_func/mean": 0.47265625, + "rewards/int_reward_func/std": 0.08957063034176826, + "rewards/soft_format_reward_func/mean": 0.4921875, + "rewards/soft_format_reward_func/std": 0.03125, + "rewards/strict_format_reward_func/mean": 0.48828125, + "rewards/strict_format_reward_func/std": 0.03697281517088413, + "rewards/xmlcount_reward_func/mean": 0.4931640625, + "rewards/xmlcount_reward_func/std": 0.023821823298931122, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 305.0, + "completions/max_terminated_length": 273.75, + "completions/mean_length": 165.515625, + "completions/mean_terminated_length": 162.84323120117188, + "completions/min_length": 89.625, + "completions/min_terminated_length": 89.625, + "entropy": 0.06814676942303777, + "epoch": 0.6338329764453962, + "frac_reward_zero_std": 0.8125, + "grad_norm": 6.625, + "learning_rate": 7.484132362554915e-06, + "loss": 0.163, + "num_tokens": 3059668.0, + "reward": 3.490234375, + "reward_std": 0.22097087278962135, + "rewards/correctness_reward_func/mean": 1.53125, + "rewards/correctness_reward_func/std": 0.8542027324438095, + "rewards/int_reward_func/mean": 0.484375, + "rewards/int_reward_func/std": 0.0625, + "rewards/soft_format_reward_func/mean": 0.5, + "rewards/soft_format_reward_func/std": 0.0, + "rewards/strict_format_reward_func/mean": 0.48046875, + "rewards/strict_format_reward_func/std": 0.06822281517088413, + "rewards/xmlcount_reward_func/mean": 0.494140625, + "rewards/xmlcount_reward_func/std": 0.020961953792721033, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 289.625, + "completions/max_terminated_length": 228.75, + "completions/mean_length": 145.8671875, + "completions/mean_terminated_length": 140.4114589691162, + "completions/min_length": 80.75, + "completions/min_terminated_length": 80.75, + "entropy": 0.06801354000344872, + "epoch": 0.6423982869379015, + "frac_reward_zero_std": 0.8125, + "grad_norm": 24.875, + "learning_rate": 7.1957261069399745e-06, + "loss": 0.1365, + "num_tokens": 3089965.0, + "reward": 3.5849609375, + "reward_std": 0.26378397084772587, + "rewards/correctness_reward_func/mean": 1.671875, + "rewards/correctness_reward_func/std": 0.7019384130835533, + "rewards/int_reward_func/mean": 0.47265625, + "rewards/int_reward_func/std": 0.08769455552101135, + "rewards/soft_format_reward_func/mean": 0.48828125, + "rewards/soft_format_reward_func/std": 0.025194555521011353, + "rewards/strict_format_reward_func/mean": 0.47265625, + "rewards/strict_format_reward_func/std": 0.06492366641759872, + "rewards/xmlcount_reward_func/mean": 0.4794921875, + "rewards/xmlcount_reward_func/std": 0.05274027772247791, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.625, + "completions/max_terminated_length": 276.625, + "completions/mean_length": 151.796875, + "completions/mean_terminated_length": 151.796875, + "completions/min_length": 80.125, + "completions/min_terminated_length": 80.125, + "entropy": 0.06695270165801048, + "epoch": 0.6509635974304069, + "frac_reward_zero_std": 0.90625, + "grad_norm": 4.125, + "learning_rate": 6.909830056250527e-06, + "loss": -0.0215, + "num_tokens": 3121975.0, + "reward": 3.591796875, + "reward_std": 0.12153397500514984, + "rewards/correctness_reward_func/mean": 1.609375, + "rewards/correctness_reward_func/std": 0.6680332496762276, + "rewards/int_reward_func/mean": 0.4921875, + "rewards/int_reward_func/std": 0.03125, + "rewards/soft_format_reward_func/mean": 0.5, + "rewards/soft_format_reward_func/std": 0.0, + "rewards/strict_format_reward_func/mean": 0.4921875, + "rewards/strict_format_reward_func/std": 0.021347815170884132, + "rewards/xmlcount_reward_func/mean": 0.498046875, + "rewards/xmlcount_reward_func/std": 0.005336953792721033, + "step": 76 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 334.375, + "completions/max_terminated_length": 308.625, + "completions/mean_length": 162.4765625, + "completions/mean_terminated_length": 157.2375030517578, + "completions/min_length": 79.25, + "completions/min_terminated_length": 79.25, + "entropy": 0.06329814763739705, + "epoch": 0.6595289079229122, + "frac_reward_zero_std": 0.796875, + "grad_norm": 19.25, + "learning_rate": 6.6267001261717015e-06, + "loss": 0.1003, + "num_tokens": 3154864.0, + "reward": 3.5283203125, + "reward_std": 0.3135024178773165, + "rewards/correctness_reward_func/mean": 1.59375, + "rewards/correctness_reward_func/std": 0.6435378566384315, + "rewards/int_reward_func/mean": 0.48046875, + "rewards/int_reward_func/std": 0.05644455552101135, + "rewards/soft_format_reward_func/mean": 0.48828125, + "rewards/soft_format_reward_func/std": 0.025194555521011353, + "rewards/strict_format_reward_func/mean": 0.48046875, + "rewards/strict_format_reward_func/std": 0.05644455552101135, + "rewards/xmlcount_reward_func/mean": 0.4853515625, + "rewards/xmlcount_reward_func/std": 0.03691330552101135, + "step": 77 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 296.875, + "completions/max_terminated_length": 238.875, + "completions/mean_length": 151.8203125, + "completions/mean_terminated_length": 146.38125228881836, + "completions/min_length": 76.5, + "completions/min_terminated_length": 76.5, + "entropy": 0.05917734792456031, + "epoch": 0.6680942184154176, + "frac_reward_zero_std": 0.796875, + "grad_norm": 10.8125, + "learning_rate": 6.34658975633605e-06, + "loss": 0.1964, + "num_tokens": 3186189.0, + "reward": 3.5595703125, + "reward_std": 0.22511406615376472, + "rewards/correctness_reward_func/mean": 1.609375, + "rewards/correctness_reward_func/std": 0.8085274025797844, + "rewards/int_reward_func/mean": 0.48828125, + "rewards/int_reward_func/std": 0.046875, + "rewards/soft_format_reward_func/mean": 0.4921875, + "rewards/soft_format_reward_func/std": 0.03125, + "rewards/strict_format_reward_func/mean": 0.48046875, + "rewards/strict_format_reward_func/std": 0.078125, + "rewards/xmlcount_reward_func/mean": 0.4892578125, + "rewards/xmlcount_reward_func/std": 0.04296875, + "step": 78 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 340.0, + "completions/max_terminated_length": 328.75, + "completions/mean_length": 157.328125, + "completions/mean_terminated_length": 154.62604331970215, + "completions/min_length": 82.5, + "completions/min_terminated_length": 82.5, + "entropy": 0.07690681796520948, + "epoch": 0.6766595289079229, + "frac_reward_zero_std": 0.875, + "grad_norm": 38.0, + "learning_rate": 6.069749683460765e-06, + "loss": 0.0732, + "num_tokens": 3218069.0, + "reward": 3.439453125, + "reward_std": 0.21820873208343983, + "rewards/correctness_reward_func/mean": 1.5625, + "rewards/correctness_reward_func/std": 0.8027089610695839, + "rewards/int_reward_func/mean": 0.46484375, + "rewards/int_reward_func/std": 0.07073915377259254, + "rewards/soft_format_reward_func/mean": 0.47265625, + "rewards/soft_format_reward_func/std": 0.0660141110420227, + "rewards/strict_format_reward_func/mean": 0.46875, + "rewards/strict_format_reward_func/std": 0.07173692621290684, + "rewards/xmlcount_reward_func/mean": 0.470703125, + "rewards/xmlcount_reward_func/std": 0.06738616153597832, + "step": 79 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 296.25, + "completions/max_terminated_length": 270.25, + "completions/mean_length": 157.2421875, + "completions/mean_terminated_length": 148.20973587036133, + "completions/min_length": 79.875, + "completions/min_terminated_length": 79.875, + "entropy": 0.06810100981965661, + "epoch": 0.6852248394004282, + "frac_reward_zero_std": 0.859375, + "grad_norm": 13.0, + "learning_rate": 5.796427716904347e-06, + "loss": -0.0428, + "num_tokens": 3250002.0, + "reward": 3.4755859375, + "reward_std": 0.18920630402863026, + "rewards/correctness_reward_func/mean": 1.546875, + "rewards/correctness_reward_func/std": 0.8281612768769264, + "rewards/int_reward_func/mean": 0.46875, + "rewards/int_reward_func/std": 0.08351518586277962, + "rewards/soft_format_reward_func/mean": 0.48828125, + "rewards/soft_format_reward_func/std": 0.025194555521011353, + "rewards/strict_format_reward_func/mean": 0.484375, + "rewards/strict_format_reward_func/std": 0.04081955552101135, + "rewards/xmlcount_reward_func/mean": 0.4873046875, + "rewards/xmlcount_reward_func/std": 0.029100805521011353, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 295.0, + "completions/max_terminated_length": 241.75, + "completions/mean_length": 148.1953125, + "completions/mean_terminated_length": 139.70900535583496, + "completions/min_length": 78.5, + "completions/min_terminated_length": 78.5, + "entropy": 0.06864482956007123, + "epoch": 0.6937901498929336, + "frac_reward_zero_std": 0.8125, + "grad_norm": 9.4375, + "learning_rate": 5.526868516843673e-06, + "loss": 0.3268, + "num_tokens": 3280593.0, + "reward": 3.451171875, + "reward_std": 0.2458300832659006, + "rewards/correctness_reward_func/mean": 1.484375, + "rewards/correctness_reward_func/std": 0.8385421559214592, + "rewards/int_reward_func/mean": 0.4921875, + "rewards/int_reward_func/std": 0.021347815170884132, + "rewards/soft_format_reward_func/mean": 0.49609375, + "rewards/soft_format_reward_func/std": 0.015625, + "rewards/strict_format_reward_func/mean": 0.48828125, + "rewards/strict_format_reward_func/std": 0.03697281517088413, + "rewards/xmlcount_reward_func/mean": 0.490234375, + "rewards/xmlcount_reward_func/std": 0.030614666640758514, + "step": 81 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 376.625, + "completions/max_terminated_length": 287.0, + "completions/mean_length": 170.5625, + "completions/mean_terminated_length": 156.5812530517578, + "completions/min_length": 78.375, + "completions/min_terminated_length": 78.375, + "entropy": 0.05719508696347475, + "epoch": 0.702355460385439, + "frac_reward_zero_std": 0.796875, + "grad_norm": 21.75, + "learning_rate": 5.2613133752700145e-06, + "loss": 0.3157, + "num_tokens": 3314333.0, + "reward": 3.5078125, + "reward_std": 0.27345145121216774, + "rewards/correctness_reward_func/mean": 1.609375, + "rewards/correctness_reward_func/std": 0.6161131635308266, + "rewards/int_reward_func/mean": 0.46484375, + "rewards/int_reward_func/std": 0.08923800103366375, + "rewards/soft_format_reward_func/mean": 0.48046875, + "rewards/soft_format_reward_func/std": 0.05644455552101135, + "rewards/strict_format_reward_func/mean": 0.47265625, + "rewards/strict_format_reward_func/std": 0.06789018586277962, + "rewards/xmlcount_reward_func/mean": 0.48046875, + "rewards/xmlcount_reward_func/std": 0.049400702118873596, + "step": 82 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 353.0, + "completions/max_terminated_length": 253.125, + "completions/mean_length": 157.8515625, + "completions/mean_terminated_length": 149.17083740234375, + "completions/min_length": 74.875, + "completions/min_terminated_length": 74.875, + "entropy": 0.06622256385162473, + "epoch": 0.7109207708779444, + "frac_reward_zero_std": 0.8125, + "grad_norm": 15.875, + "learning_rate": 5.000000000000003e-06, + "loss": 0.292, + "num_tokens": 3346592.0, + "reward": 3.5224609375, + "reward_std": 0.3217888306826353, + "rewards/correctness_reward_func/mean": 1.625, + "rewards/correctness_reward_func/std": 0.654181070625782, + "rewards/int_reward_func/mean": 0.48046875, + "rewards/int_reward_func/std": 0.05644455552101135, + "rewards/soft_format_reward_func/mean": 0.4765625, + "rewards/soft_format_reward_func/std": 0.05920085124671459, + "rewards/strict_format_reward_func/mean": 0.46484375, + "rewards/strict_format_reward_func/std": 0.10607585124671459, + "rewards/xmlcount_reward_func/mean": 0.4755859375, + "rewards/xmlcount_reward_func/std": 0.06840440817177296, + "step": 83 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 409.75, + "completions/max_terminated_length": 250.875, + "completions/mean_length": 167.9375, + "completions/mean_terminated_length": 148.38951301574707, + "completions/min_length": 73.125, + "completions/min_terminated_length": 73.125, + "entropy": 0.05832461267709732, + "epoch": 0.7194860813704497, + "frac_reward_zero_std": 0.828125, + "grad_norm": 9.875, + "learning_rate": 4.743162301894952e-06, + "loss": 0.4018, + "num_tokens": 3380156.0, + "reward": 3.4765625, + "reward_std": 0.23201941419392824, + "rewards/correctness_reward_func/mean": 1.5625, + "rewards/correctness_reward_func/std": 0.8384527564048767, + "rewards/int_reward_func/mean": 0.4765625, + "rewards/int_reward_func/std": 0.07394563034176826, + "rewards/soft_format_reward_func/mean": 0.484375, + "rewards/soft_format_reward_func/std": 0.05259781517088413, + "rewards/strict_format_reward_func/mean": 0.46875, + "rewards/strict_format_reward_func/std": 0.0952934455126524, + "rewards/xmlcount_reward_func/mean": 0.484375, + "rewards/xmlcount_reward_func/std": 0.05507336184382439, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 318.375, + "completions/max_terminated_length": 290.375, + "completions/mean_length": 158.484375, + "completions/mean_terminated_length": 152.8973217010498, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.07197121158242226, + "epoch": 0.728051391862955, + "frac_reward_zero_std": 0.78125, + "grad_norm": 6.625, + "learning_rate": 4.491030185478976e-06, + "loss": 0.2278, + "num_tokens": 3413046.0, + "reward": 3.6376953125, + "reward_std": 0.2803567871451378, + "rewards/correctness_reward_func/mean": 1.671875, + "rewards/correctness_reward_func/std": 0.6848579198122025, + "rewards/int_reward_func/mean": 0.484375, + "rewards/int_reward_func/std": 0.04081955552101135, + "rewards/soft_format_reward_func/mean": 0.5, + "rewards/soft_format_reward_func/std": 0.0, + "rewards/strict_format_reward_func/mean": 0.484375, + "rewards/strict_format_reward_func/std": 0.04081955552101135, + "rewards/xmlcount_reward_func/mean": 0.4970703125, + "rewards/xmlcount_reward_func/std": 0.01171875, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 370.25, + "completions/max_terminated_length": 291.125, + "completions/mean_length": 175.3828125, + "completions/mean_terminated_length": 161.8246307373047, + "completions/min_length": 83.75, + "completions/min_terminated_length": 83.75, + "entropy": 0.06557085691019893, + "epoch": 0.7366167023554604, + "frac_reward_zero_std": 0.734375, + "grad_norm": 13.4375, + "learning_rate": 4.2438293431432665e-06, + "loss": 0.3556, + "num_tokens": 3447325.0, + "reward": 3.4638671875, + "reward_std": 0.30521600786596537, + "rewards/correctness_reward_func/mean": 1.5625, + "rewards/correctness_reward_func/std": 0.7508078292012215, + "rewards/int_reward_func/mean": 0.46875, + "rewards/int_reward_func/std": 0.06689241342246532, + "rewards/soft_format_reward_func/mean": 0.484375, + "rewards/soft_format_reward_func/std": 0.04081955552101135, + "rewards/strict_format_reward_func/mean": 0.46484375, + "rewards/strict_format_reward_func/std": 0.08627148158848286, + "rewards/xmlcount_reward_func/mean": 0.4833984375, + "rewards/xmlcount_reward_func/std": 0.03715440817177296, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 423.75, + "completions/max_terminated_length": 293.125, + "completions/mean_length": 168.0390625, + "completions/mean_terminated_length": 154.1963596343994, + "completions/min_length": 77.625, + "completions/min_terminated_length": 77.625, + "entropy": 0.06665381882339716, + "epoch": 0.7451820128479657, + "frac_reward_zero_std": 0.796875, + "grad_norm": 8.625, + "learning_rate": 4.001781053120863e-06, + "loss": 0.2807, + "num_tokens": 3481046.0, + "reward": 3.435546875, + "reward_std": 0.2762135900557041, + "rewards/correctness_reward_func/mean": 1.515625, + "rewards/correctness_reward_func/std": 0.7195080667734146, + "rewards/int_reward_func/mean": 0.4609375, + "rewards/int_reward_func/std": 0.09011822193861008, + "rewards/soft_format_reward_func/mean": 0.48828125, + "rewards/soft_format_reward_func/std": 0.046875, + "rewards/strict_format_reward_func/mean": 0.48046875, + "rewards/strict_format_reward_func/std": 0.078125, + "rewards/xmlcount_reward_func/mean": 0.490234375, + "rewards/xmlcount_reward_func/std": 0.0390625, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 390.75, + "completions/max_terminated_length": 273.625, + "completions/mean_length": 171.125, + "completions/mean_terminated_length": 160.2234401702881, + "completions/min_length": 88.25, + "completions/min_terminated_length": 88.25, + "entropy": 0.060074358712881804, + "epoch": 0.7537473233404711, + "frac_reward_zero_std": 0.796875, + "grad_norm": 6.71875, + "learning_rate": 3.7651019814126656e-06, + "loss": 0.3052, + "num_tokens": 3515172.0, + "reward": 3.5400390625, + "reward_std": 0.2748325187712908, + "rewards/correctness_reward_func/mean": 1.59375, + "rewards/correctness_reward_func/std": 0.7068260312080383, + "rewards/int_reward_func/mean": 0.4765625, + "rewards/int_reward_func/std": 0.062167370691895485, + "rewards/soft_format_reward_func/mean": 0.49609375, + "rewards/soft_format_reward_func/std": 0.015625, + "rewards/strict_format_reward_func/mean": 0.48046875, + "rewards/strict_format_reward_func/std": 0.078125, + "rewards/xmlcount_reward_func/mean": 0.4931640625, + "rewards/xmlcount_reward_func/std": 0.02734375, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 339.25, + "completions/max_terminated_length": 269.0, + "completions/mean_length": 155.5546875, + "completions/mean_terminated_length": 143.96582794189453, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.06739555345848203, + "epoch": 0.7623126338329764, + "frac_reward_zero_std": 0.84375, + "grad_norm": 7.3125, + "learning_rate": 3.534003987842005e-06, + "loss": 0.0879, + "num_tokens": 3546821.0, + "reward": 3.49609375, + "reward_std": 0.14915533305611461, + "rewards/correctness_reward_func/mean": 1.578125, + "rewards/correctness_reward_func/std": 0.8041466698050499, + "rewards/int_reward_func/mean": 0.4609375, + "rewards/int_reward_func/std": 0.11476518586277962, + "rewards/soft_format_reward_func/mean": 0.48828125, + "rewards/soft_format_reward_func/std": 0.03697281517088413, + "rewards/strict_format_reward_func/mean": 0.48046875, + "rewards/strict_format_reward_func/std": 0.05644455552101135, + "rewards/xmlcount_reward_func/mean": 0.48828125, + "rewards/xmlcount_reward_func/std": 0.03697281517088413, + "step": 89 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 367.0, + "completions/max_terminated_length": 264.625, + "completions/mean_length": 164.6796875, + "completions/mean_terminated_length": 150.61161041259766, + "completions/min_length": 72.5, + "completions/min_terminated_length": 72.5, + "entropy": 0.06468326412141323, + "epoch": 0.7708779443254818, + "frac_reward_zero_std": 0.78125, + "grad_norm": 17.75, + "learning_rate": 3.308693936411421e-06, + "loss": 0.1794, + "num_tokens": 3580028.0, + "reward": 3.33203125, + "reward_std": 0.27621358446776867, + "rewards/correctness_reward_func/mean": 1.453125, + "rewards/correctness_reward_func/std": 0.8369380235671997, + "rewards/int_reward_func/mean": 0.45703125, + "rewards/int_reward_func/std": 0.11861192621290684, + "rewards/soft_format_reward_func/mean": 0.484375, + "rewards/soft_format_reward_func/std": 0.04081955552101135, + "rewards/strict_format_reward_func/mean": 0.4609375, + "rewards/strict_format_reward_func/std": 0.10298692621290684, + "rewards/xmlcount_reward_func/mean": 0.4765625, + "rewards/xmlcount_reward_func/std": 0.061410133726894855, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 376.125, + "completions/max_terminated_length": 301.625, + "completions/mean_length": 162.484375, + "completions/mean_terminated_length": 153.9395866394043, + "completions/min_length": 76.375, + "completions/min_terminated_length": 76.375, + "entropy": 0.062019561883062124, + "epoch": 0.7794432548179872, + "frac_reward_zero_std": 0.765625, + "grad_norm": 9.0, + "learning_rate": 3.089373510131354e-06, + "loss": 0.2742, + "num_tokens": 3613084.0, + "reward": 3.546875, + "reward_std": 0.2872621323913336, + "rewards/correctness_reward_func/mean": 1.578125, + "rewards/correctness_reward_func/std": 0.8067077249288559, + "rewards/int_reward_func/mean": 0.4921875, + "rewards/int_reward_func/std": 0.03125, + "rewards/soft_format_reward_func/mean": 0.49609375, + "rewards/soft_format_reward_func/std": 0.015625, + "rewards/strict_format_reward_func/mean": 0.484375, + "rewards/strict_format_reward_func/std": 0.05259781517088413, + "rewards/xmlcount_reward_func/mean": 0.49609375, + "rewards/xmlcount_reward_func/std": 0.013149453792721033, + "step": 91 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 385.625, + "completions/max_terminated_length": 279.125, + "completions/mean_length": 182.7734375, + "completions/mean_terminated_length": 163.5002956390381, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.060410378966480494, + "epoch": 0.7880085653104925, + "frac_reward_zero_std": 0.796875, + "grad_norm": 15.125, + "learning_rate": 2.876239030486554e-06, + "loss": 0.2067, + "num_tokens": 3649339.0, + "reward": 3.423828125, + "reward_std": 0.31488349102437496, + "rewards/correctness_reward_func/mean": 1.53125, + "rewards/correctness_reward_func/std": 0.8038287088274956, + "rewards/int_reward_func/mean": 0.46484375, + "rewards/int_reward_func/std": 0.10904237069189548, + "rewards/soft_format_reward_func/mean": 0.4765625, + "rewards/soft_format_reward_func/std": 0.07206955552101135, + "rewards/strict_format_reward_func/mean": 0.47265625, + "rewards/strict_format_reward_func/std": 0.07779237069189548, + "rewards/xmlcount_reward_func/mean": 0.478515625, + "rewards/xmlcount_reward_func/std": 0.060735128819942474, + "step": 92 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 425.5, + "completions/max_terminated_length": 323.0, + "completions/mean_length": 188.359375, + "completions/mean_terminated_length": 175.17136001586914, + "completions/min_length": 89.5, + "completions/min_terminated_length": 89.5, + "entropy": 0.0746710579842329, + "epoch": 0.7965738758029979, + "frac_reward_zero_std": 0.78125, + "grad_norm": 8.125, + "learning_rate": 2.669481281701739e-06, + "loss": 0.3599, + "num_tokens": 3686281.0, + "reward": 3.3671875, + "reward_std": 0.3425048552453518, + "rewards/correctness_reward_func/mean": 1.453125, + "rewards/correctness_reward_func/std": 0.8768203780055046, + "rewards/int_reward_func/mean": 0.46484375, + "rewards/int_reward_func/std": 0.10904237069189548, + "rewards/soft_format_reward_func/mean": 0.484375, + "rewards/soft_format_reward_func/std": 0.0625, + "rewards/strict_format_reward_func/mean": 0.4765625, + "rewards/strict_format_reward_func/std": 0.08384781517088413, + "rewards/xmlcount_reward_func/mean": 0.48828125, + "rewards/xmlcount_reward_func/std": 0.04335307329893112, + "step": 93 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 304.0, + "completions/max_terminated_length": 273.125, + "completions/mean_length": 151.4765625, + "completions/mean_terminated_length": 148.6171875, + "completions/min_length": 74.375, + "completions/min_terminated_length": 74.375, + "entropy": 0.06636462640017271, + "epoch": 0.8051391862955032, + "frac_reward_zero_std": 0.78125, + "grad_norm": 6.34375, + "learning_rate": 2.469285339963892e-06, + "loss": 0.0483, + "num_tokens": 3717830.0, + "reward": 3.517578125, + "reward_std": 0.32869415916502476, + "rewards/correctness_reward_func/mean": 1.53125, + "rewards/correctness_reward_func/std": 0.809794619679451, + "rewards/int_reward_func/mean": 0.49609375, + "rewards/int_reward_func/std": 0.015625, + "rewards/soft_format_reward_func/mean": 0.49609375, + "rewards/soft_format_reward_func/std": 0.015625, + "rewards/strict_format_reward_func/mean": 0.49609375, + "rewards/strict_format_reward_func/std": 0.015625, + "rewards/xmlcount_reward_func/mean": 0.498046875, + "rewards/xmlcount_reward_func/std": 0.0078125, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 319.125, + "completions/max_terminated_length": 247.375, + "completions/mean_length": 158.1015625, + "completions/mean_terminated_length": 141.4641580581665, + "completions/min_length": 79.625, + "completions/min_terminated_length": 79.625, + "entropy": 0.06498824106529355, + "epoch": 0.8137044967880086, + "frac_reward_zero_std": 0.828125, + "grad_norm": 15.0625, + "learning_rate": 2.275830407754006e-06, + "loss": 0.2777, + "num_tokens": 3750065.0, + "reward": 3.41796875, + "reward_std": 0.2485922183841467, + "rewards/correctness_reward_func/mean": 1.5, + "rewards/correctness_reward_func/std": 0.842106930911541, + "rewards/int_reward_func/mean": 0.4765625, + "rewards/int_reward_func/std": 0.04554459825158119, + "rewards/soft_format_reward_func/mean": 0.48046875, + "rewards/soft_format_reward_func/std": 0.04357585124671459, + "rewards/strict_format_reward_func/mean": 0.4765625, + "rewards/strict_format_reward_func/std": 0.04554459825158119, + "rewards/xmlcount_reward_func/mean": 0.484375, + "rewards/xmlcount_reward_func/std": 0.03324815817177296, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 324.25, + "completions/max_terminated_length": 257.375, + "completions/mean_length": 161.9140625, + "completions/mean_terminated_length": 153.45208549499512, + "completions/min_length": 83.75, + "completions/min_terminated_length": 83.75, + "entropy": 0.06923280376940966, + "epoch": 0.8222698072805139, + "frac_reward_zero_std": 0.84375, + "grad_norm": 7.21875, + "learning_rate": 2.08928965343659e-06, + "loss": 0.0191, + "num_tokens": 3783210.0, + "reward": 3.6064453125, + "reward_std": 0.2085412573069334, + "rewards/correctness_reward_func/mean": 1.65625, + "rewards/correctness_reward_func/std": 0.7442077249288559, + "rewards/int_reward_func/mean": 0.48046875, + "rewards/int_reward_func/std": 0.06822281517088413, + "rewards/soft_format_reward_func/mean": 0.48828125, + "rewards/soft_format_reward_func/std": 0.03697281517088413, + "rewards/strict_format_reward_func/mean": 0.48828125, + "rewards/strict_format_reward_func/std": 0.03697281517088413, + "rewards/xmlcount_reward_func/mean": 0.4931640625, + "rewards/xmlcount_reward_func/std": 0.021456445567309856, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 294.0, + "completions/max_terminated_length": 224.5, + "completions/mean_length": 143.0703125, + "completions/mean_terminated_length": 137.4109401702881, + "completions/min_length": 66.125, + "completions/min_terminated_length": 66.125, + "entropy": 0.06562586454674602, + "epoch": 0.8308351177730193, + "frac_reward_zero_std": 0.90625, + "grad_norm": 4.4375, + "learning_rate": 1.9098300562505266e-06, + "loss": -0.0738, + "num_tokens": 3813537.0, + "reward": 3.6640625, + "reward_std": 0.14086892642080784, + "rewards/correctness_reward_func/mean": 1.734375, + "rewards/correctness_reward_func/std": 0.5065634250640869, + "rewards/int_reward_func/mean": 0.48046875, + "rewards/int_reward_func/std": 0.046542370691895485, + "rewards/soft_format_reward_func/mean": 0.484375, + "rewards/soft_format_reward_func/std": 0.04081955552101135, + "rewards/strict_format_reward_func/mean": 0.48046875, + "rewards/strict_format_reward_func/std": 0.05644455552101135, + "rewards/xmlcount_reward_func/mean": 0.484375, + "rewards/xmlcount_reward_func/std": 0.04081955552101135, + "step": 97 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 262.75, + "completions/max_terminated_length": 262.75, + "completions/mean_length": 146.15625, + "completions/mean_terminated_length": 146.15625, + "completions/min_length": 77.625, + "completions/min_terminated_length": 77.625, + "entropy": 0.05933028785511851, + "epoch": 0.8394004282655246, + "frac_reward_zero_std": 0.828125, + "grad_norm": 8.5, + "learning_rate": 1.7376122568400533e-06, + "loss": -0.0062, + "num_tokens": 3844527.0, + "reward": 3.7138671875, + "reward_std": 0.21682766266167164, + "rewards/correctness_reward_func/mean": 1.734375, + "rewards/correctness_reward_func/std": 0.5849205926060677, + "rewards/int_reward_func/mean": 0.484375, + "rewards/int_reward_func/std": 0.05259781517088413, + "rewards/soft_format_reward_func/mean": 0.5, + "rewards/soft_format_reward_func/std": 0.0, + "rewards/strict_format_reward_func/mean": 0.49609375, + "rewards/strict_format_reward_func/std": 0.015625, + "rewards/xmlcount_reward_func/mean": 0.4990234375, + "rewards/xmlcount_reward_func/std": 0.00390625, + "step": 98 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 298.125, + "completions/max_terminated_length": 269.0, + "completions/mean_length": 152.359375, + "completions/mean_terminated_length": 149.32135486602783, + "completions/min_length": 76.125, + "completions/min_terminated_length": 76.125, + "entropy": 0.07287971116602421, + "epoch": 0.8479657387580299, + "frac_reward_zero_std": 0.78125, + "grad_norm": 35.5, + "learning_rate": 1.5727904134596084e-06, + "loss": 0.0803, + "num_tokens": 3875861.0, + "reward": 3.2978515625, + "reward_std": 0.3632208569906652, + "rewards/correctness_reward_func/mean": 1.421875, + "rewards/correctness_reward_func/std": 0.9180332496762276, + "rewards/int_reward_func/mean": 0.4609375, + "rewards/int_reward_func/std": 0.10298692621290684, + "rewards/soft_format_reward_func/mean": 0.47265625, + "rewards/soft_format_reward_func/std": 0.0660141110420227, + "rewards/strict_format_reward_func/mean": 0.47265625, + "rewards/strict_format_reward_func/std": 0.0660141110420227, + "rewards/xmlcount_reward_func/mean": 0.4697265625, + "rewards/xmlcount_reward_func/std": 0.06928502768278122, + "step": 99 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 399.5, + "completions/max_terminated_length": 313.125, + "completions/mean_length": 182.84375, + "completions/mean_terminated_length": 160.92637634277344, + "completions/min_length": 84.375, + "completions/min_terminated_length": 84.375, + "entropy": 0.07499845931306481, + "epoch": 0.8565310492505354, + "frac_reward_zero_std": 0.734375, + "grad_norm": 15.375, + "learning_rate": 1.4155120639813392e-06, + "loss": 0.1946, + "num_tokens": 3911453.0, + "reward": 3.2890625, + "reward_std": 0.40603396110236645, + "rewards/correctness_reward_func/mean": 1.421875, + "rewards/correctness_reward_func/std": 0.9128188416361809, + "rewards/int_reward_func/mean": 0.44921875, + "rewards/int_reward_func/std": 0.10904237069189548, + "rewards/soft_format_reward_func/mean": 0.47265625, + "rewards/soft_format_reward_func/std": 0.06116959825158119, + "rewards/strict_format_reward_func/mean": 0.46875, + "rewards/strict_format_reward_func/std": 0.07679459825158119, + "rewards/xmlcount_reward_func/mean": 0.4765625, + "rewards/xmlcount_reward_func/std": 0.05072539113461971, + "step": 100 + } + ], + "logging_steps": 1, + "max_steps": 117, + "num_input_tokens_seen": 3911453, + "num_train_epochs": 1, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}