| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.25938735177865613, |
| "eval_steps": 500, |
| "global_step": 1050, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "completion_length": 176.71875, |
| "epoch": 0.00024703557312252963, |
| "grad_norm": 3.9802175845327943, |
| "kl": 0.0, |
| "learning_rate": 1e-06, |
| "loss": -0.0, |
| "reward": 1.6392865180969238, |
| "reward_std": 0.3521379828453064, |
| "rewards/accuracy_reward_stage2": 0.3434532582759857, |
| "rewards/format_reward_all_stage": 1.2958333492279053, |
| "scores/refine_times": 1.84375, |
| "step": 1 |
| }, |
| { |
| "completion_length": 105.453125, |
| "epoch": 0.0004940711462450593, |
| "grad_norm": 8.551880775801044, |
| "kl": 0.00066375732421875, |
| "learning_rate": 9.997529644268775e-07, |
| "loss": 0.0, |
| "reward": 1.9555529356002808, |
| "reward_std": 0.5750938653945923, |
| "rewards/accuracy_reward_stage2": 0.5399278998374939, |
| "rewards/format_reward_all_stage": 1.4156250953674316, |
| "scores/refine_times": 1.484375, |
| "step": 2 |
| }, |
| { |
| "completion_length": 147.21875, |
| "epoch": 0.0007411067193675889, |
| "grad_norm": 7.326960101035143, |
| "kl": 0.0022125244140625, |
| "learning_rate": 9.995059288537548e-07, |
| "loss": 0.0001, |
| "reward": 1.696824312210083, |
| "reward_std": 0.7795395851135254, |
| "rewards/accuracy_reward_stage2": 0.5530742406845093, |
| "rewards/format_reward_all_stage": 1.1437499523162842, |
| "scores/refine_times": 1.703125, |
| "step": 3 |
| }, |
| { |
| "completion_length": 208.84375, |
| "epoch": 0.0009881422924901185, |
| "grad_norm": 4.54458264965591, |
| "kl": 0.00144195556640625, |
| "learning_rate": 9.992588932806324e-07, |
| "loss": 0.0001, |
| "reward": 1.765295386314392, |
| "reward_std": 0.8180840611457825, |
| "rewards/accuracy_reward_stage2": 0.4215453863143921, |
| "rewards/format_reward_all_stage": 1.34375, |
| "scores/refine_times": 2.09375, |
| "step": 4 |
| }, |
| { |
| "completion_length": 187.09375, |
| "epoch": 0.0012351778656126482, |
| "grad_norm": 4.889281322875834, |
| "kl": 0.00213623046875, |
| "learning_rate": 9.9901185770751e-07, |
| "loss": 0.0001, |
| "reward": 1.936046838760376, |
| "reward_std": 0.6565989255905151, |
| "rewards/accuracy_reward_stage2": 0.44438010454177856, |
| "rewards/format_reward_all_stage": 1.4916666746139526, |
| "scores/refine_times": 1.875, |
| "step": 5 |
| }, |
| { |
| "completion_length": 144.9375, |
| "epoch": 0.0014822134387351778, |
| "grad_norm": 3.650312537503493, |
| "kl": 0.005859375, |
| "learning_rate": 9.987648221343872e-07, |
| "loss": 0.0002, |
| "reward": 2.015610694885254, |
| "reward_std": 0.5393483638763428, |
| "rewards/accuracy_reward_stage2": 0.648944079875946, |
| "rewards/format_reward_all_stage": 1.366666555404663, |
| "scores/refine_times": 1.6875, |
| "step": 6 |
| }, |
| { |
| "completion_length": 206.09375, |
| "epoch": 0.0017292490118577075, |
| "grad_norm": 3.88597528891217, |
| "kl": 0.006591796875, |
| "learning_rate": 9.985177865612648e-07, |
| "loss": 0.0003, |
| "reward": 1.4110958576202393, |
| "reward_std": 0.4156912863254547, |
| "rewards/accuracy_reward_stage2": 0.3126583695411682, |
| "rewards/format_reward_all_stage": 1.0984375476837158, |
| "scores/refine_times": 2.09375, |
| "step": 7 |
| }, |
| { |
| "completion_length": 190.78125, |
| "epoch": 0.001976284584980237, |
| "grad_norm": 3.7825448304674785, |
| "kl": 0.0047607421875, |
| "learning_rate": 9.982707509881423e-07, |
| "loss": 0.0002, |
| "reward": 1.8658943176269531, |
| "reward_std": 0.33927106857299805, |
| "rewards/accuracy_reward_stage2": 0.32214438915252686, |
| "rewards/format_reward_all_stage": 1.5437500476837158, |
| "scores/refine_times": 2.0625, |
| "step": 8 |
| }, |
| { |
| "completion_length": 245.578125, |
| "epoch": 0.002223320158102767, |
| "grad_norm": 3.272740004919561, |
| "kl": 0.00537109375, |
| "learning_rate": 9.980237154150196e-07, |
| "loss": 0.0002, |
| "reward": 1.6504933834075928, |
| "reward_std": 0.617614209651947, |
| "rewards/accuracy_reward_stage2": 0.34736835956573486, |
| "rewards/format_reward_all_stage": 1.303125023841858, |
| "scores/refine_times": 2.390625, |
| "step": 9 |
| }, |
| { |
| "completion_length": 190.5625, |
| "epoch": 0.0024703557312252965, |
| "grad_norm": 3.01022489508433, |
| "kl": 0.01092529296875, |
| "learning_rate": 9.977766798418972e-07, |
| "loss": 0.0004, |
| "reward": 2.478713035583496, |
| "reward_std": 0.38603711128234863, |
| "rewards/accuracy_reward_stage2": 0.663608968257904, |
| "rewards/format_reward_all_stage": 1.8151041269302368, |
| "scores/refine_times": 2.171875, |
| "step": 10 |
| }, |
| { |
| "completion_length": 155.546875, |
| "epoch": 0.002717391304347826, |
| "grad_norm": 3.9877146575128064, |
| "kl": 0.0174560546875, |
| "learning_rate": 9.975296442687747e-07, |
| "loss": 0.0007, |
| "reward": 2.293698787689209, |
| "reward_std": 0.36475634574890137, |
| "rewards/accuracy_reward_stage2": 0.5129696726799011, |
| "rewards/format_reward_all_stage": 1.7807291746139526, |
| "scores/refine_times": 1.5625, |
| "step": 11 |
| }, |
| { |
| "completion_length": 183.53125, |
| "epoch": 0.0029644268774703555, |
| "grad_norm": 3.396300273793604, |
| "kl": 0.014404296875, |
| "learning_rate": 9.972826086956523e-07, |
| "loss": 0.0006, |
| "reward": 2.3064093589782715, |
| "reward_std": 0.37103700637817383, |
| "rewards/accuracy_reward_stage2": 0.4647427201271057, |
| "rewards/format_reward_all_stage": 1.841666579246521, |
| "scores/refine_times": 1.859375, |
| "step": 12 |
| }, |
| { |
| "completion_length": 123.828125, |
| "epoch": 0.0032114624505928855, |
| "grad_norm": 6.558836134964146, |
| "kl": 0.12060546875, |
| "learning_rate": 9.970355731225296e-07, |
| "loss": 0.0048, |
| "reward": 2.1412835121154785, |
| "reward_std": 0.298098087310791, |
| "rewards/accuracy_reward_stage2": 0.4058668315410614, |
| "rewards/format_reward_all_stage": 1.7354166507720947, |
| "scores/refine_times": 1.265625, |
| "step": 13 |
| }, |
| { |
| "completion_length": 160.40625, |
| "epoch": 0.003458498023715415, |
| "grad_norm": 3.333202019037145, |
| "kl": 0.017333984375, |
| "learning_rate": 9.967885375494071e-07, |
| "loss": 0.0007, |
| "reward": 2.3971457481384277, |
| "reward_std": 0.4326985478401184, |
| "rewards/accuracy_reward_stage2": 0.5752710103988647, |
| "rewards/format_reward_all_stage": 1.821874976158142, |
| "scores/refine_times": 1.578125, |
| "step": 14 |
| }, |
| { |
| "completion_length": 138.5625, |
| "epoch": 0.0037055335968379445, |
| "grad_norm": 3.981989824534079, |
| "kl": 0.031982421875, |
| "learning_rate": 9.965415019762845e-07, |
| "loss": 0.0013, |
| "reward": 2.39411997795105, |
| "reward_std": 0.3097790479660034, |
| "rewards/accuracy_reward_stage2": 0.5972450375556946, |
| "rewards/format_reward_all_stage": 1.796875, |
| "scores/refine_times": 1.65625, |
| "step": 15 |
| }, |
| { |
| "completion_length": 126.546875, |
| "epoch": 0.003952569169960474, |
| "grad_norm": 4.685808632727751, |
| "kl": 0.024169921875, |
| "learning_rate": 9.96294466403162e-07, |
| "loss": 0.001, |
| "reward": 2.398407220840454, |
| "reward_std": 0.23343093693256378, |
| "rewards/accuracy_reward_stage2": 0.5484069585800171, |
| "rewards/format_reward_all_stage": 1.850000023841858, |
| "scores/refine_times": 1.25, |
| "step": 16 |
| }, |
| { |
| "completion_length": 121.65625, |
| "epoch": 0.004199604743083004, |
| "grad_norm": 4.846282709405759, |
| "kl": 0.0252685546875, |
| "learning_rate": 9.960474308300395e-07, |
| "loss": 0.001, |
| "reward": 2.297874689102173, |
| "reward_std": 0.37787097692489624, |
| "rewards/accuracy_reward_stage2": 0.4603745937347412, |
| "rewards/format_reward_all_stage": 1.837499976158142, |
| "scores/refine_times": 1.328125, |
| "step": 17 |
| }, |
| { |
| "completion_length": 130.1875, |
| "epoch": 0.004446640316205534, |
| "grad_norm": 3.377534489669681, |
| "kl": 0.021728515625, |
| "learning_rate": 9.958003952569169e-07, |
| "loss": 0.0009, |
| "reward": 2.5723018646240234, |
| "reward_std": 0.24451476335525513, |
| "rewards/accuracy_reward_stage2": 0.6816768050193787, |
| "rewards/format_reward_all_stage": 1.890625, |
| "scores/refine_times": 1.546875, |
| "step": 18 |
| }, |
| { |
| "completion_length": 124.125, |
| "epoch": 0.004693675889328063, |
| "grad_norm": 4.888604917487123, |
| "kl": 0.0260009765625, |
| "learning_rate": 9.955533596837944e-07, |
| "loss": 0.001, |
| "reward": 2.2013630867004395, |
| "reward_std": 0.43003690242767334, |
| "rewards/accuracy_reward_stage2": 0.4992799162864685, |
| "rewards/format_reward_all_stage": 1.7020832300186157, |
| "scores/refine_times": 1.265625, |
| "step": 19 |
| }, |
| { |
| "completion_length": 112.984375, |
| "epoch": 0.004940711462450593, |
| "grad_norm": 4.247112761669188, |
| "kl": 0.029296875, |
| "learning_rate": 9.95306324110672e-07, |
| "loss": 0.0012, |
| "reward": 2.458132028579712, |
| "reward_std": 0.45524460077285767, |
| "rewards/accuracy_reward_stage2": 0.6154236197471619, |
| "rewards/format_reward_all_stage": 1.8427083492279053, |
| "scores/refine_times": 1.421875, |
| "step": 20 |
| }, |
| { |
| "completion_length": 95.1875, |
| "epoch": 0.005187747035573123, |
| "grad_norm": 5.691563277271698, |
| "kl": 0.031494140625, |
| "learning_rate": 9.950592885375495e-07, |
| "loss": 0.0013, |
| "reward": 2.3042306900024414, |
| "reward_std": 0.20654383301734924, |
| "rewards/accuracy_reward_stage2": 0.30423077940940857, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 21 |
| }, |
| { |
| "completion_length": 77.0, |
| "epoch": 0.005434782608695652, |
| "grad_norm": 4.170049319339651, |
| "kl": 0.0517578125, |
| "learning_rate": 9.948122529644268e-07, |
| "loss": 0.0021, |
| "reward": 2.779543876647949, |
| "reward_std": 0.049611423164606094, |
| "rewards/accuracy_reward_stage2": 0.7795437574386597, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 22 |
| }, |
| { |
| "completion_length": 115.03125, |
| "epoch": 0.005681818181818182, |
| "grad_norm": 4.183045012317983, |
| "kl": 0.03662109375, |
| "learning_rate": 9.945652173913043e-07, |
| "loss": 0.0015, |
| "reward": 2.506112813949585, |
| "reward_std": 0.26072418689727783, |
| "rewards/accuracy_reward_stage2": 0.5842377543449402, |
| "rewards/format_reward_all_stage": 1.921875, |
| "scores/refine_times": 1.34375, |
| "step": 23 |
| }, |
| { |
| "completion_length": 128.40625, |
| "epoch": 0.005928853754940711, |
| "grad_norm": 5.330982257283442, |
| "kl": 0.04345703125, |
| "learning_rate": 9.943181818181817e-07, |
| "loss": 0.0017, |
| "reward": 2.2350950241088867, |
| "reward_std": 0.4230126738548279, |
| "rewards/accuracy_reward_stage2": 0.494990736246109, |
| "rewards/format_reward_all_stage": 1.7401041984558105, |
| "scores/refine_times": 1.53125, |
| "step": 24 |
| }, |
| { |
| "completion_length": 124.375, |
| "epoch": 0.006175889328063241, |
| "grad_norm": 4.250442014523113, |
| "kl": 0.03564453125, |
| "learning_rate": 9.940711462450592e-07, |
| "loss": 0.0014, |
| "reward": 2.6479578018188477, |
| "reward_std": 0.10312794148921967, |
| "rewards/accuracy_reward_stage2": 0.6667079925537109, |
| "rewards/format_reward_all_stage": 1.9812500476837158, |
| "scores/refine_times": 1.390625, |
| "step": 25 |
| }, |
| { |
| "completion_length": 98.3125, |
| "epoch": 0.006422924901185771, |
| "grad_norm": 5.3796088493504195, |
| "kl": 0.0732421875, |
| "learning_rate": 9.938241106719368e-07, |
| "loss": 0.0029, |
| "reward": 2.4852514266967773, |
| "reward_std": 0.17395630478858948, |
| "rewards/accuracy_reward_stage2": 0.5102513432502747, |
| "rewards/format_reward_all_stage": 1.975000023841858, |
| "scores/refine_times": 1.125, |
| "step": 26 |
| }, |
| { |
| "completion_length": 95.625, |
| "epoch": 0.0066699604743083, |
| "grad_norm": 4.531359745756485, |
| "kl": 0.0556640625, |
| "learning_rate": 9.93577075098814e-07, |
| "loss": 0.0022, |
| "reward": 2.668905019760132, |
| "reward_std": 0.0784706324338913, |
| "rewards/accuracy_reward_stage2": 0.6689050197601318, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.15625, |
| "step": 27 |
| }, |
| { |
| "completion_length": 97.859375, |
| "epoch": 0.00691699604743083, |
| "grad_norm": 6.538585320360923, |
| "kl": 0.037841796875, |
| "learning_rate": 9.933300395256916e-07, |
| "loss": 0.0015, |
| "reward": 2.3841779232025146, |
| "reward_std": 0.26399004459381104, |
| "rewards/accuracy_reward_stage2": 0.40292802453041077, |
| "rewards/format_reward_all_stage": 1.9812500476837158, |
| "scores/refine_times": 1.21875, |
| "step": 28 |
| }, |
| { |
| "completion_length": 73.625, |
| "epoch": 0.00716403162055336, |
| "grad_norm": 3.595423200658827, |
| "kl": 0.06103515625, |
| "learning_rate": 9.930830039525692e-07, |
| "loss": 0.0025, |
| "reward": 2.5502519607543945, |
| "reward_std": 0.053210172802209854, |
| "rewards/accuracy_reward_stage2": 0.5502521395683289, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 29 |
| }, |
| { |
| "completion_length": 106.421875, |
| "epoch": 0.007411067193675889, |
| "grad_norm": 5.2183549887056975, |
| "kl": 0.05029296875, |
| "learning_rate": 9.928359683794467e-07, |
| "loss": 0.002, |
| "reward": 2.453716516494751, |
| "reward_std": 0.18897229433059692, |
| "rewards/accuracy_reward_stage2": 0.484966516494751, |
| "rewards/format_reward_all_stage": 1.96875, |
| "scores/refine_times": 1.328125, |
| "step": 30 |
| }, |
| { |
| "completion_length": 63.25, |
| "epoch": 0.007658102766798419, |
| "grad_norm": 4.605415382726092, |
| "kl": 0.061279296875, |
| "learning_rate": 9.92588932806324e-07, |
| "loss": 0.0025, |
| "reward": 2.706653594970703, |
| "reward_std": 0.16327600181102753, |
| "rewards/accuracy_reward_stage2": 0.8316534757614136, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.0, |
| "step": 31 |
| }, |
| { |
| "completion_length": 71.1875, |
| "epoch": 0.007905138339920948, |
| "grad_norm": 5.107542561053739, |
| "kl": 0.04345703125, |
| "learning_rate": 9.923418972332016e-07, |
| "loss": 0.0017, |
| "reward": 2.5002684593200684, |
| "reward_std": 0.07764653861522675, |
| "rewards/accuracy_reward_stage2": 0.5002684593200684, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 32 |
| }, |
| { |
| "completion_length": 87.984375, |
| "epoch": 0.008152173913043478, |
| "grad_norm": 5.276629249410866, |
| "kl": 0.05908203125, |
| "learning_rate": 9.920948616600791e-07, |
| "loss": 0.0024, |
| "reward": 2.567147970199585, |
| "reward_std": 0.20031458139419556, |
| "rewards/accuracy_reward_stage2": 0.582772970199585, |
| "rewards/format_reward_all_stage": 1.984375, |
| "scores/refine_times": 1.171875, |
| "step": 33 |
| }, |
| { |
| "completion_length": 81.375, |
| "epoch": 0.008399209486166008, |
| "grad_norm": 6.680603340606892, |
| "kl": 0.05078125, |
| "learning_rate": 9.918478260869564e-07, |
| "loss": 0.002, |
| "reward": 2.356149435043335, |
| "reward_std": 0.3229230046272278, |
| "rewards/accuracy_reward_stage2": 0.4811493754386902, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.0, |
| "step": 34 |
| }, |
| { |
| "completion_length": 66.25, |
| "epoch": 0.008646245059288538, |
| "grad_norm": 4.761831971688755, |
| "kl": 0.06787109375, |
| "learning_rate": 9.91600790513834e-07, |
| "loss": 0.0027, |
| "reward": 2.679305076599121, |
| "reward_std": 0.2129913717508316, |
| "rewards/accuracy_reward_stage2": 0.8043051362037659, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.0, |
| "step": 35 |
| }, |
| { |
| "completion_length": 64.5625, |
| "epoch": 0.008893280632411068, |
| "grad_norm": 4.318337980431364, |
| "kl": 0.057861328125, |
| "learning_rate": 9.913537549407113e-07, |
| "loss": 0.0023, |
| "reward": 2.641396999359131, |
| "reward_std": 0.06798752397298813, |
| "rewards/accuracy_reward_stage2": 0.6413968801498413, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 36 |
| }, |
| { |
| "completion_length": 81.828125, |
| "epoch": 0.009140316205533596, |
| "grad_norm": 6.067477705808883, |
| "kl": 0.08447265625, |
| "learning_rate": 9.911067193675888e-07, |
| "loss": 0.0034, |
| "reward": 2.5635056495666504, |
| "reward_std": 0.1310747265815735, |
| "rewards/accuracy_reward_stage2": 0.5635056495666504, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 37 |
| }, |
| { |
| "completion_length": 68.625, |
| "epoch": 0.009387351778656126, |
| "grad_norm": 5.097067697325507, |
| "kl": 0.0498046875, |
| "learning_rate": 9.908596837944664e-07, |
| "loss": 0.002, |
| "reward": 2.657144784927368, |
| "reward_std": 0.14659970998764038, |
| "rewards/accuracy_reward_stage2": 0.7821449041366577, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.0, |
| "step": 38 |
| }, |
| { |
| "completion_length": 82.484375, |
| "epoch": 0.009634387351778656, |
| "grad_norm": 5.382691839406121, |
| "kl": 0.05810546875, |
| "learning_rate": 9.90612648221344e-07, |
| "loss": 0.0023, |
| "reward": 2.6429476737976074, |
| "reward_std": 0.056348949670791626, |
| "rewards/accuracy_reward_stage2": 0.642947793006897, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 39 |
| }, |
| { |
| "completion_length": 69.4375, |
| "epoch": 0.009881422924901186, |
| "grad_norm": 5.662499218407379, |
| "kl": 0.05908203125, |
| "learning_rate": 9.903656126482212e-07, |
| "loss": 0.0024, |
| "reward": 2.6908767223358154, |
| "reward_std": 0.20418381690979004, |
| "rewards/accuracy_reward_stage2": 0.8158766627311707, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.0, |
| "step": 40 |
| }, |
| { |
| "completion_length": 67.3125, |
| "epoch": 0.010128458498023716, |
| "grad_norm": 4.335602071251039, |
| "kl": 0.1328125, |
| "learning_rate": 9.901185770750988e-07, |
| "loss": 0.0053, |
| "reward": 2.38730525970459, |
| "reward_std": 0.0236817616969347, |
| "rewards/accuracy_reward_stage2": 0.38730525970458984, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 41 |
| }, |
| { |
| "completion_length": 73.0625, |
| "epoch": 0.010375494071146246, |
| "grad_norm": 4.859918176788424, |
| "kl": 0.06201171875, |
| "learning_rate": 9.898715415019763e-07, |
| "loss": 0.0025, |
| "reward": 2.676914691925049, |
| "reward_std": 0.2246585190296173, |
| "rewards/accuracy_reward_stage2": 0.801914632320404, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.0, |
| "step": 42 |
| }, |
| { |
| "completion_length": 55.1875, |
| "epoch": 0.010622529644268774, |
| "grad_norm": 5.676790714973813, |
| "kl": 0.08740234375, |
| "learning_rate": 9.896245059288537e-07, |
| "loss": 0.0035, |
| "reward": 2.4531073570251465, |
| "reward_std": 0.14041699469089508, |
| "rewards/accuracy_reward_stage2": 0.45310738682746887, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 43 |
| }, |
| { |
| "completion_length": 64.1875, |
| "epoch": 0.010869565217391304, |
| "grad_norm": 4.692443081447903, |
| "kl": 0.07080078125, |
| "learning_rate": 9.893774703557312e-07, |
| "loss": 0.0028, |
| "reward": 2.5724921226501465, |
| "reward_std": 0.03363148868083954, |
| "rewards/accuracy_reward_stage2": 0.5724921226501465, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 44 |
| }, |
| { |
| "completion_length": 68.640625, |
| "epoch": 0.011116600790513834, |
| "grad_norm": 4.558828578106905, |
| "kl": 0.11376953125, |
| "learning_rate": 9.891304347826085e-07, |
| "loss": 0.0045, |
| "reward": 2.6344082355499268, |
| "reward_std": 0.07256568968296051, |
| "rewards/accuracy_reward_stage2": 0.6344083547592163, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.09375, |
| "step": 45 |
| }, |
| { |
| "completion_length": 61.3125, |
| "epoch": 0.011363636363636364, |
| "grad_norm": 2.223285366639568, |
| "kl": 0.09716796875, |
| "learning_rate": 9.88883399209486e-07, |
| "loss": 0.0039, |
| "reward": 2.777026414871216, |
| "reward_std": 0.004207036457955837, |
| "rewards/accuracy_reward_stage2": 0.7770264148712158, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 46 |
| }, |
| { |
| "completion_length": 56.9375, |
| "epoch": 0.011610671936758894, |
| "grad_norm": 4.425636565779575, |
| "kl": 0.111328125, |
| "learning_rate": 9.886363636363636e-07, |
| "loss": 0.0044, |
| "reward": 2.7204360961914062, |
| "reward_std": 0.07424846291542053, |
| "rewards/accuracy_reward_stage2": 0.7204362154006958, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 47 |
| }, |
| { |
| "completion_length": 57.3125, |
| "epoch": 0.011857707509881422, |
| "grad_norm": 3.8051294688279405, |
| "kl": 0.138671875, |
| "learning_rate": 9.883893280632411e-07, |
| "loss": 0.0055, |
| "reward": 2.5655875205993652, |
| "reward_std": 0.00956629030406475, |
| "rewards/accuracy_reward_stage2": 0.5655874013900757, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 48 |
| }, |
| { |
| "completion_length": 73.625, |
| "epoch": 0.012104743083003952, |
| "grad_norm": 4.088750742873491, |
| "kl": 0.10986328125, |
| "learning_rate": 9.881422924901185e-07, |
| "loss": 0.0044, |
| "reward": 2.7474145889282227, |
| "reward_std": 0.12706872820854187, |
| "rewards/accuracy_reward_stage2": 0.8099147081375122, |
| "rewards/format_reward_all_stage": 1.9375, |
| "scores/refine_times": 1.125, |
| "step": 49 |
| }, |
| { |
| "completion_length": 82.171875, |
| "epoch": 0.012351778656126482, |
| "grad_norm": 4.328291131941699, |
| "kl": 0.12890625, |
| "learning_rate": 9.87895256916996e-07, |
| "loss": 0.0052, |
| "reward": 2.3505635261535645, |
| "reward_std": 0.19469095766544342, |
| "rewards/accuracy_reward_stage2": 0.538063645362854, |
| "rewards/format_reward_all_stage": 1.8125, |
| "scores/refine_times": 1.25, |
| "step": 50 |
| }, |
| { |
| "completion_length": 60.578125, |
| "epoch": 0.012598814229249012, |
| "grad_norm": 5.181949589681223, |
| "kl": 0.12255859375, |
| "learning_rate": 9.876482213438736e-07, |
| "loss": 0.0049, |
| "reward": 2.6888465881347656, |
| "reward_std": 0.09551382809877396, |
| "rewards/accuracy_reward_stage2": 0.6888466477394104, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 51 |
| }, |
| { |
| "completion_length": 62.3125, |
| "epoch": 0.012845849802371542, |
| "grad_norm": 5.337786602273433, |
| "kl": 0.1201171875, |
| "learning_rate": 9.874011857707509e-07, |
| "loss": 0.0048, |
| "reward": 2.4908738136291504, |
| "reward_std": 0.19815078377723694, |
| "rewards/accuracy_reward_stage2": 0.6158738136291504, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.0, |
| "step": 52 |
| }, |
| { |
| "completion_length": 55.40625, |
| "epoch": 0.013092885375494072, |
| "grad_norm": 3.890124070261549, |
| "kl": 0.1103515625, |
| "learning_rate": 9.871541501976284e-07, |
| "loss": 0.0044, |
| "reward": 2.40610933303833, |
| "reward_std": 0.18748030066490173, |
| "rewards/accuracy_reward_stage2": 0.5311094522476196, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.15625, |
| "step": 53 |
| }, |
| { |
| "completion_length": 74.390625, |
| "epoch": 0.0133399209486166, |
| "grad_norm": 4.895191389307928, |
| "kl": 0.130859375, |
| "learning_rate": 9.86907114624506e-07, |
| "loss": 0.0053, |
| "reward": 2.4812636375427246, |
| "reward_std": 0.05580512434244156, |
| "rewards/accuracy_reward_stage2": 0.48126381635665894, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.28125, |
| "step": 54 |
| }, |
| { |
| "completion_length": 71.96875, |
| "epoch": 0.01358695652173913, |
| "grad_norm": 4.883363116264003, |
| "kl": 0.1279296875, |
| "learning_rate": 9.866600790513833e-07, |
| "loss": 0.0051, |
| "reward": 2.406097650527954, |
| "reward_std": 0.22400271892547607, |
| "rewards/accuracy_reward_stage2": 0.5935976505279541, |
| "rewards/format_reward_all_stage": 1.8125, |
| "scores/refine_times": 1.140625, |
| "step": 55 |
| }, |
| { |
| "completion_length": 85.546875, |
| "epoch": 0.01383399209486166, |
| "grad_norm": 3.868475183095971, |
| "kl": 0.11181640625, |
| "learning_rate": 9.864130434782608e-07, |
| "loss": 0.0045, |
| "reward": 2.4206976890563965, |
| "reward_std": 0.07123995572328568, |
| "rewards/accuracy_reward_stage2": 0.42694777250289917, |
| "rewards/format_reward_all_stage": 1.993749976158142, |
| "scores/refine_times": 1.40625, |
| "step": 56 |
| }, |
| { |
| "completion_length": 66.890625, |
| "epoch": 0.01408102766798419, |
| "grad_norm": 5.0953233667661095, |
| "kl": 0.0966796875, |
| "learning_rate": 9.861660079051384e-07, |
| "loss": 0.0039, |
| "reward": 2.7497503757476807, |
| "reward_std": 0.1626994013786316, |
| "rewards/accuracy_reward_stage2": 0.8747504949569702, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.078125, |
| "step": 57 |
| }, |
| { |
| "completion_length": 63.90625, |
| "epoch": 0.01432806324110672, |
| "grad_norm": 4.880185819937392, |
| "kl": 0.064453125, |
| "learning_rate": 9.85918972332016e-07, |
| "loss": 0.0026, |
| "reward": 2.616400957107544, |
| "reward_std": 0.10538823902606964, |
| "rewards/accuracy_reward_stage2": 0.6851509213447571, |
| "rewards/format_reward_all_stage": 1.931249976158142, |
| "scores/refine_times": 1.28125, |
| "step": 58 |
| }, |
| { |
| "completion_length": 57.1875, |
| "epoch": 0.01457509881422925, |
| "grad_norm": 1.809541273776412, |
| "kl": 0.0869140625, |
| "learning_rate": 9.856719367588932e-07, |
| "loss": 0.0035, |
| "reward": 2.748608112335205, |
| "reward_std": 0.009172855876386166, |
| "rewards/accuracy_reward_stage2": 0.7486082315444946, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 59 |
| }, |
| { |
| "completion_length": 73.890625, |
| "epoch": 0.014822134387351778, |
| "grad_norm": 4.654926557564929, |
| "kl": 0.111328125, |
| "learning_rate": 9.854249011857708e-07, |
| "loss": 0.0045, |
| "reward": 2.5640292167663574, |
| "reward_std": 0.03297269344329834, |
| "rewards/accuracy_reward_stage2": 0.5640289783477783, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.265625, |
| "step": 60 |
| }, |
| { |
| "completion_length": 86.78125, |
| "epoch": 0.015069169960474308, |
| "grad_norm": 4.5725168834876975, |
| "kl": 0.0712890625, |
| "learning_rate": 9.85177865612648e-07, |
| "loss": 0.0029, |
| "reward": 2.499513626098633, |
| "reward_std": 0.2176150381565094, |
| "rewards/accuracy_reward_stage2": 0.5849303603172302, |
| "rewards/format_reward_all_stage": 1.9145833253860474, |
| "scores/refine_times": 1.375, |
| "step": 61 |
| }, |
| { |
| "completion_length": 70.90625, |
| "epoch": 0.015316205533596838, |
| "grad_norm": 2.585985758788352, |
| "kl": 0.076171875, |
| "learning_rate": 9.849308300395256e-07, |
| "loss": 0.003, |
| "reward": 2.589341640472412, |
| "reward_std": 0.06622620671987534, |
| "rewards/accuracy_reward_stage2": 0.6424667239189148, |
| "rewards/format_reward_all_stage": 1.946874976158142, |
| "scores/refine_times": 1.171875, |
| "step": 62 |
| }, |
| { |
| "completion_length": 68.046875, |
| "epoch": 0.015563241106719368, |
| "grad_norm": 6.250671138658764, |
| "kl": 0.1904296875, |
| "learning_rate": 9.846837944664032e-07, |
| "loss": 0.0076, |
| "reward": 2.6439754962921143, |
| "reward_std": 0.31126198172569275, |
| "rewards/accuracy_reward_stage2": 0.8262671232223511, |
| "rewards/format_reward_all_stage": 1.8177083730697632, |
| "scores/refine_times": 1.078125, |
| "step": 63 |
| }, |
| { |
| "completion_length": 101.96875, |
| "epoch": 0.015810276679841896, |
| "grad_norm": 4.14719666980477, |
| "kl": 0.064453125, |
| "learning_rate": 9.844367588932805e-07, |
| "loss": 0.0026, |
| "reward": 2.47231388092041, |
| "reward_std": 0.32240432500839233, |
| "rewards/accuracy_reward_stage2": 0.7478347420692444, |
| "rewards/format_reward_all_stage": 1.7244791984558105, |
| "scores/refine_times": 1.390625, |
| "step": 64 |
| }, |
| { |
| "completion_length": 73.71875, |
| "epoch": 0.016057312252964428, |
| "grad_norm": 5.183498665871656, |
| "kl": 0.07861328125, |
| "learning_rate": 9.84189723320158e-07, |
| "loss": 0.0032, |
| "reward": 2.67384672164917, |
| "reward_std": 0.08753049373626709, |
| "rewards/accuracy_reward_stage2": 0.6738468408584595, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 65 |
| }, |
| { |
| "completion_length": 67.578125, |
| "epoch": 0.016304347826086956, |
| "grad_norm": 4.8680399656731845, |
| "kl": 0.05322265625, |
| "learning_rate": 9.839426877470356e-07, |
| "loss": 0.0021, |
| "reward": 2.583557605743408, |
| "reward_std": 0.12607493996620178, |
| "rewards/accuracy_reward_stage2": 0.5835577845573425, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.140625, |
| "step": 66 |
| }, |
| { |
| "completion_length": 66.875, |
| "epoch": 0.016551383399209488, |
| "grad_norm": 6.343983489288195, |
| "kl": 0.06982421875, |
| "learning_rate": 9.836956521739131e-07, |
| "loss": 0.0028, |
| "reward": 2.603722095489502, |
| "reward_std": 0.11146017163991928, |
| "rewards/accuracy_reward_stage2": 0.6037219762802124, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 67 |
| }, |
| { |
| "completion_length": 85.46875, |
| "epoch": 0.016798418972332016, |
| "grad_norm": 5.404842087971315, |
| "kl": 0.1083984375, |
| "learning_rate": 9.834486166007905e-07, |
| "loss": 0.0043, |
| "reward": 2.205897808074951, |
| "reward_std": 0.2572137117385864, |
| "rewards/accuracy_reward_stage2": 0.41527265310287476, |
| "rewards/format_reward_all_stage": 1.790624976158142, |
| "scores/refine_times": 1.25, |
| "step": 68 |
| }, |
| { |
| "completion_length": 88.125, |
| "epoch": 0.017045454545454544, |
| "grad_norm": 4.340005446967431, |
| "kl": 0.052734375, |
| "learning_rate": 9.83201581027668e-07, |
| "loss": 0.0021, |
| "reward": 2.445634126663208, |
| "reward_std": 0.36941787600517273, |
| "rewards/accuracy_reward_stage2": 0.6956342458724976, |
| "rewards/format_reward_all_stage": 1.75, |
| "scores/refine_times": 1.0, |
| "step": 69 |
| }, |
| { |
| "completion_length": 86.671875, |
| "epoch": 0.017292490118577076, |
| "grad_norm": 4.28977539978035, |
| "kl": 0.043701171875, |
| "learning_rate": 9.829545454545453e-07, |
| "loss": 0.0018, |
| "reward": 2.5991945266723633, |
| "reward_std": 0.10131914913654327, |
| "rewards/accuracy_reward_stage2": 0.5991945862770081, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.140625, |
| "step": 70 |
| }, |
| { |
| "completion_length": 70.0, |
| "epoch": 0.017539525691699604, |
| "grad_norm": 4.632147156699228, |
| "kl": 0.054443359375, |
| "learning_rate": 9.827075098814229e-07, |
| "loss": 0.0022, |
| "reward": 2.4850528240203857, |
| "reward_std": 0.15690843760967255, |
| "rewards/accuracy_reward_stage2": 0.6100528240203857, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.0625, |
| "step": 71 |
| }, |
| { |
| "completion_length": 74.0625, |
| "epoch": 0.017786561264822136, |
| "grad_norm": 7.003739986456345, |
| "kl": 0.06396484375, |
| "learning_rate": 9.824604743083004e-07, |
| "loss": 0.0026, |
| "reward": 2.482394218444824, |
| "reward_std": 0.38058096170425415, |
| "rewards/accuracy_reward_stage2": 0.6073942184448242, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.0, |
| "step": 72 |
| }, |
| { |
| "completion_length": 65.375, |
| "epoch": 0.018033596837944664, |
| "grad_norm": 3.770333175418537, |
| "kl": 0.0634765625, |
| "learning_rate": 9.822134387351777e-07, |
| "loss": 0.0025, |
| "reward": 2.595181703567505, |
| "reward_std": 0.00595674104988575, |
| "rewards/accuracy_reward_stage2": 0.5951815247535706, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 73 |
| }, |
| { |
| "completion_length": 79.703125, |
| "epoch": 0.018280632411067192, |
| "grad_norm": 4.384564430068585, |
| "kl": 0.0634765625, |
| "learning_rate": 9.819664031620553e-07, |
| "loss": 0.0025, |
| "reward": 2.560758590698242, |
| "reward_std": 0.11315355449914932, |
| "rewards/accuracy_reward_stage2": 0.5795084834098816, |
| "rewards/format_reward_all_stage": 1.9812500476837158, |
| "scores/refine_times": 1.15625, |
| "step": 74 |
| }, |
| { |
| "completion_length": 81.9375, |
| "epoch": 0.018527667984189724, |
| "grad_norm": 6.555813557025346, |
| "kl": 0.051025390625, |
| "learning_rate": 9.817193675889328e-07, |
| "loss": 0.002, |
| "reward": 2.257451295852661, |
| "reward_std": 0.10627569258213043, |
| "rewards/accuracy_reward_stage2": 0.25745123624801636, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.15625, |
| "step": 75 |
| }, |
| { |
| "completion_length": 71.609375, |
| "epoch": 0.018774703557312252, |
| "grad_norm": 2.549306492315234, |
| "kl": 0.0634765625, |
| "learning_rate": 9.814723320158103e-07, |
| "loss": 0.0025, |
| "reward": 2.7328977584838867, |
| "reward_std": 0.06272567808628082, |
| "rewards/accuracy_reward_stage2": 0.7328977584838867, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.078125, |
| "step": 76 |
| }, |
| { |
| "completion_length": 78.375, |
| "epoch": 0.019021739130434784, |
| "grad_norm": 2.5658656287329453, |
| "kl": 0.07373046875, |
| "learning_rate": 9.812252964426877e-07, |
| "loss": 0.003, |
| "reward": 2.7386789321899414, |
| "reward_std": 0.021051663905382156, |
| "rewards/accuracy_reward_stage2": 0.7386791706085205, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.15625, |
| "step": 77 |
| }, |
| { |
| "completion_length": 99.03125, |
| "epoch": 0.019268774703557312, |
| "grad_norm": 4.112277800837744, |
| "kl": 0.06787109375, |
| "learning_rate": 9.809782608695652e-07, |
| "loss": 0.0027, |
| "reward": 2.516645908355713, |
| "reward_std": 0.11490845680236816, |
| "rewards/accuracy_reward_stage2": 0.5166457891464233, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.34375, |
| "step": 78 |
| }, |
| { |
| "completion_length": 68.75, |
| "epoch": 0.01951581027667984, |
| "grad_norm": 1.7167798311826008, |
| "kl": 0.07080078125, |
| "learning_rate": 9.807312252964425e-07, |
| "loss": 0.0028, |
| "reward": 2.8159339427948, |
| "reward_std": 0.0637812465429306, |
| "rewards/accuracy_reward_stage2": 0.8159340620040894, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.140625, |
| "step": 79 |
| }, |
| { |
| "completion_length": 75.578125, |
| "epoch": 0.019762845849802372, |
| "grad_norm": 5.315603208425231, |
| "kl": 0.06298828125, |
| "learning_rate": 9.8048418972332e-07, |
| "loss": 0.0025, |
| "reward": 2.4659554958343506, |
| "reward_std": 0.020968768745660782, |
| "rewards/accuracy_reward_stage2": 0.46595555543899536, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 80 |
| }, |
| { |
| "completion_length": 82.765625, |
| "epoch": 0.0200098814229249, |
| "grad_norm": 5.529728305997517, |
| "kl": 0.061767578125, |
| "learning_rate": 9.802371541501976e-07, |
| "loss": 0.0025, |
| "reward": 2.5143938064575195, |
| "reward_std": 0.24701416492462158, |
| "rewards/accuracy_reward_stage2": 0.5143939256668091, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 81 |
| }, |
| { |
| "completion_length": 72.25, |
| "epoch": 0.020256916996047432, |
| "grad_norm": 5.475938413894696, |
| "kl": 0.0634765625, |
| "learning_rate": 9.79990118577075e-07, |
| "loss": 0.0025, |
| "reward": 2.6759824752807617, |
| "reward_std": 0.2955029606819153, |
| "rewards/accuracy_reward_stage2": 0.8009825944900513, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.0625, |
| "step": 82 |
| }, |
| { |
| "completion_length": 70.84375, |
| "epoch": 0.02050395256916996, |
| "grad_norm": 6.444654069519576, |
| "kl": 0.08740234375, |
| "learning_rate": 9.797430830039525e-07, |
| "loss": 0.0035, |
| "reward": 2.538287878036499, |
| "reward_std": 0.12264619767665863, |
| "rewards/accuracy_reward_stage2": 0.5382877588272095, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.09375, |
| "step": 83 |
| }, |
| { |
| "completion_length": 71.3125, |
| "epoch": 0.020750988142292492, |
| "grad_norm": 3.595656141306728, |
| "kl": 0.09033203125, |
| "learning_rate": 9.7949604743083e-07, |
| "loss": 0.0036, |
| "reward": 2.7061915397644043, |
| "reward_std": 0.011706423945724964, |
| "rewards/accuracy_reward_stage2": 0.7061916589736938, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 84 |
| }, |
| { |
| "completion_length": 102.09375, |
| "epoch": 0.02099802371541502, |
| "grad_norm": 5.812639601627871, |
| "kl": 0.095703125, |
| "learning_rate": 9.792490118577076e-07, |
| "loss": 0.0038, |
| "reward": 2.2463674545288086, |
| "reward_std": 0.37936723232269287, |
| "rewards/accuracy_reward_stage2": 0.4276173710823059, |
| "rewards/format_reward_all_stage": 1.8187499046325684, |
| "scores/refine_times": 1.265625, |
| "step": 85 |
| }, |
| { |
| "completion_length": 82.578125, |
| "epoch": 0.021245059288537548, |
| "grad_norm": 5.621200937633171, |
| "kl": 0.07373046875, |
| "learning_rate": 9.79001976284585e-07, |
| "loss": 0.003, |
| "reward": 2.2440762519836426, |
| "reward_std": 0.26927846670150757, |
| "rewards/accuracy_reward_stage2": 0.3690761625766754, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.0625, |
| "step": 86 |
| }, |
| { |
| "completion_length": 92.609375, |
| "epoch": 0.02149209486166008, |
| "grad_norm": 5.197685430679634, |
| "kl": 0.07666015625, |
| "learning_rate": 9.787549407114624e-07, |
| "loss": 0.0031, |
| "reward": 2.726613759994507, |
| "reward_std": 0.14432454109191895, |
| "rewards/accuracy_reward_stage2": 0.7266137599945068, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.234375, |
| "step": 87 |
| }, |
| { |
| "completion_length": 91.125, |
| "epoch": 0.021739130434782608, |
| "grad_norm": 2.9714055097674095, |
| "kl": 0.0908203125, |
| "learning_rate": 9.7850790513834e-07, |
| "loss": 0.0036, |
| "reward": 2.7476940155029297, |
| "reward_std": 0.22233270108699799, |
| "rewards/accuracy_reward_stage2": 0.8883191347122192, |
| "rewards/format_reward_all_stage": 1.859375, |
| "scores/refine_times": 1.3125, |
| "step": 88 |
| }, |
| { |
| "completion_length": 95.90625, |
| "epoch": 0.02198616600790514, |
| "grad_norm": 4.966352277090883, |
| "kl": 0.083984375, |
| "learning_rate": 9.782608695652173e-07, |
| "loss": 0.0034, |
| "reward": 2.6650359630584717, |
| "reward_std": 0.04115064814686775, |
| "rewards/accuracy_reward_stage2": 0.6650359034538269, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.21875, |
| "step": 89 |
| }, |
| { |
| "completion_length": 91.6875, |
| "epoch": 0.022233201581027668, |
| "grad_norm": 3.475945047080351, |
| "kl": 0.061279296875, |
| "learning_rate": 9.780138339920948e-07, |
| "loss": 0.0024, |
| "reward": 2.6070141792297363, |
| "reward_std": 0.1261502206325531, |
| "rewards/accuracy_reward_stage2": 0.6070142388343811, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.21875, |
| "step": 90 |
| }, |
| { |
| "completion_length": 60.6875, |
| "epoch": 0.022480237154150196, |
| "grad_norm": 4.508462809688603, |
| "kl": 0.07373046875, |
| "learning_rate": 9.777667984189722e-07, |
| "loss": 0.0029, |
| "reward": 2.6952967643737793, |
| "reward_std": 0.08862090110778809, |
| "rewards/accuracy_reward_stage2": 0.6952967643737793, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.078125, |
| "step": 91 |
| }, |
| { |
| "completion_length": 85.203125, |
| "epoch": 0.022727272727272728, |
| "grad_norm": 6.403954189418028, |
| "kl": 0.068359375, |
| "learning_rate": 9.775197628458497e-07, |
| "loss": 0.0027, |
| "reward": 2.4433703422546387, |
| "reward_std": 0.20903919637203217, |
| "rewards/accuracy_reward_stage2": 0.5058705806732178, |
| "rewards/format_reward_all_stage": 1.9375, |
| "scores/refine_times": 1.125, |
| "step": 92 |
| }, |
| { |
| "completion_length": 77.125, |
| "epoch": 0.022974308300395256, |
| "grad_norm": 3.6283748627126804, |
| "kl": 0.068359375, |
| "learning_rate": 9.772727272727273e-07, |
| "loss": 0.0027, |
| "reward": 2.6447415351867676, |
| "reward_std": 0.08912975341081619, |
| "rewards/accuracy_reward_stage2": 0.6447416543960571, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 93 |
| }, |
| { |
| "completion_length": 95.125, |
| "epoch": 0.023221343873517788, |
| "grad_norm": 5.182312100272035, |
| "kl": 0.0712890625, |
| "learning_rate": 9.770256916996048e-07, |
| "loss": 0.0029, |
| "reward": 2.412106990814209, |
| "reward_std": 0.03851859271526337, |
| "rewards/accuracy_reward_stage2": 0.4121071696281433, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.15625, |
| "step": 94 |
| }, |
| { |
| "completion_length": 82.875, |
| "epoch": 0.023468379446640316, |
| "grad_norm": 4.024559027790135, |
| "kl": 0.055419921875, |
| "learning_rate": 9.767786561264821e-07, |
| "loss": 0.0022, |
| "reward": 2.6596741676330566, |
| "reward_std": 0.08960773050785065, |
| "rewards/accuracy_reward_stage2": 0.6596741676330566, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 95 |
| }, |
| { |
| "completion_length": 97.015625, |
| "epoch": 0.023715415019762844, |
| "grad_norm": 3.8218241857549216, |
| "kl": 0.06640625, |
| "learning_rate": 9.765316205533597e-07, |
| "loss": 0.0027, |
| "reward": 2.511441707611084, |
| "reward_std": 0.1077704057097435, |
| "rewards/accuracy_reward_stage2": 0.511441707611084, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.234375, |
| "step": 96 |
| }, |
| { |
| "completion_length": 84.578125, |
| "epoch": 0.023962450592885376, |
| "grad_norm": 5.000582952776722, |
| "kl": 0.07568359375, |
| "learning_rate": 9.762845849802372e-07, |
| "loss": 0.003, |
| "reward": 2.4915847778320312, |
| "reward_std": 0.1341104507446289, |
| "rewards/accuracy_reward_stage2": 0.4915849566459656, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.265625, |
| "step": 97 |
| }, |
| { |
| "completion_length": 63.15625, |
| "epoch": 0.024209486166007904, |
| "grad_norm": 5.040940426798209, |
| "kl": 0.09912109375, |
| "learning_rate": 9.760375494071145e-07, |
| "loss": 0.004, |
| "reward": 2.692129611968994, |
| "reward_std": 0.09403635561466217, |
| "rewards/accuracy_reward_stage2": 0.6921296119689941, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 98 |
| }, |
| { |
| "completion_length": 89.546875, |
| "epoch": 0.024456521739130436, |
| "grad_norm": 4.932061600457188, |
| "kl": 0.060302734375, |
| "learning_rate": 9.75790513833992e-07, |
| "loss": 0.0024, |
| "reward": 2.525324821472168, |
| "reward_std": 0.15960124135017395, |
| "rewards/accuracy_reward_stage2": 0.5878250598907471, |
| "rewards/format_reward_all_stage": 1.9375, |
| "scores/refine_times": 1.125, |
| "step": 99 |
| }, |
| { |
| "completion_length": 72.640625, |
| "epoch": 0.024703557312252964, |
| "grad_norm": 4.56806382193382, |
| "kl": 0.064453125, |
| "learning_rate": 9.755434782608694e-07, |
| "loss": 0.0026, |
| "reward": 2.7026538848876953, |
| "reward_std": 0.1532786637544632, |
| "rewards/accuracy_reward_stage2": 0.7729662656784058, |
| "rewards/format_reward_all_stage": 1.9296875, |
| "scores/refine_times": 1.125, |
| "step": 100 |
| }, |
| { |
| "completion_length": 84.03125, |
| "epoch": 0.024950592885375496, |
| "grad_norm": 3.7622549256413014, |
| "kl": 0.07861328125, |
| "learning_rate": 9.75296442687747e-07, |
| "loss": 0.0031, |
| "reward": 2.604541778564453, |
| "reward_std": 0.1364867091178894, |
| "rewards/accuracy_reward_stage2": 0.6107918620109558, |
| "rewards/format_reward_all_stage": 1.993749976158142, |
| "scores/refine_times": 1.1875, |
| "step": 101 |
| }, |
| { |
| "completion_length": 93.9375, |
| "epoch": 0.025197628458498024, |
| "grad_norm": 4.0252707546161774, |
| "kl": 0.08251953125, |
| "learning_rate": 9.750494071146245e-07, |
| "loss": 0.0033, |
| "reward": 2.581124782562256, |
| "reward_std": 0.07142534106969833, |
| "rewards/accuracy_reward_stage2": 0.5811247229576111, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.203125, |
| "step": 102 |
| }, |
| { |
| "completion_length": 84.109375, |
| "epoch": 0.025444664031620552, |
| "grad_norm": 2.979249409154134, |
| "kl": 0.06640625, |
| "learning_rate": 9.74802371541502e-07, |
| "loss": 0.0027, |
| "reward": 2.6580286026000977, |
| "reward_std": 0.07319141179323196, |
| "rewards/accuracy_reward_stage2": 0.6580287218093872, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.171875, |
| "step": 103 |
| }, |
| { |
| "completion_length": 61.125, |
| "epoch": 0.025691699604743084, |
| "grad_norm": 0.28783476698060495, |
| "kl": 0.061279296875, |
| "learning_rate": 9.745553359683793e-07, |
| "loss": 0.0024, |
| "reward": 2.686764717102051, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward_stage2": 0.6867647767066956, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 104 |
| }, |
| { |
| "completion_length": 113.265625, |
| "epoch": 0.025938735177865612, |
| "grad_norm": 4.594788362626608, |
| "kl": 0.05810546875, |
| "learning_rate": 9.743083003952569e-07, |
| "loss": 0.0023, |
| "reward": 2.4998667240142822, |
| "reward_std": 0.04374603182077408, |
| "rewards/accuracy_reward_stage2": 0.4998666048049927, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.140625, |
| "step": 105 |
| }, |
| { |
| "completion_length": 72.0625, |
| "epoch": 0.026185770750988144, |
| "grad_norm": 4.674866102803609, |
| "kl": 0.048583984375, |
| "learning_rate": 9.740612648221344e-07, |
| "loss": 0.002, |
| "reward": 2.6107826232910156, |
| "reward_std": 0.05181875079870224, |
| "rewards/accuracy_reward_stage2": 0.6107826828956604, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 106 |
| }, |
| { |
| "completion_length": 76.8125, |
| "epoch": 0.026432806324110672, |
| "grad_norm": 4.443840034303371, |
| "kl": 0.05224609375, |
| "learning_rate": 9.738142292490117e-07, |
| "loss": 0.0021, |
| "reward": 2.552403450012207, |
| "reward_std": 0.14651192724704742, |
| "rewards/accuracy_reward_stage2": 0.552403450012207, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 107 |
| }, |
| { |
| "completion_length": 81.765625, |
| "epoch": 0.0266798418972332, |
| "grad_norm": 2.0441554829485145, |
| "kl": 0.06298828125, |
| "learning_rate": 9.735671936758893e-07, |
| "loss": 0.0025, |
| "reward": 2.7436699867248535, |
| "reward_std": 0.037195369601249695, |
| "rewards/accuracy_reward_stage2": 0.7436702251434326, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 108 |
| }, |
| { |
| "completion_length": 128.015625, |
| "epoch": 0.026926877470355732, |
| "grad_norm": 3.968644058753578, |
| "kl": 0.08984375, |
| "learning_rate": 9.733201581027668e-07, |
| "loss": 0.0036, |
| "reward": 2.570418357849121, |
| "reward_std": 0.11060214787721634, |
| "rewards/accuracy_reward_stage2": 0.6329183578491211, |
| "rewards/format_reward_all_stage": 1.9375, |
| "scores/refine_times": 1.40625, |
| "step": 109 |
| }, |
| { |
| "completion_length": 136.390625, |
| "epoch": 0.02717391304347826, |
| "grad_norm": 1.5281023437512695, |
| "kl": 0.050537109375, |
| "learning_rate": 9.730731225296442e-07, |
| "loss": 0.002, |
| "reward": 2.631211519241333, |
| "reward_std": 0.07405112683773041, |
| "rewards/accuracy_reward_stage2": 0.6312115788459778, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.390625, |
| "step": 110 |
| }, |
| { |
| "completion_length": 103.125, |
| "epoch": 0.027420948616600792, |
| "grad_norm": 4.3848043520379605, |
| "kl": 0.07275390625, |
| "learning_rate": 9.728260869565217e-07, |
| "loss": 0.0029, |
| "reward": 2.532392978668213, |
| "reward_std": 0.2085559368133545, |
| "rewards/accuracy_reward_stage2": 0.5948929190635681, |
| "rewards/format_reward_all_stage": 1.9375, |
| "scores/refine_times": 1.125, |
| "step": 111 |
| }, |
| { |
| "completion_length": 101.4375, |
| "epoch": 0.02766798418972332, |
| "grad_norm": 3.9823453743402126, |
| "kl": 0.05810546875, |
| "learning_rate": 9.72579051383399e-07, |
| "loss": 0.0023, |
| "reward": 2.5878045558929443, |
| "reward_std": 0.12208257615566254, |
| "rewards/accuracy_reward_stage2": 0.5940545797348022, |
| "rewards/format_reward_all_stage": 1.993749976158142, |
| "scores/refine_times": 1.203125, |
| "step": 112 |
| }, |
| { |
| "completion_length": 108.640625, |
| "epoch": 0.027915019762845848, |
| "grad_norm": 2.487090870664229, |
| "kl": 0.0341796875, |
| "learning_rate": 9.723320158102768e-07, |
| "loss": 0.0014, |
| "reward": 2.7555534839630127, |
| "reward_std": 0.015020077116787434, |
| "rewards/accuracy_reward_stage2": 0.7555533647537231, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 113 |
| }, |
| { |
| "completion_length": 149.234375, |
| "epoch": 0.02816205533596838, |
| "grad_norm": 1.5545707236666886, |
| "kl": 0.036376953125, |
| "learning_rate": 9.72084980237154e-07, |
| "loss": 0.0015, |
| "reward": 2.5128674507141113, |
| "reward_std": 0.08764077723026276, |
| "rewards/accuracy_reward_stage2": 0.5347423553466797, |
| "rewards/format_reward_all_stage": 1.978124976158142, |
| "scores/refine_times": 1.53125, |
| "step": 114 |
| }, |
| { |
| "completion_length": 111.0625, |
| "epoch": 0.028409090909090908, |
| "grad_norm": 3.9159999459644754, |
| "kl": 0.047607421875, |
| "learning_rate": 9.718379446640316e-07, |
| "loss": 0.0019, |
| "reward": 2.7607569694519043, |
| "reward_std": 0.22762958705425262, |
| "rewards/accuracy_reward_stage2": 0.7670071125030518, |
| "rewards/format_reward_all_stage": 1.993749976158142, |
| "scores/refine_times": 1.21875, |
| "step": 115 |
| }, |
| { |
| "completion_length": 91.890625, |
| "epoch": 0.02865612648221344, |
| "grad_norm": 2.314903676922173, |
| "kl": 0.037841796875, |
| "learning_rate": 9.71590909090909e-07, |
| "loss": 0.0015, |
| "reward": 2.563828945159912, |
| "reward_std": 0.0933101698756218, |
| "rewards/accuracy_reward_stage2": 0.6263290047645569, |
| "rewards/format_reward_all_stage": 1.9375, |
| "scores/refine_times": 1.0625, |
| "step": 116 |
| }, |
| { |
| "completion_length": 119.109375, |
| "epoch": 0.028903162055335968, |
| "grad_norm": 4.047428002907611, |
| "kl": 0.04296875, |
| "learning_rate": 9.713438735177865e-07, |
| "loss": 0.0017, |
| "reward": 2.6038811206817627, |
| "reward_std": 0.20530042052268982, |
| "rewards/accuracy_reward_stage2": 0.6038811206817627, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.3125, |
| "step": 117 |
| }, |
| { |
| "completion_length": 107.71875, |
| "epoch": 0.0291501976284585, |
| "grad_norm": 3.3403630742196513, |
| "kl": 0.0400390625, |
| "learning_rate": 9.71096837944664e-07, |
| "loss": 0.0016, |
| "reward": 2.4648709297180176, |
| "reward_std": 0.13606658577919006, |
| "rewards/accuracy_reward_stage2": 0.46487098932266235, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.171875, |
| "step": 118 |
| }, |
| { |
| "completion_length": 104.828125, |
| "epoch": 0.029397233201581028, |
| "grad_norm": 3.9940498248823206, |
| "kl": 0.039306640625, |
| "learning_rate": 9.708498023715414e-07, |
| "loss": 0.0016, |
| "reward": 2.6417078971862793, |
| "reward_std": 0.0913887768983841, |
| "rewards/accuracy_reward_stage2": 0.6417078971862793, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.140625, |
| "step": 119 |
| }, |
| { |
| "completion_length": 97.1875, |
| "epoch": 0.029644268774703556, |
| "grad_norm": 3.73814994296708, |
| "kl": 0.042724609375, |
| "learning_rate": 9.70602766798419e-07, |
| "loss": 0.0017, |
| "reward": 2.562872886657715, |
| "reward_std": 0.07472334057092667, |
| "rewards/accuracy_reward_stage2": 0.5628727078437805, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.078125, |
| "step": 120 |
| }, |
| { |
| "completion_length": 132.296875, |
| "epoch": 0.029891304347826088, |
| "grad_norm": 2.9384630818108612, |
| "kl": 0.058349609375, |
| "learning_rate": 9.703557312252962e-07, |
| "loss": 0.0023, |
| "reward": 2.676942825317383, |
| "reward_std": 0.12600594758987427, |
| "rewards/accuracy_reward_stage2": 0.7378804087638855, |
| "rewards/format_reward_all_stage": 1.939062476158142, |
| "scores/refine_times": 1.546875, |
| "step": 121 |
| }, |
| { |
| "completion_length": 127.84375, |
| "epoch": 0.030138339920948616, |
| "grad_norm": 3.0365235817191767, |
| "kl": 0.05029296875, |
| "learning_rate": 9.70108695652174e-07, |
| "loss": 0.002, |
| "reward": 2.704944133758545, |
| "reward_std": 0.07274624705314636, |
| "rewards/accuracy_reward_stage2": 0.7257775664329529, |
| "rewards/format_reward_all_stage": 1.9791667461395264, |
| "scores/refine_times": 1.390625, |
| "step": 122 |
| }, |
| { |
| "completion_length": 100.734375, |
| "epoch": 0.030385375494071148, |
| "grad_norm": 3.5805778693833235, |
| "kl": 0.042236328125, |
| "learning_rate": 9.698616600790513e-07, |
| "loss": 0.0017, |
| "reward": 2.7929563522338867, |
| "reward_std": 0.12518826127052307, |
| "rewards/accuracy_reward_stage2": 0.803372859954834, |
| "rewards/format_reward_all_stage": 1.9895832538604736, |
| "scores/refine_times": 1.140625, |
| "step": 123 |
| }, |
| { |
| "completion_length": 79.609375, |
| "epoch": 0.030632411067193676, |
| "grad_norm": 1.9142569203178765, |
| "kl": 0.049560546875, |
| "learning_rate": 9.696146245059289e-07, |
| "loss": 0.002, |
| "reward": 2.949510335922241, |
| "reward_std": 0.01902618445456028, |
| "rewards/accuracy_reward_stage2": 0.9495103359222412, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 124 |
| }, |
| { |
| "completion_length": 115.65625, |
| "epoch": 0.030879446640316204, |
| "grad_norm": 3.192939893717315, |
| "kl": 0.049560546875, |
| "learning_rate": 9.693675889328062e-07, |
| "loss": 0.002, |
| "reward": 2.6586837768554688, |
| "reward_std": 0.07593096792697906, |
| "rewards/accuracy_reward_stage2": 0.6586835980415344, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.34375, |
| "step": 125 |
| }, |
| { |
| "completion_length": 115.65625, |
| "epoch": 0.031126482213438736, |
| "grad_norm": 4.448628078761415, |
| "kl": 0.06005859375, |
| "learning_rate": 9.691205533596837e-07, |
| "loss": 0.0024, |
| "reward": 2.4910385608673096, |
| "reward_std": 0.19780105352401733, |
| "rewards/accuracy_reward_stage2": 0.5014550685882568, |
| "rewards/format_reward_all_stage": 1.9895833730697632, |
| "scores/refine_times": 1.28125, |
| "step": 126 |
| }, |
| { |
| "completion_length": 102.5625, |
| "epoch": 0.031373517786561264, |
| "grad_norm": 4.306073074264426, |
| "kl": 0.04736328125, |
| "learning_rate": 9.688735177865613e-07, |
| "loss": 0.0019, |
| "reward": 2.328153133392334, |
| "reward_std": 0.06073104217648506, |
| "rewards/accuracy_reward_stage2": 0.32815316319465637, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.09375, |
| "step": 127 |
| }, |
| { |
| "completion_length": 120.96875, |
| "epoch": 0.03162055335968379, |
| "grad_norm": 2.5626638879865524, |
| "kl": 0.05078125, |
| "learning_rate": 9.686264822134386e-07, |
| "loss": 0.002, |
| "reward": 2.7157416343688965, |
| "reward_std": 0.09564615786075592, |
| "rewards/accuracy_reward_stage2": 0.7157415151596069, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.328125, |
| "step": 128 |
| }, |
| { |
| "completion_length": 120.0, |
| "epoch": 0.03186758893280633, |
| "grad_norm": 2.872358125881173, |
| "kl": 0.061279296875, |
| "learning_rate": 9.683794466403161e-07, |
| "loss": 0.0025, |
| "reward": 2.68588924407959, |
| "reward_std": 0.03977838158607483, |
| "rewards/accuracy_reward_stage2": 0.6858893632888794, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.234375, |
| "step": 129 |
| }, |
| { |
| "completion_length": 122.8125, |
| "epoch": 0.032114624505928856, |
| "grad_norm": 4.061774924569719, |
| "kl": 0.064453125, |
| "learning_rate": 9.681324110671937e-07, |
| "loss": 0.0026, |
| "reward": 2.5638954639434814, |
| "reward_std": 0.1816643923521042, |
| "rewards/accuracy_reward_stage2": 0.5638953447341919, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.484375, |
| "step": 130 |
| }, |
| { |
| "completion_length": 84.875, |
| "epoch": 0.032361660079051384, |
| "grad_norm": 4.115640976165146, |
| "kl": 0.047119140625, |
| "learning_rate": 9.678853754940712e-07, |
| "loss": 0.0019, |
| "reward": 2.612516403198242, |
| "reward_std": 0.19447211921215057, |
| "rewards/accuracy_reward_stage2": 0.6125162839889526, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 131 |
| }, |
| { |
| "completion_length": 138.390625, |
| "epoch": 0.03260869565217391, |
| "grad_norm": 3.44122277510103, |
| "kl": 0.044189453125, |
| "learning_rate": 9.676383399209485e-07, |
| "loss": 0.0018, |
| "reward": 2.6405930519104004, |
| "reward_std": 0.10844551026821136, |
| "rewards/accuracy_reward_stage2": 0.6405929327011108, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.609375, |
| "step": 132 |
| }, |
| { |
| "completion_length": 114.34375, |
| "epoch": 0.03285573122529644, |
| "grad_norm": 3.3056082707747776, |
| "kl": 0.04638671875, |
| "learning_rate": 9.67391304347826e-07, |
| "loss": 0.0019, |
| "reward": 2.246976375579834, |
| "reward_std": 0.13054856657981873, |
| "rewards/accuracy_reward_stage2": 0.25322651863098145, |
| "rewards/format_reward_all_stage": 1.993749976158142, |
| "scores/refine_times": 1.25, |
| "step": 133 |
| }, |
| { |
| "completion_length": 91.25, |
| "epoch": 0.033102766798418976, |
| "grad_norm": 1.7128079673479029, |
| "kl": 0.048583984375, |
| "learning_rate": 9.671442687747036e-07, |
| "loss": 0.0019, |
| "reward": 2.833665370941162, |
| "reward_std": 0.010462751612067223, |
| "rewards/accuracy_reward_stage2": 0.8336653709411621, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.1875, |
| "step": 134 |
| }, |
| { |
| "completion_length": 113.765625, |
| "epoch": 0.033349802371541504, |
| "grad_norm": 3.3425383925164325, |
| "kl": 0.04052734375, |
| "learning_rate": 9.66897233201581e-07, |
| "loss": 0.0016, |
| "reward": 2.676530122756958, |
| "reward_std": 0.14146235585212708, |
| "rewards/accuracy_reward_stage2": 0.6827802062034607, |
| "rewards/format_reward_all_stage": 1.993749976158142, |
| "scores/refine_times": 1.359375, |
| "step": 135 |
| }, |
| { |
| "completion_length": 130.125, |
| "epoch": 0.03359683794466403, |
| "grad_norm": 1.1676928781318863, |
| "kl": 0.048583984375, |
| "learning_rate": 9.666501976284585e-07, |
| "loss": 0.0019, |
| "reward": 2.4324049949645996, |
| "reward_std": 0.10420753061771393, |
| "rewards/accuracy_reward_stage2": 0.44490504264831543, |
| "rewards/format_reward_all_stage": 1.9874999523162842, |
| "scores/refine_times": 1.484375, |
| "step": 136 |
| }, |
| { |
| "completion_length": 95.421875, |
| "epoch": 0.03384387351778656, |
| "grad_norm": 3.1237238960935723, |
| "kl": 0.047607421875, |
| "learning_rate": 9.664031620553358e-07, |
| "loss": 0.0019, |
| "reward": 2.6851418018341064, |
| "reward_std": 0.14544668793678284, |
| "rewards/accuracy_reward_stage2": 0.7163918614387512, |
| "rewards/format_reward_all_stage": 1.96875, |
| "scores/refine_times": 1.328125, |
| "step": 137 |
| }, |
| { |
| "completion_length": 96.8125, |
| "epoch": 0.03409090909090909, |
| "grad_norm": 1.7079196696529952, |
| "kl": 0.041015625, |
| "learning_rate": 9.661561264822134e-07, |
| "loss": 0.0016, |
| "reward": 2.799861431121826, |
| "reward_std": 0.09301671385765076, |
| "rewards/accuracy_reward_stage2": 0.8061115145683289, |
| "rewards/format_reward_all_stage": 1.993749976158142, |
| "scores/refine_times": 1.3125, |
| "step": 138 |
| }, |
| { |
| "completion_length": 87.8125, |
| "epoch": 0.034337944664031624, |
| "grad_norm": 2.4549781302932483, |
| "kl": 0.040771484375, |
| "learning_rate": 9.65909090909091e-07, |
| "loss": 0.0016, |
| "reward": 2.8591620922088623, |
| "reward_std": 0.0772564709186554, |
| "rewards/accuracy_reward_stage2": 0.8591620922088623, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.28125, |
| "step": 139 |
| }, |
| { |
| "completion_length": 105.484375, |
| "epoch": 0.03458498023715415, |
| "grad_norm": 4.243346429014275, |
| "kl": 0.05615234375, |
| "learning_rate": 9.656620553359684e-07, |
| "loss": 0.0022, |
| "reward": 2.712839365005493, |
| "reward_std": 0.12247423827648163, |
| "rewards/accuracy_reward_stage2": 0.7232558727264404, |
| "rewards/format_reward_all_stage": 1.9895833730697632, |
| "scores/refine_times": 1.328125, |
| "step": 140 |
| }, |
| { |
| "completion_length": 73.890625, |
| "epoch": 0.03483201581027668, |
| "grad_norm": 3.8646588914693787, |
| "kl": 0.05126953125, |
| "learning_rate": 9.654150197628458e-07, |
| "loss": 0.002, |
| "reward": 2.6789019107818604, |
| "reward_std": 0.10791897773742676, |
| "rewards/accuracy_reward_stage2": 0.6789019107818604, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 141 |
| }, |
| { |
| "completion_length": 102.359375, |
| "epoch": 0.03507905138339921, |
| "grad_norm": 4.4491213672022285, |
| "kl": 0.06787109375, |
| "learning_rate": 9.651679841897233e-07, |
| "loss": 0.0027, |
| "reward": 2.4613142013549805, |
| "reward_std": 0.12597447633743286, |
| "rewards/accuracy_reward_stage2": 0.47693929076194763, |
| "rewards/format_reward_all_stage": 1.984375, |
| "scores/refine_times": 1.3125, |
| "step": 142 |
| }, |
| { |
| "completion_length": 110.609375, |
| "epoch": 0.035326086956521736, |
| "grad_norm": 2.987716235712274, |
| "kl": 0.045166015625, |
| "learning_rate": 9.649209486166008e-07, |
| "loss": 0.0018, |
| "reward": 2.57698392868042, |
| "reward_std": 0.08374475687742233, |
| "rewards/accuracy_reward_stage2": 0.5769840478897095, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.5625, |
| "step": 143 |
| }, |
| { |
| "completion_length": 103.328125, |
| "epoch": 0.03557312252964427, |
| "grad_norm": 3.350336752038281, |
| "kl": 0.04150390625, |
| "learning_rate": 9.646739130434782e-07, |
| "loss": 0.0017, |
| "reward": 2.5754241943359375, |
| "reward_std": 0.12323600053787231, |
| "rewards/accuracy_reward_stage2": 0.5754240155220032, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.328125, |
| "step": 144 |
| }, |
| { |
| "completion_length": 112.171875, |
| "epoch": 0.0358201581027668, |
| "grad_norm": 3.3004897234036346, |
| "kl": 0.07177734375, |
| "learning_rate": 9.644268774703557e-07, |
| "loss": 0.0029, |
| "reward": 2.7073974609375, |
| "reward_std": 0.1341802179813385, |
| "rewards/accuracy_reward_stage2": 0.7198973894119263, |
| "rewards/format_reward_all_stage": 1.9874999523162842, |
| "scores/refine_times": 1.5625, |
| "step": 145 |
| }, |
| { |
| "completion_length": 70.109375, |
| "epoch": 0.03606719367588933, |
| "grad_norm": 3.746695836459905, |
| "kl": 0.08251953125, |
| "learning_rate": 9.64179841897233e-07, |
| "loss": 0.0033, |
| "reward": 2.7705631256103516, |
| "reward_std": 0.07979334890842438, |
| "rewards/accuracy_reward_stage2": 0.7705631256103516, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.140625, |
| "step": 146 |
| }, |
| { |
| "completion_length": 68.84375, |
| "epoch": 0.036314229249011856, |
| "grad_norm": 4.993912255293353, |
| "kl": 0.064453125, |
| "learning_rate": 9.639328063241106e-07, |
| "loss": 0.0026, |
| "reward": 2.462231397628784, |
| "reward_std": 0.2417951375246048, |
| "rewards/accuracy_reward_stage2": 0.587231457233429, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.125, |
| "step": 147 |
| }, |
| { |
| "completion_length": 89.359375, |
| "epoch": 0.036561264822134384, |
| "grad_norm": 1.2055002867444848, |
| "kl": 0.0751953125, |
| "learning_rate": 9.636857707509881e-07, |
| "loss": 0.003, |
| "reward": 2.6344380378723145, |
| "reward_std": 0.05059962347149849, |
| "rewards/accuracy_reward_stage2": 0.6344380974769592, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.296875, |
| "step": 148 |
| }, |
| { |
| "completion_length": 56.4375, |
| "epoch": 0.03680830039525692, |
| "grad_norm": 0.27507548410419386, |
| "kl": 0.062255859375, |
| "learning_rate": 9.634387351778657e-07, |
| "loss": 0.0025, |
| "reward": 2.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward_stage2": 0.75, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 149 |
| }, |
| { |
| "completion_length": 85.234375, |
| "epoch": 0.03705533596837945, |
| "grad_norm": 3.7285465705439877, |
| "kl": 0.06396484375, |
| "learning_rate": 9.63191699604743e-07, |
| "loss": 0.0026, |
| "reward": 2.714507579803467, |
| "reward_std": 0.07993803918361664, |
| "rewards/accuracy_reward_stage2": 0.7145076990127563, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.3125, |
| "step": 150 |
| }, |
| { |
| "completion_length": 69.53125, |
| "epoch": 0.037302371541501976, |
| "grad_norm": 2.7754541338531924, |
| "kl": 0.0625, |
| "learning_rate": 9.629446640316205e-07, |
| "loss": 0.0025, |
| "reward": 2.5262837409973145, |
| "reward_std": 0.015376383438706398, |
| "rewards/accuracy_reward_stage2": 0.5262836217880249, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.078125, |
| "step": 151 |
| }, |
| { |
| "completion_length": 59.703125, |
| "epoch": 0.037549407114624504, |
| "grad_norm": 3.6132525092697256, |
| "kl": 0.0849609375, |
| "learning_rate": 9.62697628458498e-07, |
| "loss": 0.0034, |
| "reward": 2.8007984161376953, |
| "reward_std": 0.05922838672995567, |
| "rewards/accuracy_reward_stage2": 0.8007984757423401, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 152 |
| }, |
| { |
| "completion_length": 73.09375, |
| "epoch": 0.03779644268774703, |
| "grad_norm": 4.7072408718848155, |
| "kl": 0.08447265625, |
| "learning_rate": 9.624505928853754e-07, |
| "loss": 0.0034, |
| "reward": 2.7883553504943848, |
| "reward_std": 0.10380949825048447, |
| "rewards/accuracy_reward_stage2": 0.78835529088974, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.234375, |
| "step": 153 |
| }, |
| { |
| "completion_length": 70.625, |
| "epoch": 0.03804347826086957, |
| "grad_norm": 4.015698810880644, |
| "kl": 0.064453125, |
| "learning_rate": 9.62203557312253e-07, |
| "loss": 0.0026, |
| "reward": 2.7374773025512695, |
| "reward_std": 0.07273007929325104, |
| "rewards/accuracy_reward_stage2": 0.7374772429466248, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 154 |
| }, |
| { |
| "completion_length": 80.09375, |
| "epoch": 0.038290513833992096, |
| "grad_norm": 4.619917963588498, |
| "kl": 0.06201171875, |
| "learning_rate": 9.619565217391305e-07, |
| "loss": 0.0025, |
| "reward": 2.6052746772766113, |
| "reward_std": 0.12084851413965225, |
| "rewards/accuracy_reward_stage2": 0.6901706457138062, |
| "rewards/format_reward_all_stage": 1.9151042699813843, |
| "scores/refine_times": 1.296875, |
| "step": 155 |
| }, |
| { |
| "completion_length": 68.359375, |
| "epoch": 0.038537549407114624, |
| "grad_norm": 4.5603936373643466, |
| "kl": 0.0859375, |
| "learning_rate": 9.617094861660078e-07, |
| "loss": 0.0034, |
| "reward": 2.6238441467285156, |
| "reward_std": 0.18369004130363464, |
| "rewards/accuracy_reward_stage2": 0.6238440275192261, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.15625, |
| "step": 156 |
| }, |
| { |
| "completion_length": 59.4375, |
| "epoch": 0.03878458498023715, |
| "grad_norm": 2.5329780180971126, |
| "kl": 0.07763671875, |
| "learning_rate": 9.614624505928853e-07, |
| "loss": 0.0031, |
| "reward": 2.679924964904785, |
| "reward_std": 0.07290495187044144, |
| "rewards/accuracy_reward_stage2": 0.6861748099327087, |
| "rewards/format_reward_all_stage": 1.993749976158142, |
| "scores/refine_times": 1.171875, |
| "step": 157 |
| }, |
| { |
| "completion_length": 68.046875, |
| "epoch": 0.03903162055335968, |
| "grad_norm": 5.443269730676339, |
| "kl": 0.0810546875, |
| "learning_rate": 9.612154150197627e-07, |
| "loss": 0.0033, |
| "reward": 2.567551612854004, |
| "reward_std": 0.1361953318119049, |
| "rewards/accuracy_reward_stage2": 0.5675517916679382, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 158 |
| }, |
| { |
| "completion_length": 57.9375, |
| "epoch": 0.039278656126482216, |
| "grad_norm": 9.030402072146797, |
| "kl": 0.0986328125, |
| "learning_rate": 9.609683794466402e-07, |
| "loss": 0.0039, |
| "reward": 2.3710129261016846, |
| "reward_std": 0.1857261210680008, |
| "rewards/accuracy_reward_stage2": 0.37101292610168457, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 159 |
| }, |
| { |
| "completion_length": 74.1875, |
| "epoch": 0.039525691699604744, |
| "grad_norm": 4.182121286336387, |
| "kl": 0.07861328125, |
| "learning_rate": 9.607213438735178e-07, |
| "loss": 0.0031, |
| "reward": 2.624070405960083, |
| "reward_std": 0.053435277193784714, |
| "rewards/accuracy_reward_stage2": 0.6240705251693726, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.15625, |
| "step": 160 |
| }, |
| { |
| "completion_length": 58.5625, |
| "epoch": 0.03977272727272727, |
| "grad_norm": 5.883898696205534, |
| "kl": 0.09228515625, |
| "learning_rate": 9.604743083003953e-07, |
| "loss": 0.0037, |
| "reward": 2.614293098449707, |
| "reward_std": 0.12666520476341248, |
| "rewards/accuracy_reward_stage2": 0.6142929792404175, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 161 |
| }, |
| { |
| "completion_length": 101.296875, |
| "epoch": 0.0400197628458498, |
| "grad_norm": 3.180252896906488, |
| "kl": 0.08642578125, |
| "learning_rate": 9.602272727272726e-07, |
| "loss": 0.0035, |
| "reward": 2.741650104522705, |
| "reward_std": 0.10522251576185226, |
| "rewards/accuracy_reward_stage2": 0.7479000091552734, |
| "rewards/format_reward_all_stage": 1.993749976158142, |
| "scores/refine_times": 1.484375, |
| "step": 162 |
| }, |
| { |
| "completion_length": 56.4375, |
| "epoch": 0.040266798418972335, |
| "grad_norm": 7.1537952621412355, |
| "kl": 0.0703125, |
| "learning_rate": 9.599802371541502e-07, |
| "loss": 0.0028, |
| "reward": 2.624744176864624, |
| "reward_std": 0.21947458386421204, |
| "rewards/accuracy_reward_stage2": 0.624744176864624, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 163 |
| }, |
| { |
| "completion_length": 68.203125, |
| "epoch": 0.040513833992094864, |
| "grad_norm": 5.366994523712258, |
| "kl": 0.08056640625, |
| "learning_rate": 9.597332015810277e-07, |
| "loss": 0.0032, |
| "reward": 2.524510622024536, |
| "reward_std": 0.12654046714305878, |
| "rewards/accuracy_reward_stage2": 0.5245106220245361, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.09375, |
| "step": 164 |
| }, |
| { |
| "completion_length": 68.6875, |
| "epoch": 0.04076086956521739, |
| "grad_norm": 3.7291692337053273, |
| "kl": 0.0625, |
| "learning_rate": 9.59486166007905e-07, |
| "loss": 0.0025, |
| "reward": 2.5265703201293945, |
| "reward_std": 0.16580721735954285, |
| "rewards/accuracy_reward_stage2": 0.5265705585479736, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.171875, |
| "step": 165 |
| }, |
| { |
| "completion_length": 73.15625, |
| "epoch": 0.04100790513833992, |
| "grad_norm": 2.735126529174995, |
| "kl": 0.0654296875, |
| "learning_rate": 9.592391304347826e-07, |
| "loss": 0.0026, |
| "reward": 2.75, |
| "reward_std": 0.06681530922651291, |
| "rewards/accuracy_reward_stage2": 0.75, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 166 |
| }, |
| { |
| "completion_length": 78.84375, |
| "epoch": 0.04125494071146245, |
| "grad_norm": 2.8700463474317703, |
| "kl": 0.07470703125, |
| "learning_rate": 9.5899209486166e-07, |
| "loss": 0.003, |
| "reward": 2.7168657779693604, |
| "reward_std": 0.035781100392341614, |
| "rewards/accuracy_reward_stage2": 0.7168656587600708, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.078125, |
| "step": 167 |
| }, |
| { |
| "completion_length": 78.0625, |
| "epoch": 0.041501976284584984, |
| "grad_norm": 4.890025569430008, |
| "kl": 0.0615234375, |
| "learning_rate": 9.587450592885376e-07, |
| "loss": 0.0025, |
| "reward": 2.6501522064208984, |
| "reward_std": 0.1081872433423996, |
| "rewards/accuracy_reward_stage2": 0.6501523852348328, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.109375, |
| "step": 168 |
| }, |
| { |
| "completion_length": 87.875, |
| "epoch": 0.04174901185770751, |
| "grad_norm": 4.04344669587095, |
| "kl": 0.058837890625, |
| "learning_rate": 9.58498023715415e-07, |
| "loss": 0.0023, |
| "reward": 2.6246044635772705, |
| "reward_std": 0.06840028613805771, |
| "rewards/accuracy_reward_stage2": 0.6246042847633362, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.1875, |
| "step": 169 |
| }, |
| { |
| "completion_length": 84.203125, |
| "epoch": 0.04199604743083004, |
| "grad_norm": 3.7630262620951216, |
| "kl": 0.0498046875, |
| "learning_rate": 9.582509881422925e-07, |
| "loss": 0.002, |
| "reward": 2.6967902183532715, |
| "reward_std": 0.03876494616270065, |
| "rewards/accuracy_reward_stage2": 0.6967902183532715, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 170 |
| }, |
| { |
| "completion_length": 65.3125, |
| "epoch": 0.04224308300395257, |
| "grad_norm": 3.921070473475162, |
| "kl": 0.060546875, |
| "learning_rate": 9.580039525691698e-07, |
| "loss": 0.0024, |
| "reward": 2.5751843452453613, |
| "reward_std": 0.04537220671772957, |
| "rewards/accuracy_reward_stage2": 0.5751842260360718, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 171 |
| }, |
| { |
| "completion_length": 86.046875, |
| "epoch": 0.042490118577075096, |
| "grad_norm": 2.6069307191323023, |
| "kl": 0.083984375, |
| "learning_rate": 9.577569169960474e-07, |
| "loss": 0.0034, |
| "reward": 2.6333060264587402, |
| "reward_std": 0.16853483021259308, |
| "rewards/accuracy_reward_stage2": 0.6489310264587402, |
| "rewards/format_reward_all_stage": 1.984375, |
| "scores/refine_times": 1.21875, |
| "step": 172 |
| }, |
| { |
| "completion_length": 70.0, |
| "epoch": 0.04273715415019763, |
| "grad_norm": 4.409216803473999, |
| "kl": 0.046630859375, |
| "learning_rate": 9.57509881422925e-07, |
| "loss": 0.0019, |
| "reward": 2.7234153747558594, |
| "reward_std": 0.20152220129966736, |
| "rewards/accuracy_reward_stage2": 0.7234152555465698, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 173 |
| }, |
| { |
| "completion_length": 77.09375, |
| "epoch": 0.04298418972332016, |
| "grad_norm": 3.94030379937351, |
| "kl": 0.0751953125, |
| "learning_rate": 9.572628458498022e-07, |
| "loss": 0.003, |
| "reward": 2.7134218215942383, |
| "reward_std": 0.16080144047737122, |
| "rewards/accuracy_reward_stage2": 0.7134219408035278, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 174 |
| }, |
| { |
| "completion_length": 82.515625, |
| "epoch": 0.04323122529644269, |
| "grad_norm": 1.861547490586632, |
| "kl": 0.04541015625, |
| "learning_rate": 9.570158102766798e-07, |
| "loss": 0.0018, |
| "reward": 2.659064292907715, |
| "reward_std": 0.04930752515792847, |
| "rewards/accuracy_reward_stage2": 0.6590641140937805, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.15625, |
| "step": 175 |
| }, |
| { |
| "completion_length": 79.765625, |
| "epoch": 0.043478260869565216, |
| "grad_norm": 4.0254532203837305, |
| "kl": 0.054931640625, |
| "learning_rate": 9.567687747035573e-07, |
| "loss": 0.0022, |
| "reward": 2.5477709770202637, |
| "reward_std": 0.09522654861211777, |
| "rewards/accuracy_reward_stage2": 0.547771155834198, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 176 |
| }, |
| { |
| "completion_length": 80.625, |
| "epoch": 0.043725296442687744, |
| "grad_norm": 4.304267511221348, |
| "kl": 0.087890625, |
| "learning_rate": 9.565217391304349e-07, |
| "loss": 0.0035, |
| "reward": 2.5806241035461426, |
| "reward_std": 0.12464433908462524, |
| "rewards/accuracy_reward_stage2": 0.5868740081787109, |
| "rewards/format_reward_all_stage": 1.993749976158142, |
| "scores/refine_times": 1.125, |
| "step": 177 |
| }, |
| { |
| "completion_length": 97.328125, |
| "epoch": 0.04397233201581028, |
| "grad_norm": 4.304249554872454, |
| "kl": 0.0576171875, |
| "learning_rate": 9.562747035573122e-07, |
| "loss": 0.0023, |
| "reward": 2.5728487968444824, |
| "reward_std": 0.06610282510519028, |
| "rewards/accuracy_reward_stage2": 0.572848916053772, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.1875, |
| "step": 178 |
| }, |
| { |
| "completion_length": 73.125, |
| "epoch": 0.04421936758893281, |
| "grad_norm": 4.062110192757534, |
| "kl": 0.05712890625, |
| "learning_rate": 9.560276679841897e-07, |
| "loss": 0.0023, |
| "reward": 2.630199909210205, |
| "reward_std": 0.03811332583427429, |
| "rewards/accuracy_reward_stage2": 0.6302000284194946, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 179 |
| }, |
| { |
| "completion_length": 67.6875, |
| "epoch": 0.044466403162055336, |
| "grad_norm": 3.0927557325941843, |
| "kl": 0.054443359375, |
| "learning_rate": 9.55780632411067e-07, |
| "loss": 0.0022, |
| "reward": 2.8124020099639893, |
| "reward_std": 0.02681785449385643, |
| "rewards/accuracy_reward_stage2": 0.8124019503593445, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 180 |
| }, |
| { |
| "completion_length": 92.65625, |
| "epoch": 0.044713438735177864, |
| "grad_norm": 4.035197870543932, |
| "kl": 0.08642578125, |
| "learning_rate": 9.555335968379446e-07, |
| "loss": 0.0035, |
| "reward": 2.6102566719055176, |
| "reward_std": 0.24840806424617767, |
| "rewards/accuracy_reward_stage2": 0.6727566719055176, |
| "rewards/format_reward_all_stage": 1.9375, |
| "scores/refine_times": 1.296875, |
| "step": 181 |
| }, |
| { |
| "completion_length": 77.75, |
| "epoch": 0.04496047430830039, |
| "grad_norm": 4.755395802023161, |
| "kl": 0.0654296875, |
| "learning_rate": 9.552865612648221e-07, |
| "loss": 0.0026, |
| "reward": 2.6376709938049316, |
| "reward_std": 0.142973855137825, |
| "rewards/accuracy_reward_stage2": 0.6376707553863525, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 182 |
| }, |
| { |
| "completion_length": 86.65625, |
| "epoch": 0.04520750988142293, |
| "grad_norm": 3.041899055085519, |
| "kl": 0.0419921875, |
| "learning_rate": 9.550395256916995e-07, |
| "loss": 0.0017, |
| "reward": 2.713804244995117, |
| "reward_std": 0.018256813287734985, |
| "rewards/accuracy_reward_stage2": 0.7138041257858276, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 183 |
| }, |
| { |
| "completion_length": 72.6875, |
| "epoch": 0.045454545454545456, |
| "grad_norm": 5.254526969633487, |
| "kl": 0.03369140625, |
| "learning_rate": 9.54792490118577e-07, |
| "loss": 0.0013, |
| "reward": 2.582467555999756, |
| "reward_std": 0.07811328768730164, |
| "rewards/accuracy_reward_stage2": 0.5824676752090454, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 184 |
| }, |
| { |
| "completion_length": 95.265625, |
| "epoch": 0.045701581027667984, |
| "grad_norm": 3.58376936857761, |
| "kl": 0.044921875, |
| "learning_rate": 9.545454545454546e-07, |
| "loss": 0.0018, |
| "reward": 2.671761989593506, |
| "reward_std": 0.09435681998729706, |
| "rewards/accuracy_reward_stage2": 0.6717619299888611, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.21875, |
| "step": 185 |
| }, |
| { |
| "completion_length": 79.25, |
| "epoch": 0.04594861660079051, |
| "grad_norm": 2.79408117557805, |
| "kl": 0.057861328125, |
| "learning_rate": 9.54298418972332e-07, |
| "loss": 0.0023, |
| "reward": 2.7370827198028564, |
| "reward_std": 0.011032424867153168, |
| "rewards/accuracy_reward_stage2": 0.7370827198028564, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.140625, |
| "step": 186 |
| }, |
| { |
| "completion_length": 81.1875, |
| "epoch": 0.04619565217391304, |
| "grad_norm": 3.070893637345864, |
| "kl": 0.0634765625, |
| "learning_rate": 9.540513833992094e-07, |
| "loss": 0.0025, |
| "reward": 2.6037180423736572, |
| "reward_std": 0.0603860504925251, |
| "rewards/accuracy_reward_stage2": 0.603718101978302, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 187 |
| }, |
| { |
| "completion_length": 83.90625, |
| "epoch": 0.046442687747035576, |
| "grad_norm": 3.9266085023435253, |
| "kl": 0.07470703125, |
| "learning_rate": 9.53804347826087e-07, |
| "loss": 0.003, |
| "reward": 2.3806803226470947, |
| "reward_std": 0.08710526674985886, |
| "rewards/accuracy_reward_stage2": 0.4431803524494171, |
| "rewards/format_reward_all_stage": 1.9375, |
| "scores/refine_times": 1.125, |
| "step": 188 |
| }, |
| { |
| "completion_length": 83.484375, |
| "epoch": 0.046689723320158104, |
| "grad_norm": 4.685942150283861, |
| "kl": 0.054443359375, |
| "learning_rate": 9.535573122529644e-07, |
| "loss": 0.0022, |
| "reward": 2.4631409645080566, |
| "reward_std": 0.21757805347442627, |
| "rewards/accuracy_reward_stage2": 0.46314099431037903, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 189 |
| }, |
| { |
| "completion_length": 72.4375, |
| "epoch": 0.04693675889328063, |
| "grad_norm": 1.8005226063530777, |
| "kl": 0.0634765625, |
| "learning_rate": 9.533102766798418e-07, |
| "loss": 0.0025, |
| "reward": 2.794851779937744, |
| "reward_std": 0.009321765042841434, |
| "rewards/accuracy_reward_stage2": 0.7948517203330994, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 190 |
| }, |
| { |
| "completion_length": 75.125, |
| "epoch": 0.04718379446640316, |
| "grad_norm": 1.9302369432589355, |
| "kl": 0.06396484375, |
| "learning_rate": 9.530632411067194e-07, |
| "loss": 0.0026, |
| "reward": 2.8036766052246094, |
| "reward_std": 0.018778154626488686, |
| "rewards/accuracy_reward_stage2": 0.8036764860153198, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 191 |
| }, |
| { |
| "completion_length": 74.671875, |
| "epoch": 0.04743083003952569, |
| "grad_norm": 0.6450107155642202, |
| "kl": 0.05224609375, |
| "learning_rate": 9.528162055335968e-07, |
| "loss": 0.0021, |
| "reward": 2.761014223098755, |
| "reward_std": 0.022562123835086823, |
| "rewards/accuracy_reward_stage2": 0.767264187335968, |
| "rewards/format_reward_all_stage": 1.993749976158142, |
| "scores/refine_times": 1.125, |
| "step": 192 |
| }, |
| { |
| "completion_length": 82.265625, |
| "epoch": 0.047677865612648224, |
| "grad_norm": 2.7725448937542465, |
| "kl": 0.06494140625, |
| "learning_rate": 9.525691699604743e-07, |
| "loss": 0.0026, |
| "reward": 2.734375, |
| "reward_std": 0.15981829166412354, |
| "rewards/accuracy_reward_stage2": 0.796875, |
| "rewards/format_reward_all_stage": 1.9375, |
| "scores/refine_times": 1.125, |
| "step": 193 |
| }, |
| { |
| "completion_length": 109.21875, |
| "epoch": 0.04792490118577075, |
| "grad_norm": 3.3462144334568045, |
| "kl": 0.083984375, |
| "learning_rate": 9.523221343873518e-07, |
| "loss": 0.0034, |
| "reward": 2.294902801513672, |
| "reward_std": 0.17553241550922394, |
| "rewards/accuracy_reward_stage2": 0.3730279803276062, |
| "rewards/format_reward_all_stage": 1.921875, |
| "scores/refine_times": 1.3125, |
| "step": 194 |
| }, |
| { |
| "completion_length": 88.421875, |
| "epoch": 0.04817193675889328, |
| "grad_norm": 2.6171281677758076, |
| "kl": 0.043212890625, |
| "learning_rate": 9.520750988142292e-07, |
| "loss": 0.0017, |
| "reward": 2.856783628463745, |
| "reward_std": 0.11217740178108215, |
| "rewards/accuracy_reward_stage2": 0.8672002553939819, |
| "rewards/format_reward_all_stage": 1.9895833730697632, |
| "scores/refine_times": 1.078125, |
| "step": 195 |
| }, |
| { |
| "completion_length": 95.296875, |
| "epoch": 0.04841897233201581, |
| "grad_norm": 4.039571176888931, |
| "kl": 0.0439453125, |
| "learning_rate": 9.518280632411066e-07, |
| "loss": 0.0018, |
| "reward": 2.6629042625427246, |
| "reward_std": 0.12791165709495544, |
| "rewards/accuracy_reward_stage2": 0.6629043817520142, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.140625, |
| "step": 196 |
| }, |
| { |
| "completion_length": 73.0, |
| "epoch": 0.048666007905138337, |
| "grad_norm": 5.175846675590277, |
| "kl": 0.07080078125, |
| "learning_rate": 9.515810276679841e-07, |
| "loss": 0.0028, |
| "reward": 2.671309232711792, |
| "reward_std": 0.13797426223754883, |
| "rewards/accuracy_reward_stage2": 0.671309232711792, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 197 |
| }, |
| { |
| "completion_length": 61.75, |
| "epoch": 0.04891304347826087, |
| "grad_norm": 4.218652136398897, |
| "kl": 0.05126953125, |
| "learning_rate": 9.513339920948616e-07, |
| "loss": 0.002, |
| "reward": 2.655543088912964, |
| "reward_std": 0.08075182139873505, |
| "rewards/accuracy_reward_stage2": 0.6555430293083191, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 198 |
| }, |
| { |
| "completion_length": 71.8125, |
| "epoch": 0.0491600790513834, |
| "grad_norm": 3.247173481906321, |
| "kl": 0.0517578125, |
| "learning_rate": 9.51086956521739e-07, |
| "loss": 0.0021, |
| "reward": 2.669358253479004, |
| "reward_std": 0.13766998052597046, |
| "rewards/accuracy_reward_stage2": 0.7943581342697144, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.0, |
| "step": 199 |
| }, |
| { |
| "completion_length": 93.953125, |
| "epoch": 0.04940711462450593, |
| "grad_norm": 3.734532512883597, |
| "kl": 0.05615234375, |
| "learning_rate": 9.508399209486166e-07, |
| "loss": 0.0022, |
| "reward": 2.4314818382263184, |
| "reward_std": 0.09143616259098053, |
| "rewards/accuracy_reward_stage2": 0.49398165941238403, |
| "rewards/format_reward_all_stage": 1.9375, |
| "scores/refine_times": 1.0625, |
| "step": 200 |
| }, |
| { |
| "completion_length": 96.40625, |
| "epoch": 0.049654150197628456, |
| "grad_norm": 2.5177431696993207, |
| "kl": 0.0576171875, |
| "learning_rate": 9.50592885375494e-07, |
| "loss": 0.0023, |
| "reward": 2.620957136154175, |
| "reward_std": 0.09463383257389069, |
| "rewards/accuracy_reward_stage2": 0.6209571361541748, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 201 |
| }, |
| { |
| "completion_length": 84.5625, |
| "epoch": 0.04990118577075099, |
| "grad_norm": 4.086803835650893, |
| "kl": 0.0625, |
| "learning_rate": 9.503458498023716e-07, |
| "loss": 0.0025, |
| "reward": 2.77097225189209, |
| "reward_std": 0.15935085713863373, |
| "rewards/accuracy_reward_stage2": 0.7709720730781555, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.21875, |
| "step": 202 |
| }, |
| { |
| "completion_length": 87.765625, |
| "epoch": 0.05014822134387352, |
| "grad_norm": 2.694826701394137, |
| "kl": 0.056884765625, |
| "learning_rate": 9.50098814229249e-07, |
| "loss": 0.0023, |
| "reward": 2.6979928016662598, |
| "reward_std": 0.054207898676395416, |
| "rewards/accuracy_reward_stage2": 0.6979928016662598, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.078125, |
| "step": 203 |
| }, |
| { |
| "completion_length": 74.0, |
| "epoch": 0.05039525691699605, |
| "grad_norm": 2.722697467089469, |
| "kl": 0.054931640625, |
| "learning_rate": 9.498517786561264e-07, |
| "loss": 0.0022, |
| "reward": 2.6437833309173584, |
| "reward_std": 0.042557310312986374, |
| "rewards/accuracy_reward_stage2": 0.6437833309173584, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 204 |
| }, |
| { |
| "completion_length": 81.5625, |
| "epoch": 0.050642292490118576, |
| "grad_norm": 2.0886379248475095, |
| "kl": 0.06591796875, |
| "learning_rate": 9.496047430830039e-07, |
| "loss": 0.0026, |
| "reward": 2.9196255207061768, |
| "reward_std": 0.005610581487417221, |
| "rewards/accuracy_reward_stage2": 0.9196255207061768, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 205 |
| }, |
| { |
| "completion_length": 91.390625, |
| "epoch": 0.050889328063241104, |
| "grad_norm": 3.6807652962283637, |
| "kl": 0.0703125, |
| "learning_rate": 9.493577075098814e-07, |
| "loss": 0.0028, |
| "reward": 2.5606374740600586, |
| "reward_std": 0.09435243904590607, |
| "rewards/accuracy_reward_stage2": 0.5606374740600586, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 206 |
| }, |
| { |
| "completion_length": 100.796875, |
| "epoch": 0.05113636363636364, |
| "grad_norm": 3.222119591626723, |
| "kl": 0.06103515625, |
| "learning_rate": 9.491106719367588e-07, |
| "loss": 0.0024, |
| "reward": 2.5677084922790527, |
| "reward_std": 0.12332375347614288, |
| "rewards/accuracy_reward_stage2": 0.578125, |
| "rewards/format_reward_all_stage": 1.9895833730697632, |
| "scores/refine_times": 1.171875, |
| "step": 207 |
| }, |
| { |
| "completion_length": 96.5625, |
| "epoch": 0.05138339920948617, |
| "grad_norm": 4.428189055231432, |
| "kl": 0.080078125, |
| "learning_rate": 9.488636363636363e-07, |
| "loss": 0.0032, |
| "reward": 2.6954102516174316, |
| "reward_std": 0.10087625682353973, |
| "rewards/accuracy_reward_stage2": 0.6954102516174316, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 208 |
| }, |
| { |
| "completion_length": 85.1875, |
| "epoch": 0.051630434782608696, |
| "grad_norm": 5.422275433434783, |
| "kl": 0.060546875, |
| "learning_rate": 9.486166007905137e-07, |
| "loss": 0.0024, |
| "reward": 2.676901340484619, |
| "reward_std": 0.19727489352226257, |
| "rewards/accuracy_reward_stage2": 0.6769014596939087, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 209 |
| }, |
| { |
| "completion_length": 100.09375, |
| "epoch": 0.051877470355731224, |
| "grad_norm": 3.968883032154412, |
| "kl": 0.07421875, |
| "learning_rate": 9.483695652173913e-07, |
| "loss": 0.003, |
| "reward": 2.7428462505340576, |
| "reward_std": 0.1038169115781784, |
| "rewards/accuracy_reward_stage2": 0.7490963935852051, |
| "rewards/format_reward_all_stage": 1.993749976158142, |
| "scores/refine_times": 1.109375, |
| "step": 210 |
| }, |
| { |
| "completion_length": 88.46875, |
| "epoch": 0.05212450592885375, |
| "grad_norm": 3.4686533432761637, |
| "kl": 0.05908203125, |
| "learning_rate": 9.481225296442688e-07, |
| "loss": 0.0024, |
| "reward": 2.782975196838379, |
| "reward_std": 0.07370894402265549, |
| "rewards/accuracy_reward_stage2": 0.7829753160476685, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 211 |
| }, |
| { |
| "completion_length": 116.34375, |
| "epoch": 0.05237154150197629, |
| "grad_norm": 4.12293075856129, |
| "kl": 0.08544921875, |
| "learning_rate": 9.478754940711462e-07, |
| "loss": 0.0034, |
| "reward": 2.6525073051452637, |
| "reward_std": 0.06538750976324081, |
| "rewards/accuracy_reward_stage2": 0.6525071859359741, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.171875, |
| "step": 212 |
| }, |
| { |
| "completion_length": 90.5625, |
| "epoch": 0.052618577075098816, |
| "grad_norm": 5.499570840391799, |
| "kl": 0.158203125, |
| "learning_rate": 9.476284584980236e-07, |
| "loss": 0.0063, |
| "reward": 2.620041847229004, |
| "reward_std": 0.158025860786438, |
| "rewards/accuracy_reward_stage2": 0.6200418472290039, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 213 |
| }, |
| { |
| "completion_length": 101.046875, |
| "epoch": 0.052865612648221344, |
| "grad_norm": 2.8991649860690845, |
| "kl": 0.0517578125, |
| "learning_rate": 9.473814229249012e-07, |
| "loss": 0.0021, |
| "reward": 2.827629804611206, |
| "reward_std": 0.026947414502501488, |
| "rewards/accuracy_reward_stage2": 0.8276296854019165, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.140625, |
| "step": 214 |
| }, |
| { |
| "completion_length": 73.875, |
| "epoch": 0.05311264822134387, |
| "grad_norm": 2.3688254496479493, |
| "kl": 0.05029296875, |
| "learning_rate": 9.471343873517786e-07, |
| "loss": 0.002, |
| "reward": 2.40625, |
| "reward_std": 0.033407654613256454, |
| "rewards/accuracy_reward_stage2": 0.40625, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 215 |
| }, |
| { |
| "completion_length": 98.703125, |
| "epoch": 0.0533596837944664, |
| "grad_norm": 4.150945077897221, |
| "kl": 0.06201171875, |
| "learning_rate": 9.468873517786561e-07, |
| "loss": 0.0025, |
| "reward": 2.7674636840820312, |
| "reward_std": 0.1500820368528366, |
| "rewards/accuracy_reward_stage2": 0.7674636840820312, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 216 |
| }, |
| { |
| "completion_length": 80.0, |
| "epoch": 0.053606719367588936, |
| "grad_norm": 3.428179312141899, |
| "kl": 0.07421875, |
| "learning_rate": 9.466403162055335e-07, |
| "loss": 0.003, |
| "reward": 2.686192274093628, |
| "reward_std": 0.023959007114171982, |
| "rewards/accuracy_reward_stage2": 0.6861922740936279, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 217 |
| }, |
| { |
| "completion_length": 113.328125, |
| "epoch": 0.053853754940711464, |
| "grad_norm": 3.9383187627551157, |
| "kl": 0.048583984375, |
| "learning_rate": 9.463932806324109e-07, |
| "loss": 0.0019, |
| "reward": 2.5574302673339844, |
| "reward_std": 0.25754785537719727, |
| "rewards/accuracy_reward_stage2": 0.750138521194458, |
| "rewards/format_reward_all_stage": 1.8072917461395264, |
| "scores/refine_times": 1.25, |
| "step": 218 |
| }, |
| { |
| "completion_length": 131.21875, |
| "epoch": 0.05410079051383399, |
| "grad_norm": 3.309098889669061, |
| "kl": 0.049072265625, |
| "learning_rate": 9.461462450592886e-07, |
| "loss": 0.002, |
| "reward": 2.5466530323028564, |
| "reward_std": 0.20897339284420013, |
| "rewards/accuracy_reward_stage2": 0.6716530323028564, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.1875, |
| "step": 219 |
| }, |
| { |
| "completion_length": 86.078125, |
| "epoch": 0.05434782608695652, |
| "grad_norm": 4.946057628757396, |
| "kl": 0.0478515625, |
| "learning_rate": 9.45899209486166e-07, |
| "loss": 0.0019, |
| "reward": 2.5970003604888916, |
| "reward_std": 0.164507195353508, |
| "rewards/accuracy_reward_stage2": 0.597000241279602, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 220 |
| }, |
| { |
| "completion_length": 95.546875, |
| "epoch": 0.05459486166007905, |
| "grad_norm": 2.4617835123837835, |
| "kl": 0.052978515625, |
| "learning_rate": 9.456521739130434e-07, |
| "loss": 0.0021, |
| "reward": 2.8430728912353516, |
| "reward_std": 0.050867728888988495, |
| "rewards/accuracy_reward_stage2": 0.843072772026062, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.15625, |
| "step": 221 |
| }, |
| { |
| "completion_length": 85.625, |
| "epoch": 0.054841897233201584, |
| "grad_norm": 12.694258343723256, |
| "kl": 0.271484375, |
| "learning_rate": 9.454051383399209e-07, |
| "loss": 0.0109, |
| "reward": 2.516552448272705, |
| "reward_std": 0.07157387584447861, |
| "rewards/accuracy_reward_stage2": 0.5790524482727051, |
| "rewards/format_reward_all_stage": 1.9375, |
| "scores/refine_times": 1.15625, |
| "step": 222 |
| }, |
| { |
| "completion_length": 121.484375, |
| "epoch": 0.05508893280632411, |
| "grad_norm": 3.2893095767759926, |
| "kl": 0.04833984375, |
| "learning_rate": 9.451581027667984e-07, |
| "loss": 0.0019, |
| "reward": 2.6593329906463623, |
| "reward_std": 0.21269533038139343, |
| "rewards/accuracy_reward_stage2": 0.7197496294975281, |
| "rewards/format_reward_all_stage": 1.9395833015441895, |
| "scores/refine_times": 1.46875, |
| "step": 223 |
| }, |
| { |
| "completion_length": 95.046875, |
| "epoch": 0.05533596837944664, |
| "grad_norm": 2.2737834746774084, |
| "kl": 0.0615234375, |
| "learning_rate": 9.449110671936758e-07, |
| "loss": 0.0025, |
| "reward": 2.78125, |
| "reward_std": 0.1246790662407875, |
| "rewards/accuracy_reward_stage2": 0.78125, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.234375, |
| "step": 224 |
| }, |
| { |
| "completion_length": 107.421875, |
| "epoch": 0.05558300395256917, |
| "grad_norm": 2.662768928485288, |
| "kl": 0.05615234375, |
| "learning_rate": 9.446640316205533e-07, |
| "loss": 0.0022, |
| "reward": 2.66047739982605, |
| "reward_std": 0.020462632179260254, |
| "rewards/accuracy_reward_stage2": 0.6604773998260498, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.328125, |
| "step": 225 |
| }, |
| { |
| "completion_length": 76.390625, |
| "epoch": 0.055830039525691696, |
| "grad_norm": 5.91370963368309, |
| "kl": 0.06591796875, |
| "learning_rate": 9.444169960474307e-07, |
| "loss": 0.0026, |
| "reward": 2.7473721504211426, |
| "reward_std": 0.16072911024093628, |
| "rewards/accuracy_reward_stage2": 0.7473721504211426, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 226 |
| }, |
| { |
| "completion_length": 94.015625, |
| "epoch": 0.05607707509881423, |
| "grad_norm": 4.076250713612817, |
| "kl": 0.04150390625, |
| "learning_rate": 9.441699604743083e-07, |
| "loss": 0.0017, |
| "reward": 2.6946887969970703, |
| "reward_std": 0.07815377414226532, |
| "rewards/accuracy_reward_stage2": 0.6946887969970703, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.171875, |
| "step": 227 |
| }, |
| { |
| "completion_length": 110.984375, |
| "epoch": 0.05632411067193676, |
| "grad_norm": 5.016186365433335, |
| "kl": 0.05224609375, |
| "learning_rate": 9.439229249011858e-07, |
| "loss": 0.0021, |
| "reward": 2.7810983657836914, |
| "reward_std": 0.1505921632051468, |
| "rewards/accuracy_reward_stage2": 0.7810983657836914, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.296875, |
| "step": 228 |
| }, |
| { |
| "completion_length": 88.296875, |
| "epoch": 0.05657114624505929, |
| "grad_norm": 4.303632516413331, |
| "kl": 0.05810546875, |
| "learning_rate": 9.436758893280632e-07, |
| "loss": 0.0023, |
| "reward": 2.471911907196045, |
| "reward_std": 0.03463466465473175, |
| "rewards/accuracy_reward_stage2": 0.4719120264053345, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 229 |
| }, |
| { |
| "completion_length": 89.75, |
| "epoch": 0.056818181818181816, |
| "grad_norm": 1.0465741722158528, |
| "kl": 0.05810546875, |
| "learning_rate": 9.434288537549407e-07, |
| "loss": 0.0023, |
| "reward": 2.6418185234069824, |
| "reward_std": 0.03043236769735813, |
| "rewards/accuracy_reward_stage2": 0.6418185234069824, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.21875, |
| "step": 230 |
| }, |
| { |
| "completion_length": 116.359375, |
| "epoch": 0.057065217391304345, |
| "grad_norm": 4.845711684305512, |
| "kl": 0.0634765625, |
| "learning_rate": 9.431818181818182e-07, |
| "loss": 0.0025, |
| "reward": 2.639404296875, |
| "reward_std": 0.13679620623588562, |
| "rewards/accuracy_reward_stage2": 0.6394043564796448, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.25, |
| "step": 231 |
| }, |
| { |
| "completion_length": 98.125, |
| "epoch": 0.05731225296442688, |
| "grad_norm": 4.2105692164904465, |
| "kl": 0.07763671875, |
| "learning_rate": 9.429347826086956e-07, |
| "loss": 0.0031, |
| "reward": 2.609795570373535, |
| "reward_std": 0.08919590711593628, |
| "rewards/accuracy_reward_stage2": 0.6097957491874695, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.171875, |
| "step": 232 |
| }, |
| { |
| "completion_length": 92.203125, |
| "epoch": 0.05755928853754941, |
| "grad_norm": 1.250927159243389, |
| "kl": 0.059326171875, |
| "learning_rate": 9.426877470355731e-07, |
| "loss": 0.0024, |
| "reward": 2.8027873039245605, |
| "reward_std": 0.014970174990594387, |
| "rewards/accuracy_reward_stage2": 0.8027871251106262, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.078125, |
| "step": 233 |
| }, |
| { |
| "completion_length": 95.65625, |
| "epoch": 0.057806324110671936, |
| "grad_norm": 1.7812890613719796, |
| "kl": 0.0634765625, |
| "learning_rate": 9.424407114624505e-07, |
| "loss": 0.0025, |
| "reward": 2.804018020629883, |
| "reward_std": 0.04018682241439819, |
| "rewards/accuracy_reward_stage2": 0.8040179014205933, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.171875, |
| "step": 234 |
| }, |
| { |
| "completion_length": 108.96875, |
| "epoch": 0.058053359683794464, |
| "grad_norm": 3.4975869844535055, |
| "kl": 0.06103515625, |
| "learning_rate": 9.421936758893279e-07, |
| "loss": 0.0024, |
| "reward": 2.629866123199463, |
| "reward_std": 0.11631540954113007, |
| "rewards/accuracy_reward_stage2": 0.6559078693389893, |
| "rewards/format_reward_all_stage": 1.9739583730697632, |
| "scores/refine_times": 1.265625, |
| "step": 235 |
| }, |
| { |
| "completion_length": 147.921875, |
| "epoch": 0.058300395256917, |
| "grad_norm": 4.268984542856603, |
| "kl": 0.0556640625, |
| "learning_rate": 9.419466403162055e-07, |
| "loss": 0.0022, |
| "reward": 2.572082042694092, |
| "reward_std": 0.15964874625205994, |
| "rewards/accuracy_reward_stage2": 0.6017696857452393, |
| "rewards/format_reward_all_stage": 1.970312476158142, |
| "scores/refine_times": 1.53125, |
| "step": 236 |
| }, |
| { |
| "completion_length": 115.21875, |
| "epoch": 0.05854743083003953, |
| "grad_norm": 4.65588209600243, |
| "kl": 0.0546875, |
| "learning_rate": 9.41699604743083e-07, |
| "loss": 0.0022, |
| "reward": 2.402869701385498, |
| "reward_std": 0.23863497376441956, |
| "rewards/accuracy_reward_stage2": 0.5341198444366455, |
| "rewards/format_reward_all_stage": 1.868749976158142, |
| "scores/refine_times": 1.25, |
| "step": 237 |
| }, |
| { |
| "completion_length": 106.515625, |
| "epoch": 0.058794466403162056, |
| "grad_norm": 3.984260162520113, |
| "kl": 0.0703125, |
| "learning_rate": 9.414525691699604e-07, |
| "loss": 0.0028, |
| "reward": 2.614504098892212, |
| "reward_std": 0.06337256729602814, |
| "rewards/accuracy_reward_stage2": 0.6207541823387146, |
| "rewards/format_reward_all_stage": 1.993749976158142, |
| "scores/refine_times": 1.28125, |
| "step": 238 |
| }, |
| { |
| "completion_length": 86.296875, |
| "epoch": 0.059041501976284584, |
| "grad_norm": 3.916424716324547, |
| "kl": 0.06884765625, |
| "learning_rate": 9.412055335968379e-07, |
| "loss": 0.0028, |
| "reward": 2.776068687438965, |
| "reward_std": 0.0822492316365242, |
| "rewards/accuracy_reward_stage2": 0.7760688066482544, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.09375, |
| "step": 239 |
| }, |
| { |
| "completion_length": 100.875, |
| "epoch": 0.05928853754940711, |
| "grad_norm": 4.688640502699326, |
| "kl": 0.0556640625, |
| "learning_rate": 9.409584980237154e-07, |
| "loss": 0.0022, |
| "reward": 2.622417449951172, |
| "reward_std": 0.17500078678131104, |
| "rewards/accuracy_reward_stage2": 0.6286673545837402, |
| "rewards/format_reward_all_stage": 1.993749976158142, |
| "scores/refine_times": 1.171875, |
| "step": 240 |
| }, |
| { |
| "completion_length": 95.921875, |
| "epoch": 0.05953557312252965, |
| "grad_norm": 4.039431146320975, |
| "kl": 0.057373046875, |
| "learning_rate": 9.407114624505929e-07, |
| "loss": 0.0023, |
| "reward": 2.611854314804077, |
| "reward_std": 0.13111275434494019, |
| "rewards/accuracy_reward_stage2": 0.6118543148040771, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.1875, |
| "step": 241 |
| }, |
| { |
| "completion_length": 89.5625, |
| "epoch": 0.059782608695652176, |
| "grad_norm": 3.2997919352845786, |
| "kl": 0.050048828125, |
| "learning_rate": 9.404644268774703e-07, |
| "loss": 0.002, |
| "reward": 2.652923822402954, |
| "reward_std": 0.11734248697757721, |
| "rewards/accuracy_reward_stage2": 0.7102153897285461, |
| "rewards/format_reward_all_stage": 1.9427083730697632, |
| "scores/refine_times": 1.078125, |
| "step": 242 |
| }, |
| { |
| "completion_length": 104.640625, |
| "epoch": 0.060029644268774704, |
| "grad_norm": 5.076461913642015, |
| "kl": 0.056396484375, |
| "learning_rate": 9.402173913043477e-07, |
| "loss": 0.0023, |
| "reward": 2.7080979347229004, |
| "reward_std": 0.04040906950831413, |
| "rewards/accuracy_reward_stage2": 0.7080979347229004, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 243 |
| }, |
| { |
| "completion_length": 77.375, |
| "epoch": 0.06027667984189723, |
| "grad_norm": 1.995536991642715, |
| "kl": 0.057373046875, |
| "learning_rate": 9.399703557312253e-07, |
| "loss": 0.0023, |
| "reward": 2.7579073905944824, |
| "reward_std": 0.0003560198238119483, |
| "rewards/accuracy_reward_stage2": 0.7579072713851929, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 244 |
| }, |
| { |
| "completion_length": 123.328125, |
| "epoch": 0.06052371541501976, |
| "grad_norm": 3.4556227655695357, |
| "kl": 0.046875, |
| "learning_rate": 9.397233201581027e-07, |
| "loss": 0.0019, |
| "reward": 2.779745578765869, |
| "reward_std": 0.10972259938716888, |
| "rewards/accuracy_reward_stage2": 0.7953706383705139, |
| "rewards/format_reward_all_stage": 1.984375, |
| "scores/refine_times": 1.328125, |
| "step": 245 |
| }, |
| { |
| "completion_length": 98.265625, |
| "epoch": 0.060770750988142296, |
| "grad_norm": 3.8073316504950308, |
| "kl": 0.05419921875, |
| "learning_rate": 9.394762845849802e-07, |
| "loss": 0.0022, |
| "reward": 2.611069679260254, |
| "reward_std": 0.15946999192237854, |
| "rewards/accuracy_reward_stage2": 0.7360695600509644, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.125, |
| "step": 246 |
| }, |
| { |
| "completion_length": 107.1875, |
| "epoch": 0.061017786561264824, |
| "grad_norm": 2.2530508679929206, |
| "kl": 0.05712890625, |
| "learning_rate": 9.392292490118577e-07, |
| "loss": 0.0023, |
| "reward": 2.6858139038085938, |
| "reward_std": 0.03068363480269909, |
| "rewards/accuracy_reward_stage2": 0.6858140230178833, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.171875, |
| "step": 247 |
| }, |
| { |
| "completion_length": 81.421875, |
| "epoch": 0.06126482213438735, |
| "grad_norm": 4.610246667877454, |
| "kl": 0.06689453125, |
| "learning_rate": 9.389822134387352e-07, |
| "loss": 0.0027, |
| "reward": 2.4801125526428223, |
| "reward_std": 0.12876607477664948, |
| "rewards/accuracy_reward_stage2": 0.49052929878234863, |
| "rewards/format_reward_all_stage": 1.9895833730697632, |
| "scores/refine_times": 1.078125, |
| "step": 248 |
| }, |
| { |
| "completion_length": 133.15625, |
| "epoch": 0.06151185770750988, |
| "grad_norm": 3.656412135533561, |
| "kl": 0.05615234375, |
| "learning_rate": 9.387351778656126e-07, |
| "loss": 0.0022, |
| "reward": 2.7299704551696777, |
| "reward_std": 0.15535692870616913, |
| "rewards/accuracy_reward_stage2": 0.7612204551696777, |
| "rewards/format_reward_all_stage": 1.96875, |
| "scores/refine_times": 1.46875, |
| "step": 249 |
| }, |
| { |
| "completion_length": 111.03125, |
| "epoch": 0.06175889328063241, |
| "grad_norm": 2.9745455486264785, |
| "kl": 0.068359375, |
| "learning_rate": 9.384881422924901e-07, |
| "loss": 0.0027, |
| "reward": 2.878418445587158, |
| "reward_std": 0.08700025826692581, |
| "rewards/accuracy_reward_stage2": 0.8784183263778687, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.3125, |
| "step": 250 |
| }, |
| { |
| "completion_length": 108.640625, |
| "epoch": 0.062005928853754944, |
| "grad_norm": 4.592700429840661, |
| "kl": 0.076171875, |
| "learning_rate": 9.382411067193675e-07, |
| "loss": 0.0031, |
| "reward": 2.640062093734741, |
| "reward_std": 0.13130618631839752, |
| "rewards/accuracy_reward_stage2": 0.6400620341300964, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.21875, |
| "step": 251 |
| }, |
| { |
| "completion_length": 93.375, |
| "epoch": 0.06225296442687747, |
| "grad_norm": 4.080049204812087, |
| "kl": 0.06201171875, |
| "learning_rate": 9.37994071146245e-07, |
| "loss": 0.0025, |
| "reward": 2.484543561935425, |
| "reward_std": 0.03272121399641037, |
| "rewards/accuracy_reward_stage2": 0.48454350233078003, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.09375, |
| "step": 252 |
| }, |
| { |
| "completion_length": 133.703125, |
| "epoch": 0.0625, |
| "grad_norm": 3.8766421314465203, |
| "kl": 0.06884765625, |
| "learning_rate": 9.377470355731225e-07, |
| "loss": 0.0028, |
| "reward": 2.6636319160461426, |
| "reward_std": 0.12454654276371002, |
| "rewards/accuracy_reward_stage2": 0.6792569160461426, |
| "rewards/format_reward_all_stage": 1.984375, |
| "scores/refine_times": 1.453125, |
| "step": 253 |
| }, |
| { |
| "completion_length": 102.640625, |
| "epoch": 0.06274703557312253, |
| "grad_norm": 3.125502153944731, |
| "kl": 0.064453125, |
| "learning_rate": 9.374999999999999e-07, |
| "loss": 0.0026, |
| "reward": 2.661580801010132, |
| "reward_std": 0.0767604261636734, |
| "rewards/accuracy_reward_stage2": 0.6615808606147766, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.234375, |
| "step": 254 |
| }, |
| { |
| "completion_length": 100.875, |
| "epoch": 0.06299407114624506, |
| "grad_norm": 2.4047815253285703, |
| "kl": 0.08154296875, |
| "learning_rate": 9.372529644268774e-07, |
| "loss": 0.0033, |
| "reward": 2.870173692703247, |
| "reward_std": 0.04427599906921387, |
| "rewards/accuracy_reward_stage2": 0.8701735734939575, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.234375, |
| "step": 255 |
| }, |
| { |
| "completion_length": 102.046875, |
| "epoch": 0.06324110671936758, |
| "grad_norm": 3.9729431573321565, |
| "kl": 0.06982421875, |
| "learning_rate": 9.370059288537549e-07, |
| "loss": 0.0028, |
| "reward": 2.5646939277648926, |
| "reward_std": 0.03415513038635254, |
| "rewards/accuracy_reward_stage2": 0.5646939873695374, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 256 |
| }, |
| { |
| "completion_length": 87.359375, |
| "epoch": 0.06348814229249011, |
| "grad_norm": 2.8984209672527923, |
| "kl": 0.05712890625, |
| "learning_rate": 9.367588932806324e-07, |
| "loss": 0.0023, |
| "reward": 2.9039530754089355, |
| "reward_std": 0.048997893929481506, |
| "rewards/accuracy_reward_stage2": 0.9039530158042908, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.078125, |
| "step": 257 |
| }, |
| { |
| "completion_length": 81.203125, |
| "epoch": 0.06373517786561265, |
| "grad_norm": 3.197191357621096, |
| "kl": 0.0693359375, |
| "learning_rate": 9.365118577075099e-07, |
| "loss": 0.0028, |
| "reward": 2.474766254425049, |
| "reward_std": 0.017342764884233475, |
| "rewards/accuracy_reward_stage2": 0.4747660756111145, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.140625, |
| "step": 258 |
| }, |
| { |
| "completion_length": 96.390625, |
| "epoch": 0.06398221343873518, |
| "grad_norm": 3.5385421006445625, |
| "kl": 0.0654296875, |
| "learning_rate": 9.362648221343873e-07, |
| "loss": 0.0026, |
| "reward": 2.3430323600769043, |
| "reward_std": 0.12197308242321014, |
| "rewards/accuracy_reward_stage2": 0.3430321514606476, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 259 |
| }, |
| { |
| "completion_length": 76.875, |
| "epoch": 0.06422924901185771, |
| "grad_norm": 3.6584345558191624, |
| "kl": 0.068359375, |
| "learning_rate": 9.360177865612647e-07, |
| "loss": 0.0027, |
| "reward": 2.3733391761779785, |
| "reward_std": 0.041200902312994, |
| "rewards/accuracy_reward_stage2": 0.3733389973640442, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 260 |
| }, |
| { |
| "completion_length": 119.6875, |
| "epoch": 0.06447628458498024, |
| "grad_norm": 3.183150779739053, |
| "kl": 0.061767578125, |
| "learning_rate": 9.357707509881423e-07, |
| "loss": 0.0025, |
| "reward": 2.5673301219940186, |
| "reward_std": 0.09498921036720276, |
| "rewards/accuracy_reward_stage2": 0.5751426219940186, |
| "rewards/format_reward_all_stage": 1.9921875, |
| "scores/refine_times": 1.359375, |
| "step": 261 |
| }, |
| { |
| "completion_length": 144.78125, |
| "epoch": 0.06472332015810277, |
| "grad_norm": 3.550233651334434, |
| "kl": 0.06396484375, |
| "learning_rate": 9.355237154150197e-07, |
| "loss": 0.0026, |
| "reward": 2.5922813415527344, |
| "reward_std": 0.25461798906326294, |
| "rewards/accuracy_reward_stage2": 0.7172813415527344, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.609375, |
| "step": 262 |
| }, |
| { |
| "completion_length": 117.859375, |
| "epoch": 0.0649703557312253, |
| "grad_norm": 3.1484490414114945, |
| "kl": 0.060302734375, |
| "learning_rate": 9.352766798418971e-07, |
| "loss": 0.0024, |
| "reward": 2.6294641494750977, |
| "reward_std": 0.16589348018169403, |
| "rewards/accuracy_reward_stage2": 0.6294642686843872, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.171875, |
| "step": 263 |
| }, |
| { |
| "completion_length": 121.28125, |
| "epoch": 0.06521739130434782, |
| "grad_norm": 2.30350469441656, |
| "kl": 0.06982421875, |
| "learning_rate": 9.350296442687746e-07, |
| "loss": 0.0028, |
| "reward": 2.70902681350708, |
| "reward_std": 0.09053189307451248, |
| "rewards/accuracy_reward_stage2": 0.716839075088501, |
| "rewards/format_reward_all_stage": 1.9921875, |
| "scores/refine_times": 1.40625, |
| "step": 264 |
| }, |
| { |
| "completion_length": 93.109375, |
| "epoch": 0.06546442687747035, |
| "grad_norm": 5.426621994742126, |
| "kl": 0.0654296875, |
| "learning_rate": 9.347826086956522e-07, |
| "loss": 0.0026, |
| "reward": 2.4706788063049316, |
| "reward_std": 0.0817948505282402, |
| "rewards/accuracy_reward_stage2": 0.4706789255142212, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 265 |
| }, |
| { |
| "completion_length": 122.046875, |
| "epoch": 0.06571146245059288, |
| "grad_norm": 4.159572161738214, |
| "kl": 0.1298828125, |
| "learning_rate": 9.345355731225297e-07, |
| "loss": 0.0052, |
| "reward": 2.4963176250457764, |
| "reward_std": 0.1702859252691269, |
| "rewards/accuracy_reward_stage2": 0.5770467519760132, |
| "rewards/format_reward_all_stage": 1.9192708730697632, |
| "scores/refine_times": 1.4375, |
| "step": 266 |
| }, |
| { |
| "completion_length": 94.671875, |
| "epoch": 0.06595849802371541, |
| "grad_norm": 3.003493248206179, |
| "kl": 0.068359375, |
| "learning_rate": 9.342885375494071e-07, |
| "loss": 0.0027, |
| "reward": 2.7748560905456543, |
| "reward_std": 0.06840323656797409, |
| "rewards/accuracy_reward_stage2": 0.79048091173172, |
| "rewards/format_reward_all_stage": 1.984375, |
| "scores/refine_times": 1.0625, |
| "step": 267 |
| }, |
| { |
| "completion_length": 111.65625, |
| "epoch": 0.06620553359683795, |
| "grad_norm": 3.0695130053036466, |
| "kl": 0.0703125, |
| "learning_rate": 9.340415019762845e-07, |
| "loss": 0.0028, |
| "reward": 2.609114170074463, |
| "reward_std": 0.11976105719804764, |
| "rewards/accuracy_reward_stage2": 0.6091140508651733, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.21875, |
| "step": 268 |
| }, |
| { |
| "completion_length": 128.21875, |
| "epoch": 0.06645256916996048, |
| "grad_norm": 3.7341130409363754, |
| "kl": 0.08349609375, |
| "learning_rate": 9.337944664031621e-07, |
| "loss": 0.0033, |
| "reward": 2.717738151550293, |
| "reward_std": 0.19703274965286255, |
| "rewards/accuracy_reward_stage2": 0.8271132707595825, |
| "rewards/format_reward_all_stage": 1.890625, |
| "scores/refine_times": 1.390625, |
| "step": 269 |
| }, |
| { |
| "completion_length": 122.96875, |
| "epoch": 0.06669960474308301, |
| "grad_norm": 3.5044994368102524, |
| "kl": 0.07958984375, |
| "learning_rate": 9.335474308300395e-07, |
| "loss": 0.0032, |
| "reward": 2.6777563095092773, |
| "reward_std": 0.08365817368030548, |
| "rewards/accuracy_reward_stage2": 0.7402562499046326, |
| "rewards/format_reward_all_stage": 1.9375, |
| "scores/refine_times": 1.359375, |
| "step": 270 |
| }, |
| { |
| "completion_length": 145.765625, |
| "epoch": 0.06694664031620554, |
| "grad_norm": 1.1667471179803168, |
| "kl": 0.068359375, |
| "learning_rate": 9.333003952569169e-07, |
| "loss": 0.0027, |
| "reward": 2.629354953765869, |
| "reward_std": 0.06412048637866974, |
| "rewards/accuracy_reward_stage2": 0.6606047749519348, |
| "rewards/format_reward_all_stage": 1.96875, |
| "scores/refine_times": 1.4375, |
| "step": 271 |
| }, |
| { |
| "completion_length": 103.90625, |
| "epoch": 0.06719367588932806, |
| "grad_norm": 4.510053925417986, |
| "kl": 0.072265625, |
| "learning_rate": 9.330533596837944e-07, |
| "loss": 0.0029, |
| "reward": 2.509962797164917, |
| "reward_std": 0.11727539449930191, |
| "rewards/accuracy_reward_stage2": 0.5099626779556274, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.203125, |
| "step": 272 |
| }, |
| { |
| "completion_length": 104.140625, |
| "epoch": 0.06744071146245059, |
| "grad_norm": 4.329434972997867, |
| "kl": 0.07861328125, |
| "learning_rate": 9.328063241106719e-07, |
| "loss": 0.0032, |
| "reward": 2.725374221801758, |
| "reward_std": 0.0988876223564148, |
| "rewards/accuracy_reward_stage2": 0.7253742218017578, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.171875, |
| "step": 273 |
| }, |
| { |
| "completion_length": 103.125, |
| "epoch": 0.06768774703557312, |
| "grad_norm": 1.6312530479379586, |
| "kl": 0.0771484375, |
| "learning_rate": 9.325592885375494e-07, |
| "loss": 0.0031, |
| "reward": 2.735292673110962, |
| "reward_std": 0.05476506054401398, |
| "rewards/accuracy_reward_stage2": 0.7509176731109619, |
| "rewards/format_reward_all_stage": 1.984375, |
| "scores/refine_times": 1.234375, |
| "step": 274 |
| }, |
| { |
| "completion_length": 122.015625, |
| "epoch": 0.06793478260869565, |
| "grad_norm": 3.165395542950571, |
| "kl": 0.07470703125, |
| "learning_rate": 9.323122529644269e-07, |
| "loss": 0.003, |
| "reward": 2.4879045486450195, |
| "reward_std": 0.10839006304740906, |
| "rewards/accuracy_reward_stage2": 0.498320996761322, |
| "rewards/format_reward_all_stage": 1.9895833730697632, |
| "scores/refine_times": 1.265625, |
| "step": 275 |
| }, |
| { |
| "completion_length": 104.796875, |
| "epoch": 0.06818181818181818, |
| "grad_norm": 2.9714488899285985, |
| "kl": 0.076171875, |
| "learning_rate": 9.320652173913043e-07, |
| "loss": 0.003, |
| "reward": 2.40559983253479, |
| "reward_std": 0.019545651972293854, |
| "rewards/accuracy_reward_stage2": 0.40559983253479004, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 276 |
| }, |
| { |
| "completion_length": 110.53125, |
| "epoch": 0.0684288537549407, |
| "grad_norm": 3.719685696298156, |
| "kl": 0.06298828125, |
| "learning_rate": 9.318181818181817e-07, |
| "loss": 0.0025, |
| "reward": 2.709977149963379, |
| "reward_std": 0.09086070209741592, |
| "rewards/accuracy_reward_stage2": 0.7672686576843262, |
| "rewards/format_reward_all_stage": 1.9427083730697632, |
| "scores/refine_times": 1.28125, |
| "step": 277 |
| }, |
| { |
| "completion_length": 127.390625, |
| "epoch": 0.06867588932806325, |
| "grad_norm": 4.0171584662369675, |
| "kl": 0.07666015625, |
| "learning_rate": 9.315711462450593e-07, |
| "loss": 0.0031, |
| "reward": 2.358762264251709, |
| "reward_std": 0.09046010673046112, |
| "rewards/accuracy_reward_stage2": 0.3712621331214905, |
| "rewards/format_reward_all_stage": 1.9874999523162842, |
| "scores/refine_times": 1.375, |
| "step": 278 |
| }, |
| { |
| "completion_length": 96.125, |
| "epoch": 0.06892292490118578, |
| "grad_norm": 3.549293268141929, |
| "kl": 0.0859375, |
| "learning_rate": 9.313241106719367e-07, |
| "loss": 0.0034, |
| "reward": 2.6358108520507812, |
| "reward_std": 0.05308837443590164, |
| "rewards/accuracy_reward_stage2": 0.6514356136322021, |
| "rewards/format_reward_all_stage": 1.984375, |
| "scores/refine_times": 1.25, |
| "step": 279 |
| }, |
| { |
| "completion_length": 111.5625, |
| "epoch": 0.0691699604743083, |
| "grad_norm": 3.426826676856291, |
| "kl": 0.0859375, |
| "learning_rate": 9.310770750988141e-07, |
| "loss": 0.0034, |
| "reward": 2.629148006439209, |
| "reward_std": 0.22529111802577972, |
| "rewards/accuracy_reward_stage2": 0.7541481256484985, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.34375, |
| "step": 280 |
| }, |
| { |
| "completion_length": 69.171875, |
| "epoch": 0.06941699604743083, |
| "grad_norm": 4.897619808608827, |
| "kl": 0.0888671875, |
| "learning_rate": 9.308300395256916e-07, |
| "loss": 0.0036, |
| "reward": 2.506143093109131, |
| "reward_std": 0.1796988546848297, |
| "rewards/accuracy_reward_stage2": 0.5061431527137756, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 281 |
| }, |
| { |
| "completion_length": 121.375, |
| "epoch": 0.06966403162055336, |
| "grad_norm": 3.280396954123298, |
| "kl": 0.0732421875, |
| "learning_rate": 9.305830039525691e-07, |
| "loss": 0.0029, |
| "reward": 2.4993786811828613, |
| "reward_std": 0.15204885601997375, |
| "rewards/accuracy_reward_stage2": 0.5566701889038086, |
| "rewards/format_reward_all_stage": 1.9427083730697632, |
| "scores/refine_times": 1.34375, |
| "step": 282 |
| }, |
| { |
| "completion_length": 100.046875, |
| "epoch": 0.06991106719367589, |
| "grad_norm": 3.1470297174282957, |
| "kl": 0.05859375, |
| "learning_rate": 9.303359683794467e-07, |
| "loss": 0.0023, |
| "reward": 2.753185749053955, |
| "reward_std": 0.04245923087000847, |
| "rewards/accuracy_reward_stage2": 0.7531858086585999, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.28125, |
| "step": 283 |
| }, |
| { |
| "completion_length": 107.484375, |
| "epoch": 0.07015810276679842, |
| "grad_norm": 4.1904751636477195, |
| "kl": 0.06494140625, |
| "learning_rate": 9.300889328063241e-07, |
| "loss": 0.0026, |
| "reward": 2.5296671390533447, |
| "reward_std": 0.1889445036649704, |
| "rewards/accuracy_reward_stage2": 0.5973755717277527, |
| "rewards/format_reward_all_stage": 1.9322917461395264, |
| "scores/refine_times": 1.328125, |
| "step": 284 |
| }, |
| { |
| "completion_length": 91.015625, |
| "epoch": 0.07040513833992094, |
| "grad_norm": 3.489184308124264, |
| "kl": 0.05517578125, |
| "learning_rate": 9.298418972332015e-07, |
| "loss": 0.0022, |
| "reward": 2.7620816230773926, |
| "reward_std": 0.030113043263554573, |
| "rewards/accuracy_reward_stage2": 0.7620817422866821, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.1875, |
| "step": 285 |
| }, |
| { |
| "completion_length": 100.1875, |
| "epoch": 0.07065217391304347, |
| "grad_norm": 3.275397968060478, |
| "kl": 0.06396484375, |
| "learning_rate": 9.295948616600791e-07, |
| "loss": 0.0026, |
| "reward": 2.6670258045196533, |
| "reward_std": 0.13048681616783142, |
| "rewards/accuracy_reward_stage2": 0.6670258641242981, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.53125, |
| "step": 286 |
| }, |
| { |
| "completion_length": 126.015625, |
| "epoch": 0.070899209486166, |
| "grad_norm": 2.6549645591790405, |
| "kl": 0.06103515625, |
| "learning_rate": 9.293478260869565e-07, |
| "loss": 0.0024, |
| "reward": 2.641623020172119, |
| "reward_std": 0.22553008794784546, |
| "rewards/accuracy_reward_stage2": 0.774956226348877, |
| "rewards/format_reward_all_stage": 1.8666666746139526, |
| "scores/refine_times": 1.546875, |
| "step": 287 |
| }, |
| { |
| "completion_length": 104.671875, |
| "epoch": 0.07114624505928854, |
| "grad_norm": 3.636209601421511, |
| "kl": 0.06640625, |
| "learning_rate": 9.291007905138339e-07, |
| "loss": 0.0027, |
| "reward": 2.666322946548462, |
| "reward_std": 0.06766237318515778, |
| "rewards/accuracy_reward_stage2": 0.6741354465484619, |
| "rewards/format_reward_all_stage": 1.9921875, |
| "scores/refine_times": 1.34375, |
| "step": 288 |
| }, |
| { |
| "completion_length": 106.640625, |
| "epoch": 0.07139328063241107, |
| "grad_norm": 4.503204838778139, |
| "kl": 0.0595703125, |
| "learning_rate": 9.288537549407114e-07, |
| "loss": 0.0024, |
| "reward": 2.3697216510772705, |
| "reward_std": 0.09033536165952682, |
| "rewards/accuracy_reward_stage2": 0.3697216212749481, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.3125, |
| "step": 289 |
| }, |
| { |
| "completion_length": 104.0, |
| "epoch": 0.0716403162055336, |
| "grad_norm": 3.3246057454714024, |
| "kl": 0.06591796875, |
| "learning_rate": 9.286067193675889e-07, |
| "loss": 0.0026, |
| "reward": 2.5884175300598145, |
| "reward_std": 0.11718533933162689, |
| "rewards/accuracy_reward_stage2": 0.594667375087738, |
| "rewards/format_reward_all_stage": 1.993749976158142, |
| "scores/refine_times": 1.296875, |
| "step": 290 |
| }, |
| { |
| "completion_length": 103.296875, |
| "epoch": 0.07188735177865613, |
| "grad_norm": 3.7899360784183123, |
| "kl": 0.057861328125, |
| "learning_rate": 9.283596837944663e-07, |
| "loss": 0.0023, |
| "reward": 2.4471476078033447, |
| "reward_std": 0.12509356439113617, |
| "rewards/accuracy_reward_stage2": 0.5096475481987, |
| "rewards/format_reward_all_stage": 1.9375, |
| "scores/refine_times": 1.3125, |
| "step": 291 |
| }, |
| { |
| "completion_length": 131.578125, |
| "epoch": 0.07213438735177866, |
| "grad_norm": 3.623459949472801, |
| "kl": 0.06591796875, |
| "learning_rate": 9.281126482213438e-07, |
| "loss": 0.0026, |
| "reward": 2.4355835914611816, |
| "reward_std": 0.17318613827228546, |
| "rewards/accuracy_reward_stage2": 0.5147501230239868, |
| "rewards/format_reward_all_stage": 1.9208333492279053, |
| "scores/refine_times": 1.4375, |
| "step": 292 |
| }, |
| { |
| "completion_length": 83.125, |
| "epoch": 0.07238142292490118, |
| "grad_norm": 4.8813524232727366, |
| "kl": 0.072265625, |
| "learning_rate": 9.278656126482213e-07, |
| "loss": 0.0029, |
| "reward": 2.58034610748291, |
| "reward_std": 0.18158404529094696, |
| "rewards/accuracy_reward_stage2": 0.5907625555992126, |
| "rewards/format_reward_all_stage": 1.9895832538604736, |
| "scores/refine_times": 1.15625, |
| "step": 293 |
| }, |
| { |
| "completion_length": 90.375, |
| "epoch": 0.07262845849802371, |
| "grad_norm": 3.4214252970122123, |
| "kl": 0.068359375, |
| "learning_rate": 9.276185770750988e-07, |
| "loss": 0.0027, |
| "reward": 2.790419578552246, |
| "reward_std": 0.07727587223052979, |
| "rewards/accuracy_reward_stage2": 0.8154194355010986, |
| "rewards/format_reward_all_stage": 1.975000023841858, |
| "scores/refine_times": 1.171875, |
| "step": 294 |
| }, |
| { |
| "completion_length": 97.0625, |
| "epoch": 0.07287549407114624, |
| "grad_norm": 2.9700993517489906, |
| "kl": 0.08349609375, |
| "learning_rate": 9.273715415019763e-07, |
| "loss": 0.0033, |
| "reward": 2.4873194694519043, |
| "reward_std": 0.01997227966785431, |
| "rewards/accuracy_reward_stage2": 0.4873194694519043, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.140625, |
| "step": 295 |
| }, |
| { |
| "completion_length": 72.28125, |
| "epoch": 0.07312252964426877, |
| "grad_norm": 3.329369020437154, |
| "kl": 0.0654296875, |
| "learning_rate": 9.271245059288537e-07, |
| "loss": 0.0026, |
| "reward": 2.7289249897003174, |
| "reward_std": 0.08622775226831436, |
| "rewards/accuracy_reward_stage2": 0.7289249300956726, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 296 |
| }, |
| { |
| "completion_length": 89.90625, |
| "epoch": 0.07336956521739131, |
| "grad_norm": 2.664881556264298, |
| "kl": 0.06689453125, |
| "learning_rate": 9.268774703557312e-07, |
| "loss": 0.0027, |
| "reward": 2.809821605682373, |
| "reward_std": 0.042036011815071106, |
| "rewards/accuracy_reward_stage2": 0.8202384114265442, |
| "rewards/format_reward_all_stage": 1.9895833730697632, |
| "scores/refine_times": 1.234375, |
| "step": 297 |
| }, |
| { |
| "completion_length": 96.640625, |
| "epoch": 0.07361660079051384, |
| "grad_norm": 5.559187183825259, |
| "kl": 0.078125, |
| "learning_rate": 9.266304347826086e-07, |
| "loss": 0.0031, |
| "reward": 2.4564993381500244, |
| "reward_std": 0.17395678162574768, |
| "rewards/accuracy_reward_stage2": 0.5398328304290771, |
| "rewards/format_reward_all_stage": 1.9166667461395264, |
| "scores/refine_times": 1.34375, |
| "step": 298 |
| }, |
| { |
| "completion_length": 87.515625, |
| "epoch": 0.07386363636363637, |
| "grad_norm": 4.704220127005476, |
| "kl": 0.0869140625, |
| "learning_rate": 9.263833992094861e-07, |
| "loss": 0.0035, |
| "reward": 2.6666579246520996, |
| "reward_std": 0.06254095584154129, |
| "rewards/accuracy_reward_stage2": 0.6666580438613892, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.203125, |
| "step": 299 |
| }, |
| { |
| "completion_length": 84.578125, |
| "epoch": 0.0741106719367589, |
| "grad_norm": 4.262104472530315, |
| "kl": 0.07666015625, |
| "learning_rate": 9.261363636363636e-07, |
| "loss": 0.0031, |
| "reward": 2.7805142402648926, |
| "reward_std": 0.08827356994152069, |
| "rewards/accuracy_reward_stage2": 0.7961392998695374, |
| "rewards/format_reward_all_stage": 1.984375, |
| "scores/refine_times": 1.1875, |
| "step": 300 |
| }, |
| { |
| "completion_length": 77.40625, |
| "epoch": 0.07435770750988142, |
| "grad_norm": 4.096353557974333, |
| "kl": 0.10693359375, |
| "learning_rate": 9.25889328063241e-07, |
| "loss": 0.0043, |
| "reward": 2.542158603668213, |
| "reward_std": 0.06200522556900978, |
| "rewards/accuracy_reward_stage2": 0.5421587228775024, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.15625, |
| "step": 301 |
| }, |
| { |
| "completion_length": 78.78125, |
| "epoch": 0.07460474308300395, |
| "grad_norm": 4.001605680444497, |
| "kl": 0.0830078125, |
| "learning_rate": 9.256422924901185e-07, |
| "loss": 0.0033, |
| "reward": 2.708712339401245, |
| "reward_std": 0.006489424966275692, |
| "rewards/accuracy_reward_stage2": 0.7087122797966003, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 302 |
| }, |
| { |
| "completion_length": 81.09375, |
| "epoch": 0.07485177865612648, |
| "grad_norm": 3.3709487646989804, |
| "kl": 0.10546875, |
| "learning_rate": 9.253952569169961e-07, |
| "loss": 0.0042, |
| "reward": 2.5341782569885254, |
| "reward_std": 0.15571407973766327, |
| "rewards/accuracy_reward_stage2": 0.6591783761978149, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.0625, |
| "step": 303 |
| }, |
| { |
| "completion_length": 65.484375, |
| "epoch": 0.07509881422924901, |
| "grad_norm": 3.3434150332690695, |
| "kl": 0.072265625, |
| "learning_rate": 9.251482213438735e-07, |
| "loss": 0.0029, |
| "reward": 2.7836451530456543, |
| "reward_std": 0.04097224026918411, |
| "rewards/accuracy_reward_stage2": 0.7836451530456543, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.078125, |
| "step": 304 |
| }, |
| { |
| "completion_length": 75.140625, |
| "epoch": 0.07534584980237154, |
| "grad_norm": 4.073041421040428, |
| "kl": 0.06884765625, |
| "learning_rate": 9.24901185770751e-07, |
| "loss": 0.0028, |
| "reward": 2.7703256607055664, |
| "reward_std": 0.06903526932001114, |
| "rewards/accuracy_reward_stage2": 0.7703255414962769, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 305 |
| }, |
| { |
| "completion_length": 86.875, |
| "epoch": 0.07559288537549406, |
| "grad_norm": 4.9587612243478905, |
| "kl": 0.0986328125, |
| "learning_rate": 9.246541501976284e-07, |
| "loss": 0.0039, |
| "reward": 2.566884994506836, |
| "reward_std": 0.11869431287050247, |
| "rewards/accuracy_reward_stage2": 0.5825099945068359, |
| "rewards/format_reward_all_stage": 1.984375, |
| "scores/refine_times": 1.25, |
| "step": 306 |
| }, |
| { |
| "completion_length": 78.734375, |
| "epoch": 0.07583992094861661, |
| "grad_norm": 5.010353844664205, |
| "kl": 0.09423828125, |
| "learning_rate": 9.244071146245059e-07, |
| "loss": 0.0038, |
| "reward": 2.725599527359009, |
| "reward_std": 0.09302526712417603, |
| "rewards/accuracy_reward_stage2": 0.7255995273590088, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 307 |
| }, |
| { |
| "completion_length": 69.375, |
| "epoch": 0.07608695652173914, |
| "grad_norm": 2.452183271447089, |
| "kl": 0.09228515625, |
| "learning_rate": 9.241600790513834e-07, |
| "loss": 0.0037, |
| "reward": 2.8343749046325684, |
| "reward_std": 0.07092030346393585, |
| "rewards/accuracy_reward_stage2": 0.8343750238418579, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.078125, |
| "step": 308 |
| }, |
| { |
| "completion_length": 72.84375, |
| "epoch": 0.07633399209486166, |
| "grad_norm": 6.377563585569854, |
| "kl": 0.1240234375, |
| "learning_rate": 9.239130434782608e-07, |
| "loss": 0.005, |
| "reward": 2.4583377838134766, |
| "reward_std": 0.15416155755519867, |
| "rewards/accuracy_reward_stage2": 0.5260462760925293, |
| "rewards/format_reward_all_stage": 1.9322917461395264, |
| "scores/refine_times": 1.28125, |
| "step": 309 |
| }, |
| { |
| "completion_length": 63.6875, |
| "epoch": 0.07658102766798419, |
| "grad_norm": 3.40096591722913, |
| "kl": 0.1025390625, |
| "learning_rate": 9.236660079051382e-07, |
| "loss": 0.0041, |
| "reward": 2.791654348373413, |
| "reward_std": 0.04698639735579491, |
| "rewards/accuracy_reward_stage2": 0.7916543483734131, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 310 |
| }, |
| { |
| "completion_length": 69.328125, |
| "epoch": 0.07682806324110672, |
| "grad_norm": 2.5157220101442923, |
| "kl": 0.1376953125, |
| "learning_rate": 9.234189723320159e-07, |
| "loss": 0.0055, |
| "reward": 2.621166706085205, |
| "reward_std": 0.01371072232723236, |
| "rewards/accuracy_reward_stage2": 0.6211665868759155, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 311 |
| }, |
| { |
| "completion_length": 73.40625, |
| "epoch": 0.07707509881422925, |
| "grad_norm": 4.6095300665340195, |
| "kl": 0.1044921875, |
| "learning_rate": 9.231719367588933e-07, |
| "loss": 0.0042, |
| "reward": 2.7523388862609863, |
| "reward_std": 0.14407417178153992, |
| "rewards/accuracy_reward_stage2": 0.7679637670516968, |
| "rewards/format_reward_all_stage": 1.984375, |
| "scores/refine_times": 1.140625, |
| "step": 312 |
| }, |
| { |
| "completion_length": 66.40625, |
| "epoch": 0.07732213438735178, |
| "grad_norm": 1.9319584273510186, |
| "kl": 0.11181640625, |
| "learning_rate": 9.229249011857707e-07, |
| "loss": 0.0045, |
| "reward": 2.7965087890625, |
| "reward_std": 0.010140997357666492, |
| "rewards/accuracy_reward_stage2": 0.7965086698532104, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 313 |
| }, |
| { |
| "completion_length": 63.0, |
| "epoch": 0.0775691699604743, |
| "grad_norm": 4.348923290801355, |
| "kl": 0.12158203125, |
| "learning_rate": 9.226778656126482e-07, |
| "loss": 0.0049, |
| "reward": 2.4945788383483887, |
| "reward_std": 0.027087727561593056, |
| "rewards/accuracy_reward_stage2": 0.49457883834838867, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 314 |
| }, |
| { |
| "completion_length": 73.53125, |
| "epoch": 0.07781620553359683, |
| "grad_norm": 4.488593163355316, |
| "kl": 0.1298828125, |
| "learning_rate": 9.224308300395256e-07, |
| "loss": 0.0052, |
| "reward": 2.7206943035125732, |
| "reward_std": 0.11720685660839081, |
| "rewards/accuracy_reward_stage2": 0.7206943035125732, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.078125, |
| "step": 315 |
| }, |
| { |
| "completion_length": 70.625, |
| "epoch": 0.07806324110671936, |
| "grad_norm": 6.557602468987205, |
| "kl": 0.119140625, |
| "learning_rate": 9.221837944664031e-07, |
| "loss": 0.0048, |
| "reward": 2.5555434226989746, |
| "reward_std": 0.14352944493293762, |
| "rewards/accuracy_reward_stage2": 0.5555435419082642, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 316 |
| }, |
| { |
| "completion_length": 91.765625, |
| "epoch": 0.0783102766798419, |
| "grad_norm": 3.0896623958939915, |
| "kl": 0.07666015625, |
| "learning_rate": 9.219367588932806e-07, |
| "loss": 0.0031, |
| "reward": 2.527045726776123, |
| "reward_std": 0.10674090683460236, |
| "rewards/accuracy_reward_stage2": 0.5473582744598389, |
| "rewards/format_reward_all_stage": 1.9796874523162842, |
| "scores/refine_times": 1.28125, |
| "step": 317 |
| }, |
| { |
| "completion_length": 65.8125, |
| "epoch": 0.07855731225296443, |
| "grad_norm": 4.73674356473737, |
| "kl": 0.125, |
| "learning_rate": 9.21689723320158e-07, |
| "loss": 0.005, |
| "reward": 2.770782947540283, |
| "reward_std": 0.07048628479242325, |
| "rewards/accuracy_reward_stage2": 0.7707828879356384, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 318 |
| }, |
| { |
| "completion_length": 78.328125, |
| "epoch": 0.07880434782608696, |
| "grad_norm": 4.422531982012408, |
| "kl": 0.12158203125, |
| "learning_rate": 9.214426877470354e-07, |
| "loss": 0.0049, |
| "reward": 2.5821871757507324, |
| "reward_std": 0.0276879221200943, |
| "rewards/accuracy_reward_stage2": 0.5821871161460876, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.078125, |
| "step": 319 |
| }, |
| { |
| "completion_length": 59.6875, |
| "epoch": 0.07905138339920949, |
| "grad_norm": 6.29720713319933, |
| "kl": 0.111328125, |
| "learning_rate": 9.211956521739131e-07, |
| "loss": 0.0045, |
| "reward": 2.796858310699463, |
| "reward_std": 0.11641418933868408, |
| "rewards/accuracy_reward_stage2": 0.7968584299087524, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 320 |
| }, |
| { |
| "completion_length": 62.125, |
| "epoch": 0.07929841897233202, |
| "grad_norm": 6.247716420446892, |
| "kl": 0.1015625, |
| "learning_rate": 9.209486166007905e-07, |
| "loss": 0.0041, |
| "reward": 2.609973430633545, |
| "reward_std": 0.14433087408542633, |
| "rewards/accuracy_reward_stage2": 0.6099736094474792, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 321 |
| }, |
| { |
| "completion_length": 44.1875, |
| "epoch": 0.07954545454545454, |
| "grad_norm": 2.1669285375452203, |
| "kl": 0.1279296875, |
| "learning_rate": 9.20701581027668e-07, |
| "loss": 0.0051, |
| "reward": 2.913513422012329, |
| "reward_std": 0.0015782499685883522, |
| "rewards/accuracy_reward_stage2": 0.9135133624076843, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 322 |
| }, |
| { |
| "completion_length": 54.5625, |
| "epoch": 0.07979249011857707, |
| "grad_norm": 5.141224866590071, |
| "kl": 0.12158203125, |
| "learning_rate": 9.204545454545454e-07, |
| "loss": 0.0049, |
| "reward": 2.738158702850342, |
| "reward_std": 0.031038541346788406, |
| "rewards/accuracy_reward_stage2": 0.7381587028503418, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 323 |
| }, |
| { |
| "completion_length": 60.625, |
| "epoch": 0.0800395256916996, |
| "grad_norm": 7.40413453314316, |
| "kl": 0.1298828125, |
| "learning_rate": 9.202075098814229e-07, |
| "loss": 0.0052, |
| "reward": 2.398099660873413, |
| "reward_std": 0.13175323605537415, |
| "rewards/accuracy_reward_stage2": 0.39809975028038025, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 324 |
| }, |
| { |
| "completion_length": 54.5625, |
| "epoch": 0.08028656126482213, |
| "grad_norm": 4.312175786875359, |
| "kl": 0.1376953125, |
| "learning_rate": 9.199604743083004e-07, |
| "loss": 0.0055, |
| "reward": 2.7443251609802246, |
| "reward_std": 0.012218557298183441, |
| "rewards/accuracy_reward_stage2": 0.7443252801895142, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 325 |
| }, |
| { |
| "completion_length": 39.75, |
| "epoch": 0.08053359683794467, |
| "grad_norm": 2.5230177648811254, |
| "kl": 0.22265625, |
| "learning_rate": 9.197134387351778e-07, |
| "loss": 0.0089, |
| "reward": 2.817213773727417, |
| "reward_std": 0.0013947999104857445, |
| "rewards/accuracy_reward_stage2": 0.817213773727417, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 326 |
| }, |
| { |
| "completion_length": 43.5625, |
| "epoch": 0.0807806324110672, |
| "grad_norm": 0.6751401066284123, |
| "kl": 0.19140625, |
| "learning_rate": 9.194664031620552e-07, |
| "loss": 0.0077, |
| "reward": 2.948148250579834, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward_stage2": 0.9481481313705444, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 327 |
| }, |
| { |
| "completion_length": 45.9375, |
| "epoch": 0.08102766798418973, |
| "grad_norm": 5.102879547791845, |
| "kl": 0.2001953125, |
| "learning_rate": 9.192193675889328e-07, |
| "loss": 0.008, |
| "reward": 2.7917566299438477, |
| "reward_std": 0.06986243277788162, |
| "rewards/accuracy_reward_stage2": 0.7917565107345581, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 328 |
| }, |
| { |
| "completion_length": 55.828125, |
| "epoch": 0.08127470355731226, |
| "grad_norm": 5.398203863071762, |
| "kl": 0.251953125, |
| "learning_rate": 9.189723320158103e-07, |
| "loss": 0.0101, |
| "reward": 2.423312187194824, |
| "reward_std": 0.2542467713356018, |
| "rewards/accuracy_reward_stage2": 0.5483123064041138, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.328125, |
| "step": 329 |
| }, |
| { |
| "completion_length": 68.328125, |
| "epoch": 0.08152173913043478, |
| "grad_norm": 3.744099222638442, |
| "kl": 0.1376953125, |
| "learning_rate": 9.187252964426877e-07, |
| "loss": 0.0055, |
| "reward": 2.623687744140625, |
| "reward_std": 0.061280906200408936, |
| "rewards/accuracy_reward_stage2": 0.6236876249313354, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.09375, |
| "step": 330 |
| }, |
| { |
| "completion_length": 61.140625, |
| "epoch": 0.08176877470355731, |
| "grad_norm": 4.174389727633853, |
| "kl": 0.1484375, |
| "learning_rate": 9.184782608695652e-07, |
| "loss": 0.0059, |
| "reward": 2.789412021636963, |
| "reward_std": 0.018945466727018356, |
| "rewards/accuracy_reward_stage2": 0.7894119620323181, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 331 |
| }, |
| { |
| "completion_length": 60.0, |
| "epoch": 0.08201581027667984, |
| "grad_norm": 5.617047129327734, |
| "kl": 0.130859375, |
| "learning_rate": 9.182312252964426e-07, |
| "loss": 0.0052, |
| "reward": 2.493664503097534, |
| "reward_std": 0.08623480051755905, |
| "rewards/accuracy_reward_stage2": 0.5092895030975342, |
| "rewards/format_reward_all_stage": 1.984375, |
| "scores/refine_times": 1.140625, |
| "step": 332 |
| }, |
| { |
| "completion_length": 61.5, |
| "epoch": 0.08226284584980237, |
| "grad_norm": 4.192545514357886, |
| "kl": 0.15625, |
| "learning_rate": 9.179841897233202e-07, |
| "loss": 0.0063, |
| "reward": 2.833590030670166, |
| "reward_std": 0.03723323345184326, |
| "rewards/accuracy_reward_stage2": 0.8335901498794556, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 333 |
| }, |
| { |
| "completion_length": 70.625, |
| "epoch": 0.0825098814229249, |
| "grad_norm": 4.124911374118259, |
| "kl": 0.134765625, |
| "learning_rate": 9.177371541501976e-07, |
| "loss": 0.0054, |
| "reward": 2.8245418071746826, |
| "reward_std": 0.07523417472839355, |
| "rewards/accuracy_reward_stage2": 0.8245418071746826, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 334 |
| }, |
| { |
| "completion_length": 57.75, |
| "epoch": 0.08275691699604742, |
| "grad_norm": 4.044678481099241, |
| "kl": 0.150390625, |
| "learning_rate": 9.17490118577075e-07, |
| "loss": 0.006, |
| "reward": 2.784482955932617, |
| "reward_std": 0.0903862714767456, |
| "rewards/accuracy_reward_stage2": 0.7844830751419067, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 335 |
| }, |
| { |
| "completion_length": 64.4375, |
| "epoch": 0.08300395256916997, |
| "grad_norm": 4.32490069299913, |
| "kl": 0.12451171875, |
| "learning_rate": 9.172430830039525e-07, |
| "loss": 0.005, |
| "reward": 2.792992115020752, |
| "reward_std": 0.07582361996173859, |
| "rewards/accuracy_reward_stage2": 0.792992115020752, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 336 |
| }, |
| { |
| "completion_length": 68.875, |
| "epoch": 0.0832509881422925, |
| "grad_norm": 3.926256955205469, |
| "kl": 0.1064453125, |
| "learning_rate": 9.1699604743083e-07, |
| "loss": 0.0043, |
| "reward": 2.8083438873291016, |
| "reward_std": 0.11306163668632507, |
| "rewards/accuracy_reward_stage2": 0.8083438873291016, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 337 |
| }, |
| { |
| "completion_length": 70.75, |
| "epoch": 0.08349802371541502, |
| "grad_norm": 3.9512075265777455, |
| "kl": 0.1181640625, |
| "learning_rate": 9.167490118577074e-07, |
| "loss": 0.0047, |
| "reward": 2.7581210136413574, |
| "reward_std": 0.07616296410560608, |
| "rewards/accuracy_reward_stage2": 0.7581211924552917, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 338 |
| }, |
| { |
| "completion_length": 75.375, |
| "epoch": 0.08374505928853755, |
| "grad_norm": 4.658203187374032, |
| "kl": 0.115234375, |
| "learning_rate": 9.16501976284585e-07, |
| "loss": 0.0046, |
| "reward": 2.5720980167388916, |
| "reward_std": 0.10662204772233963, |
| "rewards/accuracy_reward_stage2": 0.5720980167388916, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 339 |
| }, |
| { |
| "completion_length": 80.34375, |
| "epoch": 0.08399209486166008, |
| "grad_norm": 4.101756140968318, |
| "kl": 0.11328125, |
| "learning_rate": 9.162549407114624e-07, |
| "loss": 0.0045, |
| "reward": 2.7045812606811523, |
| "reward_std": 0.0755821019411087, |
| "rewards/accuracy_reward_stage2": 0.7045812606811523, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 340 |
| }, |
| { |
| "completion_length": 66.0, |
| "epoch": 0.08423913043478261, |
| "grad_norm": 3.7647913238998947, |
| "kl": 0.123046875, |
| "learning_rate": 9.160079051383399e-07, |
| "loss": 0.0049, |
| "reward": 2.548119068145752, |
| "reward_std": 0.07794924080371857, |
| "rewards/accuracy_reward_stage2": 0.5481189489364624, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 341 |
| }, |
| { |
| "completion_length": 87.84375, |
| "epoch": 0.08448616600790514, |
| "grad_norm": 3.5158554366688306, |
| "kl": 0.10107421875, |
| "learning_rate": 9.157608695652174e-07, |
| "loss": 0.004, |
| "reward": 2.307880163192749, |
| "reward_std": 0.0058220368809998035, |
| "rewards/accuracy_reward_stage2": 0.37038010358810425, |
| "rewards/format_reward_all_stage": 1.9375, |
| "scores/refine_times": 1.0625, |
| "step": 342 |
| }, |
| { |
| "completion_length": 68.9375, |
| "epoch": 0.08473320158102766, |
| "grad_norm": 4.574586844445791, |
| "kl": 0.1171875, |
| "learning_rate": 9.155138339920948e-07, |
| "loss": 0.0047, |
| "reward": 2.369117259979248, |
| "reward_std": 0.0667356476187706, |
| "rewards/accuracy_reward_stage2": 0.36911740899086, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 343 |
| }, |
| { |
| "completion_length": 72.625, |
| "epoch": 0.08498023715415019, |
| "grad_norm": 2.595510068327524, |
| "kl": 0.0947265625, |
| "learning_rate": 9.152667984189722e-07, |
| "loss": 0.0038, |
| "reward": 2.610255718231201, |
| "reward_std": 0.04115435481071472, |
| "rewards/accuracy_reward_stage2": 0.6102556586265564, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 344 |
| }, |
| { |
| "completion_length": 86.5, |
| "epoch": 0.08522727272727272, |
| "grad_norm": 3.6823970485562, |
| "kl": 0.08740234375, |
| "learning_rate": 9.150197628458498e-07, |
| "loss": 0.0035, |
| "reward": 2.6106173992156982, |
| "reward_std": 0.13690951466560364, |
| "rewards/accuracy_reward_stage2": 0.7356172800064087, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.078125, |
| "step": 345 |
| }, |
| { |
| "completion_length": 83.0625, |
| "epoch": 0.08547430830039526, |
| "grad_norm": 3.550714248285621, |
| "kl": 0.1015625, |
| "learning_rate": 9.147727272727272e-07, |
| "loss": 0.0041, |
| "reward": 2.5705089569091797, |
| "reward_std": 0.020351896062493324, |
| "rewards/accuracy_reward_stage2": 0.5705088973045349, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 346 |
| }, |
| { |
| "completion_length": 80.9375, |
| "epoch": 0.08572134387351779, |
| "grad_norm": 6.048964304026029, |
| "kl": 0.10107421875, |
| "learning_rate": 9.145256916996046e-07, |
| "loss": 0.004, |
| "reward": 2.619798183441162, |
| "reward_std": 0.1754463016986847, |
| "rewards/accuracy_reward_stage2": 0.6197980642318726, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 347 |
| }, |
| { |
| "completion_length": 97.828125, |
| "epoch": 0.08596837944664032, |
| "grad_norm": 3.753761799406494, |
| "kl": 0.07763671875, |
| "learning_rate": 9.142786561264822e-07, |
| "loss": 0.0031, |
| "reward": 2.7262320518493652, |
| "reward_std": 0.10837259888648987, |
| "rewards/accuracy_reward_stage2": 0.7324820160865784, |
| "rewards/format_reward_all_stage": 1.993749976158142, |
| "scores/refine_times": 1.140625, |
| "step": 348 |
| }, |
| { |
| "completion_length": 72.71875, |
| "epoch": 0.08621541501976285, |
| "grad_norm": 4.3265197363111385, |
| "kl": 0.109375, |
| "learning_rate": 9.140316205533597e-07, |
| "loss": 0.0044, |
| "reward": 2.7120370864868164, |
| "reward_std": 0.12224727869033813, |
| "rewards/accuracy_reward_stage2": 0.7120370864868164, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 349 |
| }, |
| { |
| "completion_length": 74.1875, |
| "epoch": 0.08646245059288538, |
| "grad_norm": 2.1002722035451935, |
| "kl": 0.09912109375, |
| "learning_rate": 9.137845849802372e-07, |
| "loss": 0.004, |
| "reward": 2.612084150314331, |
| "reward_std": 0.00017338224279228598, |
| "rewards/accuracy_reward_stage2": 0.6120842099189758, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 350 |
| }, |
| { |
| "completion_length": 90.875, |
| "epoch": 0.0867094861660079, |
| "grad_norm": 2.865518683426728, |
| "kl": 0.06591796875, |
| "learning_rate": 9.135375494071146e-07, |
| "loss": 0.0026, |
| "reward": 2.7866175174713135, |
| "reward_std": 0.0579838864505291, |
| "rewards/accuracy_reward_stage2": 0.7866175174713135, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 351 |
| }, |
| { |
| "completion_length": 98.328125, |
| "epoch": 0.08695652173913043, |
| "grad_norm": 4.334486790649066, |
| "kl": 0.0830078125, |
| "learning_rate": 9.13290513833992e-07, |
| "loss": 0.0033, |
| "reward": 2.560479164123535, |
| "reward_std": 0.028959453105926514, |
| "rewards/accuracy_reward_stage2": 0.5604792833328247, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 352 |
| }, |
| { |
| "completion_length": 90.0, |
| "epoch": 0.08720355731225296, |
| "grad_norm": 2.8099941143702236, |
| "kl": 0.0654296875, |
| "learning_rate": 9.130434782608695e-07, |
| "loss": 0.0026, |
| "reward": 2.7252159118652344, |
| "reward_std": 0.03723974525928497, |
| "rewards/accuracy_reward_stage2": 0.7252160310745239, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 353 |
| }, |
| { |
| "completion_length": 95.09375, |
| "epoch": 0.08745059288537549, |
| "grad_norm": 3.8428516026086887, |
| "kl": 0.08544921875, |
| "learning_rate": 9.12796442687747e-07, |
| "loss": 0.0034, |
| "reward": 2.444223642349243, |
| "reward_std": 0.01821870729327202, |
| "rewards/accuracy_reward_stage2": 0.4442237615585327, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.078125, |
| "step": 354 |
| }, |
| { |
| "completion_length": 101.8125, |
| "epoch": 0.08769762845849802, |
| "grad_norm": 4.237657395100437, |
| "kl": 0.08740234375, |
| "learning_rate": 9.125494071146244e-07, |
| "loss": 0.0035, |
| "reward": 2.5968141555786133, |
| "reward_std": 0.09012407064437866, |
| "rewards/accuracy_reward_stage2": 0.5968142151832581, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 355 |
| }, |
| { |
| "completion_length": 92.4375, |
| "epoch": 0.08794466403162056, |
| "grad_norm": 3.600620223298913, |
| "kl": 0.07421875, |
| "learning_rate": 9.123023715415019e-07, |
| "loss": 0.003, |
| "reward": 2.542025327682495, |
| "reward_std": 0.05679365620017052, |
| "rewards/accuracy_reward_stage2": 0.5420252680778503, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 356 |
| }, |
| { |
| "completion_length": 93.0, |
| "epoch": 0.08819169960474309, |
| "grad_norm": 4.863920897927563, |
| "kl": 0.08740234375, |
| "learning_rate": 9.120553359683793e-07, |
| "loss": 0.0035, |
| "reward": 2.5952115058898926, |
| "reward_std": 0.0393090695142746, |
| "rewards/accuracy_reward_stage2": 0.5952116250991821, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 357 |
| }, |
| { |
| "completion_length": 96.109375, |
| "epoch": 0.08843873517786562, |
| "grad_norm": 1.7790679985594122, |
| "kl": 0.10888671875, |
| "learning_rate": 9.11808300395257e-07, |
| "loss": 0.0044, |
| "reward": 2.6337594985961914, |
| "reward_std": 0.085614413022995, |
| "rewards/accuracy_reward_stage2": 0.6962594985961914, |
| "rewards/format_reward_all_stage": 1.9375, |
| "scores/refine_times": 1.125, |
| "step": 358 |
| }, |
| { |
| "completion_length": 83.8125, |
| "epoch": 0.08868577075098814, |
| "grad_norm": 3.551887755670501, |
| "kl": 0.0615234375, |
| "learning_rate": 9.115612648221344e-07, |
| "loss": 0.0025, |
| "reward": 2.706885814666748, |
| "reward_std": 0.08412647247314453, |
| "rewards/accuracy_reward_stage2": 0.7068856954574585, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 359 |
| }, |
| { |
| "completion_length": 85.875, |
| "epoch": 0.08893280632411067, |
| "grad_norm": 3.9494113031278744, |
| "kl": 0.1015625, |
| "learning_rate": 9.113142292490118e-07, |
| "loss": 0.0041, |
| "reward": 2.6664719581604004, |
| "reward_std": 0.028600279241800308, |
| "rewards/accuracy_reward_stage2": 0.6664718389511108, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 360 |
| }, |
| { |
| "completion_length": 82.625, |
| "epoch": 0.0891798418972332, |
| "grad_norm": 3.650417991791406, |
| "kl": 0.12890625, |
| "learning_rate": 9.110671936758893e-07, |
| "loss": 0.0052, |
| "reward": 2.6980035305023193, |
| "reward_std": 0.020932497456669807, |
| "rewards/accuracy_reward_stage2": 0.6980035305023193, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 361 |
| }, |
| { |
| "completion_length": 90.6875, |
| "epoch": 0.08942687747035573, |
| "grad_norm": 1.8849199154834935, |
| "kl": 0.06298828125, |
| "learning_rate": 9.108201581027668e-07, |
| "loss": 0.0025, |
| "reward": 2.5484163761138916, |
| "reward_std": 0.0038587902672588825, |
| "rewards/accuracy_reward_stage2": 0.5484163165092468, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 362 |
| }, |
| { |
| "completion_length": 72.0625, |
| "epoch": 0.08967391304347826, |
| "grad_norm": 3.494331373222489, |
| "kl": 0.08740234375, |
| "learning_rate": 9.105731225296442e-07, |
| "loss": 0.0035, |
| "reward": 2.810746431350708, |
| "reward_std": 0.07636210322380066, |
| "rewards/accuracy_reward_stage2": 0.810746431350708, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 363 |
| }, |
| { |
| "completion_length": 94.203125, |
| "epoch": 0.08992094861660078, |
| "grad_norm": 1.7491121800723743, |
| "kl": 0.07958984375, |
| "learning_rate": 9.103260869565217e-07, |
| "loss": 0.0032, |
| "reward": 2.7079660892486572, |
| "reward_std": 0.0014026534045115113, |
| "rewards/accuracy_reward_stage2": 0.707966148853302, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 364 |
| }, |
| { |
| "completion_length": 94.5625, |
| "epoch": 0.09016798418972333, |
| "grad_norm": 5.039313710437697, |
| "kl": 0.07275390625, |
| "learning_rate": 9.100790513833991e-07, |
| "loss": 0.0029, |
| "reward": 2.6045689582824707, |
| "reward_std": 0.06207848712801933, |
| "rewards/accuracy_reward_stage2": 0.6045687794685364, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 365 |
| }, |
| { |
| "completion_length": 78.5625, |
| "epoch": 0.09041501976284586, |
| "grad_norm": 5.0930488398379685, |
| "kl": 0.0927734375, |
| "learning_rate": 9.098320158102767e-07, |
| "loss": 0.0037, |
| "reward": 2.645688533782959, |
| "reward_std": 0.06729687005281448, |
| "rewards/accuracy_reward_stage2": 0.645688533782959, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 366 |
| }, |
| { |
| "completion_length": 80.125, |
| "epoch": 0.09066205533596838, |
| "grad_norm": 3.9786634024319487, |
| "kl": 0.0615234375, |
| "learning_rate": 9.095849802371542e-07, |
| "loss": 0.0025, |
| "reward": 2.5161356925964355, |
| "reward_std": 0.027481183409690857, |
| "rewards/accuracy_reward_stage2": 0.5161359310150146, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 367 |
| }, |
| { |
| "completion_length": 75.0, |
| "epoch": 0.09090909090909091, |
| "grad_norm": 3.523685489400653, |
| "kl": 0.052734375, |
| "learning_rate": 9.093379446640316e-07, |
| "loss": 0.0021, |
| "reward": 2.816563129425049, |
| "reward_std": 0.009983880445361137, |
| "rewards/accuracy_reward_stage2": 0.8165630102157593, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 368 |
| }, |
| { |
| "completion_length": 92.3125, |
| "epoch": 0.09115612648221344, |
| "grad_norm": 4.234417498920322, |
| "kl": 0.06640625, |
| "learning_rate": 9.09090909090909e-07, |
| "loss": 0.0026, |
| "reward": 2.7531919479370117, |
| "reward_std": 0.15805090963840485, |
| "rewards/accuracy_reward_stage2": 0.7531920671463013, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 369 |
| }, |
| { |
| "completion_length": 83.0625, |
| "epoch": 0.09140316205533597, |
| "grad_norm": 3.9114706673580932, |
| "kl": 0.09033203125, |
| "learning_rate": 9.088438735177866e-07, |
| "loss": 0.0036, |
| "reward": 2.5998778343200684, |
| "reward_std": 0.025657862424850464, |
| "rewards/accuracy_reward_stage2": 0.5998777747154236, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 370 |
| }, |
| { |
| "completion_length": 89.359375, |
| "epoch": 0.0916501976284585, |
| "grad_norm": 4.838298835178836, |
| "kl": 0.08203125, |
| "learning_rate": 9.08596837944664e-07, |
| "loss": 0.0033, |
| "reward": 2.607950448989868, |
| "reward_std": 0.05628318339586258, |
| "rewards/accuracy_reward_stage2": 0.6079504489898682, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.078125, |
| "step": 371 |
| }, |
| { |
| "completion_length": 82.28125, |
| "epoch": 0.09189723320158102, |
| "grad_norm": 3.7446689639370048, |
| "kl": 0.06982421875, |
| "learning_rate": 9.083498023715414e-07, |
| "loss": 0.0028, |
| "reward": 2.763612747192383, |
| "reward_std": 0.06537837535142899, |
| "rewards/accuracy_reward_stage2": 0.7636126279830933, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.140625, |
| "step": 372 |
| }, |
| { |
| "completion_length": 75.6875, |
| "epoch": 0.09214426877470355, |
| "grad_norm": 5.0039114879396855, |
| "kl": 0.07763671875, |
| "learning_rate": 9.081027667984189e-07, |
| "loss": 0.0031, |
| "reward": 2.7143735885620117, |
| "reward_std": 0.11156806349754333, |
| "rewards/accuracy_reward_stage2": 0.7143735885620117, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 373 |
| }, |
| { |
| "completion_length": 62.0, |
| "epoch": 0.09239130434782608, |
| "grad_norm": 2.1401690709477665, |
| "kl": 0.07421875, |
| "learning_rate": 9.078557312252963e-07, |
| "loss": 0.003, |
| "reward": 2.7579450607299805, |
| "reward_std": 0.0014047721633687615, |
| "rewards/accuracy_reward_stage2": 0.75794517993927, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 374 |
| }, |
| { |
| "completion_length": 74.296875, |
| "epoch": 0.09263833992094862, |
| "grad_norm": 3.17288826560754, |
| "kl": 0.07861328125, |
| "learning_rate": 9.076086956521739e-07, |
| "loss": 0.0031, |
| "reward": 2.858553171157837, |
| "reward_std": 0.07798619568347931, |
| "rewards/accuracy_reward_stage2": 0.8585531711578369, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 375 |
| }, |
| { |
| "completion_length": 83.046875, |
| "epoch": 0.09288537549407115, |
| "grad_norm": 4.880063981090028, |
| "kl": 0.0859375, |
| "learning_rate": 9.073616600790514e-07, |
| "loss": 0.0034, |
| "reward": 2.54172420501709, |
| "reward_std": 0.07892563194036484, |
| "rewards/accuracy_reward_stage2": 0.541724443435669, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.140625, |
| "step": 376 |
| }, |
| { |
| "completion_length": 76.625, |
| "epoch": 0.09313241106719368, |
| "grad_norm": 3.6385142711792717, |
| "kl": 0.0732421875, |
| "learning_rate": 9.071146245059288e-07, |
| "loss": 0.0029, |
| "reward": 2.5843570232391357, |
| "reward_std": 0.09440311044454575, |
| "rewards/accuracy_reward_stage2": 0.5843569040298462, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 377 |
| }, |
| { |
| "completion_length": 82.3125, |
| "epoch": 0.09337944664031621, |
| "grad_norm": 4.763527255160224, |
| "kl": 0.06787109375, |
| "learning_rate": 9.068675889328063e-07, |
| "loss": 0.0027, |
| "reward": 2.674133062362671, |
| "reward_std": 0.14093773066997528, |
| "rewards/accuracy_reward_stage2": 0.7366331219673157, |
| "rewards/format_reward_all_stage": 1.9375, |
| "scores/refine_times": 1.0625, |
| "step": 378 |
| }, |
| { |
| "completion_length": 78.140625, |
| "epoch": 0.09362648221343874, |
| "grad_norm": 2.188818094769199, |
| "kl": 0.07763671875, |
| "learning_rate": 9.066205533596838e-07, |
| "loss": 0.0031, |
| "reward": 2.685183525085449, |
| "reward_std": 0.11984831094741821, |
| "rewards/accuracy_reward_stage2": 0.7039335370063782, |
| "rewards/format_reward_all_stage": 1.9812500476837158, |
| "scores/refine_times": 1.109375, |
| "step": 379 |
| }, |
| { |
| "completion_length": 89.515625, |
| "epoch": 0.09387351778656126, |
| "grad_norm": 2.065491538033126, |
| "kl": 0.1748046875, |
| "learning_rate": 9.063735177865612e-07, |
| "loss": 0.007, |
| "reward": 2.7357075214385986, |
| "reward_std": 0.016317401081323624, |
| "rewards/accuracy_reward_stage2": 0.7357075214385986, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.25, |
| "step": 380 |
| }, |
| { |
| "completion_length": 71.890625, |
| "epoch": 0.09412055335968379, |
| "grad_norm": 5.304636175879363, |
| "kl": 0.08203125, |
| "learning_rate": 9.061264822134387e-07, |
| "loss": 0.0033, |
| "reward": 2.747291088104248, |
| "reward_std": 0.08369327336549759, |
| "rewards/accuracy_reward_stage2": 0.7629162073135376, |
| "rewards/format_reward_all_stage": 1.984375, |
| "scores/refine_times": 1.0625, |
| "step": 381 |
| }, |
| { |
| "completion_length": 81.0, |
| "epoch": 0.09436758893280632, |
| "grad_norm": 3.2109084867310376, |
| "kl": 0.10205078125, |
| "learning_rate": 9.058794466403161e-07, |
| "loss": 0.0041, |
| "reward": 2.59745454788208, |
| "reward_std": 0.13363061845302582, |
| "rewards/accuracy_reward_stage2": 0.5974544286727905, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.09375, |
| "step": 382 |
| }, |
| { |
| "completion_length": 80.265625, |
| "epoch": 0.09461462450592885, |
| "grad_norm": 2.77987908921126, |
| "kl": 0.07421875, |
| "learning_rate": 9.056324110671936e-07, |
| "loss": 0.003, |
| "reward": 2.638756513595581, |
| "reward_std": 0.03335772827267647, |
| "rewards/accuracy_reward_stage2": 0.638756513595581, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.078125, |
| "step": 383 |
| }, |
| { |
| "completion_length": 91.359375, |
| "epoch": 0.09486166007905138, |
| "grad_norm": 4.517273905721912, |
| "kl": 0.10205078125, |
| "learning_rate": 9.053853754940711e-07, |
| "loss": 0.0041, |
| "reward": 2.5370707511901855, |
| "reward_std": 0.04199734330177307, |
| "rewards/accuracy_reward_stage2": 0.537070631980896, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.140625, |
| "step": 384 |
| }, |
| { |
| "completion_length": 92.0625, |
| "epoch": 0.09510869565217392, |
| "grad_norm": 4.582087540119265, |
| "kl": 0.058349609375, |
| "learning_rate": 9.051383399209486e-07, |
| "loss": 0.0023, |
| "reward": 2.4340434074401855, |
| "reward_std": 0.17110881209373474, |
| "rewards/accuracy_reward_stage2": 0.4340435266494751, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.140625, |
| "step": 385 |
| }, |
| { |
| "completion_length": 87.546875, |
| "epoch": 0.09535573122529645, |
| "grad_norm": 3.8514679024728227, |
| "kl": 0.07861328125, |
| "learning_rate": 9.04891304347826e-07, |
| "loss": 0.0032, |
| "reward": 2.805450677871704, |
| "reward_std": 0.08290667831897736, |
| "rewards/accuracy_reward_stage2": 0.8054506778717041, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.078125, |
| "step": 386 |
| }, |
| { |
| "completion_length": 111.59375, |
| "epoch": 0.09560276679841898, |
| "grad_norm": 2.995041280410516, |
| "kl": 0.076171875, |
| "learning_rate": 9.046442687747036e-07, |
| "loss": 0.003, |
| "reward": 2.571000099182129, |
| "reward_std": 0.1235700324177742, |
| "rewards/accuracy_reward_stage2": 0.6475626230239868, |
| "rewards/format_reward_all_stage": 1.923437476158142, |
| "scores/refine_times": 1.484375, |
| "step": 387 |
| }, |
| { |
| "completion_length": 93.390625, |
| "epoch": 0.0958498023715415, |
| "grad_norm": 3.1143672609105697, |
| "kl": 0.060791015625, |
| "learning_rate": 9.04397233201581e-07, |
| "loss": 0.0024, |
| "reward": 2.7106356620788574, |
| "reward_std": 0.10501326620578766, |
| "rewards/accuracy_reward_stage2": 0.7168859243392944, |
| "rewards/format_reward_all_stage": 1.993749976158142, |
| "scores/refine_times": 1.328125, |
| "step": 388 |
| }, |
| { |
| "completion_length": 115.40625, |
| "epoch": 0.09609683794466403, |
| "grad_norm": 3.2606535165707085, |
| "kl": 0.08056640625, |
| "learning_rate": 9.041501976284585e-07, |
| "loss": 0.0032, |
| "reward": 2.5893375873565674, |
| "reward_std": 0.07137158513069153, |
| "rewards/accuracy_reward_stage2": 0.5955876111984253, |
| "rewards/format_reward_all_stage": 1.993749976158142, |
| "scores/refine_times": 1.375, |
| "step": 389 |
| }, |
| { |
| "completion_length": 81.25, |
| "epoch": 0.09634387351778656, |
| "grad_norm": 0.7663992960647029, |
| "kl": 0.07421875, |
| "learning_rate": 9.039031620553359e-07, |
| "loss": 0.003, |
| "reward": 2.861607074737549, |
| "reward_std": 0.03788072615861893, |
| "rewards/accuracy_reward_stage2": 0.8616071343421936, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.234375, |
| "step": 390 |
| }, |
| { |
| "completion_length": 93.8125, |
| "epoch": 0.09659090909090909, |
| "grad_norm": 3.072101647281855, |
| "kl": 0.060546875, |
| "learning_rate": 9.036561264822133e-07, |
| "loss": 0.0024, |
| "reward": 2.6479721069335938, |
| "reward_std": 0.13277268409729004, |
| "rewards/accuracy_reward_stage2": 0.6479719877243042, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.25, |
| "step": 391 |
| }, |
| { |
| "completion_length": 133.53125, |
| "epoch": 0.09683794466403162, |
| "grad_norm": 1.1412164642463887, |
| "kl": 0.060791015625, |
| "learning_rate": 9.034090909090909e-07, |
| "loss": 0.0024, |
| "reward": 2.759420871734619, |
| "reward_std": 0.021660229191184044, |
| "rewards/accuracy_reward_stage2": 0.7594207525253296, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.6875, |
| "step": 392 |
| }, |
| { |
| "completion_length": 109.796875, |
| "epoch": 0.09708498023715414, |
| "grad_norm": 3.7260651927810224, |
| "kl": 0.0849609375, |
| "learning_rate": 9.031620553359683e-07, |
| "loss": 0.0034, |
| "reward": 2.6653695106506348, |
| "reward_std": 0.1488296538591385, |
| "rewards/accuracy_reward_stage2": 0.6653696298599243, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.4375, |
| "step": 393 |
| }, |
| { |
| "completion_length": 97.5, |
| "epoch": 0.09733201581027667, |
| "grad_norm": 1.9771971561886401, |
| "kl": 0.07666015625, |
| "learning_rate": 9.029150197628458e-07, |
| "loss": 0.0031, |
| "reward": 2.64612078666687, |
| "reward_std": 0.06681530922651291, |
| "rewards/accuracy_reward_stage2": 0.6461206674575806, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.328125, |
| "step": 394 |
| }, |
| { |
| "completion_length": 136.671875, |
| "epoch": 0.09757905138339922, |
| "grad_norm": 2.9936463926600254, |
| "kl": 0.09716796875, |
| "learning_rate": 9.026679841897233e-07, |
| "loss": 0.0039, |
| "reward": 2.3593060970306396, |
| "reward_std": 0.10838481783866882, |
| "rewards/accuracy_reward_stage2": 0.42805612087249756, |
| "rewards/format_reward_all_stage": 1.931249976158142, |
| "scores/refine_times": 1.5, |
| "step": 395 |
| }, |
| { |
| "completion_length": 100.890625, |
| "epoch": 0.09782608695652174, |
| "grad_norm": 3.25385470248015, |
| "kl": 0.055908203125, |
| "learning_rate": 9.024209486166008e-07, |
| "loss": 0.0022, |
| "reward": 2.2150940895080566, |
| "reward_std": 0.3374716639518738, |
| "rewards/accuracy_reward_stage2": 0.3400941491127014, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.34375, |
| "step": 396 |
| }, |
| { |
| "completion_length": 100.21875, |
| "epoch": 0.09807312252964427, |
| "grad_norm": 2.687250333827823, |
| "kl": 0.0537109375, |
| "learning_rate": 9.021739130434782e-07, |
| "loss": 0.0022, |
| "reward": 2.6029052734375, |
| "reward_std": 0.09498636424541473, |
| "rewards/accuracy_reward_stage2": 0.6107178926467896, |
| "rewards/format_reward_all_stage": 1.9921875, |
| "scores/refine_times": 1.359375, |
| "step": 397 |
| }, |
| { |
| "completion_length": 118.1875, |
| "epoch": 0.0983201581027668, |
| "grad_norm": 3.207780448405167, |
| "kl": 0.0791015625, |
| "learning_rate": 9.019268774703557e-07, |
| "loss": 0.0032, |
| "reward": 2.7733564376831055, |
| "reward_std": 0.1665625274181366, |
| "rewards/accuracy_reward_stage2": 0.7733563184738159, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.390625, |
| "step": 398 |
| }, |
| { |
| "completion_length": 166.609375, |
| "epoch": 0.09856719367588933, |
| "grad_norm": 3.1116135663801625, |
| "kl": 0.0625, |
| "learning_rate": 9.016798418972331e-07, |
| "loss": 0.0025, |
| "reward": 2.6186110973358154, |
| "reward_std": 0.23827369511127472, |
| "rewards/accuracy_reward_stage2": 0.6482987403869629, |
| "rewards/format_reward_all_stage": 1.970312476158142, |
| "scores/refine_times": 1.953125, |
| "step": 399 |
| }, |
| { |
| "completion_length": 126.015625, |
| "epoch": 0.09881422924901186, |
| "grad_norm": 3.09906490210566, |
| "kl": 0.05322265625, |
| "learning_rate": 9.014328063241107e-07, |
| "loss": 0.0021, |
| "reward": 2.795719861984253, |
| "reward_std": 0.09762432426214218, |
| "rewards/accuracy_reward_stage2": 0.7957199215888977, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.53125, |
| "step": 400 |
| }, |
| { |
| "completion_length": 92.90625, |
| "epoch": 0.09906126482213438, |
| "grad_norm": 2.2514082728709557, |
| "kl": 0.1162109375, |
| "learning_rate": 9.011857707509881e-07, |
| "loss": 0.0046, |
| "reward": 2.7314138412475586, |
| "reward_std": 0.0348123237490654, |
| "rewards/accuracy_reward_stage2": 0.7418302893638611, |
| "rewards/format_reward_all_stage": 1.9895833730697632, |
| "scores/refine_times": 1.34375, |
| "step": 401 |
| }, |
| { |
| "completion_length": 129.75, |
| "epoch": 0.09930830039525691, |
| "grad_norm": 3.6954546874529988, |
| "kl": 0.05615234375, |
| "learning_rate": 9.009387351778655e-07, |
| "loss": 0.0022, |
| "reward": 2.509495496749878, |
| "reward_std": 0.15696083009243011, |
| "rewards/accuracy_reward_stage2": 0.5157454013824463, |
| "rewards/format_reward_all_stage": 1.993749976158142, |
| "scores/refine_times": 1.671875, |
| "step": 402 |
| }, |
| { |
| "completion_length": 146.5625, |
| "epoch": 0.09955533596837944, |
| "grad_norm": 2.579997674658776, |
| "kl": 0.0673828125, |
| "learning_rate": 9.00691699604743e-07, |
| "loss": 0.0027, |
| "reward": 2.930208206176758, |
| "reward_std": 0.08919259905815125, |
| "rewards/accuracy_reward_stage2": 0.9302083253860474, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.640625, |
| "step": 403 |
| }, |
| { |
| "completion_length": 121.75, |
| "epoch": 0.09980237154150198, |
| "grad_norm": 2.1237078180503586, |
| "kl": 0.0556640625, |
| "learning_rate": 9.004446640316206e-07, |
| "loss": 0.0022, |
| "reward": 2.6469950675964355, |
| "reward_std": 0.07097595930099487, |
| "rewards/accuracy_reward_stage2": 0.6469952464103699, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.5625, |
| "step": 404 |
| }, |
| { |
| "completion_length": 130.65625, |
| "epoch": 0.10004940711462451, |
| "grad_norm": 2.444254134638145, |
| "kl": 0.0576171875, |
| "learning_rate": 9.00197628458498e-07, |
| "loss": 0.0023, |
| "reward": 2.7591514587402344, |
| "reward_std": 0.07375451177358627, |
| "rewards/accuracy_reward_stage2": 0.7591514587402344, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.625, |
| "step": 405 |
| }, |
| { |
| "completion_length": 120.765625, |
| "epoch": 0.10029644268774704, |
| "grad_norm": 2.1204759761599066, |
| "kl": 0.059814453125, |
| "learning_rate": 8.999505928853755e-07, |
| "loss": 0.0024, |
| "reward": 2.799839973449707, |
| "reward_std": 0.004150245804339647, |
| "rewards/accuracy_reward_stage2": 0.7998400926589966, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.5625, |
| "step": 406 |
| }, |
| { |
| "completion_length": 162.5625, |
| "epoch": 0.10054347826086957, |
| "grad_norm": 7.631897735555443, |
| "kl": 0.32421875, |
| "learning_rate": 8.997035573122529e-07, |
| "loss": 0.013, |
| "reward": 2.5346357822418213, |
| "reward_std": 0.32676130533218384, |
| "rewards/accuracy_reward_stage2": 0.6940107941627502, |
| "rewards/format_reward_all_stage": 1.8406249284744263, |
| "scores/refine_times": 1.921875, |
| "step": 407 |
| }, |
| { |
| "completion_length": 146.625, |
| "epoch": 0.1007905138339921, |
| "grad_norm": 3.855569269918276, |
| "kl": 0.048828125, |
| "learning_rate": 8.994565217391304e-07, |
| "loss": 0.002, |
| "reward": 2.6476688385009766, |
| "reward_std": 0.12627330422401428, |
| "rewards/accuracy_reward_stage2": 0.6476688981056213, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.75, |
| "step": 408 |
| }, |
| { |
| "completion_length": 128.796875, |
| "epoch": 0.10103754940711462, |
| "grad_norm": 2.917741896038562, |
| "kl": 0.0595703125, |
| "learning_rate": 8.992094861660079e-07, |
| "loss": 0.0024, |
| "reward": 2.773995876312256, |
| "reward_std": 0.10757410526275635, |
| "rewards/accuracy_reward_stage2": 0.7818082571029663, |
| "rewards/format_reward_all_stage": 1.9921875, |
| "scores/refine_times": 1.546875, |
| "step": 409 |
| }, |
| { |
| "completion_length": 155.734375, |
| "epoch": 0.10128458498023715, |
| "grad_norm": 2.9672829734154478, |
| "kl": 0.0673828125, |
| "learning_rate": 8.989624505928853e-07, |
| "loss": 0.0027, |
| "reward": 2.5923919677734375, |
| "reward_std": 0.29955703020095825, |
| "rewards/accuracy_reward_stage2": 0.7741629481315613, |
| "rewards/format_reward_all_stage": 1.8182291984558105, |
| "scores/refine_times": 2.0, |
| "step": 410 |
| }, |
| { |
| "completion_length": 193.625, |
| "epoch": 0.10153162055335968, |
| "grad_norm": 3.7254627268683578, |
| "kl": 0.05615234375, |
| "learning_rate": 8.987154150197627e-07, |
| "loss": 0.0022, |
| "reward": 2.3365869522094727, |
| "reward_std": 0.3466811776161194, |
| "rewards/accuracy_reward_stage2": 0.49075353145599365, |
| "rewards/format_reward_all_stage": 1.8458333015441895, |
| "scores/refine_times": 2.046875, |
| "step": 411 |
| }, |
| { |
| "completion_length": 149.046875, |
| "epoch": 0.10177865612648221, |
| "grad_norm": 2.8805192836507953, |
| "kl": 0.05419921875, |
| "learning_rate": 8.984683794466402e-07, |
| "loss": 0.0022, |
| "reward": 2.570408821105957, |
| "reward_std": 0.3919922709465027, |
| "rewards/accuracy_reward_stage2": 0.8464504480361938, |
| "rewards/format_reward_all_stage": 1.7239582538604736, |
| "scores/refine_times": 1.796875, |
| "step": 412 |
| }, |
| { |
| "completion_length": 138.328125, |
| "epoch": 0.10202569169960474, |
| "grad_norm": 2.4678069629785138, |
| "kl": 0.06298828125, |
| "learning_rate": 8.982213438735178e-07, |
| "loss": 0.0025, |
| "reward": 2.5044920444488525, |
| "reward_std": 0.28305938839912415, |
| "rewards/accuracy_reward_stage2": 0.7544921636581421, |
| "rewards/format_reward_all_stage": 1.75, |
| "scores/refine_times": 1.609375, |
| "step": 413 |
| }, |
| { |
| "completion_length": 92.421875, |
| "epoch": 0.10227272727272728, |
| "grad_norm": 3.564931456124642, |
| "kl": 0.06884765625, |
| "learning_rate": 8.979743083003953e-07, |
| "loss": 0.0027, |
| "reward": 2.6201610565185547, |
| "reward_std": 0.19829556345939636, |
| "rewards/accuracy_reward_stage2": 0.7659943103790283, |
| "rewards/format_reward_all_stage": 1.8541667461395264, |
| "scores/refine_times": 1.40625, |
| "step": 414 |
| }, |
| { |
| "completion_length": 140.8125, |
| "epoch": 0.10251976284584981, |
| "grad_norm": 4.105881747857916, |
| "kl": 0.06787109375, |
| "learning_rate": 8.977272727272727e-07, |
| "loss": 0.0027, |
| "reward": 2.1747279167175293, |
| "reward_std": 0.6268551349639893, |
| "rewards/accuracy_reward_stage2": 0.6966027021408081, |
| "rewards/format_reward_all_stage": 1.478124976158142, |
| "scores/refine_times": 1.546875, |
| "step": 415 |
| }, |
| { |
| "completion_length": 130.28125, |
| "epoch": 0.10276679841897234, |
| "grad_norm": 3.5739830819011034, |
| "kl": 0.0703125, |
| "learning_rate": 8.974802371541501e-07, |
| "loss": 0.0028, |
| "reward": 2.330864191055298, |
| "reward_std": 0.39451754093170166, |
| "rewards/accuracy_reward_stage2": 0.7058640718460083, |
| "rewards/format_reward_all_stage": 1.625, |
| "scores/refine_times": 1.421875, |
| "step": 416 |
| }, |
| { |
| "completion_length": 113.203125, |
| "epoch": 0.10301383399209486, |
| "grad_norm": 5.371473470225853, |
| "kl": 0.0927734375, |
| "learning_rate": 8.972332015810277e-07, |
| "loss": 0.0037, |
| "reward": 2.0138349533081055, |
| "reward_std": 0.6088952422142029, |
| "rewards/accuracy_reward_stage2": 0.6700849533081055, |
| "rewards/format_reward_all_stage": 1.34375, |
| "scores/refine_times": 1.296875, |
| "step": 417 |
| }, |
| { |
| "completion_length": 105.09375, |
| "epoch": 0.10326086956521739, |
| "grad_norm": 3.7462892766024387, |
| "kl": 0.06396484375, |
| "learning_rate": 8.969861660079051e-07, |
| "loss": 0.0026, |
| "reward": 2.5230908393859863, |
| "reward_std": 0.21397997438907623, |
| "rewards/accuracy_reward_stage2": 0.6480907797813416, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.375, |
| "step": 418 |
| }, |
| { |
| "completion_length": 91.484375, |
| "epoch": 0.10350790513833992, |
| "grad_norm": 2.5938381710499856, |
| "kl": 0.06640625, |
| "learning_rate": 8.967391304347825e-07, |
| "loss": 0.0026, |
| "reward": 2.7374203205108643, |
| "reward_std": 0.06906712800264359, |
| "rewards/accuracy_reward_stage2": 0.7374203205108643, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.265625, |
| "step": 419 |
| }, |
| { |
| "completion_length": 99.859375, |
| "epoch": 0.10375494071146245, |
| "grad_norm": 3.7475747032872726, |
| "kl": 0.0654296875, |
| "learning_rate": 8.9649209486166e-07, |
| "loss": 0.0026, |
| "reward": 2.4589004516601562, |
| "reward_std": 0.19246460497379303, |
| "rewards/accuracy_reward_stage2": 0.5839004516601562, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.234375, |
| "step": 420 |
| }, |
| { |
| "completion_length": 87.734375, |
| "epoch": 0.10400197628458498, |
| "grad_norm": 2.4100627297787027, |
| "kl": 0.0732421875, |
| "learning_rate": 8.962450592885375e-07, |
| "loss": 0.0029, |
| "reward": 2.7743234634399414, |
| "reward_std": 0.08512721955776215, |
| "rewards/accuracy_reward_stage2": 0.7743235230445862, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.296875, |
| "step": 421 |
| }, |
| { |
| "completion_length": 94.828125, |
| "epoch": 0.1042490118577075, |
| "grad_norm": 5.060240016474509, |
| "kl": 0.07861328125, |
| "learning_rate": 8.95998023715415e-07, |
| "loss": 0.0031, |
| "reward": 2.6438651084899902, |
| "reward_std": 0.12257831543684006, |
| "rewards/accuracy_reward_stage2": 0.6438649296760559, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.21875, |
| "step": 422 |
| }, |
| { |
| "completion_length": 103.109375, |
| "epoch": 0.10449604743083003, |
| "grad_norm": 4.702666849391409, |
| "kl": 0.053466796875, |
| "learning_rate": 8.957509881422925e-07, |
| "loss": 0.0021, |
| "reward": 2.2406249046325684, |
| "reward_std": 0.4183598756790161, |
| "rewards/accuracy_reward_stage2": 0.4906250238418579, |
| "rewards/format_reward_all_stage": 1.75, |
| "scores/refine_times": 1.28125, |
| "step": 423 |
| }, |
| { |
| "completion_length": 106.6875, |
| "epoch": 0.10474308300395258, |
| "grad_norm": 4.171493335376991, |
| "kl": 0.05517578125, |
| "learning_rate": 8.955039525691699e-07, |
| "loss": 0.0022, |
| "reward": 2.5777876377105713, |
| "reward_std": 0.1656649112701416, |
| "rewards/accuracy_reward_stage2": 0.7027875781059265, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.25, |
| "step": 424 |
| }, |
| { |
| "completion_length": 84.828125, |
| "epoch": 0.1049901185770751, |
| "grad_norm": 3.8218395659454005, |
| "kl": 0.09423828125, |
| "learning_rate": 8.952569169960475e-07, |
| "loss": 0.0038, |
| "reward": 2.5052084922790527, |
| "reward_std": 0.2631934583187103, |
| "rewards/accuracy_reward_stage2": 0.6875, |
| "rewards/format_reward_all_stage": 1.8177083730697632, |
| "scores/refine_times": 1.21875, |
| "step": 425 |
| }, |
| { |
| "completion_length": 105.5625, |
| "epoch": 0.10523715415019763, |
| "grad_norm": 3.4660828623228928, |
| "kl": 0.08935546875, |
| "learning_rate": 8.950098814229249e-07, |
| "loss": 0.0036, |
| "reward": 2.3972997665405273, |
| "reward_std": 0.22224318981170654, |
| "rewards/accuracy_reward_stage2": 0.584799587726593, |
| "rewards/format_reward_all_stage": 1.8125, |
| "scores/refine_times": 1.265625, |
| "step": 426 |
| }, |
| { |
| "completion_length": 98.421875, |
| "epoch": 0.10548418972332016, |
| "grad_norm": 4.869263400762264, |
| "kl": 0.0732421875, |
| "learning_rate": 8.947628458498023e-07, |
| "loss": 0.0029, |
| "reward": 2.5426318645477295, |
| "reward_std": 0.12406840175390244, |
| "rewards/accuracy_reward_stage2": 0.5426318645477295, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.234375, |
| "step": 427 |
| }, |
| { |
| "completion_length": 91.90625, |
| "epoch": 0.10573122529644269, |
| "grad_norm": 1.8180351543204958, |
| "kl": 0.07177734375, |
| "learning_rate": 8.945158102766798e-07, |
| "loss": 0.0029, |
| "reward": 2.8208327293395996, |
| "reward_std": 0.10960796475410461, |
| "rewards/accuracy_reward_stage2": 0.8208328485488892, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.34375, |
| "step": 428 |
| }, |
| { |
| "completion_length": 86.1875, |
| "epoch": 0.10597826086956522, |
| "grad_norm": 3.3436567770125656, |
| "kl": 0.07421875, |
| "learning_rate": 8.942687747035573e-07, |
| "loss": 0.003, |
| "reward": 2.865917682647705, |
| "reward_std": 0.02855822630226612, |
| "rewards/accuracy_reward_stage2": 0.8659177422523499, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 429 |
| }, |
| { |
| "completion_length": 72.46875, |
| "epoch": 0.10622529644268774, |
| "grad_norm": 4.139290116741471, |
| "kl": 0.169921875, |
| "learning_rate": 8.940217391304347e-07, |
| "loss": 0.0068, |
| "reward": 2.6120975017547607, |
| "reward_std": 0.2286979854106903, |
| "rewards/accuracy_reward_stage2": 0.7370975017547607, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.0625, |
| "step": 430 |
| }, |
| { |
| "completion_length": 71.3125, |
| "epoch": 0.10647233201581027, |
| "grad_norm": 3.7005291669713354, |
| "kl": 0.08203125, |
| "learning_rate": 8.937747035573123e-07, |
| "loss": 0.0033, |
| "reward": 2.71065092086792, |
| "reward_std": 0.08734364062547684, |
| "rewards/accuracy_reward_stage2": 0.7106510400772095, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 431 |
| }, |
| { |
| "completion_length": 85.984375, |
| "epoch": 0.1067193675889328, |
| "grad_norm": 5.202110977588656, |
| "kl": 0.1015625, |
| "learning_rate": 8.935276679841897e-07, |
| "loss": 0.0041, |
| "reward": 2.617884635925293, |
| "reward_std": 0.10500668734312057, |
| "rewards/accuracy_reward_stage2": 0.617884635925293, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 432 |
| }, |
| { |
| "completion_length": 72.53125, |
| "epoch": 0.10696640316205533, |
| "grad_norm": 5.720314610842664, |
| "kl": 0.103515625, |
| "learning_rate": 8.932806324110671e-07, |
| "loss": 0.0041, |
| "reward": 2.567166805267334, |
| "reward_std": 0.21528260409832, |
| "rewards/accuracy_reward_stage2": 0.5671666860580444, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 433 |
| }, |
| { |
| "completion_length": 84.546875, |
| "epoch": 0.10721343873517787, |
| "grad_norm": 3.2175328418513924, |
| "kl": 0.068359375, |
| "learning_rate": 8.930335968379447e-07, |
| "loss": 0.0027, |
| "reward": 2.581601142883301, |
| "reward_std": 0.04433634132146835, |
| "rewards/accuracy_reward_stage2": 0.581601083278656, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 434 |
| }, |
| { |
| "completion_length": 78.984375, |
| "epoch": 0.1074604743083004, |
| "grad_norm": 1.2926706812925475, |
| "kl": 0.12353515625, |
| "learning_rate": 8.927865612648221e-07, |
| "loss": 0.0049, |
| "reward": 2.9375431537628174, |
| "reward_std": 0.01452865544706583, |
| "rewards/accuracy_reward_stage2": 0.9375432133674622, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 435 |
| }, |
| { |
| "completion_length": 77.5625, |
| "epoch": 0.10770750988142293, |
| "grad_norm": 2.2617787059512224, |
| "kl": 0.0791015625, |
| "learning_rate": 8.925395256916995e-07, |
| "loss": 0.0032, |
| "reward": 2.767601251602173, |
| "reward_std": 0.04931104555726051, |
| "rewards/accuracy_reward_stage2": 0.7676013112068176, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 436 |
| }, |
| { |
| "completion_length": 101.75, |
| "epoch": 0.10795454545454546, |
| "grad_norm": 2.4619041669921216, |
| "kl": 0.078125, |
| "learning_rate": 8.92292490118577e-07, |
| "loss": 0.0031, |
| "reward": 2.6876630783081055, |
| "reward_std": 0.09527213126420975, |
| "rewards/accuracy_reward_stage2": 0.757975697517395, |
| "rewards/format_reward_all_stage": 1.9296875, |
| "scores/refine_times": 1.21875, |
| "step": 437 |
| }, |
| { |
| "completion_length": 74.546875, |
| "epoch": 0.10820158102766798, |
| "grad_norm": 3.6665021628241283, |
| "kl": 0.08349609375, |
| "learning_rate": 8.920454545454545e-07, |
| "loss": 0.0033, |
| "reward": 2.5134830474853516, |
| "reward_std": 0.11075843870639801, |
| "rewards/accuracy_reward_stage2": 0.5134831070899963, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.078125, |
| "step": 438 |
| }, |
| { |
| "completion_length": 83.828125, |
| "epoch": 0.10844861660079051, |
| "grad_norm": 0.2742410987570204, |
| "kl": 0.06787109375, |
| "learning_rate": 8.91798418972332e-07, |
| "loss": 0.0027, |
| "reward": 2.606250047683716, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward_stage2": 0.606249988079071, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.1875, |
| "step": 439 |
| }, |
| { |
| "completion_length": 70.625, |
| "epoch": 0.10869565217391304, |
| "grad_norm": 4.1049578784327085, |
| "kl": 0.08349609375, |
| "learning_rate": 8.915513833992094e-07, |
| "loss": 0.0033, |
| "reward": 2.747023820877075, |
| "reward_std": 0.15908406674861908, |
| "rewards/accuracy_reward_stage2": 0.7470238208770752, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 440 |
| }, |
| { |
| "completion_length": 83.796875, |
| "epoch": 0.10894268774703557, |
| "grad_norm": 3.890400064216742, |
| "kl": 0.072265625, |
| "learning_rate": 8.913043478260869e-07, |
| "loss": 0.0029, |
| "reward": 2.6413779258728027, |
| "reward_std": 0.15115022659301758, |
| "rewards/accuracy_reward_stage2": 0.6413780450820923, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.171875, |
| "step": 441 |
| }, |
| { |
| "completion_length": 82.5, |
| "epoch": 0.1091897233201581, |
| "grad_norm": 3.364268393065154, |
| "kl": 0.0869140625, |
| "learning_rate": 8.910573122529645e-07, |
| "loss": 0.0035, |
| "reward": 2.5104165077209473, |
| "reward_std": 0.23385357856750488, |
| "rewards/accuracy_reward_stage2": 0.6354166865348816, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.078125, |
| "step": 442 |
| }, |
| { |
| "completion_length": 93.9375, |
| "epoch": 0.10943675889328064, |
| "grad_norm": 3.999124238125438, |
| "kl": 0.0908203125, |
| "learning_rate": 8.908102766798419e-07, |
| "loss": 0.0036, |
| "reward": 2.6210455894470215, |
| "reward_std": 0.09140154719352722, |
| "rewards/accuracy_reward_stage2": 0.6210454106330872, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 443 |
| }, |
| { |
| "completion_length": 94.34375, |
| "epoch": 0.10968379446640317, |
| "grad_norm": 4.9639104904967235, |
| "kl": 0.1005859375, |
| "learning_rate": 8.905632411067193e-07, |
| "loss": 0.004, |
| "reward": 2.784119129180908, |
| "reward_std": 0.237242192029953, |
| "rewards/accuracy_reward_stage2": 0.7841191291809082, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.1875, |
| "step": 444 |
| }, |
| { |
| "completion_length": 92.8125, |
| "epoch": 0.1099308300395257, |
| "grad_norm": 5.1034757317692145, |
| "kl": 0.06396484375, |
| "learning_rate": 8.903162055335968e-07, |
| "loss": 0.0026, |
| "reward": 2.5643460750579834, |
| "reward_std": 0.06753169000148773, |
| "rewards/accuracy_reward_stage2": 0.564346194267273, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 445 |
| }, |
| { |
| "completion_length": 78.71875, |
| "epoch": 0.11017786561264822, |
| "grad_norm": 3.133436785678946, |
| "kl": 0.0908203125, |
| "learning_rate": 8.900691699604743e-07, |
| "loss": 0.0036, |
| "reward": 2.6766560077667236, |
| "reward_std": 0.004866618663072586, |
| "rewards/accuracy_reward_stage2": 0.6766558885574341, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 446 |
| }, |
| { |
| "completion_length": 88.4375, |
| "epoch": 0.11042490118577075, |
| "grad_norm": 5.09866753511643, |
| "kl": 0.078125, |
| "learning_rate": 8.898221343873517e-07, |
| "loss": 0.0031, |
| "reward": 2.419560194015503, |
| "reward_std": 0.09583514928817749, |
| "rewards/accuracy_reward_stage2": 0.41956019401550293, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 447 |
| }, |
| { |
| "completion_length": 88.25, |
| "epoch": 0.11067193675889328, |
| "grad_norm": 2.3473140239947465, |
| "kl": 0.0732421875, |
| "learning_rate": 8.895750988142292e-07, |
| "loss": 0.0029, |
| "reward": 2.59273099899292, |
| "reward_std": 0.1246790662407875, |
| "rewards/accuracy_reward_stage2": 0.5927308797836304, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 448 |
| }, |
| { |
| "completion_length": 104.453125, |
| "epoch": 0.11091897233201581, |
| "grad_norm": 3.1125557355492424, |
| "kl": 0.0771484375, |
| "learning_rate": 8.893280632411066e-07, |
| "loss": 0.0031, |
| "reward": 2.4798202514648438, |
| "reward_std": 0.06186756119132042, |
| "rewards/accuracy_reward_stage2": 0.47982022166252136, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.15625, |
| "step": 449 |
| }, |
| { |
| "completion_length": 88.078125, |
| "epoch": 0.11116600790513834, |
| "grad_norm": 2.9837836911221354, |
| "kl": 0.0810546875, |
| "learning_rate": 8.890810276679841e-07, |
| "loss": 0.0033, |
| "reward": 2.817561149597168, |
| "reward_std": 0.05040454864501953, |
| "rewards/accuracy_reward_stage2": 0.817561149597168, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 450 |
| }, |
| { |
| "completion_length": 98.609375, |
| "epoch": 0.11141304347826086, |
| "grad_norm": 4.441414350052775, |
| "kl": 0.0810546875, |
| "learning_rate": 8.888339920948617e-07, |
| "loss": 0.0032, |
| "reward": 2.681997299194336, |
| "reward_std": 0.05065108835697174, |
| "rewards/accuracy_reward_stage2": 0.6819972395896912, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.078125, |
| "step": 451 |
| }, |
| { |
| "completion_length": 101.296875, |
| "epoch": 0.11166007905138339, |
| "grad_norm": 3.7005571414160867, |
| "kl": 0.0771484375, |
| "learning_rate": 8.885869565217391e-07, |
| "loss": 0.0031, |
| "reward": 2.5746400356292725, |
| "reward_std": 0.08936276286840439, |
| "rewards/accuracy_reward_stage2": 0.5746400952339172, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 452 |
| }, |
| { |
| "completion_length": 78.4375, |
| "epoch": 0.11190711462450594, |
| "grad_norm": 4.167745774001736, |
| "kl": 0.08935546875, |
| "learning_rate": 8.883399209486165e-07, |
| "loss": 0.0036, |
| "reward": 2.710275173187256, |
| "reward_std": 0.0839356780052185, |
| "rewards/accuracy_reward_stage2": 0.7102750539779663, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 453 |
| }, |
| { |
| "completion_length": 108.59375, |
| "epoch": 0.11215415019762846, |
| "grad_norm": 2.4441985384576106, |
| "kl": 0.08251953125, |
| "learning_rate": 8.88092885375494e-07, |
| "loss": 0.0033, |
| "reward": 2.7707533836364746, |
| "reward_std": 0.10964667797088623, |
| "rewards/accuracy_reward_stage2": 0.8332535028457642, |
| "rewards/format_reward_all_stage": 1.9375, |
| "scores/refine_times": 1.1875, |
| "step": 454 |
| }, |
| { |
| "completion_length": 89.09375, |
| "epoch": 0.11240118577075099, |
| "grad_norm": 3.4290524406940177, |
| "kl": 0.11572265625, |
| "learning_rate": 8.878458498023715e-07, |
| "loss": 0.0046, |
| "reward": 2.8360495567321777, |
| "reward_std": 0.06881996244192123, |
| "rewards/accuracy_reward_stage2": 0.8360496759414673, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 455 |
| }, |
| { |
| "completion_length": 93.546875, |
| "epoch": 0.11264822134387352, |
| "grad_norm": 4.2633390639176, |
| "kl": 0.091796875, |
| "learning_rate": 8.87598814229249e-07, |
| "loss": 0.0037, |
| "reward": 2.737834930419922, |
| "reward_std": 0.08329826593399048, |
| "rewards/accuracy_reward_stage2": 0.7378349304199219, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 456 |
| }, |
| { |
| "completion_length": 88.125, |
| "epoch": 0.11289525691699605, |
| "grad_norm": 4.185983875071428, |
| "kl": 0.07177734375, |
| "learning_rate": 8.873517786561264e-07, |
| "loss": 0.0029, |
| "reward": 2.7210230827331543, |
| "reward_std": 0.1462172567844391, |
| "rewards/accuracy_reward_stage2": 0.7210230231285095, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 457 |
| }, |
| { |
| "completion_length": 78.3125, |
| "epoch": 0.11314229249011858, |
| "grad_norm": 2.9568956363715557, |
| "kl": 0.0859375, |
| "learning_rate": 8.871047430830038e-07, |
| "loss": 0.0034, |
| "reward": 2.846480131149292, |
| "reward_std": 0.058452486991882324, |
| "rewards/accuracy_reward_stage2": 0.846480131149292, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 458 |
| }, |
| { |
| "completion_length": 93.96875, |
| "epoch": 0.1133893280632411, |
| "grad_norm": 3.702464502871565, |
| "kl": 0.11474609375, |
| "learning_rate": 8.868577075098815e-07, |
| "loss": 0.0046, |
| "reward": 2.6403086185455322, |
| "reward_std": 0.07300623506307602, |
| "rewards/accuracy_reward_stage2": 0.640308678150177, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 459 |
| }, |
| { |
| "completion_length": 82.015625, |
| "epoch": 0.11363636363636363, |
| "grad_norm": 3.5096468558603027, |
| "kl": 0.11767578125, |
| "learning_rate": 8.866106719367589e-07, |
| "loss": 0.0047, |
| "reward": 2.7332301139831543, |
| "reward_std": 0.1381341964006424, |
| "rewards/accuracy_reward_stage2": 0.7957301735877991, |
| "rewards/format_reward_all_stage": 1.9375, |
| "scores/refine_times": 1.125, |
| "step": 460 |
| }, |
| { |
| "completion_length": 78.125, |
| "epoch": 0.11388339920948616, |
| "grad_norm": 3.559169978335934, |
| "kl": 0.1015625, |
| "learning_rate": 8.863636363636363e-07, |
| "loss": 0.0041, |
| "reward": 2.8119444847106934, |
| "reward_std": 0.08849316835403442, |
| "rewards/accuracy_reward_stage2": 0.8119444847106934, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 461 |
| }, |
| { |
| "completion_length": 93.234375, |
| "epoch": 0.11413043478260869, |
| "grad_norm": 0.5282625223145849, |
| "kl": 0.091796875, |
| "learning_rate": 8.861166007905138e-07, |
| "loss": 0.0037, |
| "reward": 2.764010190963745, |
| "reward_std": 0.008434552699327469, |
| "rewards/accuracy_reward_stage2": 0.7640101909637451, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.078125, |
| "step": 462 |
| }, |
| { |
| "completion_length": 85.25, |
| "epoch": 0.11437747035573123, |
| "grad_norm": 2.9976744755473064, |
| "kl": 0.09814453125, |
| "learning_rate": 8.858695652173913e-07, |
| "loss": 0.0039, |
| "reward": 2.7464168071746826, |
| "reward_std": 0.07833977788686752, |
| "rewards/accuracy_reward_stage2": 0.7464168071746826, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 463 |
| }, |
| { |
| "completion_length": 83.9375, |
| "epoch": 0.11462450592885376, |
| "grad_norm": 3.6875932623475802, |
| "kl": 0.08642578125, |
| "learning_rate": 8.856225296442687e-07, |
| "loss": 0.0035, |
| "reward": 2.6572089195251465, |
| "reward_std": 0.027874791994690895, |
| "rewards/accuracy_reward_stage2": 0.6572088003158569, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.078125, |
| "step": 464 |
| }, |
| { |
| "completion_length": 93.09375, |
| "epoch": 0.11487154150197629, |
| "grad_norm": 3.818896478911207, |
| "kl": 0.09814453125, |
| "learning_rate": 8.853754940711462e-07, |
| "loss": 0.0039, |
| "reward": 2.522165298461914, |
| "reward_std": 0.0887691006064415, |
| "rewards/accuracy_reward_stage2": 0.522165060043335, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.1875, |
| "step": 465 |
| }, |
| { |
| "completion_length": 86.40625, |
| "epoch": 0.11511857707509882, |
| "grad_norm": 5.068588696355706, |
| "kl": 0.080078125, |
| "learning_rate": 8.851284584980236e-07, |
| "loss": 0.0032, |
| "reward": 2.7015743255615234, |
| "reward_std": 0.16825415194034576, |
| "rewards/accuracy_reward_stage2": 0.7640743255615234, |
| "rewards/format_reward_all_stage": 1.9375, |
| "scores/refine_times": 1.125, |
| "step": 466 |
| }, |
| { |
| "completion_length": 118.046875, |
| "epoch": 0.11536561264822134, |
| "grad_norm": 3.1937404370635414, |
| "kl": 0.0712890625, |
| "learning_rate": 8.848814229249012e-07, |
| "loss": 0.0028, |
| "reward": 2.629642963409424, |
| "reward_std": 0.1885630190372467, |
| "rewards/accuracy_reward_stage2": 0.6921432018280029, |
| "rewards/format_reward_all_stage": 1.9375, |
| "scores/refine_times": 1.3125, |
| "step": 467 |
| }, |
| { |
| "completion_length": 78.03125, |
| "epoch": 0.11561264822134387, |
| "grad_norm": 3.2947592139517377, |
| "kl": 0.0732421875, |
| "learning_rate": 8.846343873517787e-07, |
| "loss": 0.0029, |
| "reward": 2.854365110397339, |
| "reward_std": 0.04581620916724205, |
| "rewards/accuracy_reward_stage2": 0.8543651103973389, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.140625, |
| "step": 468 |
| }, |
| { |
| "completion_length": 71.5625, |
| "epoch": 0.1158596837944664, |
| "grad_norm": 4.152892444324077, |
| "kl": 0.08837890625, |
| "learning_rate": 8.843873517786561e-07, |
| "loss": 0.0035, |
| "reward": 2.510572671890259, |
| "reward_std": 0.01925911381840706, |
| "rewards/accuracy_reward_stage2": 0.5105725526809692, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 469 |
| }, |
| { |
| "completion_length": 82.0625, |
| "epoch": 0.11610671936758893, |
| "grad_norm": 2.1146453684599917, |
| "kl": 0.06884765625, |
| "learning_rate": 8.841403162055336e-07, |
| "loss": 0.0028, |
| "reward": 2.793236494064331, |
| "reward_std": 0.0080789215862751, |
| "rewards/accuracy_reward_stage2": 0.793236494064331, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 470 |
| }, |
| { |
| "completion_length": 77.34375, |
| "epoch": 0.11635375494071146, |
| "grad_norm": 2.209537382540903, |
| "kl": 0.07568359375, |
| "learning_rate": 8.83893280632411e-07, |
| "loss": 0.003, |
| "reward": 2.7740843296051025, |
| "reward_std": 0.023258034139871597, |
| "rewards/accuracy_reward_stage2": 0.7740844488143921, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 471 |
| }, |
| { |
| "completion_length": 76.640625, |
| "epoch": 0.116600790513834, |
| "grad_norm": 0.6746428954052648, |
| "kl": 0.06201171875, |
| "learning_rate": 8.836462450592885e-07, |
| "loss": 0.0025, |
| "reward": 2.7364659309387207, |
| "reward_std": 0.01504778116941452, |
| "rewards/accuracy_reward_stage2": 0.7364660501480103, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 472 |
| }, |
| { |
| "completion_length": 80.078125, |
| "epoch": 0.11684782608695653, |
| "grad_norm": 5.151503301895097, |
| "kl": 0.0732421875, |
| "learning_rate": 8.83399209486166e-07, |
| "loss": 0.0029, |
| "reward": 2.6196067333221436, |
| "reward_std": 0.10613559931516647, |
| "rewards/accuracy_reward_stage2": 0.6196067333221436, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 473 |
| }, |
| { |
| "completion_length": 76.203125, |
| "epoch": 0.11709486166007906, |
| "grad_norm": 3.6271936982955113, |
| "kl": 0.10205078125, |
| "learning_rate": 8.831521739130434e-07, |
| "loss": 0.0041, |
| "reward": 2.644803524017334, |
| "reward_std": 0.07510168105363846, |
| "rewards/accuracy_reward_stage2": 0.6552203893661499, |
| "rewards/format_reward_all_stage": 1.9895833730697632, |
| "scores/refine_times": 1.078125, |
| "step": 474 |
| }, |
| { |
| "completion_length": 75.65625, |
| "epoch": 0.11734189723320158, |
| "grad_norm": 4.565040146312246, |
| "kl": 0.10302734375, |
| "learning_rate": 8.829051383399208e-07, |
| "loss": 0.0041, |
| "reward": 2.692810535430908, |
| "reward_std": 0.02393944561481476, |
| "rewards/accuracy_reward_stage2": 0.6928104162216187, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.078125, |
| "step": 475 |
| }, |
| { |
| "completion_length": 79.171875, |
| "epoch": 0.11758893280632411, |
| "grad_norm": 2.197838778708124, |
| "kl": 0.0849609375, |
| "learning_rate": 8.826581027667984e-07, |
| "loss": 0.0034, |
| "reward": 2.7445812225341797, |
| "reward_std": 0.10567092895507812, |
| "rewards/accuracy_reward_stage2": 0.8174980282783508, |
| "rewards/format_reward_all_stage": 1.9270833730697632, |
| "scores/refine_times": 1.140625, |
| "step": 476 |
| }, |
| { |
| "completion_length": 77.296875, |
| "epoch": 0.11783596837944664, |
| "grad_norm": 2.8788455034822484, |
| "kl": 0.08642578125, |
| "learning_rate": 8.824110671936759e-07, |
| "loss": 0.0035, |
| "reward": 2.6922109127044678, |
| "reward_std": 0.057185299694538116, |
| "rewards/accuracy_reward_stage2": 0.692210853099823, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 477 |
| }, |
| { |
| "completion_length": 80.078125, |
| "epoch": 0.11808300395256917, |
| "grad_norm": 2.881982205739493, |
| "kl": 0.08642578125, |
| "learning_rate": 8.821640316205533e-07, |
| "loss": 0.0035, |
| "reward": 2.9145209789276123, |
| "reward_std": 0.07034643739461899, |
| "rewards/accuracy_reward_stage2": 0.9145209789276123, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.1875, |
| "step": 478 |
| }, |
| { |
| "completion_length": 81.3125, |
| "epoch": 0.1183300395256917, |
| "grad_norm": 3.9207615453647953, |
| "kl": 0.0927734375, |
| "learning_rate": 8.819169960474308e-07, |
| "loss": 0.0037, |
| "reward": 2.5646958351135254, |
| "reward_std": 0.1530625820159912, |
| "rewards/accuracy_reward_stage2": 0.6896957159042358, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.125, |
| "step": 479 |
| }, |
| { |
| "completion_length": 84.453125, |
| "epoch": 0.11857707509881422, |
| "grad_norm": 3.6748089134366064, |
| "kl": 0.0771484375, |
| "learning_rate": 8.816699604743083e-07, |
| "loss": 0.0031, |
| "reward": 2.8285746574401855, |
| "reward_std": 0.028202539309859276, |
| "rewards/accuracy_reward_stage2": 0.8285747766494751, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.078125, |
| "step": 480 |
| }, |
| { |
| "completion_length": 107.296875, |
| "epoch": 0.11882411067193675, |
| "grad_norm": 3.0552260659106762, |
| "kl": 0.0771484375, |
| "learning_rate": 8.814229249011858e-07, |
| "loss": 0.0031, |
| "reward": 2.7689619064331055, |
| "reward_std": 0.07823709398508072, |
| "rewards/accuracy_reward_stage2": 0.768962025642395, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.296875, |
| "step": 481 |
| }, |
| { |
| "completion_length": 69.515625, |
| "epoch": 0.1190711462450593, |
| "grad_norm": 3.1056533714213588, |
| "kl": 0.11083984375, |
| "learning_rate": 8.811758893280632e-07, |
| "loss": 0.0044, |
| "reward": 2.6699254512786865, |
| "reward_std": 0.10760709643363953, |
| "rewards/accuracy_reward_stage2": 0.6803420782089233, |
| "rewards/format_reward_all_stage": 1.9895833730697632, |
| "scores/refine_times": 1.078125, |
| "step": 482 |
| }, |
| { |
| "completion_length": 95.75, |
| "epoch": 0.11931818181818182, |
| "grad_norm": 3.673929415547851, |
| "kl": 0.0771484375, |
| "learning_rate": 8.809288537549406e-07, |
| "loss": 0.0031, |
| "reward": 2.495757579803467, |
| "reward_std": 0.02522164396941662, |
| "rewards/accuracy_reward_stage2": 0.4957575500011444, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 483 |
| }, |
| { |
| "completion_length": 87.59375, |
| "epoch": 0.11956521739130435, |
| "grad_norm": 2.2653734549347004, |
| "kl": 0.06689453125, |
| "learning_rate": 8.806818181818182e-07, |
| "loss": 0.0027, |
| "reward": 2.5830860137939453, |
| "reward_std": 0.06735340505838394, |
| "rewards/accuracy_reward_stage2": 0.5830860137939453, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.21875, |
| "step": 484 |
| }, |
| { |
| "completion_length": 71.53125, |
| "epoch": 0.11981225296442688, |
| "grad_norm": 2.692817884354693, |
| "kl": 0.07470703125, |
| "learning_rate": 8.804347826086956e-07, |
| "loss": 0.003, |
| "reward": 2.6110339164733887, |
| "reward_std": 0.07797226309776306, |
| "rewards/accuracy_reward_stage2": 0.6110339164733887, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 485 |
| }, |
| { |
| "completion_length": 83.796875, |
| "epoch": 0.12005928853754941, |
| "grad_norm": 3.867866268455527, |
| "kl": 0.0888671875, |
| "learning_rate": 8.80187747035573e-07, |
| "loss": 0.0036, |
| "reward": 2.82053279876709, |
| "reward_std": 0.04790631681680679, |
| "rewards/accuracy_reward_stage2": 0.8205327987670898, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 486 |
| }, |
| { |
| "completion_length": 85.015625, |
| "epoch": 0.12030632411067194, |
| "grad_norm": 3.7006027639703243, |
| "kl": 0.0849609375, |
| "learning_rate": 8.799407114624506e-07, |
| "loss": 0.0034, |
| "reward": 2.630404472351074, |
| "reward_std": 0.037833116948604584, |
| "rewards/accuracy_reward_stage2": 0.6304043531417847, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.1875, |
| "step": 487 |
| }, |
| { |
| "completion_length": 89.59375, |
| "epoch": 0.12055335968379446, |
| "grad_norm": 3.3659415525573, |
| "kl": 0.0654296875, |
| "learning_rate": 8.79693675889328e-07, |
| "loss": 0.0026, |
| "reward": 2.5924062728881836, |
| "reward_std": 0.040243446826934814, |
| "rewards/accuracy_reward_stage2": 0.592406153678894, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 488 |
| }, |
| { |
| "completion_length": 84.96875, |
| "epoch": 0.12080039525691699, |
| "grad_norm": 2.345461810437249, |
| "kl": 0.0869140625, |
| "learning_rate": 8.794466403162055e-07, |
| "loss": 0.0035, |
| "reward": 2.8451998233795166, |
| "reward_std": 0.011534404940903187, |
| "rewards/accuracy_reward_stage2": 0.8451998233795166, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.140625, |
| "step": 489 |
| }, |
| { |
| "completion_length": 80.078125, |
| "epoch": 0.12104743083003952, |
| "grad_norm": 3.3056877683572123, |
| "kl": 0.07421875, |
| "learning_rate": 8.79199604743083e-07, |
| "loss": 0.003, |
| "reward": 2.674712657928467, |
| "reward_std": 0.0697299912571907, |
| "rewards/accuracy_reward_stage2": 0.6747127771377563, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 490 |
| }, |
| { |
| "completion_length": 100.78125, |
| "epoch": 0.12129446640316205, |
| "grad_norm": 4.62714518835741, |
| "kl": 0.07568359375, |
| "learning_rate": 8.789525691699604e-07, |
| "loss": 0.003, |
| "reward": 2.2546346187591553, |
| "reward_std": 0.18582692742347717, |
| "rewards/accuracy_reward_stage2": 0.3796347379684448, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.078125, |
| "step": 491 |
| }, |
| { |
| "completion_length": 87.46875, |
| "epoch": 0.12154150197628459, |
| "grad_norm": 4.0547374474409565, |
| "kl": 0.1064453125, |
| "learning_rate": 8.787055335968378e-07, |
| "loss": 0.0043, |
| "reward": 2.582868814468384, |
| "reward_std": 0.2643705904483795, |
| "rewards/accuracy_reward_stage2": 0.6609938144683838, |
| "rewards/format_reward_all_stage": 1.921875, |
| "scores/refine_times": 1.328125, |
| "step": 492 |
| }, |
| { |
| "completion_length": 79.984375, |
| "epoch": 0.12178853754940712, |
| "grad_norm": 4.974773902072042, |
| "kl": 0.08447265625, |
| "learning_rate": 8.784584980237154e-07, |
| "loss": 0.0034, |
| "reward": 2.6923179626464844, |
| "reward_std": 0.11359754204750061, |
| "rewards/accuracy_reward_stage2": 0.744401216506958, |
| "rewards/format_reward_all_stage": 1.9479167461395264, |
| "scores/refine_times": 1.09375, |
| "step": 493 |
| }, |
| { |
| "completion_length": 99.75, |
| "epoch": 0.12203557312252965, |
| "grad_norm": 3.0636928116844575, |
| "kl": 0.0654296875, |
| "learning_rate": 8.782114624505928e-07, |
| "loss": 0.0026, |
| "reward": 2.845734119415283, |
| "reward_std": 0.01307186484336853, |
| "rewards/accuracy_reward_stage2": 0.8457342982292175, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.25, |
| "step": 494 |
| }, |
| { |
| "completion_length": 80.859375, |
| "epoch": 0.12228260869565218, |
| "grad_norm": 3.434547157616726, |
| "kl": 0.11962890625, |
| "learning_rate": 8.779644268774703e-07, |
| "loss": 0.0048, |
| "reward": 2.5350117683410645, |
| "reward_std": 0.018023155629634857, |
| "rewards/accuracy_reward_stage2": 0.5350118279457092, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 495 |
| }, |
| { |
| "completion_length": 70.4375, |
| "epoch": 0.1225296442687747, |
| "grad_norm": 4.465055705056343, |
| "kl": 0.080078125, |
| "learning_rate": 8.777173913043478e-07, |
| "loss": 0.0032, |
| "reward": 2.4872050285339355, |
| "reward_std": 0.07076370716094971, |
| "rewards/accuracy_reward_stage2": 0.48720502853393555, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 496 |
| }, |
| { |
| "completion_length": 81.53125, |
| "epoch": 0.12277667984189723, |
| "grad_norm": 5.0334176088721545, |
| "kl": 0.11865234375, |
| "learning_rate": 8.774703557312253e-07, |
| "loss": 0.0047, |
| "reward": 2.3961634635925293, |
| "reward_std": 0.30635514855384827, |
| "rewards/accuracy_reward_stage2": 0.5784550905227661, |
| "rewards/format_reward_all_stage": 1.8177083730697632, |
| "scores/refine_times": 1.078125, |
| "step": 497 |
| }, |
| { |
| "completion_length": 65.046875, |
| "epoch": 0.12302371541501976, |
| "grad_norm": 0.8054176825918075, |
| "kl": 0.0869140625, |
| "learning_rate": 8.772233201581028e-07, |
| "loss": 0.0035, |
| "reward": 2.841981887817383, |
| "reward_std": 0.0124855637550354, |
| "rewards/accuracy_reward_stage2": 0.8419820070266724, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 498 |
| }, |
| { |
| "completion_length": 94.765625, |
| "epoch": 0.12327075098814229, |
| "grad_norm": 3.630597761876916, |
| "kl": 0.09130859375, |
| "learning_rate": 8.769762845849802e-07, |
| "loss": 0.0036, |
| "reward": 2.5977272987365723, |
| "reward_std": 0.2044953554868698, |
| "rewards/accuracy_reward_stage2": 0.8477272987365723, |
| "rewards/format_reward_all_stage": 1.75, |
| "scores/refine_times": 1.1875, |
| "step": 499 |
| }, |
| { |
| "completion_length": 79.140625, |
| "epoch": 0.12351778656126482, |
| "grad_norm": 3.823309287561965, |
| "kl": 0.08544921875, |
| "learning_rate": 8.767292490118576e-07, |
| "loss": 0.0034, |
| "reward": 2.621757984161377, |
| "reward_std": 0.1109161525964737, |
| "rewards/accuracy_reward_stage2": 0.6217580437660217, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 500 |
| }, |
| { |
| "completion_length": 79.546875, |
| "epoch": 0.12376482213438735, |
| "grad_norm": 0.2598621354632467, |
| "kl": 0.07763671875, |
| "learning_rate": 8.764822134387352e-07, |
| "loss": 0.0031, |
| "reward": 2.809523820877075, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward_stage2": 0.8095238208770752, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 501 |
| }, |
| { |
| "completion_length": 65.6875, |
| "epoch": 0.12401185770750989, |
| "grad_norm": 3.741876860167781, |
| "kl": 0.09130859375, |
| "learning_rate": 8.762351778656126e-07, |
| "loss": 0.0037, |
| "reward": 2.740185022354126, |
| "reward_std": 0.044610656797885895, |
| "rewards/accuracy_reward_stage2": 0.740185022354126, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 502 |
| }, |
| { |
| "completion_length": 64.9375, |
| "epoch": 0.12425889328063242, |
| "grad_norm": 2.1936854561420267, |
| "kl": 0.09912109375, |
| "learning_rate": 8.7598814229249e-07, |
| "loss": 0.004, |
| "reward": 2.96875, |
| "reward_std": 0.033407654613256454, |
| "rewards/accuracy_reward_stage2": 0.96875, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 503 |
| }, |
| { |
| "completion_length": 65.234375, |
| "epoch": 0.12450592885375494, |
| "grad_norm": 4.050893925110765, |
| "kl": 0.0869140625, |
| "learning_rate": 8.757411067193675e-07, |
| "loss": 0.0035, |
| "reward": 2.7537124156951904, |
| "reward_std": 0.11977555602788925, |
| "rewards/accuracy_reward_stage2": 0.7537122964859009, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 504 |
| }, |
| { |
| "completion_length": 84.359375, |
| "epoch": 0.12475296442687747, |
| "grad_norm": 3.072366621168365, |
| "kl": 0.107421875, |
| "learning_rate": 8.754940711462451e-07, |
| "loss": 0.0043, |
| "reward": 2.5886545181274414, |
| "reward_std": 0.1502598226070404, |
| "rewards/accuracy_reward_stage2": 0.713654637336731, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.15625, |
| "step": 505 |
| }, |
| { |
| "completion_length": 69.828125, |
| "epoch": 0.125, |
| "grad_norm": 5.845882236669313, |
| "kl": 0.1533203125, |
| "learning_rate": 8.752470355731226e-07, |
| "loss": 0.0061, |
| "reward": 2.244267463684082, |
| "reward_std": 0.35389357805252075, |
| "rewards/accuracy_reward_stage2": 0.49426767230033875, |
| "rewards/format_reward_all_stage": 1.75, |
| "scores/refine_times": 1.125, |
| "step": 506 |
| }, |
| { |
| "completion_length": 76.109375, |
| "epoch": 0.12524703557312253, |
| "grad_norm": 3.476292970725002, |
| "kl": 0.08642578125, |
| "learning_rate": 8.75e-07, |
| "loss": 0.0034, |
| "reward": 2.8092727661132812, |
| "reward_std": 0.034412235021591187, |
| "rewards/accuracy_reward_stage2": 0.809272825717926, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 507 |
| }, |
| { |
| "completion_length": 75.875, |
| "epoch": 0.12549407114624506, |
| "grad_norm": 3.331603296120503, |
| "kl": 0.08984375, |
| "learning_rate": 8.747529644268774e-07, |
| "loss": 0.0036, |
| "reward": 2.759597063064575, |
| "reward_std": 0.020664650946855545, |
| "rewards/accuracy_reward_stage2": 0.7595971822738647, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 508 |
| }, |
| { |
| "completion_length": 65.5625, |
| "epoch": 0.12574110671936758, |
| "grad_norm": 2.4995462264429062, |
| "kl": 0.08984375, |
| "learning_rate": 8.745059288537549e-07, |
| "loss": 0.0036, |
| "reward": 2.830458879470825, |
| "reward_std": 0.005241929553449154, |
| "rewards/accuracy_reward_stage2": 0.8304589986801147, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 509 |
| }, |
| { |
| "completion_length": 85.90625, |
| "epoch": 0.1259881422924901, |
| "grad_norm": 5.179493941114563, |
| "kl": 0.10400390625, |
| "learning_rate": 8.742588932806324e-07, |
| "loss": 0.0042, |
| "reward": 2.6034417152404785, |
| "reward_std": 0.12308812886476517, |
| "rewards/accuracy_reward_stage2": 0.6607334613800049, |
| "rewards/format_reward_all_stage": 1.9427082538604736, |
| "scores/refine_times": 1.21875, |
| "step": 510 |
| }, |
| { |
| "completion_length": 69.625, |
| "epoch": 0.12623517786561264, |
| "grad_norm": 4.072436652749926, |
| "kl": 0.10107421875, |
| "learning_rate": 8.740118577075098e-07, |
| "loss": 0.004, |
| "reward": 2.5392637252807617, |
| "reward_std": 0.061263859272003174, |
| "rewards/accuracy_reward_stage2": 0.5392636060714722, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 511 |
| }, |
| { |
| "completion_length": 60.015625, |
| "epoch": 0.12648221343873517, |
| "grad_norm": 4.246135496046692, |
| "kl": 0.1357421875, |
| "learning_rate": 8.737648221343873e-07, |
| "loss": 0.0054, |
| "reward": 2.6761271953582764, |
| "reward_std": 0.06654070317745209, |
| "rewards/accuracy_reward_stage2": 0.6761271953582764, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 512 |
| }, |
| { |
| "completion_length": 59.875, |
| "epoch": 0.1267292490118577, |
| "grad_norm": 4.924272097887162, |
| "kl": 0.11083984375, |
| "learning_rate": 8.735177865612647e-07, |
| "loss": 0.0044, |
| "reward": 2.692314624786377, |
| "reward_std": 0.0717814713716507, |
| "rewards/accuracy_reward_stage2": 0.6923147439956665, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 513 |
| }, |
| { |
| "completion_length": 78.4375, |
| "epoch": 0.12697628458498023, |
| "grad_norm": 4.1067478974761515, |
| "kl": 0.11572265625, |
| "learning_rate": 8.732707509881423e-07, |
| "loss": 0.0046, |
| "reward": 2.7923502922058105, |
| "reward_std": 0.03013404831290245, |
| "rewards/accuracy_reward_stage2": 0.7923504114151001, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.1875, |
| "step": 514 |
| }, |
| { |
| "completion_length": 80.125, |
| "epoch": 0.12722332015810275, |
| "grad_norm": 3.9493774213422403, |
| "kl": 0.11083984375, |
| "learning_rate": 8.730237154150198e-07, |
| "loss": 0.0044, |
| "reward": 2.1602487564086914, |
| "reward_std": 0.28394466638565063, |
| "rewards/accuracy_reward_stage2": 0.5352487564086914, |
| "rewards/format_reward_all_stage": 1.625, |
| "scores/refine_times": 1.140625, |
| "step": 515 |
| }, |
| { |
| "completion_length": 74.4375, |
| "epoch": 0.1274703557312253, |
| "grad_norm": 3.6966243378996513, |
| "kl": 0.1044921875, |
| "learning_rate": 8.727766798418972e-07, |
| "loss": 0.0042, |
| "reward": 2.8750760555267334, |
| "reward_std": 0.06863968074321747, |
| "rewards/accuracy_reward_stage2": 0.937576174736023, |
| "rewards/format_reward_all_stage": 1.9375, |
| "scores/refine_times": 1.203125, |
| "step": 516 |
| }, |
| { |
| "completion_length": 89.984375, |
| "epoch": 0.12771739130434784, |
| "grad_norm": 3.8890101683067577, |
| "kl": 0.1181640625, |
| "learning_rate": 8.725296442687746e-07, |
| "loss": 0.0047, |
| "reward": 2.467437744140625, |
| "reward_std": 0.10733962059020996, |
| "rewards/accuracy_reward_stage2": 0.4830629527568817, |
| "rewards/format_reward_all_stage": 1.984375, |
| "scores/refine_times": 1.21875, |
| "step": 517 |
| }, |
| { |
| "completion_length": 73.1875, |
| "epoch": 0.12796442687747037, |
| "grad_norm": 4.990132015423476, |
| "kl": 0.10009765625, |
| "learning_rate": 8.722826086956522e-07, |
| "loss": 0.004, |
| "reward": 2.5844149589538574, |
| "reward_std": 0.20980337262153625, |
| "rewards/accuracy_reward_stage2": 0.7094148397445679, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.0, |
| "step": 518 |
| }, |
| { |
| "completion_length": 77.828125, |
| "epoch": 0.1282114624505929, |
| "grad_norm": 4.7214654461831485, |
| "kl": 0.12353515625, |
| "learning_rate": 8.720355731225296e-07, |
| "loss": 0.0049, |
| "reward": 2.7041687965393066, |
| "reward_std": 0.07556068897247314, |
| "rewards/accuracy_reward_stage2": 0.7041686177253723, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.15625, |
| "step": 519 |
| }, |
| { |
| "completion_length": 77.78125, |
| "epoch": 0.12845849802371542, |
| "grad_norm": 0.3211094077379723, |
| "kl": 0.111328125, |
| "learning_rate": 8.71788537549407e-07, |
| "loss": 0.0045, |
| "reward": 2.943162441253662, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward_stage2": 0.9431624412536621, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 520 |
| }, |
| { |
| "completion_length": 75.296875, |
| "epoch": 0.12870553359683795, |
| "grad_norm": 3.364555080905937, |
| "kl": 0.11083984375, |
| "learning_rate": 8.715415019762845e-07, |
| "loss": 0.0044, |
| "reward": 2.7595181465148926, |
| "reward_std": 0.03243761509656906, |
| "rewards/accuracy_reward_stage2": 0.7595181465148926, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.109375, |
| "step": 521 |
| }, |
| { |
| "completion_length": 75.125, |
| "epoch": 0.12895256916996048, |
| "grad_norm": 4.197238919282646, |
| "kl": 0.08935546875, |
| "learning_rate": 8.71294466403162e-07, |
| "loss": 0.0036, |
| "reward": 2.7527740001678467, |
| "reward_std": 0.15860606729984283, |
| "rewards/accuracy_reward_stage2": 0.8777740001678467, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.0, |
| "step": 522 |
| }, |
| { |
| "completion_length": 83.046875, |
| "epoch": 0.129199604743083, |
| "grad_norm": 3.684564519838231, |
| "kl": 0.09912109375, |
| "learning_rate": 8.710474308300395e-07, |
| "loss": 0.004, |
| "reward": 2.7169008255004883, |
| "reward_std": 0.028284341096878052, |
| "rewards/accuracy_reward_stage2": 0.7169008851051331, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.1875, |
| "step": 523 |
| }, |
| { |
| "completion_length": 89.65625, |
| "epoch": 0.12944664031620554, |
| "grad_norm": 3.7050980630735038, |
| "kl": 0.0966796875, |
| "learning_rate": 8.70800395256917e-07, |
| "loss": 0.0039, |
| "reward": 2.706432342529297, |
| "reward_std": 0.03294907137751579, |
| "rewards/accuracy_reward_stage2": 0.7064325213432312, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.1875, |
| "step": 524 |
| }, |
| { |
| "completion_length": 87.4375, |
| "epoch": 0.12969367588932806, |
| "grad_norm": 4.395244733430323, |
| "kl": 0.0908203125, |
| "learning_rate": 8.705533596837944e-07, |
| "loss": 0.0036, |
| "reward": 2.6969168186187744, |
| "reward_std": 0.2856762111186981, |
| "rewards/accuracy_reward_stage2": 0.8219167590141296, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.0625, |
| "step": 525 |
| }, |
| { |
| "completion_length": 75.15625, |
| "epoch": 0.1299407114624506, |
| "grad_norm": 3.7899906109560693, |
| "kl": 0.09814453125, |
| "learning_rate": 8.70306324110672e-07, |
| "loss": 0.0039, |
| "reward": 2.7718465328216553, |
| "reward_std": 0.026267768815159798, |
| "rewards/accuracy_reward_stage2": 0.7718464136123657, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.078125, |
| "step": 526 |
| }, |
| { |
| "completion_length": 75.5, |
| "epoch": 0.13018774703557312, |
| "grad_norm": 4.216329302877091, |
| "kl": 0.07666015625, |
| "learning_rate": 8.700592885375494e-07, |
| "loss": 0.0031, |
| "reward": 2.68217134475708, |
| "reward_std": 0.045934125781059265, |
| "rewards/accuracy_reward_stage2": 0.6821711659431458, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 527 |
| }, |
| { |
| "completion_length": 68.9375, |
| "epoch": 0.13043478260869565, |
| "grad_norm": 0.2760461680822888, |
| "kl": 0.0830078125, |
| "learning_rate": 8.698122529644268e-07, |
| "loss": 0.0033, |
| "reward": 2.7838234901428223, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward_stage2": 0.7838236093521118, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 528 |
| }, |
| { |
| "completion_length": 80.09375, |
| "epoch": 0.13068181818181818, |
| "grad_norm": 4.353620248510361, |
| "kl": 0.09130859375, |
| "learning_rate": 8.695652173913043e-07, |
| "loss": 0.0036, |
| "reward": 2.865518093109131, |
| "reward_std": 0.09008646011352539, |
| "rewards/accuracy_reward_stage2": 0.8655180931091309, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 529 |
| }, |
| { |
| "completion_length": 79.921875, |
| "epoch": 0.1309288537549407, |
| "grad_norm": 4.4730643003742125, |
| "kl": 0.12451171875, |
| "learning_rate": 8.693181818181817e-07, |
| "loss": 0.005, |
| "reward": 2.5940661430358887, |
| "reward_std": 0.12470673769712448, |
| "rewards/accuracy_reward_stage2": 0.6565661430358887, |
| "rewards/format_reward_all_stage": 1.9375, |
| "scores/refine_times": 1.203125, |
| "step": 530 |
| }, |
| { |
| "completion_length": 76.28125, |
| "epoch": 0.13117588932806323, |
| "grad_norm": 3.879298926548609, |
| "kl": 0.11376953125, |
| "learning_rate": 8.690711462450592e-07, |
| "loss": 0.0046, |
| "reward": 2.544191598892212, |
| "reward_std": 0.09751159697771072, |
| "rewards/accuracy_reward_stage2": 0.6212749481201172, |
| "rewards/format_reward_all_stage": 1.9229166507720947, |
| "scores/refine_times": 1.1875, |
| "step": 531 |
| }, |
| { |
| "completion_length": 78.6875, |
| "epoch": 0.13142292490118576, |
| "grad_norm": 2.89769607245115, |
| "kl": 0.09033203125, |
| "learning_rate": 8.688241106719367e-07, |
| "loss": 0.0036, |
| "reward": 2.672381639480591, |
| "reward_std": 0.004669596441090107, |
| "rewards/accuracy_reward_stage2": 0.6723816394805908, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 532 |
| }, |
| { |
| "completion_length": 70.1875, |
| "epoch": 0.1316699604743083, |
| "grad_norm": 4.060304413882462, |
| "kl": 0.125, |
| "learning_rate": 8.685770750988142e-07, |
| "loss": 0.005, |
| "reward": 2.491544246673584, |
| "reward_std": 0.013606157153844833, |
| "rewards/accuracy_reward_stage2": 0.4915444254875183, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 533 |
| }, |
| { |
| "completion_length": 87.75, |
| "epoch": 0.13191699604743082, |
| "grad_norm": 3.6628188266219768, |
| "kl": 0.10791015625, |
| "learning_rate": 8.683300395256917e-07, |
| "loss": 0.0043, |
| "reward": 2.5697736740112305, |
| "reward_std": 0.015459168702363968, |
| "rewards/accuracy_reward_stage2": 0.5697736740112305, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 534 |
| }, |
| { |
| "completion_length": 92.28125, |
| "epoch": 0.13216403162055335, |
| "grad_norm": 4.200607367033454, |
| "kl": 0.1044921875, |
| "learning_rate": 8.680830039525692e-07, |
| "loss": 0.0042, |
| "reward": 2.7578911781311035, |
| "reward_std": 0.1420280486345291, |
| "rewards/accuracy_reward_stage2": 0.836016058921814, |
| "rewards/format_reward_all_stage": 1.921875, |
| "scores/refine_times": 1.1875, |
| "step": 535 |
| }, |
| { |
| "completion_length": 89.640625, |
| "epoch": 0.1324110671936759, |
| "grad_norm": 2.626610558071225, |
| "kl": 0.08349609375, |
| "learning_rate": 8.678359683794466e-07, |
| "loss": 0.0033, |
| "reward": 2.7216603755950928, |
| "reward_std": 0.013819479383528233, |
| "rewards/accuracy_reward_stage2": 0.7216602563858032, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 536 |
| }, |
| { |
| "completion_length": 93.765625, |
| "epoch": 0.13265810276679843, |
| "grad_norm": 4.500544096757364, |
| "kl": 0.107421875, |
| "learning_rate": 8.675889328063241e-07, |
| "loss": 0.0043, |
| "reward": 2.755054473876953, |
| "reward_std": 0.08167126029729843, |
| "rewards/accuracy_reward_stage2": 0.7550546526908875, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 537 |
| }, |
| { |
| "completion_length": 86.21875, |
| "epoch": 0.13290513833992096, |
| "grad_norm": 2.865246814533839, |
| "kl": 0.09619140625, |
| "learning_rate": 8.673418972332015e-07, |
| "loss": 0.0039, |
| "reward": 2.8893496990203857, |
| "reward_std": 0.024965433403849602, |
| "rewards/accuracy_reward_stage2": 0.8893496990203857, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.140625, |
| "step": 538 |
| }, |
| { |
| "completion_length": 76.4375, |
| "epoch": 0.1331521739130435, |
| "grad_norm": 1.67135798397581, |
| "kl": 0.0810546875, |
| "learning_rate": 8.67094861660079e-07, |
| "loss": 0.0033, |
| "reward": 2.7071239948272705, |
| "reward_std": 0.005690534599125385, |
| "rewards/accuracy_reward_stage2": 0.7071239948272705, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 539 |
| }, |
| { |
| "completion_length": 82.78125, |
| "epoch": 0.13339920948616601, |
| "grad_norm": 4.1245154427845465, |
| "kl": 0.0830078125, |
| "learning_rate": 8.668478260869565e-07, |
| "loss": 0.0033, |
| "reward": 2.630288600921631, |
| "reward_std": 0.06467457115650177, |
| "rewards/accuracy_reward_stage2": 0.6302886009216309, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 540 |
| }, |
| { |
| "completion_length": 83.0, |
| "epoch": 0.13364624505928854, |
| "grad_norm": 1.7077519811767492, |
| "kl": 0.07861328125, |
| "learning_rate": 8.666007905138339e-07, |
| "loss": 0.0031, |
| "reward": 2.87119722366333, |
| "reward_std": 0.012019687332212925, |
| "rewards/accuracy_reward_stage2": 0.8711971044540405, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.078125, |
| "step": 541 |
| }, |
| { |
| "completion_length": 75.5, |
| "epoch": 0.13389328063241107, |
| "grad_norm": 3.6652902376339846, |
| "kl": 0.0771484375, |
| "learning_rate": 8.663537549407114e-07, |
| "loss": 0.0031, |
| "reward": 2.528963565826416, |
| "reward_std": 0.03654494509100914, |
| "rewards/accuracy_reward_stage2": 0.5289634466171265, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 542 |
| }, |
| { |
| "completion_length": 82.125, |
| "epoch": 0.1341403162055336, |
| "grad_norm": 2.915704097691835, |
| "kl": 0.087890625, |
| "learning_rate": 8.66106719367589e-07, |
| "loss": 0.0035, |
| "reward": 2.725383996963501, |
| "reward_std": 0.05014697462320328, |
| "rewards/accuracy_reward_stage2": 0.725383996963501, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 543 |
| }, |
| { |
| "completion_length": 85.328125, |
| "epoch": 0.13438735177865613, |
| "grad_norm": 3.3523091039346906, |
| "kl": 0.07177734375, |
| "learning_rate": 8.658596837944664e-07, |
| "loss": 0.0029, |
| "reward": 2.68579363822937, |
| "reward_std": 0.02735856920480728, |
| "rewards/accuracy_reward_stage2": 0.6857935786247253, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 544 |
| }, |
| { |
| "completion_length": 66.1875, |
| "epoch": 0.13463438735177866, |
| "grad_norm": 5.104979767459419, |
| "kl": 0.09814453125, |
| "learning_rate": 8.656126482213438e-07, |
| "loss": 0.0039, |
| "reward": 2.6130058765411377, |
| "reward_std": 0.050167717039585114, |
| "rewards/accuracy_reward_stage2": 0.6130058169364929, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 545 |
| }, |
| { |
| "completion_length": 71.25, |
| "epoch": 0.13488142292490118, |
| "grad_norm": 3.450494920523269, |
| "kl": 0.08251953125, |
| "learning_rate": 8.653656126482213e-07, |
| "loss": 0.0033, |
| "reward": 2.6630539894104004, |
| "reward_std": 0.03502904251217842, |
| "rewards/accuracy_reward_stage2": 0.6630541086196899, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 546 |
| }, |
| { |
| "completion_length": 79.125, |
| "epoch": 0.1351284584980237, |
| "grad_norm": 2.859476004597891, |
| "kl": 0.068359375, |
| "learning_rate": 8.651185770750987e-07, |
| "loss": 0.0027, |
| "reward": 2.6337075233459473, |
| "reward_std": 0.03160533308982849, |
| "rewards/accuracy_reward_stage2": 0.6337075233459473, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 547 |
| }, |
| { |
| "completion_length": 73.0625, |
| "epoch": 0.13537549407114624, |
| "grad_norm": 4.327752965233732, |
| "kl": 0.07666015625, |
| "learning_rate": 8.648715415019763e-07, |
| "loss": 0.0031, |
| "reward": 2.5985307693481445, |
| "reward_std": 0.036348432302474976, |
| "rewards/accuracy_reward_stage2": 0.5985307097434998, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 548 |
| }, |
| { |
| "completion_length": 83.9375, |
| "epoch": 0.13562252964426877, |
| "grad_norm": 3.5893750391044117, |
| "kl": 0.080078125, |
| "learning_rate": 8.646245059288537e-07, |
| "loss": 0.0032, |
| "reward": 2.7797060012817383, |
| "reward_std": 0.03607073798775673, |
| "rewards/accuracy_reward_stage2": 0.7797058820724487, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 549 |
| }, |
| { |
| "completion_length": 101.328125, |
| "epoch": 0.1358695652173913, |
| "grad_norm": 4.38009132567829, |
| "kl": 0.07861328125, |
| "learning_rate": 8.643774703557311e-07, |
| "loss": 0.0031, |
| "reward": 2.667013645172119, |
| "reward_std": 0.07868792116641998, |
| "rewards/accuracy_reward_stage2": 0.6826385259628296, |
| "rewards/format_reward_all_stage": 1.984375, |
| "scores/refine_times": 1.0625, |
| "step": 550 |
| }, |
| { |
| "completion_length": 94.125, |
| "epoch": 0.13611660079051383, |
| "grad_norm": 3.4410934285028305, |
| "kl": 0.049072265625, |
| "learning_rate": 8.641304347826086e-07, |
| "loss": 0.002, |
| "reward": 2.637599229812622, |
| "reward_std": 0.08176921308040619, |
| "rewards/accuracy_reward_stage2": 0.6375992298126221, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 551 |
| }, |
| { |
| "completion_length": 84.6875, |
| "epoch": 0.13636363636363635, |
| "grad_norm": 4.056780679390888, |
| "kl": 0.1005859375, |
| "learning_rate": 8.638833992094862e-07, |
| "loss": 0.004, |
| "reward": 2.6697425842285156, |
| "reward_std": 0.037977829575538635, |
| "rewards/accuracy_reward_stage2": 0.6697424650192261, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 552 |
| }, |
| { |
| "completion_length": 97.140625, |
| "epoch": 0.13661067193675888, |
| "grad_norm": 3.1912103956728206, |
| "kl": 0.0634765625, |
| "learning_rate": 8.636363636363636e-07, |
| "loss": 0.0025, |
| "reward": 2.673372745513916, |
| "reward_std": 0.17441077530384064, |
| "rewards/accuracy_reward_stage2": 0.813997745513916, |
| "rewards/format_reward_all_stage": 1.859375, |
| "scores/refine_times": 1.125, |
| "step": 553 |
| }, |
| { |
| "completion_length": 94.6875, |
| "epoch": 0.1368577075098814, |
| "grad_norm": 3.9139148065427922, |
| "kl": 0.08447265625, |
| "learning_rate": 8.633893280632411e-07, |
| "loss": 0.0034, |
| "reward": 2.507345676422119, |
| "reward_std": 0.1150667741894722, |
| "rewards/accuracy_reward_stage2": 0.5177624225616455, |
| "rewards/format_reward_all_stage": 1.9895833730697632, |
| "scores/refine_times": 1.140625, |
| "step": 554 |
| }, |
| { |
| "completion_length": 97.640625, |
| "epoch": 0.13710474308300397, |
| "grad_norm": 2.778226584347817, |
| "kl": 0.09521484375, |
| "learning_rate": 8.631422924901185e-07, |
| "loss": 0.0038, |
| "reward": 2.8255248069763184, |
| "reward_std": 0.026099219918251038, |
| "rewards/accuracy_reward_stage2": 0.8255246877670288, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.15625, |
| "step": 555 |
| }, |
| { |
| "completion_length": 93.796875, |
| "epoch": 0.1373517786561265, |
| "grad_norm": 3.714775703693918, |
| "kl": 0.06982421875, |
| "learning_rate": 8.62895256916996e-07, |
| "loss": 0.0028, |
| "reward": 2.700383186340332, |
| "reward_std": 0.11565177142620087, |
| "rewards/accuracy_reward_stage2": 0.7003831267356873, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.109375, |
| "step": 556 |
| }, |
| { |
| "completion_length": 95.96875, |
| "epoch": 0.13759881422924902, |
| "grad_norm": 4.424621871770595, |
| "kl": 0.08203125, |
| "learning_rate": 8.626482213438735e-07, |
| "loss": 0.0033, |
| "reward": 2.531020164489746, |
| "reward_std": 0.1613694131374359, |
| "rewards/accuracy_reward_stage2": 0.5310203433036804, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.140625, |
| "step": 557 |
| }, |
| { |
| "completion_length": 74.40625, |
| "epoch": 0.13784584980237155, |
| "grad_norm": 0.2547649072682389, |
| "kl": 0.07421875, |
| "learning_rate": 8.624011857707509e-07, |
| "loss": 0.003, |
| "reward": 2.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward_stage2": 0.75, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 558 |
| }, |
| { |
| "completion_length": 91.046875, |
| "epoch": 0.13809288537549408, |
| "grad_norm": 4.493659007924147, |
| "kl": 0.08740234375, |
| "learning_rate": 8.621541501976283e-07, |
| "loss": 0.0035, |
| "reward": 2.4549224376678467, |
| "reward_std": 0.0722210705280304, |
| "rewards/accuracy_reward_stage2": 0.4705474376678467, |
| "rewards/format_reward_all_stage": 1.984375, |
| "scores/refine_times": 1.078125, |
| "step": 559 |
| }, |
| { |
| "completion_length": 80.9375, |
| "epoch": 0.1383399209486166, |
| "grad_norm": 3.4555316703357213, |
| "kl": 0.072265625, |
| "learning_rate": 8.61907114624506e-07, |
| "loss": 0.0029, |
| "reward": 2.689997434616089, |
| "reward_std": 0.15605773031711578, |
| "rewards/accuracy_reward_stage2": 0.8149974346160889, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.0, |
| "step": 560 |
| }, |
| { |
| "completion_length": 82.921875, |
| "epoch": 0.13858695652173914, |
| "grad_norm": 4.399711011954836, |
| "kl": 0.07861328125, |
| "learning_rate": 8.616600790513834e-07, |
| "loss": 0.0031, |
| "reward": 2.6843276023864746, |
| "reward_std": 0.048114050179719925, |
| "rewards/accuracy_reward_stage2": 0.6843275427818298, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 561 |
| }, |
| { |
| "completion_length": 69.4375, |
| "epoch": 0.13883399209486166, |
| "grad_norm": 2.544848259860156, |
| "kl": 0.060791015625, |
| "learning_rate": 8.614130434782609e-07, |
| "loss": 0.0024, |
| "reward": 2.7242729663848877, |
| "reward_std": 0.008474176749587059, |
| "rewards/accuracy_reward_stage2": 0.7242730259895325, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 562 |
| }, |
| { |
| "completion_length": 93.015625, |
| "epoch": 0.1390810276679842, |
| "grad_norm": 1.9603121727015835, |
| "kl": 0.08203125, |
| "learning_rate": 8.611660079051383e-07, |
| "loss": 0.0033, |
| "reward": 2.7193644046783447, |
| "reward_std": 0.0018559737363830209, |
| "rewards/accuracy_reward_stage2": 0.7193642854690552, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 563 |
| }, |
| { |
| "completion_length": 83.328125, |
| "epoch": 0.13932806324110672, |
| "grad_norm": 4.238916634053795, |
| "kl": 0.0908203125, |
| "learning_rate": 8.609189723320158e-07, |
| "loss": 0.0036, |
| "reward": 2.60569167137146, |
| "reward_std": 0.21955642104148865, |
| "rewards/accuracy_reward_stage2": 0.73069167137146, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.0625, |
| "step": 564 |
| }, |
| { |
| "completion_length": 87.390625, |
| "epoch": 0.13957509881422925, |
| "grad_norm": 4.382576919750248, |
| "kl": 0.1015625, |
| "learning_rate": 8.606719367588933e-07, |
| "loss": 0.0041, |
| "reward": 2.5854461193084717, |
| "reward_std": 0.15955139696598053, |
| "rewards/accuracy_reward_stage2": 0.7104461789131165, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.0625, |
| "step": 565 |
| }, |
| { |
| "completion_length": 80.25, |
| "epoch": 0.13982213438735178, |
| "grad_norm": 2.9867715867203857, |
| "kl": 0.0654296875, |
| "learning_rate": 8.604249011857707e-07, |
| "loss": 0.0026, |
| "reward": 2.7301557064056396, |
| "reward_std": 0.0774800181388855, |
| "rewards/accuracy_reward_stage2": 0.7301557064056396, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 566 |
| }, |
| { |
| "completion_length": 81.0625, |
| "epoch": 0.1400691699604743, |
| "grad_norm": 3.224273094621829, |
| "kl": 0.0771484375, |
| "learning_rate": 8.601778656126481e-07, |
| "loss": 0.0031, |
| "reward": 2.7714409828186035, |
| "reward_std": 0.010768895037472248, |
| "rewards/accuracy_reward_stage2": 0.7714409232139587, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 567 |
| }, |
| { |
| "completion_length": 87.375, |
| "epoch": 0.14031620553359683, |
| "grad_norm": 4.003108387284898, |
| "kl": 0.08544921875, |
| "learning_rate": 8.599308300395256e-07, |
| "loss": 0.0034, |
| "reward": 2.571406841278076, |
| "reward_std": 0.021448295563459396, |
| "rewards/accuracy_reward_stage2": 0.5714069604873657, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 568 |
| }, |
| { |
| "completion_length": 91.734375, |
| "epoch": 0.14056324110671936, |
| "grad_norm": 1.763343386987535, |
| "kl": 0.078125, |
| "learning_rate": 8.596837944664031e-07, |
| "loss": 0.0031, |
| "reward": 2.537046432495117, |
| "reward_std": 0.005295008420944214, |
| "rewards/accuracy_reward_stage2": 0.5370461940765381, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 569 |
| }, |
| { |
| "completion_length": 94.234375, |
| "epoch": 0.1408102766798419, |
| "grad_norm": 2.1101705072844963, |
| "kl": 0.080078125, |
| "learning_rate": 8.594367588932806e-07, |
| "loss": 0.0032, |
| "reward": 2.644087076187134, |
| "reward_std": 0.006828606594353914, |
| "rewards/accuracy_reward_stage2": 0.6440869569778442, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 570 |
| }, |
| { |
| "completion_length": 91.125, |
| "epoch": 0.14105731225296442, |
| "grad_norm": 2.4996061886316165, |
| "kl": 0.06689453125, |
| "learning_rate": 8.591897233201581e-07, |
| "loss": 0.0027, |
| "reward": 2.6300556659698486, |
| "reward_std": 0.06752649694681168, |
| "rewards/accuracy_reward_stage2": 0.6300556659698486, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.078125, |
| "step": 571 |
| }, |
| { |
| "completion_length": 83.796875, |
| "epoch": 0.14130434782608695, |
| "grad_norm": 2.4902164979539956, |
| "kl": 0.08642578125, |
| "learning_rate": 8.589426877470355e-07, |
| "loss": 0.0035, |
| "reward": 2.728515148162842, |
| "reward_std": 0.020385991781949997, |
| "rewards/accuracy_reward_stage2": 0.7285150289535522, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.1875, |
| "step": 572 |
| }, |
| { |
| "completion_length": 82.25, |
| "epoch": 0.14155138339920947, |
| "grad_norm": 4.393798659706149, |
| "kl": 0.0908203125, |
| "learning_rate": 8.586956521739131e-07, |
| "loss": 0.0036, |
| "reward": 2.5845212936401367, |
| "reward_std": 0.14551466703414917, |
| "rewards/accuracy_reward_stage2": 0.5845211744308472, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.078125, |
| "step": 573 |
| }, |
| { |
| "completion_length": 89.5625, |
| "epoch": 0.141798418972332, |
| "grad_norm": 3.4708980017734623, |
| "kl": 0.0625, |
| "learning_rate": 8.584486166007905e-07, |
| "loss": 0.0025, |
| "reward": 2.4852051734924316, |
| "reward_std": 0.04739289730787277, |
| "rewards/accuracy_reward_stage2": 0.4852050542831421, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 574 |
| }, |
| { |
| "completion_length": 75.3125, |
| "epoch": 0.14204545454545456, |
| "grad_norm": 1.9218103907041477, |
| "kl": 0.09130859375, |
| "learning_rate": 8.582015810276679e-07, |
| "loss": 0.0037, |
| "reward": 2.776808738708496, |
| "reward_std": 0.003206153865903616, |
| "rewards/accuracy_reward_stage2": 0.7768086194992065, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 575 |
| }, |
| { |
| "completion_length": 99.109375, |
| "epoch": 0.1422924901185771, |
| "grad_norm": 4.45653496204149, |
| "kl": 0.0791015625, |
| "learning_rate": 8.579545454545454e-07, |
| "loss": 0.0032, |
| "reward": 2.578134059906006, |
| "reward_std": 0.09177695214748383, |
| "rewards/accuracy_reward_stage2": 0.5781341791152954, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 576 |
| }, |
| { |
| "completion_length": 105.296875, |
| "epoch": 0.14253952569169961, |
| "grad_norm": 4.1541183942964235, |
| "kl": 0.0634765625, |
| "learning_rate": 8.577075098814229e-07, |
| "loss": 0.0025, |
| "reward": 2.668818950653076, |
| "reward_std": 0.14301809668540955, |
| "rewards/accuracy_reward_stage2": 0.6688190698623657, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.140625, |
| "step": 577 |
| }, |
| { |
| "completion_length": 86.34375, |
| "epoch": 0.14278656126482214, |
| "grad_norm": 3.6726618866240326, |
| "kl": 0.0810546875, |
| "learning_rate": 8.574604743083003e-07, |
| "loss": 0.0032, |
| "reward": 2.8135416507720947, |
| "reward_std": 0.09910938143730164, |
| "rewards/accuracy_reward_stage2": 0.8135416507720947, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 578 |
| }, |
| { |
| "completion_length": 96.5, |
| "epoch": 0.14303359683794467, |
| "grad_norm": 3.2088315728349333, |
| "kl": 0.09326171875, |
| "learning_rate": 8.572134387351779e-07, |
| "loss": 0.0037, |
| "reward": 2.813441514968872, |
| "reward_std": 0.01283353567123413, |
| "rewards/accuracy_reward_stage2": 0.8134413957595825, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 579 |
| }, |
| { |
| "completion_length": 89.125, |
| "epoch": 0.1432806324110672, |
| "grad_norm": 4.787753647506243, |
| "kl": 0.076171875, |
| "learning_rate": 8.569664031620553e-07, |
| "loss": 0.003, |
| "reward": 2.428907871246338, |
| "reward_std": 0.27077752351760864, |
| "rewards/accuracy_reward_stage2": 0.5539077520370483, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.0, |
| "step": 580 |
| }, |
| { |
| "completion_length": 92.53125, |
| "epoch": 0.14352766798418973, |
| "grad_norm": 1.964050185140295, |
| "kl": 0.0693359375, |
| "learning_rate": 8.567193675889328e-07, |
| "loss": 0.0028, |
| "reward": 2.7752304077148438, |
| "reward_std": 0.017977114766836166, |
| "rewards/accuracy_reward_stage2": 0.7752305269241333, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.140625, |
| "step": 581 |
| }, |
| { |
| "completion_length": 98.734375, |
| "epoch": 0.14377470355731226, |
| "grad_norm": 4.5476829359160895, |
| "kl": 0.087890625, |
| "learning_rate": 8.564723320158103e-07, |
| "loss": 0.0035, |
| "reward": 2.6758265495300293, |
| "reward_std": 0.057866424322128296, |
| "rewards/accuracy_reward_stage2": 0.6758266687393188, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 582 |
| }, |
| { |
| "completion_length": 97.140625, |
| "epoch": 0.14402173913043478, |
| "grad_norm": 3.3204660955158856, |
| "kl": 0.08349609375, |
| "learning_rate": 8.562252964426877e-07, |
| "loss": 0.0033, |
| "reward": 2.78486967086792, |
| "reward_std": 0.08574660122394562, |
| "rewards/accuracy_reward_stage2": 0.7848696708679199, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.203125, |
| "step": 583 |
| }, |
| { |
| "completion_length": 100.046875, |
| "epoch": 0.1442687747035573, |
| "grad_norm": 3.826957325314654, |
| "kl": 0.07470703125, |
| "learning_rate": 8.559782608695651e-07, |
| "loss": 0.003, |
| "reward": 2.706620454788208, |
| "reward_std": 0.07693397253751755, |
| "rewards/accuracy_reward_stage2": 0.7066203355789185, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 584 |
| }, |
| { |
| "completion_length": 103.125, |
| "epoch": 0.14451581027667984, |
| "grad_norm": 3.8246431494565387, |
| "kl": 0.09375, |
| "learning_rate": 8.557312252964426e-07, |
| "loss": 0.0038, |
| "reward": 2.5936570167541504, |
| "reward_std": 0.09550125896930695, |
| "rewards/accuracy_reward_stage2": 0.5936569571495056, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.09375, |
| "step": 585 |
| }, |
| { |
| "completion_length": 103.671875, |
| "epoch": 0.14476284584980237, |
| "grad_norm": 2.914164966990848, |
| "kl": 0.07568359375, |
| "learning_rate": 8.554841897233201e-07, |
| "loss": 0.003, |
| "reward": 2.3438425064086914, |
| "reward_std": 0.029713183641433716, |
| "rewards/accuracy_reward_stage2": 0.3438425660133362, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.25, |
| "step": 586 |
| }, |
| { |
| "completion_length": 92.34375, |
| "epoch": 0.1450098814229249, |
| "grad_norm": 4.580016718298798, |
| "kl": 0.0810546875, |
| "learning_rate": 8.552371541501975e-07, |
| "loss": 0.0032, |
| "reward": 2.644289016723633, |
| "reward_std": 0.057920269668102264, |
| "rewards/accuracy_reward_stage2": 0.6442890167236328, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 587 |
| }, |
| { |
| "completion_length": 91.328125, |
| "epoch": 0.14525691699604742, |
| "grad_norm": 2.3584839471859866, |
| "kl": 0.099609375, |
| "learning_rate": 8.549901185770751e-07, |
| "loss": 0.004, |
| "reward": 2.853529930114746, |
| "reward_std": 0.018497945740818977, |
| "rewards/accuracy_reward_stage2": 0.8535300493240356, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.328125, |
| "step": 588 |
| }, |
| { |
| "completion_length": 99.671875, |
| "epoch": 0.14550395256916995, |
| "grad_norm": 1.5376393488573932, |
| "kl": 0.0986328125, |
| "learning_rate": 8.547430830039525e-07, |
| "loss": 0.0039, |
| "reward": 2.908482074737549, |
| "reward_std": 0.0646936446428299, |
| "rewards/accuracy_reward_stage2": 0.9084821939468384, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.28125, |
| "step": 589 |
| }, |
| { |
| "completion_length": 108.59375, |
| "epoch": 0.14575098814229248, |
| "grad_norm": 2.616044570647881, |
| "kl": 0.07373046875, |
| "learning_rate": 8.544960474308301e-07, |
| "loss": 0.003, |
| "reward": 2.4426791667938232, |
| "reward_std": 0.09798327833414078, |
| "rewards/accuracy_reward_stage2": 0.4426790773868561, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.3125, |
| "step": 590 |
| }, |
| { |
| "completion_length": 75.9375, |
| "epoch": 0.145998023715415, |
| "grad_norm": 2.3565237541211075, |
| "kl": 0.087890625, |
| "learning_rate": 8.542490118577075e-07, |
| "loss": 0.0035, |
| "reward": 2.5625, |
| "reward_std": 0.06681530922651291, |
| "rewards/accuracy_reward_stage2": 0.5625, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 591 |
| }, |
| { |
| "completion_length": 98.359375, |
| "epoch": 0.14624505928853754, |
| "grad_norm": 5.707482842285747, |
| "kl": 0.09033203125, |
| "learning_rate": 8.540019762845849e-07, |
| "loss": 0.0036, |
| "reward": 2.3818490505218506, |
| "reward_std": 0.3349462151527405, |
| "rewards/accuracy_reward_stage2": 0.5693491101264954, |
| "rewards/format_reward_all_stage": 1.8125, |
| "scores/refine_times": 1.125, |
| "step": 592 |
| }, |
| { |
| "completion_length": 74.875, |
| "epoch": 0.14649209486166007, |
| "grad_norm": 2.5953914638869096, |
| "kl": 0.08056640625, |
| "learning_rate": 8.537549407114624e-07, |
| "loss": 0.0032, |
| "reward": 2.9376792907714844, |
| "reward_std": 0.004189035389572382, |
| "rewards/accuracy_reward_stage2": 0.9376791715621948, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 593 |
| }, |
| { |
| "completion_length": 96.953125, |
| "epoch": 0.14673913043478262, |
| "grad_norm": 5.554929821649331, |
| "kl": 0.09326171875, |
| "learning_rate": 8.535079051383399e-07, |
| "loss": 0.0037, |
| "reward": 2.597329616546631, |
| "reward_std": 0.27840977907180786, |
| "rewards/accuracy_reward_stage2": 0.6598294973373413, |
| "rewards/format_reward_all_stage": 1.9375, |
| "scores/refine_times": 1.125, |
| "step": 594 |
| }, |
| { |
| "completion_length": 89.15625, |
| "epoch": 0.14698616600790515, |
| "grad_norm": 4.019536724109511, |
| "kl": 0.059326171875, |
| "learning_rate": 8.532608695652173e-07, |
| "loss": 0.0024, |
| "reward": 2.6870036125183105, |
| "reward_std": 0.1373123675584793, |
| "rewards/accuracy_reward_stage2": 0.8120037317276001, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.125, |
| "step": 595 |
| }, |
| { |
| "completion_length": 77.8125, |
| "epoch": 0.14723320158102768, |
| "grad_norm": 4.8955452689702685, |
| "kl": 0.0927734375, |
| "learning_rate": 8.530138339920948e-07, |
| "loss": 0.0037, |
| "reward": 2.7033681869506836, |
| "reward_std": 0.09230202436447144, |
| "rewards/accuracy_reward_stage2": 0.7033681869506836, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 596 |
| }, |
| { |
| "completion_length": 79.6875, |
| "epoch": 0.1474802371541502, |
| "grad_norm": 3.7796317517540103, |
| "kl": 0.08447265625, |
| "learning_rate": 8.527667984189722e-07, |
| "loss": 0.0034, |
| "reward": 2.491044521331787, |
| "reward_std": 0.14756934344768524, |
| "rewards/accuracy_reward_stage2": 0.4910443425178528, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.140625, |
| "step": 597 |
| }, |
| { |
| "completion_length": 78.09375, |
| "epoch": 0.14772727272727273, |
| "grad_norm": 3.8334181192499153, |
| "kl": 0.091796875, |
| "learning_rate": 8.525197628458499e-07, |
| "loss": 0.0037, |
| "reward": 2.8837780952453613, |
| "reward_std": 0.07180596143007278, |
| "rewards/accuracy_reward_stage2": 0.8837779760360718, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.15625, |
| "step": 598 |
| }, |
| { |
| "completion_length": 92.859375, |
| "epoch": 0.14797430830039526, |
| "grad_norm": 4.288955963549508, |
| "kl": 0.125, |
| "learning_rate": 8.522727272727273e-07, |
| "loss": 0.005, |
| "reward": 2.762850761413574, |
| "reward_std": 0.07867846637964249, |
| "rewards/accuracy_reward_stage2": 0.7628507018089294, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.203125, |
| "step": 599 |
| }, |
| { |
| "completion_length": 82.71875, |
| "epoch": 0.1482213438735178, |
| "grad_norm": 3.5528631311780385, |
| "kl": 0.10498046875, |
| "learning_rate": 8.520256916996047e-07, |
| "loss": 0.0042, |
| "reward": 2.6389195919036865, |
| "reward_std": 0.06157643720507622, |
| "rewards/accuracy_reward_stage2": 0.6389195919036865, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.140625, |
| "step": 600 |
| }, |
| { |
| "completion_length": 85.328125, |
| "epoch": 0.14846837944664032, |
| "grad_norm": 4.689305434315942, |
| "kl": 0.08984375, |
| "learning_rate": 8.517786561264822e-07, |
| "loss": 0.0036, |
| "reward": 2.7248332500457764, |
| "reward_std": 0.1498197615146637, |
| "rewards/accuracy_reward_stage2": 0.724833071231842, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.1875, |
| "step": 601 |
| }, |
| { |
| "completion_length": 63.34375, |
| "epoch": 0.14871541501976285, |
| "grad_norm": 3.314732002614642, |
| "kl": 0.13671875, |
| "learning_rate": 8.515316205533597e-07, |
| "loss": 0.0055, |
| "reward": 2.7788662910461426, |
| "reward_std": 0.035718683153390884, |
| "rewards/accuracy_reward_stage2": 0.7788662910461426, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.078125, |
| "step": 602 |
| }, |
| { |
| "completion_length": 91.640625, |
| "epoch": 0.14896245059288538, |
| "grad_norm": 3.0711151595440245, |
| "kl": 0.078125, |
| "learning_rate": 8.512845849802371e-07, |
| "loss": 0.0031, |
| "reward": 2.600371837615967, |
| "reward_std": 0.17952686548233032, |
| "rewards/accuracy_reward_stage2": 0.7253717184066772, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.21875, |
| "step": 603 |
| }, |
| { |
| "completion_length": 78.796875, |
| "epoch": 0.1492094861660079, |
| "grad_norm": 3.5027395607429717, |
| "kl": 0.09765625, |
| "learning_rate": 8.510375494071146e-07, |
| "loss": 0.0039, |
| "reward": 2.674034833908081, |
| "reward_std": 0.04458358883857727, |
| "rewards/accuracy_reward_stage2": 0.6740349531173706, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 604 |
| }, |
| { |
| "completion_length": 86.609375, |
| "epoch": 0.14945652173913043, |
| "grad_norm": 2.2600159130295654, |
| "kl": 0.0849609375, |
| "learning_rate": 8.50790513833992e-07, |
| "loss": 0.0034, |
| "reward": 2.6875, |
| "reward_std": 0.06681530922651291, |
| "rewards/accuracy_reward_stage2": 0.6875, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.140625, |
| "step": 605 |
| }, |
| { |
| "completion_length": 80.640625, |
| "epoch": 0.14970355731225296, |
| "grad_norm": 4.805520751259063, |
| "kl": 0.07861328125, |
| "learning_rate": 8.505434782608694e-07, |
| "loss": 0.0032, |
| "reward": 2.63192081451416, |
| "reward_std": 0.1455264687538147, |
| "rewards/accuracy_reward_stage2": 0.6475456357002258, |
| "rewards/format_reward_all_stage": 1.984375, |
| "scores/refine_times": 1.15625, |
| "step": 606 |
| }, |
| { |
| "completion_length": 75.71875, |
| "epoch": 0.1499505928853755, |
| "grad_norm": 4.573576044918896, |
| "kl": 0.1044921875, |
| "learning_rate": 8.502964426877471e-07, |
| "loss": 0.0042, |
| "reward": 2.67458438873291, |
| "reward_std": 0.04098530113697052, |
| "rewards/accuracy_reward_stage2": 0.6745842695236206, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 607 |
| }, |
| { |
| "completion_length": 75.234375, |
| "epoch": 0.15019762845849802, |
| "grad_norm": 4.335859801113833, |
| "kl": 0.07470703125, |
| "learning_rate": 8.500494071146245e-07, |
| "loss": 0.003, |
| "reward": 2.4506468772888184, |
| "reward_std": 0.05327065661549568, |
| "rewards/accuracy_reward_stage2": 0.4506469666957855, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 608 |
| }, |
| { |
| "completion_length": 58.0, |
| "epoch": 0.15044466403162055, |
| "grad_norm": 4.152937219223513, |
| "kl": 0.09521484375, |
| "learning_rate": 8.498023715415019e-07, |
| "loss": 0.0038, |
| "reward": 2.6415534019470215, |
| "reward_std": 0.0792679637670517, |
| "rewards/accuracy_reward_stage2": 0.6415532827377319, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 609 |
| }, |
| { |
| "completion_length": 75.671875, |
| "epoch": 0.15069169960474307, |
| "grad_norm": 3.5202934389955143, |
| "kl": 0.0986328125, |
| "learning_rate": 8.495553359683794e-07, |
| "loss": 0.0039, |
| "reward": 2.627929210662842, |
| "reward_std": 0.08055642247200012, |
| "rewards/accuracy_reward_stage2": 0.6279292106628418, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 610 |
| }, |
| { |
| "completion_length": 68.421875, |
| "epoch": 0.1509387351778656, |
| "grad_norm": 3.9270342458590073, |
| "kl": 0.09326171875, |
| "learning_rate": 8.493083003952569e-07, |
| "loss": 0.0037, |
| "reward": 2.6131200790405273, |
| "reward_std": 0.07064563035964966, |
| "rewards/accuracy_reward_stage2": 0.6131199598312378, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.140625, |
| "step": 611 |
| }, |
| { |
| "completion_length": 94.046875, |
| "epoch": 0.15118577075098813, |
| "grad_norm": 2.797781979194763, |
| "kl": 0.138671875, |
| "learning_rate": 8.490612648221343e-07, |
| "loss": 0.0055, |
| "reward": 2.668482542037964, |
| "reward_std": 0.09077267348766327, |
| "rewards/accuracy_reward_stage2": 0.7257742285728455, |
| "rewards/format_reward_all_stage": 1.9427082538604736, |
| "scores/refine_times": 1.421875, |
| "step": 612 |
| }, |
| { |
| "completion_length": 76.25, |
| "epoch": 0.15143280632411066, |
| "grad_norm": 3.5823382600072833, |
| "kl": 0.1123046875, |
| "learning_rate": 8.488142292490118e-07, |
| "loss": 0.0045, |
| "reward": 2.8506646156311035, |
| "reward_std": 0.02524959295988083, |
| "rewards/accuracy_reward_stage2": 0.8506646156311035, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 613 |
| }, |
| { |
| "completion_length": 54.375, |
| "epoch": 0.15167984189723321, |
| "grad_norm": 5.43957715062048, |
| "kl": 0.1513671875, |
| "learning_rate": 8.485671936758892e-07, |
| "loss": 0.006, |
| "reward": 2.212973117828369, |
| "reward_std": 0.054874010384082794, |
| "rewards/accuracy_reward_stage2": 0.2129732221364975, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 614 |
| }, |
| { |
| "completion_length": 58.265625, |
| "epoch": 0.15192687747035574, |
| "grad_norm": 5.126288403045913, |
| "kl": 0.265625, |
| "learning_rate": 8.483201581027668e-07, |
| "loss": 0.0106, |
| "reward": 2.6679821014404297, |
| "reward_std": 0.11545281857252121, |
| "rewards/accuracy_reward_stage2": 0.6836073398590088, |
| "rewards/format_reward_all_stage": 1.984375, |
| "scores/refine_times": 1.25, |
| "step": 615 |
| }, |
| { |
| "completion_length": 37.515625, |
| "epoch": 0.15217391304347827, |
| "grad_norm": 4.80980966637606, |
| "kl": 0.2392578125, |
| "learning_rate": 8.480731225296443e-07, |
| "loss": 0.0096, |
| "reward": 2.513650417327881, |
| "reward_std": 0.11596601456403732, |
| "rewards/accuracy_reward_stage2": 0.5136504769325256, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 616 |
| }, |
| { |
| "completion_length": 50.390625, |
| "epoch": 0.1524209486166008, |
| "grad_norm": 2.9052895258079197, |
| "kl": 0.1875, |
| "learning_rate": 8.478260869565217e-07, |
| "loss": 0.0075, |
| "reward": 2.665754795074463, |
| "reward_std": 0.09999995678663254, |
| "rewards/accuracy_reward_stage2": 0.6813797950744629, |
| "rewards/format_reward_all_stage": 1.984375, |
| "scores/refine_times": 1.140625, |
| "step": 617 |
| }, |
| { |
| "completion_length": 42.734375, |
| "epoch": 0.15266798418972333, |
| "grad_norm": 2.9639210413309627, |
| "kl": 0.2138671875, |
| "learning_rate": 8.475790513833992e-07, |
| "loss": 0.0085, |
| "reward": 2.741377353668213, |
| "reward_std": 0.07839522510766983, |
| "rewards/accuracy_reward_stage2": 0.7413773536682129, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 618 |
| }, |
| { |
| "completion_length": 32.59375, |
| "epoch": 0.15291501976284586, |
| "grad_norm": 3.7719906987384793, |
| "kl": 0.322265625, |
| "learning_rate": 8.473320158102767e-07, |
| "loss": 0.0129, |
| "reward": 2.813457489013672, |
| "reward_std": 0.034740254282951355, |
| "rewards/accuracy_reward_stage2": 0.8134576082229614, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 619 |
| }, |
| { |
| "completion_length": 32.0625, |
| "epoch": 0.15316205533596838, |
| "grad_norm": 3.886802620815163, |
| "kl": 0.251953125, |
| "learning_rate": 8.470849802371541e-07, |
| "loss": 0.01, |
| "reward": 2.7390785217285156, |
| "reward_std": 0.08644495904445648, |
| "rewards/accuracy_reward_stage2": 0.7390785813331604, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 620 |
| }, |
| { |
| "completion_length": 28.53125, |
| "epoch": 0.1534090909090909, |
| "grad_norm": 4.3419292681332085, |
| "kl": 0.283203125, |
| "learning_rate": 8.468379446640316e-07, |
| "loss": 0.0113, |
| "reward": 2.72926664352417, |
| "reward_std": 0.02556372992694378, |
| "rewards/accuracy_reward_stage2": 0.7292666435241699, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 621 |
| }, |
| { |
| "completion_length": 31.265625, |
| "epoch": 0.15365612648221344, |
| "grad_norm": 4.426681437677741, |
| "kl": 0.330078125, |
| "learning_rate": 8.46590909090909e-07, |
| "loss": 0.0132, |
| "reward": 2.675302505493164, |
| "reward_std": 0.1827945113182068, |
| "rewards/accuracy_reward_stage2": 0.6753023862838745, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 622 |
| }, |
| { |
| "completion_length": 40.5, |
| "epoch": 0.15390316205533597, |
| "grad_norm": 4.901917034966575, |
| "kl": 0.2265625, |
| "learning_rate": 8.463438735177865e-07, |
| "loss": 0.0091, |
| "reward": 2.6904773712158203, |
| "reward_std": 0.09335070103406906, |
| "rewards/accuracy_reward_stage2": 0.7061026096343994, |
| "rewards/format_reward_all_stage": 1.984375, |
| "scores/refine_times": 1.078125, |
| "step": 623 |
| }, |
| { |
| "completion_length": 25.71875, |
| "epoch": 0.1541501976284585, |
| "grad_norm": 2.0307140823798524, |
| "kl": 0.25390625, |
| "learning_rate": 8.46096837944664e-07, |
| "loss": 0.0102, |
| "reward": 2.9270834922790527, |
| "reward_std": 0.09627808630466461, |
| "rewards/accuracy_reward_stage2": 0.9375, |
| "rewards/format_reward_all_stage": 1.9895833730697632, |
| "scores/refine_times": 1.203125, |
| "step": 624 |
| }, |
| { |
| "completion_length": 34.59375, |
| "epoch": 0.15439723320158102, |
| "grad_norm": 4.471368602534083, |
| "kl": 0.28515625, |
| "learning_rate": 8.458498023715415e-07, |
| "loss": 0.0114, |
| "reward": 2.7720367908477783, |
| "reward_std": 0.06439623236656189, |
| "rewards/accuracy_reward_stage2": 0.7876617908477783, |
| "rewards/format_reward_all_stage": 1.984375, |
| "scores/refine_times": 1.25, |
| "step": 625 |
| }, |
| { |
| "completion_length": 36.0625, |
| "epoch": 0.15464426877470355, |
| "grad_norm": 4.686637990114619, |
| "kl": 0.25, |
| "learning_rate": 8.45602766798419e-07, |
| "loss": 0.01, |
| "reward": 2.6198482513427734, |
| "reward_std": 0.0595381073653698, |
| "rewards/accuracy_reward_stage2": 0.6198481917381287, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 626 |
| }, |
| { |
| "completion_length": 28.421875, |
| "epoch": 0.15489130434782608, |
| "grad_norm": 3.816332577374878, |
| "kl": 0.2578125, |
| "learning_rate": 8.453557312252964e-07, |
| "loss": 0.0103, |
| "reward": 2.7242279052734375, |
| "reward_std": 0.10282538086175919, |
| "rewards/accuracy_reward_stage2": 0.7242279648780823, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.1875, |
| "step": 627 |
| }, |
| { |
| "completion_length": 26.875, |
| "epoch": 0.1551383399209486, |
| "grad_norm": 5.68706231004042, |
| "kl": 0.275390625, |
| "learning_rate": 8.451086956521739e-07, |
| "loss": 0.011, |
| "reward": 2.6736927032470703, |
| "reward_std": 0.12340083718299866, |
| "rewards/accuracy_reward_stage2": 0.673692524433136, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 628 |
| }, |
| { |
| "completion_length": 29.078125, |
| "epoch": 0.15538537549407114, |
| "grad_norm": 2.328207752491545, |
| "kl": 0.283203125, |
| "learning_rate": 8.448616600790514e-07, |
| "loss": 0.0113, |
| "reward": 2.7866618633270264, |
| "reward_std": 0.03922741115093231, |
| "rewards/accuracy_reward_stage2": 0.7866617441177368, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.234375, |
| "step": 629 |
| }, |
| { |
| "completion_length": 35.9375, |
| "epoch": 0.15563241106719367, |
| "grad_norm": 0.5035644859778624, |
| "kl": 0.240234375, |
| "learning_rate": 8.446146245059288e-07, |
| "loss": 0.0096, |
| "reward": 2.75, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward_stage2": 0.75, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 630 |
| }, |
| { |
| "completion_length": 44.375, |
| "epoch": 0.1558794466403162, |
| "grad_norm": 3.7496541284685887, |
| "kl": 0.244140625, |
| "learning_rate": 8.443675889328062e-07, |
| "loss": 0.0098, |
| "reward": 2.7512311935424805, |
| "reward_std": 0.09940579533576965, |
| "rewards/accuracy_reward_stage2": 0.8085229396820068, |
| "rewards/format_reward_all_stage": 1.9427083730697632, |
| "scores/refine_times": 1.296875, |
| "step": 631 |
| }, |
| { |
| "completion_length": 35.5, |
| "epoch": 0.15612648221343872, |
| "grad_norm": 3.8633097627706148, |
| "kl": 0.193359375, |
| "learning_rate": 8.441205533596838e-07, |
| "loss": 0.0077, |
| "reward": 2.653985023498535, |
| "reward_std": 0.035471752285957336, |
| "rewards/accuracy_reward_stage2": 0.6539848446846008, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.140625, |
| "step": 632 |
| }, |
| { |
| "completion_length": 35.65625, |
| "epoch": 0.15637351778656128, |
| "grad_norm": 2.102058612850822, |
| "kl": 0.2158203125, |
| "learning_rate": 8.438735177865612e-07, |
| "loss": 0.0086, |
| "reward": 2.83585524559021, |
| "reward_std": 0.07515110820531845, |
| "rewards/accuracy_reward_stage2": 0.8358553051948547, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.28125, |
| "step": 633 |
| }, |
| { |
| "completion_length": 27.09375, |
| "epoch": 0.1566205533596838, |
| "grad_norm": 5.37908292624943, |
| "kl": 0.2451171875, |
| "learning_rate": 8.436264822134386e-07, |
| "loss": 0.0098, |
| "reward": 2.67559814453125, |
| "reward_std": 0.1606387048959732, |
| "rewards/accuracy_reward_stage2": 0.6912230849266052, |
| "rewards/format_reward_all_stage": 1.984375, |
| "scores/refine_times": 1.0625, |
| "step": 634 |
| }, |
| { |
| "completion_length": 57.015625, |
| "epoch": 0.15686758893280633, |
| "grad_norm": 1.930963468171265, |
| "kl": 0.203125, |
| "learning_rate": 8.433794466403162e-07, |
| "loss": 0.0081, |
| "reward": 2.9319870471954346, |
| "reward_std": 0.00449987780302763, |
| "rewards/accuracy_reward_stage2": 0.9319870471954346, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.28125, |
| "step": 635 |
| }, |
| { |
| "completion_length": 42.6875, |
| "epoch": 0.15711462450592886, |
| "grad_norm": 4.015840085817725, |
| "kl": 0.26953125, |
| "learning_rate": 8.431324110671937e-07, |
| "loss": 0.0109, |
| "reward": 2.7423253059387207, |
| "reward_std": 0.20044684410095215, |
| "rewards/accuracy_reward_stage2": 0.8048254251480103, |
| "rewards/format_reward_all_stage": 1.9375, |
| "scores/refine_times": 1.140625, |
| "step": 636 |
| }, |
| { |
| "completion_length": 46.046875, |
| "epoch": 0.1573616600790514, |
| "grad_norm": 1.2951785973348273, |
| "kl": 0.2177734375, |
| "learning_rate": 8.428853754940711e-07, |
| "loss": 0.0087, |
| "reward": 2.8515625, |
| "reward_std": 0.022097086533904076, |
| "rewards/accuracy_reward_stage2": 0.8515625, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.203125, |
| "step": 637 |
| }, |
| { |
| "completion_length": 43.65625, |
| "epoch": 0.15760869565217392, |
| "grad_norm": 3.7465112575188666, |
| "kl": 0.23046875, |
| "learning_rate": 8.426383399209486e-07, |
| "loss": 0.0092, |
| "reward": 2.5327157974243164, |
| "reward_std": 0.17657220363616943, |
| "rewards/accuracy_reward_stage2": 0.6577157974243164, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.203125, |
| "step": 638 |
| }, |
| { |
| "completion_length": 36.625, |
| "epoch": 0.15785573122529645, |
| "grad_norm": 5.842474547361892, |
| "kl": 0.20703125, |
| "learning_rate": 8.42391304347826e-07, |
| "loss": 0.0083, |
| "reward": 2.6493334770202637, |
| "reward_std": 0.21465185284614563, |
| "rewards/accuracy_reward_stage2": 0.7743334770202637, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.140625, |
| "step": 639 |
| }, |
| { |
| "completion_length": 45.6875, |
| "epoch": 0.15810276679841898, |
| "grad_norm": 2.4835559209692075, |
| "kl": 0.24609375, |
| "learning_rate": 8.421442687747036e-07, |
| "loss": 0.0098, |
| "reward": 2.8289783000946045, |
| "reward_std": 0.016266150400042534, |
| "rewards/accuracy_reward_stage2": 0.8289782404899597, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.328125, |
| "step": 640 |
| }, |
| { |
| "completion_length": 59.5625, |
| "epoch": 0.1583498023715415, |
| "grad_norm": 5.194147021313203, |
| "kl": 0.294921875, |
| "learning_rate": 8.41897233201581e-07, |
| "loss": 0.0118, |
| "reward": 2.5701632499694824, |
| "reward_std": 0.032328180968761444, |
| "rewards/accuracy_reward_stage2": 0.5701633095741272, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.3125, |
| "step": 641 |
| }, |
| { |
| "completion_length": 41.265625, |
| "epoch": 0.15859683794466403, |
| "grad_norm": 1.9162461047086887, |
| "kl": 0.1650390625, |
| "learning_rate": 8.416501976284584e-07, |
| "loss": 0.0066, |
| "reward": 2.867898941040039, |
| "reward_std": 0.06962129473686218, |
| "rewards/accuracy_reward_stage2": 0.8678989410400391, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.390625, |
| "step": 642 |
| }, |
| { |
| "completion_length": 51.0, |
| "epoch": 0.15884387351778656, |
| "grad_norm": 2.573662013627622, |
| "kl": 0.193359375, |
| "learning_rate": 8.414031620553359e-07, |
| "loss": 0.0077, |
| "reward": 2.8405961990356445, |
| "reward_std": 0.0634043961763382, |
| "rewards/accuracy_reward_stage2": 0.8562212586402893, |
| "rewards/format_reward_all_stage": 1.984375, |
| "scores/refine_times": 1.3125, |
| "step": 643 |
| }, |
| { |
| "completion_length": 71.03125, |
| "epoch": 0.1590909090909091, |
| "grad_norm": 1.7426308070279573, |
| "kl": 0.1787109375, |
| "learning_rate": 8.411561264822134e-07, |
| "loss": 0.0071, |
| "reward": 2.577765464782715, |
| "reward_std": 0.14586614072322845, |
| "rewards/accuracy_reward_stage2": 0.6663070917129517, |
| "rewards/format_reward_all_stage": 1.9114582538604736, |
| "scores/refine_times": 1.453125, |
| "step": 644 |
| }, |
| { |
| "completion_length": 46.453125, |
| "epoch": 0.15933794466403162, |
| "grad_norm": 4.271793905157046, |
| "kl": 0.28515625, |
| "learning_rate": 8.409090909090909e-07, |
| "loss": 0.0114, |
| "reward": 2.730409860610962, |
| "reward_std": 0.15720345079898834, |
| "rewards/accuracy_reward_stage2": 0.7929098606109619, |
| "rewards/format_reward_all_stage": 1.9375, |
| "scores/refine_times": 1.25, |
| "step": 645 |
| }, |
| { |
| "completion_length": 67.453125, |
| "epoch": 0.15958498023715414, |
| "grad_norm": 5.033031730842746, |
| "kl": 0.2421875, |
| "learning_rate": 8.406620553359684e-07, |
| "loss": 0.0097, |
| "reward": 2.7433104515075684, |
| "reward_std": 0.05172478407621384, |
| "rewards/accuracy_reward_stage2": 0.7537272572517395, |
| "rewards/format_reward_all_stage": 1.9895833730697632, |
| "scores/refine_times": 1.296875, |
| "step": 646 |
| }, |
| { |
| "completion_length": 76.640625, |
| "epoch": 0.15983201581027667, |
| "grad_norm": 3.2006449227471725, |
| "kl": 0.1611328125, |
| "learning_rate": 8.404150197628458e-07, |
| "loss": 0.0065, |
| "reward": 2.792520761489868, |
| "reward_std": 0.013646906241774559, |
| "rewards/accuracy_reward_stage2": 0.7925208210945129, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.265625, |
| "step": 647 |
| }, |
| { |
| "completion_length": 68.640625, |
| "epoch": 0.1600790513833992, |
| "grad_norm": 6.943408346730125, |
| "kl": 0.302734375, |
| "learning_rate": 8.401679841897232e-07, |
| "loss": 0.0121, |
| "reward": 2.5047967433929443, |
| "reward_std": 0.1528734713792801, |
| "rewards/accuracy_reward_stage2": 0.5829217433929443, |
| "rewards/format_reward_all_stage": 1.921875, |
| "scores/refine_times": 1.25, |
| "step": 648 |
| }, |
| { |
| "completion_length": 77.828125, |
| "epoch": 0.16032608695652173, |
| "grad_norm": 4.0567635031379705, |
| "kl": 0.1923828125, |
| "learning_rate": 8.399209486166008e-07, |
| "loss": 0.0077, |
| "reward": 2.656846046447754, |
| "reward_std": 0.04590492323040962, |
| "rewards/accuracy_reward_stage2": 0.6568462252616882, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.234375, |
| "step": 649 |
| }, |
| { |
| "completion_length": 67.625, |
| "epoch": 0.16057312252964426, |
| "grad_norm": 3.370288275593572, |
| "kl": 0.1533203125, |
| "learning_rate": 8.396739130434782e-07, |
| "loss": 0.0061, |
| "reward": 2.65291690826416, |
| "reward_std": 0.07776036858558655, |
| "rewards/accuracy_reward_stage2": 0.6529167890548706, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.25, |
| "step": 650 |
| }, |
| { |
| "completion_length": 77.75, |
| "epoch": 0.16082015810276679, |
| "grad_norm": 0.9094780307831853, |
| "kl": 0.10546875, |
| "learning_rate": 8.394268774703556e-07, |
| "loss": 0.0042, |
| "reward": 2.8998451232910156, |
| "reward_std": 0.051297787576913834, |
| "rewards/accuracy_reward_stage2": 0.9060951471328735, |
| "rewards/format_reward_all_stage": 1.993749976158142, |
| "scores/refine_times": 1.3125, |
| "step": 651 |
| }, |
| { |
| "completion_length": 77.46875, |
| "epoch": 0.16106719367588934, |
| "grad_norm": 4.930984465220881, |
| "kl": 0.1171875, |
| "learning_rate": 8.391798418972331e-07, |
| "loss": 0.0047, |
| "reward": 2.6480984687805176, |
| "reward_std": 0.18127982318401337, |
| "rewards/accuracy_reward_stage2": 0.6793487071990967, |
| "rewards/format_reward_all_stage": 1.96875, |
| "scores/refine_times": 1.265625, |
| "step": 652 |
| }, |
| { |
| "completion_length": 60.984375, |
| "epoch": 0.16131422924901187, |
| "grad_norm": 5.128137555423027, |
| "kl": 0.134765625, |
| "learning_rate": 8.389328063241107e-07, |
| "loss": 0.0054, |
| "reward": 2.765324115753174, |
| "reward_std": 0.08034379780292511, |
| "rewards/accuracy_reward_stage2": 0.7809491157531738, |
| "rewards/format_reward_all_stage": 1.984375, |
| "scores/refine_times": 1.140625, |
| "step": 653 |
| }, |
| { |
| "completion_length": 74.890625, |
| "epoch": 0.1615612648221344, |
| "grad_norm": 4.648661118886044, |
| "kl": 0.134765625, |
| "learning_rate": 8.386857707509882e-07, |
| "loss": 0.0054, |
| "reward": 2.691357135772705, |
| "reward_std": 0.13039422035217285, |
| "rewards/accuracy_reward_stage2": 0.6913573145866394, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.1875, |
| "step": 654 |
| }, |
| { |
| "completion_length": 98.640625, |
| "epoch": 0.16180830039525693, |
| "grad_norm": 4.064729740373123, |
| "kl": 0.12353515625, |
| "learning_rate": 8.384387351778656e-07, |
| "loss": 0.0049, |
| "reward": 2.5702481269836426, |
| "reward_std": 0.13462196290493011, |
| "rewards/accuracy_reward_stage2": 0.5702481269836426, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.46875, |
| "step": 655 |
| }, |
| { |
| "completion_length": 113.84375, |
| "epoch": 0.16205533596837945, |
| "grad_norm": 3.2862871577391393, |
| "kl": 0.10595703125, |
| "learning_rate": 8.38191699604743e-07, |
| "loss": 0.0042, |
| "reward": 2.511234760284424, |
| "reward_std": 0.2531359791755676, |
| "rewards/accuracy_reward_stage2": 0.5841513872146606, |
| "rewards/format_reward_all_stage": 1.9270833730697632, |
| "scores/refine_times": 1.671875, |
| "step": 656 |
| }, |
| { |
| "completion_length": 92.34375, |
| "epoch": 0.16230237154150198, |
| "grad_norm": 4.422132694926118, |
| "kl": 0.1474609375, |
| "learning_rate": 8.379446640316206e-07, |
| "loss": 0.0059, |
| "reward": 2.3517141342163086, |
| "reward_std": 0.16402184963226318, |
| "rewards/accuracy_reward_stage2": 0.476714164018631, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.46875, |
| "step": 657 |
| }, |
| { |
| "completion_length": 70.734375, |
| "epoch": 0.1625494071146245, |
| "grad_norm": 1.1685505706287254, |
| "kl": 0.1162109375, |
| "learning_rate": 8.37697628458498e-07, |
| "loss": 0.0046, |
| "reward": 2.7522194385528564, |
| "reward_std": 0.05886061489582062, |
| "rewards/accuracy_reward_stage2": 0.7678444981575012, |
| "rewards/format_reward_all_stage": 1.984375, |
| "scores/refine_times": 1.375, |
| "step": 658 |
| }, |
| { |
| "completion_length": 91.140625, |
| "epoch": 0.16279644268774704, |
| "grad_norm": 3.257325899603518, |
| "kl": 0.14453125, |
| "learning_rate": 8.374505928853754e-07, |
| "loss": 0.0058, |
| "reward": 2.632218837738037, |
| "reward_std": 0.09933055937290192, |
| "rewards/accuracy_reward_stage2": 0.658260703086853, |
| "rewards/format_reward_all_stage": 1.9739583730697632, |
| "scores/refine_times": 1.53125, |
| "step": 659 |
| }, |
| { |
| "completion_length": 95.90625, |
| "epoch": 0.16304347826086957, |
| "grad_norm": 1.9690432413920704, |
| "kl": 0.134765625, |
| "learning_rate": 8.372035573122529e-07, |
| "loss": 0.0054, |
| "reward": 2.771512508392334, |
| "reward_std": 0.06438760459423065, |
| "rewards/accuracy_reward_stage2": 0.7715123891830444, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.59375, |
| "step": 660 |
| }, |
| { |
| "completion_length": 78.234375, |
| "epoch": 0.1632905138339921, |
| "grad_norm": 4.936444107949188, |
| "kl": 0.18359375, |
| "learning_rate": 8.369565217391304e-07, |
| "loss": 0.0073, |
| "reward": 2.2439303398132324, |
| "reward_std": 0.28851431608200073, |
| "rewards/accuracy_reward_stage2": 0.4574721157550812, |
| "rewards/format_reward_all_stage": 1.7864583730697632, |
| "scores/refine_times": 1.484375, |
| "step": 661 |
| }, |
| { |
| "completion_length": 82.96875, |
| "epoch": 0.16353754940711462, |
| "grad_norm": 4.166445669222274, |
| "kl": 0.1474609375, |
| "learning_rate": 8.367094861660079e-07, |
| "loss": 0.0059, |
| "reward": 2.672656297683716, |
| "reward_std": 0.28908365964889526, |
| "rewards/accuracy_reward_stage2": 0.9226562976837158, |
| "rewards/format_reward_all_stage": 1.75, |
| "scores/refine_times": 1.421875, |
| "step": 662 |
| }, |
| { |
| "completion_length": 99.703125, |
| "epoch": 0.16378458498023715, |
| "grad_norm": 2.9695692781319765, |
| "kl": 0.1494140625, |
| "learning_rate": 8.364624505928854e-07, |
| "loss": 0.006, |
| "reward": 2.593446969985962, |
| "reward_std": 0.07469912618398666, |
| "rewards/accuracy_reward_stage2": 0.5934468507766724, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.5625, |
| "step": 663 |
| }, |
| { |
| "completion_length": 62.421875, |
| "epoch": 0.16403162055335968, |
| "grad_norm": 3.313859207150031, |
| "kl": 0.19921875, |
| "learning_rate": 8.362154150197628e-07, |
| "loss": 0.008, |
| "reward": 2.719926357269287, |
| "reward_std": 0.06940320134162903, |
| "rewards/accuracy_reward_stage2": 0.7199262380599976, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.265625, |
| "step": 664 |
| }, |
| { |
| "completion_length": 96.4375, |
| "epoch": 0.1642786561264822, |
| "grad_norm": 2.5612629248896943, |
| "kl": 0.1484375, |
| "learning_rate": 8.359683794466402e-07, |
| "loss": 0.006, |
| "reward": 2.539092540740967, |
| "reward_std": 0.16509577631950378, |
| "rewards/accuracy_reward_stage2": 0.5609675049781799, |
| "rewards/format_reward_all_stage": 1.978124976158142, |
| "scores/refine_times": 1.5, |
| "step": 665 |
| }, |
| { |
| "completion_length": 92.9375, |
| "epoch": 0.16452569169960474, |
| "grad_norm": 4.211580308464546, |
| "kl": 0.2255859375, |
| "learning_rate": 8.357213438735178e-07, |
| "loss": 0.009, |
| "reward": 2.471635580062866, |
| "reward_std": 0.37825992703437805, |
| "rewards/accuracy_reward_stage2": 0.700802206993103, |
| "rewards/format_reward_all_stage": 1.7708333730697632, |
| "scores/refine_times": 1.390625, |
| "step": 666 |
| }, |
| { |
| "completion_length": 82.265625, |
| "epoch": 0.16477272727272727, |
| "grad_norm": 3.6219249232882227, |
| "kl": 0.2490234375, |
| "learning_rate": 8.354743083003952e-07, |
| "loss": 0.01, |
| "reward": 2.845177173614502, |
| "reward_std": 0.11333870142698288, |
| "rewards/accuracy_reward_stage2": 0.8451772332191467, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.515625, |
| "step": 667 |
| }, |
| { |
| "completion_length": 75.5, |
| "epoch": 0.1650197628458498, |
| "grad_norm": 2.3153256870816588, |
| "kl": 0.2080078125, |
| "learning_rate": 8.352272727272727e-07, |
| "loss": 0.0083, |
| "reward": 2.7491612434387207, |
| "reward_std": 0.14487169682979584, |
| "rewards/accuracy_reward_stage2": 0.7804111838340759, |
| "rewards/format_reward_all_stage": 1.96875, |
| "scores/refine_times": 1.46875, |
| "step": 668 |
| }, |
| { |
| "completion_length": 92.0625, |
| "epoch": 0.16526679841897232, |
| "grad_norm": 2.3136812707049246, |
| "kl": 0.169921875, |
| "learning_rate": 8.349802371541501e-07, |
| "loss": 0.0068, |
| "reward": 2.8566527366638184, |
| "reward_std": 0.01673370786011219, |
| "rewards/accuracy_reward_stage2": 0.856652557849884, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.640625, |
| "step": 669 |
| }, |
| { |
| "completion_length": 81.703125, |
| "epoch": 0.16551383399209485, |
| "grad_norm": 2.4637521602454946, |
| "kl": 0.1640625, |
| "learning_rate": 8.347332015810276e-07, |
| "loss": 0.0066, |
| "reward": 2.7337234020233154, |
| "reward_std": 0.08656609803438187, |
| "rewards/accuracy_reward_stage2": 0.749348521232605, |
| "rewards/format_reward_all_stage": 1.984375, |
| "scores/refine_times": 1.296875, |
| "step": 670 |
| }, |
| { |
| "completion_length": 72.140625, |
| "epoch": 0.16576086956521738, |
| "grad_norm": 4.509112434753636, |
| "kl": 0.203125, |
| "learning_rate": 8.344861660079052e-07, |
| "loss": 0.0081, |
| "reward": 2.5170774459838867, |
| "reward_std": 0.1543843299150467, |
| "rewards/accuracy_reward_stage2": 0.6420773267745972, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.390625, |
| "step": 671 |
| }, |
| { |
| "completion_length": 70.6875, |
| "epoch": 0.16600790513833993, |
| "grad_norm": 3.6089679100156515, |
| "kl": 0.310546875, |
| "learning_rate": 8.342391304347826e-07, |
| "loss": 0.0124, |
| "reward": 2.774411916732788, |
| "reward_std": 0.16703827679157257, |
| "rewards/accuracy_reward_stage2": 0.8994120359420776, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.3125, |
| "step": 672 |
| }, |
| { |
| "completion_length": 76.796875, |
| "epoch": 0.16625494071146246, |
| "grad_norm": 3.7602906391473985, |
| "kl": 0.220703125, |
| "learning_rate": 8.3399209486166e-07, |
| "loss": 0.0088, |
| "reward": 2.7770447731018066, |
| "reward_std": 0.1483728587627411, |
| "rewards/accuracy_reward_stage2": 0.8395448923110962, |
| "rewards/format_reward_all_stage": 1.9375, |
| "scores/refine_times": 1.515625, |
| "step": 673 |
| }, |
| { |
| "completion_length": 105.421875, |
| "epoch": 0.166501976284585, |
| "grad_norm": 5.874770192574941, |
| "kl": 0.263671875, |
| "learning_rate": 8.337450592885376e-07, |
| "loss": 0.0105, |
| "reward": 2.7320079803466797, |
| "reward_std": 0.22740009427070618, |
| "rewards/accuracy_reward_stage2": 0.8517996072769165, |
| "rewards/format_reward_all_stage": 1.8802083730697632, |
| "scores/refine_times": 1.59375, |
| "step": 674 |
| }, |
| { |
| "completion_length": 55.96875, |
| "epoch": 0.16674901185770752, |
| "grad_norm": 6.3659434646120445, |
| "kl": 0.271484375, |
| "learning_rate": 8.33498023715415e-07, |
| "loss": 0.0108, |
| "reward": 2.706432580947876, |
| "reward_std": 0.12657755613327026, |
| "rewards/accuracy_reward_stage2": 0.7064326405525208, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 675 |
| }, |
| { |
| "completion_length": 62.515625, |
| "epoch": 0.16699604743083005, |
| "grad_norm": 3.602748165149088, |
| "kl": 0.2138671875, |
| "learning_rate": 8.332509881422924e-07, |
| "loss": 0.0086, |
| "reward": 2.6532373428344727, |
| "reward_std": 0.12004198879003525, |
| "rewards/accuracy_reward_stage2": 0.6532373428344727, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.203125, |
| "step": 676 |
| }, |
| { |
| "completion_length": 70.375, |
| "epoch": 0.16724308300395258, |
| "grad_norm": 3.767716762503837, |
| "kl": 0.1865234375, |
| "learning_rate": 8.330039525691699e-07, |
| "loss": 0.0075, |
| "reward": 2.7938568592071533, |
| "reward_std": 0.10841001570224762, |
| "rewards/accuracy_reward_stage2": 0.8094819784164429, |
| "rewards/format_reward_all_stage": 1.984375, |
| "scores/refine_times": 1.171875, |
| "step": 677 |
| }, |
| { |
| "completion_length": 67.5, |
| "epoch": 0.1674901185770751, |
| "grad_norm": 5.950488721235662, |
| "kl": 0.19921875, |
| "learning_rate": 8.327569169960474e-07, |
| "loss": 0.008, |
| "reward": 2.2359559535980225, |
| "reward_std": 0.1503939926624298, |
| "rewards/accuracy_reward_stage2": 0.23595598340034485, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.359375, |
| "step": 678 |
| }, |
| { |
| "completion_length": 60.78125, |
| "epoch": 0.16773715415019763, |
| "grad_norm": 6.45907229221516, |
| "kl": 0.4375, |
| "learning_rate": 8.325098814229248e-07, |
| "loss": 0.0176, |
| "reward": 2.4382810592651367, |
| "reward_std": 0.24932968616485596, |
| "rewards/accuracy_reward_stage2": 0.5736978054046631, |
| "rewards/format_reward_all_stage": 1.8645833730697632, |
| "scores/refine_times": 1.375, |
| "step": 679 |
| }, |
| { |
| "completion_length": 70.515625, |
| "epoch": 0.16798418972332016, |
| "grad_norm": 3.8960198945184024, |
| "kl": 0.185546875, |
| "learning_rate": 8.322628458498023e-07, |
| "loss": 0.0074, |
| "reward": 2.7114639282226562, |
| "reward_std": 0.10305608808994293, |
| "rewards/accuracy_reward_stage2": 0.7114640474319458, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.4375, |
| "step": 680 |
| }, |
| { |
| "completion_length": 62.25, |
| "epoch": 0.1682312252964427, |
| "grad_norm": 2.3643728630811998, |
| "kl": 0.173828125, |
| "learning_rate": 8.320158102766798e-07, |
| "loss": 0.0069, |
| "reward": 2.6612939834594727, |
| "reward_std": 0.00641861604526639, |
| "rewards/accuracy_reward_stage2": 0.6612938642501831, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.203125, |
| "step": 681 |
| }, |
| { |
| "completion_length": 50.421875, |
| "epoch": 0.16847826086956522, |
| "grad_norm": 4.307232779181777, |
| "kl": 0.1630859375, |
| "learning_rate": 8.317687747035574e-07, |
| "loss": 0.0065, |
| "reward": 2.8651654720306396, |
| "reward_std": 0.07820230722427368, |
| "rewards/accuracy_reward_stage2": 0.8651654124259949, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.09375, |
| "step": 682 |
| }, |
| { |
| "completion_length": 62.4375, |
| "epoch": 0.16872529644268774, |
| "grad_norm": 4.587939777537952, |
| "kl": 0.2119140625, |
| "learning_rate": 8.315217391304348e-07, |
| "loss": 0.0085, |
| "reward": 2.7178468704223633, |
| "reward_std": 0.04774696007370949, |
| "rewards/accuracy_reward_stage2": 0.7178468704223633, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.265625, |
| "step": 683 |
| }, |
| { |
| "completion_length": 55.09375, |
| "epoch": 0.16897233201581027, |
| "grad_norm": 1.624931490884714, |
| "kl": 0.09228515625, |
| "learning_rate": 8.312747035573122e-07, |
| "loss": 0.0037, |
| "reward": 2.6188762187957764, |
| "reward_std": 0.0659729540348053, |
| "rewards/accuracy_reward_stage2": 0.6813762784004211, |
| "rewards/format_reward_all_stage": 1.9375, |
| "scores/refine_times": 1.1875, |
| "step": 684 |
| }, |
| { |
| "completion_length": 44.921875, |
| "epoch": 0.1692193675889328, |
| "grad_norm": 2.746456916397565, |
| "kl": 0.177734375, |
| "learning_rate": 8.310276679841897e-07, |
| "loss": 0.0071, |
| "reward": 2.862575054168701, |
| "reward_std": 0.07828960567712784, |
| "rewards/accuracy_reward_stage2": 0.8781998753547668, |
| "rewards/format_reward_all_stage": 1.984375, |
| "scores/refine_times": 1.140625, |
| "step": 685 |
| }, |
| { |
| "completion_length": 59.140625, |
| "epoch": 0.16946640316205533, |
| "grad_norm": 3.164878176588737, |
| "kl": 0.16015625, |
| "learning_rate": 8.307806324110671e-07, |
| "loss": 0.0064, |
| "reward": 2.7457258701324463, |
| "reward_std": 0.08368375897407532, |
| "rewards/accuracy_reward_stage2": 0.7613507509231567, |
| "rewards/format_reward_all_stage": 1.984375, |
| "scores/refine_times": 1.203125, |
| "step": 686 |
| }, |
| { |
| "completion_length": 60.09375, |
| "epoch": 0.16971343873517786, |
| "grad_norm": 4.429099779687638, |
| "kl": 0.1953125, |
| "learning_rate": 8.305335968379446e-07, |
| "loss": 0.0078, |
| "reward": 2.592247486114502, |
| "reward_std": 0.12885862588882446, |
| "rewards/accuracy_reward_stage2": 0.6078723669052124, |
| "rewards/format_reward_all_stage": 1.984375, |
| "scores/refine_times": 1.21875, |
| "step": 687 |
| }, |
| { |
| "completion_length": 53.765625, |
| "epoch": 0.16996047430830039, |
| "grad_norm": 5.471292386957025, |
| "kl": 0.173828125, |
| "learning_rate": 8.302865612648221e-07, |
| "loss": 0.007, |
| "reward": 2.387572765350342, |
| "reward_std": 0.07770496606826782, |
| "rewards/accuracy_reward_stage2": 0.3875727653503418, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.171875, |
| "step": 688 |
| }, |
| { |
| "completion_length": 48.375, |
| "epoch": 0.1702075098814229, |
| "grad_norm": 5.2370692806906325, |
| "kl": 0.1416015625, |
| "learning_rate": 8.300395256916995e-07, |
| "loss": 0.0057, |
| "reward": 2.590374231338501, |
| "reward_std": 0.030147546902298927, |
| "rewards/accuracy_reward_stage2": 0.5903741717338562, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 689 |
| }, |
| { |
| "completion_length": 68.265625, |
| "epoch": 0.17045454545454544, |
| "grad_norm": 3.3215452215659234, |
| "kl": 0.1416015625, |
| "learning_rate": 8.29792490118577e-07, |
| "loss": 0.0057, |
| "reward": 2.662672281265259, |
| "reward_std": 0.12791702151298523, |
| "rewards/accuracy_reward_stage2": 0.7303804159164429, |
| "rewards/format_reward_all_stage": 1.9322917461395264, |
| "scores/refine_times": 1.28125, |
| "step": 690 |
| }, |
| { |
| "completion_length": 72.71875, |
| "epoch": 0.170701581027668, |
| "grad_norm": 4.221926503688189, |
| "kl": 0.1806640625, |
| "learning_rate": 8.295454545454546e-07, |
| "loss": 0.0072, |
| "reward": 2.465261697769165, |
| "reward_std": 0.19621172547340393, |
| "rewards/accuracy_reward_stage2": 0.6475533246994019, |
| "rewards/format_reward_all_stage": 1.8177083730697632, |
| "scores/refine_times": 1.40625, |
| "step": 691 |
| }, |
| { |
| "completion_length": 58.15625, |
| "epoch": 0.17094861660079053, |
| "grad_norm": 4.999034875079111, |
| "kl": 0.1943359375, |
| "learning_rate": 8.29298418972332e-07, |
| "loss": 0.0078, |
| "reward": 2.3911185264587402, |
| "reward_std": 0.3185882568359375, |
| "rewards/accuracy_reward_stage2": 0.6567436456680298, |
| "rewards/format_reward_all_stage": 1.734375, |
| "scores/refine_times": 1.125, |
| "step": 692 |
| }, |
| { |
| "completion_length": 49.921875, |
| "epoch": 0.17119565217391305, |
| "grad_norm": 3.6830696834703565, |
| "kl": 0.2421875, |
| "learning_rate": 8.290513833992095e-07, |
| "loss": 0.0097, |
| "reward": 2.8560991287231445, |
| "reward_std": 0.08281519263982773, |
| "rewards/accuracy_reward_stage2": 0.856099009513855, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 693 |
| }, |
| { |
| "completion_length": 34.125, |
| "epoch": 0.17144268774703558, |
| "grad_norm": 0.9783630797309559, |
| "kl": 0.2275390625, |
| "learning_rate": 8.288043478260869e-07, |
| "loss": 0.0091, |
| "reward": 2.609375, |
| "reward_std": 0.04419417306780815, |
| "rewards/accuracy_reward_stage2": 0.734375, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.125, |
| "step": 694 |
| }, |
| { |
| "completion_length": 69.125, |
| "epoch": 0.1716897233201581, |
| "grad_norm": 3.0218653160299795, |
| "kl": 0.173828125, |
| "learning_rate": 8.285573122529644e-07, |
| "loss": 0.007, |
| "reward": 2.6367897987365723, |
| "reward_std": 0.12302423268556595, |
| "rewards/accuracy_reward_stage2": 0.7748106718063354, |
| "rewards/format_reward_all_stage": 1.8619791269302368, |
| "scores/refine_times": 1.265625, |
| "step": 695 |
| }, |
| { |
| "completion_length": 54.234375, |
| "epoch": 0.17193675889328064, |
| "grad_norm": 4.184913189641024, |
| "kl": 0.248046875, |
| "learning_rate": 8.283102766798419e-07, |
| "loss": 0.0099, |
| "reward": 2.395554542541504, |
| "reward_std": 0.39820319414138794, |
| "rewards/accuracy_reward_stage2": 0.6611795425415039, |
| "rewards/format_reward_all_stage": 1.734375, |
| "scores/refine_times": 1.203125, |
| "step": 696 |
| }, |
| { |
| "completion_length": 43.5, |
| "epoch": 0.17218379446640317, |
| "grad_norm": 2.595884108341551, |
| "kl": 0.248046875, |
| "learning_rate": 8.280632411067193e-07, |
| "loss": 0.0099, |
| "reward": 2.8225650787353516, |
| "reward_std": 0.17631623148918152, |
| "rewards/accuracy_reward_stage2": 0.9631900191307068, |
| "rewards/format_reward_all_stage": 1.859375, |
| "scores/refine_times": 1.171875, |
| "step": 697 |
| }, |
| { |
| "completion_length": 38.953125, |
| "epoch": 0.1724308300395257, |
| "grad_norm": 4.022940918000707, |
| "kl": 0.26171875, |
| "learning_rate": 8.278162055335967e-07, |
| "loss": 0.0105, |
| "reward": 2.7802834510803223, |
| "reward_std": 0.18306688964366913, |
| "rewards/accuracy_reward_stage2": 0.9052833318710327, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.125, |
| "step": 698 |
| }, |
| { |
| "completion_length": 42.90625, |
| "epoch": 0.17267786561264822, |
| "grad_norm": 3.4391934805126767, |
| "kl": 0.2490234375, |
| "learning_rate": 8.275691699604744e-07, |
| "loss": 0.0099, |
| "reward": 2.6163432598114014, |
| "reward_std": 0.23740006983280182, |
| "rewards/accuracy_reward_stage2": 0.6319682002067566, |
| "rewards/format_reward_all_stage": 1.984375, |
| "scores/refine_times": 1.125, |
| "step": 699 |
| }, |
| { |
| "completion_length": 49.4375, |
| "epoch": 0.17292490118577075, |
| "grad_norm": 5.477631698074841, |
| "kl": 0.34375, |
| "learning_rate": 8.273221343873518e-07, |
| "loss": 0.0137, |
| "reward": 2.540916681289673, |
| "reward_std": 0.21585610508918762, |
| "rewards/accuracy_reward_stage2": 0.6034167408943176, |
| "rewards/format_reward_all_stage": 1.9375, |
| "scores/refine_times": 1.203125, |
| "step": 700 |
| }, |
| { |
| "completion_length": 37.3125, |
| "epoch": 0.17317193675889328, |
| "grad_norm": 4.275784599530968, |
| "kl": 0.1767578125, |
| "learning_rate": 8.270750988142292e-07, |
| "loss": 0.0071, |
| "reward": 2.82668399810791, |
| "reward_std": 0.12060071527957916, |
| "rewards/accuracy_reward_stage2": 0.8266839385032654, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 701 |
| }, |
| { |
| "completion_length": 44.0, |
| "epoch": 0.1734189723320158, |
| "grad_norm": 5.929761937498036, |
| "kl": 0.1982421875, |
| "learning_rate": 8.268280632411067e-07, |
| "loss": 0.0079, |
| "reward": 2.5984463691711426, |
| "reward_std": 0.10440248996019363, |
| "rewards/accuracy_reward_stage2": 0.5984464883804321, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 702 |
| }, |
| { |
| "completion_length": 38.046875, |
| "epoch": 0.17366600790513834, |
| "grad_norm": 4.36820409837742, |
| "kl": 0.2177734375, |
| "learning_rate": 8.265810276679841e-07, |
| "loss": 0.0087, |
| "reward": 2.6032559871673584, |
| "reward_std": 0.021172545850276947, |
| "rewards/accuracy_reward_stage2": 0.6032558679580688, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.078125, |
| "step": 703 |
| }, |
| { |
| "completion_length": 29.8125, |
| "epoch": 0.17391304347826086, |
| "grad_norm": 4.34358181377926, |
| "kl": 0.263671875, |
| "learning_rate": 8.263339920948616e-07, |
| "loss": 0.0106, |
| "reward": 2.7768826484680176, |
| "reward_std": 0.04401562735438347, |
| "rewards/accuracy_reward_stage2": 0.7768827080726624, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 704 |
| }, |
| { |
| "completion_length": 39.890625, |
| "epoch": 0.1741600790513834, |
| "grad_norm": 3.8777171386458273, |
| "kl": 0.2314453125, |
| "learning_rate": 8.260869565217391e-07, |
| "loss": 0.0093, |
| "reward": 2.51078200340271, |
| "reward_std": 0.15417703986167908, |
| "rewards/accuracy_reward_stage2": 0.5784904360771179, |
| "rewards/format_reward_all_stage": 1.9322916269302368, |
| "scores/refine_times": 1.078125, |
| "step": 705 |
| }, |
| { |
| "completion_length": 29.34375, |
| "epoch": 0.17440711462450592, |
| "grad_norm": 5.0953232936524895, |
| "kl": 0.296875, |
| "learning_rate": 8.258399209486165e-07, |
| "loss": 0.0118, |
| "reward": 2.5522358417510986, |
| "reward_std": 0.07304012030363083, |
| "rewards/accuracy_reward_stage2": 0.5522358417510986, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 706 |
| }, |
| { |
| "completion_length": 49.625, |
| "epoch": 0.17465415019762845, |
| "grad_norm": 2.902160155311922, |
| "kl": 0.2373046875, |
| "learning_rate": 8.255928853754939e-07, |
| "loss": 0.0095, |
| "reward": 2.645291805267334, |
| "reward_std": 0.0809149295091629, |
| "rewards/accuracy_reward_stage2": 0.8327919244766235, |
| "rewards/format_reward_all_stage": 1.8125, |
| "scores/refine_times": 1.125, |
| "step": 707 |
| }, |
| { |
| "completion_length": 30.375, |
| "epoch": 0.17490118577075098, |
| "grad_norm": 3.3841190373178414, |
| "kl": 0.224609375, |
| "learning_rate": 8.253458498023716e-07, |
| "loss": 0.009, |
| "reward": 2.648444414138794, |
| "reward_std": 0.0884200856089592, |
| "rewards/accuracy_reward_stage2": 0.6484442949295044, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 708 |
| }, |
| { |
| "completion_length": 37.4375, |
| "epoch": 0.1751482213438735, |
| "grad_norm": 2.569762486464144, |
| "kl": 0.22265625, |
| "learning_rate": 8.25098814229249e-07, |
| "loss": 0.0089, |
| "reward": 2.728829860687256, |
| "reward_std": 0.07126966118812561, |
| "rewards/accuracy_reward_stage2": 0.7913298606872559, |
| "rewards/format_reward_all_stage": 1.9375, |
| "scores/refine_times": 1.0625, |
| "step": 709 |
| }, |
| { |
| "completion_length": 29.625, |
| "epoch": 0.17539525691699603, |
| "grad_norm": 2.9841615476853205, |
| "kl": 0.2001953125, |
| "learning_rate": 8.248517786561265e-07, |
| "loss": 0.008, |
| "reward": 2.771129608154297, |
| "reward_std": 0.01384773664176464, |
| "rewards/accuracy_reward_stage2": 0.7711294889450073, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 710 |
| }, |
| { |
| "completion_length": 38.1875, |
| "epoch": 0.1756422924901186, |
| "grad_norm": 3.3518342412176336, |
| "kl": 0.2314453125, |
| "learning_rate": 8.246047430830039e-07, |
| "loss": 0.0092, |
| "reward": 2.5392909049987793, |
| "reward_std": 0.027141904458403587, |
| "rewards/accuracy_reward_stage2": 0.5392909049987793, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 711 |
| }, |
| { |
| "completion_length": 45.421875, |
| "epoch": 0.17588932806324112, |
| "grad_norm": 5.272718022007329, |
| "kl": 0.32421875, |
| "learning_rate": 8.243577075098814e-07, |
| "loss": 0.013, |
| "reward": 2.4325010776519775, |
| "reward_std": 0.3576674461364746, |
| "rewards/accuracy_reward_stage2": 0.7450010180473328, |
| "rewards/format_reward_all_stage": 1.6875, |
| "scores/refine_times": 1.125, |
| "step": 712 |
| }, |
| { |
| "completion_length": 44.828125, |
| "epoch": 0.17613636363636365, |
| "grad_norm": 4.860902186643657, |
| "kl": 0.2353515625, |
| "learning_rate": 8.241106719367589e-07, |
| "loss": 0.0094, |
| "reward": 2.4493470191955566, |
| "reward_std": 0.18647147715091705, |
| "rewards/accuracy_reward_stage2": 0.5743468999862671, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.078125, |
| "step": 713 |
| }, |
| { |
| "completion_length": 38.359375, |
| "epoch": 0.17638339920948617, |
| "grad_norm": 4.4256498100793324, |
| "kl": 0.23046875, |
| "learning_rate": 8.238636363636363e-07, |
| "loss": 0.0093, |
| "reward": 2.6608829498291016, |
| "reward_std": 0.08993560075759888, |
| "rewards/accuracy_reward_stage2": 0.7233830690383911, |
| "rewards/format_reward_all_stage": 1.9375, |
| "scores/refine_times": 1.1875, |
| "step": 714 |
| }, |
| { |
| "completion_length": 33.0625, |
| "epoch": 0.1766304347826087, |
| "grad_norm": 4.296817652715915, |
| "kl": 0.27734375, |
| "learning_rate": 8.236166007905137e-07, |
| "loss": 0.0111, |
| "reward": 2.7572100162506104, |
| "reward_std": 0.1819225251674652, |
| "rewards/accuracy_reward_stage2": 0.8822100162506104, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.0, |
| "step": 715 |
| }, |
| { |
| "completion_length": 39.65625, |
| "epoch": 0.17687747035573123, |
| "grad_norm": 1.246417044381739, |
| "kl": 0.2236328125, |
| "learning_rate": 8.233695652173913e-07, |
| "loss": 0.0089, |
| "reward": 2.941277265548706, |
| "reward_std": 0.04274333268404007, |
| "rewards/accuracy_reward_stage2": 0.956902265548706, |
| "rewards/format_reward_all_stage": 1.984375, |
| "scores/refine_times": 1.125, |
| "step": 716 |
| }, |
| { |
| "completion_length": 62.546875, |
| "epoch": 0.17712450592885376, |
| "grad_norm": 3.592067481060388, |
| "kl": 0.15234375, |
| "learning_rate": 8.231225296442687e-07, |
| "loss": 0.0061, |
| "reward": 2.607104778289795, |
| "reward_std": 0.10167138278484344, |
| "rewards/accuracy_reward_stage2": 0.6696048378944397, |
| "rewards/format_reward_all_stage": 1.9375, |
| "scores/refine_times": 1.0625, |
| "step": 717 |
| }, |
| { |
| "completion_length": 65.015625, |
| "epoch": 0.1773715415019763, |
| "grad_norm": 4.019521038571473, |
| "kl": 0.158203125, |
| "learning_rate": 8.228754940711462e-07, |
| "loss": 0.0063, |
| "reward": 2.6828722953796387, |
| "reward_std": 0.044261686503887177, |
| "rewards/accuracy_reward_stage2": 0.6828722357749939, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.15625, |
| "step": 718 |
| }, |
| { |
| "completion_length": 45.25, |
| "epoch": 0.17761857707509882, |
| "grad_norm": 6.127963836128111, |
| "kl": 0.17578125, |
| "learning_rate": 8.226284584980237e-07, |
| "loss": 0.007, |
| "reward": 2.5614614486694336, |
| "reward_std": 0.16553114354610443, |
| "rewards/accuracy_reward_stage2": 0.6239614486694336, |
| "rewards/format_reward_all_stage": 1.9375, |
| "scores/refine_times": 1.0625, |
| "step": 719 |
| }, |
| { |
| "completion_length": 62.0625, |
| "epoch": 0.17786561264822134, |
| "grad_norm": 5.5689391101787225, |
| "kl": 0.162109375, |
| "learning_rate": 8.223814229249012e-07, |
| "loss": 0.0065, |
| "reward": 2.605252265930176, |
| "reward_std": 0.15683504939079285, |
| "rewards/accuracy_reward_stage2": 0.6677523255348206, |
| "rewards/format_reward_all_stage": 1.9375, |
| "scores/refine_times": 1.21875, |
| "step": 720 |
| }, |
| { |
| "completion_length": 53.53125, |
| "epoch": 0.17811264822134387, |
| "grad_norm": 3.7608697254725154, |
| "kl": 0.1787109375, |
| "learning_rate": 8.221343873517787e-07, |
| "loss": 0.0071, |
| "reward": 2.357903480529785, |
| "reward_std": 0.037971869111061096, |
| "rewards/accuracy_reward_stage2": 0.35790371894836426, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 721 |
| }, |
| { |
| "completion_length": 60.46875, |
| "epoch": 0.1783596837944664, |
| "grad_norm": 2.9452310520498797, |
| "kl": 0.11962890625, |
| "learning_rate": 8.218873517786561e-07, |
| "loss": 0.0048, |
| "reward": 2.753098964691162, |
| "reward_std": 0.056902870535850525, |
| "rewards/accuracy_reward_stage2": 0.7530988454818726, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.1875, |
| "step": 722 |
| }, |
| { |
| "completion_length": 69.234375, |
| "epoch": 0.17860671936758893, |
| "grad_norm": 1.1382233123873722, |
| "kl": 0.10400390625, |
| "learning_rate": 8.216403162055335e-07, |
| "loss": 0.0042, |
| "reward": 2.576420545578003, |
| "reward_std": 0.004017648287117481, |
| "rewards/accuracy_reward_stage2": 0.5764204859733582, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 723 |
| }, |
| { |
| "completion_length": 75.015625, |
| "epoch": 0.17885375494071146, |
| "grad_norm": 2.173861915208193, |
| "kl": 0.1181640625, |
| "learning_rate": 8.21393280632411e-07, |
| "loss": 0.0047, |
| "reward": 2.8701119422912598, |
| "reward_std": 0.03073050081729889, |
| "rewards/accuracy_reward_stage2": 0.8701118230819702, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.28125, |
| "step": 724 |
| }, |
| { |
| "completion_length": 102.875, |
| "epoch": 0.17910079051383399, |
| "grad_norm": 2.5923588665830204, |
| "kl": 0.11181640625, |
| "learning_rate": 8.211462450592885e-07, |
| "loss": 0.0045, |
| "reward": 2.534066677093506, |
| "reward_std": 0.11862494796514511, |
| "rewards/accuracy_reward_stage2": 0.6069832444190979, |
| "rewards/format_reward_all_stage": 1.9270833730697632, |
| "scores/refine_times": 1.4375, |
| "step": 725 |
| }, |
| { |
| "completion_length": 80.375, |
| "epoch": 0.1793478260869565, |
| "grad_norm": 4.406094013771105, |
| "kl": 0.09521484375, |
| "learning_rate": 8.208992094861659e-07, |
| "loss": 0.0038, |
| "reward": 2.7627735137939453, |
| "reward_std": 0.05562632530927658, |
| "rewards/accuracy_reward_stage2": 0.7627733945846558, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 726 |
| }, |
| { |
| "completion_length": 78.71875, |
| "epoch": 0.17959486166007904, |
| "grad_norm": 4.420639598786895, |
| "kl": 0.13671875, |
| "learning_rate": 8.206521739130435e-07, |
| "loss": 0.0055, |
| "reward": 2.427016258239746, |
| "reward_std": 0.20195448398590088, |
| "rewards/accuracy_reward_stage2": 0.5520162582397461, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.0625, |
| "step": 727 |
| }, |
| { |
| "completion_length": 102.9375, |
| "epoch": 0.17984189723320157, |
| "grad_norm": 3.7517930348608077, |
| "kl": 0.1044921875, |
| "learning_rate": 8.204051383399209e-07, |
| "loss": 0.0042, |
| "reward": 2.55942440032959, |
| "reward_std": 0.2075508087873459, |
| "rewards/accuracy_reward_stage2": 0.6844244003295898, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.140625, |
| "step": 728 |
| }, |
| { |
| "completion_length": 87.328125, |
| "epoch": 0.1800889328063241, |
| "grad_norm": 4.610010902240235, |
| "kl": 0.134765625, |
| "learning_rate": 8.201581027667984e-07, |
| "loss": 0.0054, |
| "reward": 2.5851545333862305, |
| "reward_std": 0.07183162122964859, |
| "rewards/accuracy_reward_stage2": 0.6476545333862305, |
| "rewards/format_reward_all_stage": 1.9375, |
| "scores/refine_times": 1.1875, |
| "step": 729 |
| }, |
| { |
| "completion_length": 93.359375, |
| "epoch": 0.18033596837944665, |
| "grad_norm": 3.6922330703671884, |
| "kl": 0.0859375, |
| "learning_rate": 8.199110671936759e-07, |
| "loss": 0.0034, |
| "reward": 2.8179094791412354, |
| "reward_std": 0.018987158313393593, |
| "rewards/accuracy_reward_stage2": 0.8179094195365906, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.1875, |
| "step": 730 |
| }, |
| { |
| "completion_length": 87.75, |
| "epoch": 0.18058300395256918, |
| "grad_norm": 2.7718550607441177, |
| "kl": 0.08251953125, |
| "learning_rate": 8.196640316205533e-07, |
| "loss": 0.0033, |
| "reward": 2.840196371078491, |
| "reward_std": 0.13527555763721466, |
| "rewards/accuracy_reward_stage2": 0.9026963114738464, |
| "rewards/format_reward_all_stage": 1.9375, |
| "scores/refine_times": 1.25, |
| "step": 731 |
| }, |
| { |
| "completion_length": 75.296875, |
| "epoch": 0.1808300395256917, |
| "grad_norm": 4.568714612629976, |
| "kl": 0.1865234375, |
| "learning_rate": 8.194169960474307e-07, |
| "loss": 0.0075, |
| "reward": 2.598619222640991, |
| "reward_std": 0.05828527733683586, |
| "rewards/accuracy_reward_stage2": 0.5986192226409912, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 732 |
| }, |
| { |
| "completion_length": 82.625, |
| "epoch": 0.18107707509881424, |
| "grad_norm": 3.3078917095674036, |
| "kl": 0.099609375, |
| "learning_rate": 8.191699604743083e-07, |
| "loss": 0.004, |
| "reward": 2.803985118865967, |
| "reward_std": 0.041249554604291916, |
| "rewards/accuracy_reward_stage2": 0.8039852380752563, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 733 |
| }, |
| { |
| "completion_length": 62.75, |
| "epoch": 0.18132411067193677, |
| "grad_norm": 4.665197863030277, |
| "kl": 0.1328125, |
| "learning_rate": 8.189229249011857e-07, |
| "loss": 0.0053, |
| "reward": 2.7739241123199463, |
| "reward_std": 0.09368322789669037, |
| "rewards/accuracy_reward_stage2": 0.7739241123199463, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 734 |
| }, |
| { |
| "completion_length": 87.03125, |
| "epoch": 0.1815711462450593, |
| "grad_norm": 5.230207042065446, |
| "kl": 0.1376953125, |
| "learning_rate": 8.186758893280632e-07, |
| "loss": 0.0055, |
| "reward": 2.6669585704803467, |
| "reward_std": 0.13236752152442932, |
| "rewards/accuracy_reward_stage2": 0.6669585704803467, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.078125, |
| "step": 735 |
| }, |
| { |
| "completion_length": 79.671875, |
| "epoch": 0.18181818181818182, |
| "grad_norm": 5.098012008996088, |
| "kl": 0.10400390625, |
| "learning_rate": 8.184288537549407e-07, |
| "loss": 0.0042, |
| "reward": 2.605584144592285, |
| "reward_std": 0.19998475909233093, |
| "rewards/accuracy_reward_stage2": 0.6055843830108643, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 736 |
| }, |
| { |
| "completion_length": 87.796875, |
| "epoch": 0.18206521739130435, |
| "grad_norm": 3.218557975199275, |
| "kl": 0.0927734375, |
| "learning_rate": 8.181818181818182e-07, |
| "loss": 0.0037, |
| "reward": 2.5339651107788086, |
| "reward_std": 0.09329190850257874, |
| "rewards/accuracy_reward_stage2": 0.6016733646392822, |
| "rewards/format_reward_all_stage": 1.9322917461395264, |
| "scores/refine_times": 1.140625, |
| "step": 737 |
| }, |
| { |
| "completion_length": 91.671875, |
| "epoch": 0.18231225296442688, |
| "grad_norm": 2.9728018450072504, |
| "kl": 0.1220703125, |
| "learning_rate": 8.179347826086957e-07, |
| "loss": 0.0049, |
| "reward": 2.5654635429382324, |
| "reward_std": 0.07684879750013351, |
| "rewards/accuracy_reward_stage2": 0.5654636025428772, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.3125, |
| "step": 738 |
| }, |
| { |
| "completion_length": 76.875, |
| "epoch": 0.1825592885375494, |
| "grad_norm": 4.098914602594136, |
| "kl": 0.1220703125, |
| "learning_rate": 8.176877470355731e-07, |
| "loss": 0.0049, |
| "reward": 2.6792609691619873, |
| "reward_std": 0.1466922014951706, |
| "rewards/accuracy_reward_stage2": 0.8042609691619873, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.0625, |
| "step": 739 |
| }, |
| { |
| "completion_length": 58.1875, |
| "epoch": 0.18280632411067194, |
| "grad_norm": 0.34555494426893213, |
| "kl": 0.1044921875, |
| "learning_rate": 8.174407114624505e-07, |
| "loss": 0.0042, |
| "reward": 2.6822760105133057, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward_stage2": 0.6822760701179504, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 740 |
| }, |
| { |
| "completion_length": 71.609375, |
| "epoch": 0.18305335968379446, |
| "grad_norm": 4.662745533117084, |
| "kl": 0.119140625, |
| "learning_rate": 8.17193675889328e-07, |
| "loss": 0.0048, |
| "reward": 2.5369908809661865, |
| "reward_std": 0.11054471135139465, |
| "rewards/accuracy_reward_stage2": 0.5369909405708313, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 741 |
| }, |
| { |
| "completion_length": 69.75, |
| "epoch": 0.183300395256917, |
| "grad_norm": 2.243839387959608, |
| "kl": 0.11767578125, |
| "learning_rate": 8.169466403162055e-07, |
| "loss": 0.0047, |
| "reward": 2.672517776489258, |
| "reward_std": 0.029596952721476555, |
| "rewards/accuracy_reward_stage2": 0.6725177764892578, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.140625, |
| "step": 742 |
| }, |
| { |
| "completion_length": 83.4375, |
| "epoch": 0.18354743083003952, |
| "grad_norm": 3.3308848465244423, |
| "kl": 0.091796875, |
| "learning_rate": 8.166996047430829e-07, |
| "loss": 0.0037, |
| "reward": 2.738541603088379, |
| "reward_std": 0.1675356924533844, |
| "rewards/accuracy_reward_stage2": 0.7385417222976685, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.203125, |
| "step": 743 |
| }, |
| { |
| "completion_length": 80.859375, |
| "epoch": 0.18379446640316205, |
| "grad_norm": 4.228437836918937, |
| "kl": 0.13671875, |
| "learning_rate": 8.164525691699604e-07, |
| "loss": 0.0055, |
| "reward": 2.600447654724121, |
| "reward_std": 0.27785760164260864, |
| "rewards/accuracy_reward_stage2": 0.8035725355148315, |
| "rewards/format_reward_all_stage": 1.796875, |
| "scores/refine_times": 1.296875, |
| "step": 744 |
| }, |
| { |
| "completion_length": 77.90625, |
| "epoch": 0.18404150197628458, |
| "grad_norm": 2.75672529825747, |
| "kl": 0.0908203125, |
| "learning_rate": 8.162055335968378e-07, |
| "loss": 0.0036, |
| "reward": 2.814410448074341, |
| "reward_std": 0.03253195434808731, |
| "rewards/accuracy_reward_stage2": 0.8144104480743408, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.078125, |
| "step": 745 |
| }, |
| { |
| "completion_length": 64.78125, |
| "epoch": 0.1842885375494071, |
| "grad_norm": 2.6445111122830727, |
| "kl": 0.09716796875, |
| "learning_rate": 8.159584980237155e-07, |
| "loss": 0.0039, |
| "reward": 2.8770241737365723, |
| "reward_std": 0.07037458568811417, |
| "rewards/accuracy_reward_stage2": 0.9395240545272827, |
| "rewards/format_reward_all_stage": 1.9375, |
| "scores/refine_times": 1.0625, |
| "step": 746 |
| }, |
| { |
| "completion_length": 69.5, |
| "epoch": 0.18453557312252963, |
| "grad_norm": 2.5683252121883693, |
| "kl": 0.1220703125, |
| "learning_rate": 8.157114624505929e-07, |
| "loss": 0.0049, |
| "reward": 2.676783561706543, |
| "reward_std": 0.033407654613256454, |
| "rewards/accuracy_reward_stage2": 0.6767836809158325, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 747 |
| }, |
| { |
| "completion_length": 75.671875, |
| "epoch": 0.18478260869565216, |
| "grad_norm": 1.7986182845085459, |
| "kl": 0.10986328125, |
| "learning_rate": 8.154644268774703e-07, |
| "loss": 0.0044, |
| "reward": 2.851139783859253, |
| "reward_std": 0.00538706174120307, |
| "rewards/accuracy_reward_stage2": 0.8511397838592529, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 748 |
| }, |
| { |
| "completion_length": 70.0, |
| "epoch": 0.1850296442687747, |
| "grad_norm": 5.4302296395693235, |
| "kl": 0.12158203125, |
| "learning_rate": 8.152173913043478e-07, |
| "loss": 0.0049, |
| "reward": 2.357592821121216, |
| "reward_std": 0.05461695045232773, |
| "rewards/accuracy_reward_stage2": 0.3575928211212158, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 749 |
| }, |
| { |
| "completion_length": 80.65625, |
| "epoch": 0.18527667984189725, |
| "grad_norm": 5.106369592091597, |
| "kl": 0.2060546875, |
| "learning_rate": 8.149703557312253e-07, |
| "loss": 0.0082, |
| "reward": 2.7984001636505127, |
| "reward_std": 0.09079733490943909, |
| "rewards/accuracy_reward_stage2": 0.7984002828598022, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.078125, |
| "step": 750 |
| }, |
| { |
| "completion_length": 95.71875, |
| "epoch": 0.18552371541501977, |
| "grad_norm": 3.9061202300388365, |
| "kl": 0.09033203125, |
| "learning_rate": 8.147233201581027e-07, |
| "loss": 0.0036, |
| "reward": 2.681049108505249, |
| "reward_std": 0.08273804187774658, |
| "rewards/accuracy_reward_stage2": 0.7018824219703674, |
| "rewards/format_reward_all_stage": 1.9791667461395264, |
| "scores/refine_times": 1.234375, |
| "step": 751 |
| }, |
| { |
| "completion_length": 87.859375, |
| "epoch": 0.1857707509881423, |
| "grad_norm": 4.0976955965839785, |
| "kl": 0.11181640625, |
| "learning_rate": 8.144762845849802e-07, |
| "loss": 0.0045, |
| "reward": 2.7674148082733154, |
| "reward_std": 0.1868867129087448, |
| "rewards/accuracy_reward_stage2": 0.7778315544128418, |
| "rewards/format_reward_all_stage": 1.9895833730697632, |
| "scores/refine_times": 1.140625, |
| "step": 752 |
| }, |
| { |
| "completion_length": 88.75, |
| "epoch": 0.18601778656126483, |
| "grad_norm": 4.555919227092463, |
| "kl": 0.10888671875, |
| "learning_rate": 8.142292490118576e-07, |
| "loss": 0.0044, |
| "reward": 2.463618278503418, |
| "reward_std": 0.16842269897460938, |
| "rewards/accuracy_reward_stage2": 0.5886183977127075, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.234375, |
| "step": 753 |
| }, |
| { |
| "completion_length": 84.171875, |
| "epoch": 0.18626482213438736, |
| "grad_norm": 5.010271159735403, |
| "kl": 0.125, |
| "learning_rate": 8.139822134387351e-07, |
| "loss": 0.005, |
| "reward": 2.6570065021514893, |
| "reward_std": 0.05975431948900223, |
| "rewards/accuracy_reward_stage2": 0.657006561756134, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.203125, |
| "step": 754 |
| }, |
| { |
| "completion_length": 88.21875, |
| "epoch": 0.1865118577075099, |
| "grad_norm": 5.939693635822729, |
| "kl": 0.1064453125, |
| "learning_rate": 8.137351778656127e-07, |
| "loss": 0.0043, |
| "reward": 2.694657325744629, |
| "reward_std": 0.11330029368400574, |
| "rewards/accuracy_reward_stage2": 0.6946573257446289, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.25, |
| "step": 755 |
| }, |
| { |
| "completion_length": 90.359375, |
| "epoch": 0.18675889328063242, |
| "grad_norm": 3.393811545438695, |
| "kl": 0.130859375, |
| "learning_rate": 8.134881422924901e-07, |
| "loss": 0.0052, |
| "reward": 2.7568604946136475, |
| "reward_std": 0.1113467812538147, |
| "rewards/accuracy_reward_stage2": 0.819360613822937, |
| "rewards/format_reward_all_stage": 1.9375, |
| "scores/refine_times": 1.265625, |
| "step": 756 |
| }, |
| { |
| "completion_length": 62.125, |
| "epoch": 0.18700592885375494, |
| "grad_norm": 3.0805751298380057, |
| "kl": 0.11865234375, |
| "learning_rate": 8.132411067193675e-07, |
| "loss": 0.0047, |
| "reward": 2.7386887073516846, |
| "reward_std": 0.0412348210811615, |
| "rewards/accuracy_reward_stage2": 0.7386887073516846, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 757 |
| }, |
| { |
| "completion_length": 81.953125, |
| "epoch": 0.18725296442687747, |
| "grad_norm": 3.183809799693692, |
| "kl": 0.1748046875, |
| "learning_rate": 8.129940711462451e-07, |
| "loss": 0.007, |
| "reward": 2.7576744556427, |
| "reward_std": 0.07029517740011215, |
| "rewards/accuracy_reward_stage2": 0.8149662017822266, |
| "rewards/format_reward_all_stage": 1.9427083730697632, |
| "scores/refine_times": 1.25, |
| "step": 758 |
| }, |
| { |
| "completion_length": 59.0625, |
| "epoch": 0.1875, |
| "grad_norm": 2.2324711430978, |
| "kl": 0.11083984375, |
| "learning_rate": 8.127470355731225e-07, |
| "loss": 0.0044, |
| "reward": 2.9286131858825684, |
| "reward_std": 0.00950054731220007, |
| "rewards/accuracy_reward_stage2": 0.9286130666732788, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 759 |
| }, |
| { |
| "completion_length": 76.4375, |
| "epoch": 0.18774703557312253, |
| "grad_norm": 5.275881267988074, |
| "kl": 0.1455078125, |
| "learning_rate": 8.125e-07, |
| "loss": 0.0058, |
| "reward": 2.655832290649414, |
| "reward_std": 0.12385143339633942, |
| "rewards/accuracy_reward_stage2": 0.6558322906494141, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.171875, |
| "step": 760 |
| }, |
| { |
| "completion_length": 72.859375, |
| "epoch": 0.18799407114624506, |
| "grad_norm": 2.148609964785086, |
| "kl": 0.1044921875, |
| "learning_rate": 8.122529644268774e-07, |
| "loss": 0.0042, |
| "reward": 2.609675884246826, |
| "reward_std": 0.002886358881369233, |
| "rewards/accuracy_reward_stage2": 0.6096760034561157, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 761 |
| }, |
| { |
| "completion_length": 73.375, |
| "epoch": 0.18824110671936758, |
| "grad_norm": 3.404198278689953, |
| "kl": 0.11962890625, |
| "learning_rate": 8.120059288537548e-07, |
| "loss": 0.0048, |
| "reward": 2.573883056640625, |
| "reward_std": 0.015004590153694153, |
| "rewards/accuracy_reward_stage2": 0.5738831758499146, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 762 |
| }, |
| { |
| "completion_length": 70.546875, |
| "epoch": 0.1884881422924901, |
| "grad_norm": 4.508482517354418, |
| "kl": 0.11279296875, |
| "learning_rate": 8.117588932806324e-07, |
| "loss": 0.0045, |
| "reward": 2.5378634929656982, |
| "reward_std": 0.10779360681772232, |
| "rewards/accuracy_reward_stage2": 0.5378634929656982, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.09375, |
| "step": 763 |
| }, |
| { |
| "completion_length": 84.3125, |
| "epoch": 0.18873517786561264, |
| "grad_norm": 3.1922499449312167, |
| "kl": 0.11669921875, |
| "learning_rate": 8.115118577075099e-07, |
| "loss": 0.0047, |
| "reward": 2.71618914604187, |
| "reward_std": 0.14423049986362457, |
| "rewards/accuracy_reward_stage2": 0.7422308325767517, |
| "rewards/format_reward_all_stage": 1.9739583730697632, |
| "scores/refine_times": 1.34375, |
| "step": 764 |
| }, |
| { |
| "completion_length": 94.890625, |
| "epoch": 0.18898221343873517, |
| "grad_norm": 2.7685322764181715, |
| "kl": 0.1953125, |
| "learning_rate": 8.112648221343873e-07, |
| "loss": 0.0078, |
| "reward": 2.739062547683716, |
| "reward_std": 0.11331679672002792, |
| "rewards/accuracy_reward_stage2": 0.7390625476837158, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.203125, |
| "step": 765 |
| }, |
| { |
| "completion_length": 77.0625, |
| "epoch": 0.1892292490118577, |
| "grad_norm": 3.9402093829627516, |
| "kl": 0.10546875, |
| "learning_rate": 8.110177865612648e-07, |
| "loss": 0.0042, |
| "reward": 2.555873155593872, |
| "reward_std": 0.16401194036006927, |
| "rewards/accuracy_reward_stage2": 0.6131649017333984, |
| "rewards/format_reward_all_stage": 1.9427083730697632, |
| "scores/refine_times": 1.15625, |
| "step": 766 |
| }, |
| { |
| "completion_length": 62.984375, |
| "epoch": 0.18947628458498023, |
| "grad_norm": 3.3270805279161335, |
| "kl": 0.1064453125, |
| "learning_rate": 8.107707509881423e-07, |
| "loss": 0.0043, |
| "reward": 2.593876361846924, |
| "reward_std": 0.14388912916183472, |
| "rewards/accuracy_reward_stage2": 0.7188762426376343, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.125, |
| "step": 767 |
| }, |
| { |
| "completion_length": 63.375, |
| "epoch": 0.18972332015810275, |
| "grad_norm": 4.575039803879313, |
| "kl": 0.12451171875, |
| "learning_rate": 8.105237154150197e-07, |
| "loss": 0.005, |
| "reward": 2.676608085632324, |
| "reward_std": 0.06587755680084229, |
| "rewards/accuracy_reward_stage2": 0.6766082048416138, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 768 |
| }, |
| { |
| "completion_length": 80.796875, |
| "epoch": 0.1899703557312253, |
| "grad_norm": 4.913789346219929, |
| "kl": 0.11181640625, |
| "learning_rate": 8.102766798418972e-07, |
| "loss": 0.0045, |
| "reward": 2.4672067165374756, |
| "reward_std": 0.14596593379974365, |
| "rewards/accuracy_reward_stage2": 0.5922067165374756, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.140625, |
| "step": 769 |
| }, |
| { |
| "completion_length": 64.25, |
| "epoch": 0.19021739130434784, |
| "grad_norm": 4.260844595348638, |
| "kl": 0.1357421875, |
| "learning_rate": 8.100296442687746e-07, |
| "loss": 0.0054, |
| "reward": 2.771862030029297, |
| "reward_std": 0.03155703470110893, |
| "rewards/accuracy_reward_stage2": 0.7718619108200073, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 770 |
| }, |
| { |
| "completion_length": 72.265625, |
| "epoch": 0.19046442687747037, |
| "grad_norm": 4.247208948225819, |
| "kl": 0.11181640625, |
| "learning_rate": 8.097826086956521e-07, |
| "loss": 0.0045, |
| "reward": 2.562706708908081, |
| "reward_std": 0.1308896243572235, |
| "rewards/accuracy_reward_stage2": 0.5627066493034363, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.078125, |
| "step": 771 |
| }, |
| { |
| "completion_length": 81.90625, |
| "epoch": 0.1907114624505929, |
| "grad_norm": 5.532229311907246, |
| "kl": 0.150390625, |
| "learning_rate": 8.095355731225296e-07, |
| "loss": 0.006, |
| "reward": 2.4436357021331787, |
| "reward_std": 0.2998715341091156, |
| "rewards/accuracy_reward_stage2": 0.6936356425285339, |
| "rewards/format_reward_all_stage": 1.75, |
| "scores/refine_times": 1.140625, |
| "step": 772 |
| }, |
| { |
| "completion_length": 96.015625, |
| "epoch": 0.19095849802371542, |
| "grad_norm": 3.8686169326260247, |
| "kl": 0.1005859375, |
| "learning_rate": 8.092885375494071e-07, |
| "loss": 0.004, |
| "reward": 2.8755273818969727, |
| "reward_std": 0.06159983575344086, |
| "rewards/accuracy_reward_stage2": 0.8859438896179199, |
| "rewards/format_reward_all_stage": 1.9895833730697632, |
| "scores/refine_times": 1.4375, |
| "step": 773 |
| }, |
| { |
| "completion_length": 74.765625, |
| "epoch": 0.19120553359683795, |
| "grad_norm": 4.19101339210458, |
| "kl": 0.107421875, |
| "learning_rate": 8.090415019762846e-07, |
| "loss": 0.0043, |
| "reward": 2.772860527038574, |
| "reward_std": 0.14349845051765442, |
| "rewards/accuracy_reward_stage2": 0.772860586643219, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 774 |
| }, |
| { |
| "completion_length": 87.46875, |
| "epoch": 0.19145256916996048, |
| "grad_norm": 2.3971160307653996, |
| "kl": 0.11181640625, |
| "learning_rate": 8.087944664031621e-07, |
| "loss": 0.0045, |
| "reward": 2.9060990810394287, |
| "reward_std": 0.0646752119064331, |
| "rewards/accuracy_reward_stage2": 0.9165157079696655, |
| "rewards/format_reward_all_stage": 1.9895832538604736, |
| "scores/refine_times": 1.359375, |
| "step": 775 |
| }, |
| { |
| "completion_length": 81.578125, |
| "epoch": 0.191699604743083, |
| "grad_norm": 1.6984897790582976, |
| "kl": 0.10400390625, |
| "learning_rate": 8.085474308300395e-07, |
| "loss": 0.0042, |
| "reward": 2.6885228157043457, |
| "reward_std": 0.06918665766716003, |
| "rewards/accuracy_reward_stage2": 0.7432103157043457, |
| "rewards/format_reward_all_stage": 1.9453125, |
| "scores/refine_times": 1.359375, |
| "step": 776 |
| }, |
| { |
| "completion_length": 66.953125, |
| "epoch": 0.19194664031620554, |
| "grad_norm": 4.087458802427488, |
| "kl": 0.09814453125, |
| "learning_rate": 8.08300395256917e-07, |
| "loss": 0.0039, |
| "reward": 2.8248817920684814, |
| "reward_std": 0.12380017340183258, |
| "rewards/accuracy_reward_stage2": 0.8248817920684814, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.296875, |
| "step": 777 |
| }, |
| { |
| "completion_length": 88.25, |
| "epoch": 0.19219367588932806, |
| "grad_norm": 2.418293642121235, |
| "kl": 0.1201171875, |
| "learning_rate": 8.080533596837944e-07, |
| "loss": 0.0048, |
| "reward": 2.590397357940674, |
| "reward_std": 0.12917400896549225, |
| "rewards/accuracy_reward_stage2": 0.5903975367546082, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.296875, |
| "step": 778 |
| }, |
| { |
| "completion_length": 93.75, |
| "epoch": 0.1924407114624506, |
| "grad_norm": 3.2915590592064, |
| "kl": 0.1162109375, |
| "learning_rate": 8.078063241106719e-07, |
| "loss": 0.0047, |
| "reward": 2.8586063385009766, |
| "reward_std": 0.02293376252055168, |
| "rewards/accuracy_reward_stage2": 0.8586064577102661, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.359375, |
| "step": 779 |
| }, |
| { |
| "completion_length": 89.515625, |
| "epoch": 0.19268774703557312, |
| "grad_norm": 4.172983979860826, |
| "kl": 0.12890625, |
| "learning_rate": 8.075592885375494e-07, |
| "loss": 0.0051, |
| "reward": 2.6774847507476807, |
| "reward_std": 0.16821645200252533, |
| "rewards/accuracy_reward_stage2": 0.7931098341941833, |
| "rewards/format_reward_all_stage": 1.884374976158142, |
| "scores/refine_times": 1.515625, |
| "step": 780 |
| }, |
| { |
| "completion_length": 88.65625, |
| "epoch": 0.19293478260869565, |
| "grad_norm": 3.7850164391712373, |
| "kl": 0.1923828125, |
| "learning_rate": 8.073122529644268e-07, |
| "loss": 0.0077, |
| "reward": 2.632174015045166, |
| "reward_std": 0.18120327591896057, |
| "rewards/accuracy_reward_stage2": 0.6998822689056396, |
| "rewards/format_reward_all_stage": 1.9322916269302368, |
| "scores/refine_times": 1.328125, |
| "step": 781 |
| }, |
| { |
| "completion_length": 88.5, |
| "epoch": 0.19318181818181818, |
| "grad_norm": 2.957342350183697, |
| "kl": 0.1376953125, |
| "learning_rate": 8.070652173913042e-07, |
| "loss": 0.0055, |
| "reward": 2.5614137649536133, |
| "reward_std": 0.07992963492870331, |
| "rewards/accuracy_reward_stage2": 0.5614137053489685, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.34375, |
| "step": 782 |
| }, |
| { |
| "completion_length": 112.296875, |
| "epoch": 0.1934288537549407, |
| "grad_norm": 3.8559803732895497, |
| "kl": 0.09228515625, |
| "learning_rate": 8.068181818181818e-07, |
| "loss": 0.0037, |
| "reward": 2.7399330139160156, |
| "reward_std": 0.1664211004972458, |
| "rewards/accuracy_reward_stage2": 0.7399328947067261, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.71875, |
| "step": 783 |
| }, |
| { |
| "completion_length": 88.875, |
| "epoch": 0.19367588932806323, |
| "grad_norm": 4.502436247020987, |
| "kl": 0.1298828125, |
| "learning_rate": 8.065711462450593e-07, |
| "loss": 0.0052, |
| "reward": 2.3374903202056885, |
| "reward_std": 0.2821962237358093, |
| "rewards/accuracy_reward_stage2": 0.5354070663452148, |
| "rewards/format_reward_all_stage": 1.8020832538604736, |
| "scores/refine_times": 1.265625, |
| "step": 784 |
| }, |
| { |
| "completion_length": 80.90625, |
| "epoch": 0.19392292490118576, |
| "grad_norm": 3.7802077823940197, |
| "kl": 0.11279296875, |
| "learning_rate": 8.063241106719367e-07, |
| "loss": 0.0045, |
| "reward": 2.731360912322998, |
| "reward_std": 0.03464128449559212, |
| "rewards/accuracy_reward_stage2": 0.7313610315322876, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.359375, |
| "step": 785 |
| }, |
| { |
| "completion_length": 118.15625, |
| "epoch": 0.1941699604743083, |
| "grad_norm": 3.555349001791891, |
| "kl": 0.0888671875, |
| "learning_rate": 8.060770750988142e-07, |
| "loss": 0.0036, |
| "reward": 2.5097384452819824, |
| "reward_std": 0.14271043241024017, |
| "rewards/accuracy_reward_stage2": 0.5982798933982849, |
| "rewards/format_reward_all_stage": 1.9114582538604736, |
| "scores/refine_times": 1.640625, |
| "step": 786 |
| }, |
| { |
| "completion_length": 106.03125, |
| "epoch": 0.19441699604743082, |
| "grad_norm": 2.85710040568485, |
| "kl": 0.08203125, |
| "learning_rate": 8.058300395256916e-07, |
| "loss": 0.0033, |
| "reward": 2.8455610275268555, |
| "reward_std": 0.05401609092950821, |
| "rewards/accuracy_reward_stage2": 0.8559777736663818, |
| "rewards/format_reward_all_stage": 1.9895833730697632, |
| "scores/refine_times": 1.515625, |
| "step": 787 |
| }, |
| { |
| "completion_length": 118.0625, |
| "epoch": 0.19466403162055335, |
| "grad_norm": 3.7964493038018245, |
| "kl": 0.1572265625, |
| "learning_rate": 8.055830039525692e-07, |
| "loss": 0.0063, |
| "reward": 2.6106669902801514, |
| "reward_std": 0.21474987268447876, |
| "rewards/accuracy_reward_stage2": 0.7356671094894409, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.53125, |
| "step": 788 |
| }, |
| { |
| "completion_length": 126.265625, |
| "epoch": 0.1949110671936759, |
| "grad_norm": 2.9916850524673837, |
| "kl": 0.1044921875, |
| "learning_rate": 8.053359683794466e-07, |
| "loss": 0.0042, |
| "reward": 2.669743537902832, |
| "reward_std": 0.17657098174095154, |
| "rewards/accuracy_reward_stage2": 0.7478686571121216, |
| "rewards/format_reward_all_stage": 1.921875, |
| "scores/refine_times": 1.53125, |
| "step": 789 |
| }, |
| { |
| "completion_length": 106.75, |
| "epoch": 0.19515810276679843, |
| "grad_norm": 3.059045891709165, |
| "kl": 0.1103515625, |
| "learning_rate": 8.05088932806324e-07, |
| "loss": 0.0044, |
| "reward": 2.5722293853759766, |
| "reward_std": 0.19657298922538757, |
| "rewards/accuracy_reward_stage2": 0.5847293138504028, |
| "rewards/format_reward_all_stage": 1.9874999523162842, |
| "scores/refine_times": 1.515625, |
| "step": 790 |
| }, |
| { |
| "completion_length": 115.171875, |
| "epoch": 0.19540513833992096, |
| "grad_norm": 4.038115134383084, |
| "kl": 0.134765625, |
| "learning_rate": 8.048418972332015e-07, |
| "loss": 0.0054, |
| "reward": 2.6473679542541504, |
| "reward_std": 0.3247292637825012, |
| "rewards/accuracy_reward_stage2": 0.838513970375061, |
| "rewards/format_reward_all_stage": 1.808854103088379, |
| "scores/refine_times": 1.734375, |
| "step": 791 |
| }, |
| { |
| "completion_length": 119.84375, |
| "epoch": 0.1956521739130435, |
| "grad_norm": 3.9096509332723537, |
| "kl": 0.1083984375, |
| "learning_rate": 8.045948616600791e-07, |
| "loss": 0.0043, |
| "reward": 2.6962451934814453, |
| "reward_std": 0.12264476716518402, |
| "rewards/accuracy_reward_stage2": 0.7597866058349609, |
| "rewards/format_reward_all_stage": 1.9364583492279053, |
| "scores/refine_times": 1.671875, |
| "step": 792 |
| }, |
| { |
| "completion_length": 123.046875, |
| "epoch": 0.19589920948616601, |
| "grad_norm": 3.453621564913509, |
| "kl": 0.1005859375, |
| "learning_rate": 8.043478260869565e-07, |
| "loss": 0.004, |
| "reward": 2.64453125, |
| "reward_std": 0.14773526787757874, |
| "rewards/accuracy_reward_stage2": 0.6445313096046448, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.671875, |
| "step": 793 |
| }, |
| { |
| "completion_length": 136.171875, |
| "epoch": 0.19614624505928854, |
| "grad_norm": 3.244903631558797, |
| "kl": 0.08203125, |
| "learning_rate": 8.04100790513834e-07, |
| "loss": 0.0033, |
| "reward": 2.459873676300049, |
| "reward_std": 0.250729501247406, |
| "rewards/accuracy_reward_stage2": 0.5989362001419067, |
| "rewards/format_reward_all_stage": 1.860937476158142, |
| "scores/refine_times": 1.828125, |
| "step": 794 |
| }, |
| { |
| "completion_length": 121.84375, |
| "epoch": 0.19639328063241107, |
| "grad_norm": 3.0812502917172724, |
| "kl": 0.1201171875, |
| "learning_rate": 8.038537549407114e-07, |
| "loss": 0.0048, |
| "reward": 2.6600656509399414, |
| "reward_std": 0.21765968203544617, |
| "rewards/accuracy_reward_stage2": 0.7225657105445862, |
| "rewards/format_reward_all_stage": 1.9375, |
| "scores/refine_times": 1.8125, |
| "step": 795 |
| }, |
| { |
| "completion_length": 80.90625, |
| "epoch": 0.1966403162055336, |
| "grad_norm": 3.1572409050375336, |
| "kl": 0.1474609375, |
| "learning_rate": 8.036067193675889e-07, |
| "loss": 0.0059, |
| "reward": 2.801659345626831, |
| "reward_std": 0.1534295380115509, |
| "rewards/accuracy_reward_stage2": 0.8172844648361206, |
| "rewards/format_reward_all_stage": 1.984375, |
| "scores/refine_times": 1.28125, |
| "step": 796 |
| }, |
| { |
| "completion_length": 102.078125, |
| "epoch": 0.19688735177865613, |
| "grad_norm": 3.677604625623705, |
| "kl": 0.091796875, |
| "learning_rate": 8.033596837944664e-07, |
| "loss": 0.0037, |
| "reward": 2.7125244140625, |
| "reward_std": 0.21848973631858826, |
| "rewards/accuracy_reward_stage2": 0.8437741994857788, |
| "rewards/format_reward_all_stage": 1.868749976158142, |
| "scores/refine_times": 1.375, |
| "step": 797 |
| }, |
| { |
| "completion_length": 115.859375, |
| "epoch": 0.19713438735177866, |
| "grad_norm": 4.2639279037433075, |
| "kl": 0.1005859375, |
| "learning_rate": 8.031126482213438e-07, |
| "loss": 0.004, |
| "reward": 2.6685781478881836, |
| "reward_std": 0.12759262323379517, |
| "rewards/accuracy_reward_stage2": 0.731078028678894, |
| "rewards/format_reward_all_stage": 1.9375, |
| "scores/refine_times": 1.5, |
| "step": 798 |
| }, |
| { |
| "completion_length": 61.765625, |
| "epoch": 0.19738142292490118, |
| "grad_norm": 7.203216209798114, |
| "kl": 0.1357421875, |
| "learning_rate": 8.028656126482212e-07, |
| "loss": 0.0054, |
| "reward": 2.653932571411133, |
| "reward_std": 0.2906397581100464, |
| "rewards/accuracy_reward_stage2": 0.6539325714111328, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 799 |
| }, |
| { |
| "completion_length": 96.875, |
| "epoch": 0.1976284584980237, |
| "grad_norm": 5.129742489372007, |
| "kl": 0.12451171875, |
| "learning_rate": 8.026185770750987e-07, |
| "loss": 0.005, |
| "reward": 2.4581518173217773, |
| "reward_std": 0.11399400979280472, |
| "rewards/accuracy_reward_stage2": 0.5154435038566589, |
| "rewards/format_reward_all_stage": 1.9427083730697632, |
| "scores/refine_times": 1.359375, |
| "step": 800 |
| }, |
| { |
| "completion_length": 95.671875, |
| "epoch": 0.19787549407114624, |
| "grad_norm": 4.764586841064228, |
| "kl": 0.11181640625, |
| "learning_rate": 8.023715415019763e-07, |
| "loss": 0.0045, |
| "reward": 2.610954999923706, |
| "reward_std": 0.2346010059118271, |
| "rewards/accuracy_reward_stage2": 0.6734550595283508, |
| "rewards/format_reward_all_stage": 1.9375, |
| "scores/refine_times": 1.265625, |
| "step": 801 |
| }, |
| { |
| "completion_length": 99.140625, |
| "epoch": 0.19812252964426877, |
| "grad_norm": 4.333849192860251, |
| "kl": 0.10107421875, |
| "learning_rate": 8.021245059288538e-07, |
| "loss": 0.004, |
| "reward": 2.6565942764282227, |
| "reward_std": 0.178715318441391, |
| "rewards/accuracy_reward_stage2": 0.6628443598747253, |
| "rewards/format_reward_all_stage": 1.993749976158142, |
| "scores/refine_times": 1.359375, |
| "step": 802 |
| }, |
| { |
| "completion_length": 128.1875, |
| "epoch": 0.1983695652173913, |
| "grad_norm": 4.089422740146013, |
| "kl": 0.119140625, |
| "learning_rate": 8.018774703557312e-07, |
| "loss": 0.0048, |
| "reward": 2.6466307640075684, |
| "reward_std": 0.2680560052394867, |
| "rewards/accuracy_reward_stage2": 0.7716308832168579, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.4375, |
| "step": 803 |
| }, |
| { |
| "completion_length": 81.5625, |
| "epoch": 0.19861660079051383, |
| "grad_norm": 4.131156028784667, |
| "kl": 0.1083984375, |
| "learning_rate": 8.016304347826086e-07, |
| "loss": 0.0043, |
| "reward": 2.7747063636779785, |
| "reward_std": 0.027518026530742645, |
| "rewards/accuracy_reward_stage2": 0.7747063636779785, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.078125, |
| "step": 804 |
| }, |
| { |
| "completion_length": 89.6875, |
| "epoch": 0.19886363636363635, |
| "grad_norm": 4.292467738406767, |
| "kl": 0.08740234375, |
| "learning_rate": 8.013833992094862e-07, |
| "loss": 0.0035, |
| "reward": 2.638862133026123, |
| "reward_std": 0.23011387884616852, |
| "rewards/accuracy_reward_stage2": 0.670112133026123, |
| "rewards/format_reward_all_stage": 1.96875, |
| "scores/refine_times": 1.25, |
| "step": 805 |
| }, |
| { |
| "completion_length": 87.78125, |
| "epoch": 0.19911067193675888, |
| "grad_norm": 3.6376261756995625, |
| "kl": 0.10205078125, |
| "learning_rate": 8.011363636363636e-07, |
| "loss": 0.0041, |
| "reward": 2.8962717056274414, |
| "reward_std": 0.030569197610020638, |
| "rewards/accuracy_reward_stage2": 0.8962716460227966, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.234375, |
| "step": 806 |
| }, |
| { |
| "completion_length": 86.96875, |
| "epoch": 0.1993577075098814, |
| "grad_norm": 4.069978684984678, |
| "kl": 0.10546875, |
| "learning_rate": 8.00889328063241e-07, |
| "loss": 0.0042, |
| "reward": 2.4667863845825195, |
| "reward_std": 0.08214451372623444, |
| "rewards/accuracy_reward_stage2": 0.4667862355709076, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.359375, |
| "step": 807 |
| }, |
| { |
| "completion_length": 124.8125, |
| "epoch": 0.19960474308300397, |
| "grad_norm": 3.6030777655101205, |
| "kl": 0.08935546875, |
| "learning_rate": 8.006422924901185e-07, |
| "loss": 0.0036, |
| "reward": 2.402216672897339, |
| "reward_std": 0.21567249298095703, |
| "rewards/accuracy_reward_stage2": 0.5272166132926941, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.453125, |
| "step": 808 |
| }, |
| { |
| "completion_length": 95.796875, |
| "epoch": 0.1998517786561265, |
| "grad_norm": 3.960382990766777, |
| "kl": 0.08154296875, |
| "learning_rate": 8.00395256916996e-07, |
| "loss": 0.0033, |
| "reward": 2.651146650314331, |
| "reward_std": 0.1337394118309021, |
| "rewards/accuracy_reward_stage2": 0.6511465907096863, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 809 |
| }, |
| { |
| "completion_length": 126.8125, |
| "epoch": 0.20009881422924902, |
| "grad_norm": 3.305927068278349, |
| "kl": 0.12060546875, |
| "learning_rate": 8.001482213438735e-07, |
| "loss": 0.0048, |
| "reward": 2.4156692028045654, |
| "reward_std": 0.21533656120300293, |
| "rewards/accuracy_reward_stage2": 0.5406690835952759, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.359375, |
| "step": 810 |
| }, |
| { |
| "completion_length": 111.78125, |
| "epoch": 0.20034584980237155, |
| "grad_norm": 2.2768402753567365, |
| "kl": 0.095703125, |
| "learning_rate": 7.99901185770751e-07, |
| "loss": 0.0038, |
| "reward": 2.449721336364746, |
| "reward_std": 0.011049442924559116, |
| "rewards/accuracy_reward_stage2": 0.44972118735313416, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.359375, |
| "step": 811 |
| }, |
| { |
| "completion_length": 88.09375, |
| "epoch": 0.20059288537549408, |
| "grad_norm": 4.766874200244394, |
| "kl": 0.0791015625, |
| "learning_rate": 7.996541501976284e-07, |
| "loss": 0.0032, |
| "reward": 2.599069595336914, |
| "reward_std": 0.21623960137367249, |
| "rewards/accuracy_reward_stage2": 0.7396947741508484, |
| "rewards/format_reward_all_stage": 1.859375, |
| "scores/refine_times": 1.125, |
| "step": 812 |
| }, |
| { |
| "completion_length": 89.125, |
| "epoch": 0.2008399209486166, |
| "grad_norm": 3.457097069645932, |
| "kl": 0.1435546875, |
| "learning_rate": 7.99407114624506e-07, |
| "loss": 0.0058, |
| "reward": 2.5180485248565674, |
| "reward_std": 0.2695591151714325, |
| "rewards/accuracy_reward_stage2": 0.7680485844612122, |
| "rewards/format_reward_all_stage": 1.75, |
| "scores/refine_times": 1.15625, |
| "step": 813 |
| }, |
| { |
| "completion_length": 109.375, |
| "epoch": 0.20108695652173914, |
| "grad_norm": 3.1285474295421047, |
| "kl": 0.07080078125, |
| "learning_rate": 7.991600790513834e-07, |
| "loss": 0.0028, |
| "reward": 2.4723763465881348, |
| "reward_std": 0.13593828678131104, |
| "rewards/accuracy_reward_stage2": 0.4723762571811676, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.28125, |
| "step": 814 |
| }, |
| { |
| "completion_length": 107.375, |
| "epoch": 0.20133399209486166, |
| "grad_norm": 4.642239916817279, |
| "kl": 0.09814453125, |
| "learning_rate": 7.989130434782608e-07, |
| "loss": 0.0039, |
| "reward": 2.4711804389953613, |
| "reward_std": 0.18163591623306274, |
| "rewards/accuracy_reward_stage2": 0.48159706592559814, |
| "rewards/format_reward_all_stage": 1.9895833730697632, |
| "scores/refine_times": 1.296875, |
| "step": 815 |
| }, |
| { |
| "completion_length": 91.28125, |
| "epoch": 0.2015810276679842, |
| "grad_norm": 3.2296620896928436, |
| "kl": 0.10107421875, |
| "learning_rate": 7.986660079051383e-07, |
| "loss": 0.0041, |
| "reward": 2.847200870513916, |
| "reward_std": 0.04103899747133255, |
| "rewards/accuracy_reward_stage2": 0.8472008109092712, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.328125, |
| "step": 816 |
| }, |
| { |
| "completion_length": 91.984375, |
| "epoch": 0.20182806324110672, |
| "grad_norm": 4.058195341850405, |
| "kl": 0.12060546875, |
| "learning_rate": 7.984189723320158e-07, |
| "loss": 0.0048, |
| "reward": 2.5396575927734375, |
| "reward_std": 0.08468227833509445, |
| "rewards/accuracy_reward_stage2": 0.539657473564148, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 817 |
| }, |
| { |
| "completion_length": 87.640625, |
| "epoch": 0.20207509881422925, |
| "grad_norm": 3.411264589140427, |
| "kl": 0.10693359375, |
| "learning_rate": 7.981719367588932e-07, |
| "loss": 0.0043, |
| "reward": 2.590893268585205, |
| "reward_std": 0.013569341972470284, |
| "rewards/accuracy_reward_stage2": 0.5908934473991394, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 818 |
| }, |
| { |
| "completion_length": 118.203125, |
| "epoch": 0.20232213438735178, |
| "grad_norm": 3.0412069435274183, |
| "kl": 0.10595703125, |
| "learning_rate": 7.979249011857708e-07, |
| "loss": 0.0042, |
| "reward": 2.5969619750976562, |
| "reward_std": 0.06461979448795319, |
| "rewards/accuracy_reward_stage2": 0.5969619750976562, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.28125, |
| "step": 819 |
| }, |
| { |
| "completion_length": 106.609375, |
| "epoch": 0.2025691699604743, |
| "grad_norm": 3.9366533721995354, |
| "kl": 0.083984375, |
| "learning_rate": 7.976778656126482e-07, |
| "loss": 0.0034, |
| "reward": 2.7181453704833984, |
| "reward_std": 0.06762054562568665, |
| "rewards/accuracy_reward_stage2": 0.7754369974136353, |
| "rewards/format_reward_all_stage": 1.9427082538604736, |
| "scores/refine_times": 1.21875, |
| "step": 820 |
| }, |
| { |
| "completion_length": 101.75, |
| "epoch": 0.20281620553359683, |
| "grad_norm": 2.896684409936336, |
| "kl": 0.0869140625, |
| "learning_rate": 7.974308300395256e-07, |
| "loss": 0.0035, |
| "reward": 2.7476139068603516, |
| "reward_std": 0.13944795727729797, |
| "rewards/accuracy_reward_stage2": 0.8726138472557068, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.265625, |
| "step": 821 |
| }, |
| { |
| "completion_length": 90.9375, |
| "epoch": 0.20306324110671936, |
| "grad_norm": 4.01535366563514, |
| "kl": 0.0908203125, |
| "learning_rate": 7.971837944664032e-07, |
| "loss": 0.0036, |
| "reward": 2.506101131439209, |
| "reward_std": 0.08851330727338791, |
| "rewards/accuracy_reward_stage2": 0.5061010122299194, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 822 |
| }, |
| { |
| "completion_length": 98.8125, |
| "epoch": 0.2033102766798419, |
| "grad_norm": 3.350308544289007, |
| "kl": 0.1005859375, |
| "learning_rate": 7.969367588932806e-07, |
| "loss": 0.004, |
| "reward": 2.8014578819274902, |
| "reward_std": 0.08593001216650009, |
| "rewards/accuracy_reward_stage2": 0.8092702627182007, |
| "rewards/format_reward_all_stage": 1.9921875, |
| "scores/refine_times": 1.203125, |
| "step": 823 |
| }, |
| { |
| "completion_length": 107.828125, |
| "epoch": 0.20355731225296442, |
| "grad_norm": 3.0783484634726546, |
| "kl": 0.1005859375, |
| "learning_rate": 7.96689723320158e-07, |
| "loss": 0.004, |
| "reward": 2.5391650199890137, |
| "reward_std": 0.10030417889356613, |
| "rewards/accuracy_reward_stage2": 0.5391650795936584, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.359375, |
| "step": 824 |
| }, |
| { |
| "completion_length": 66.0625, |
| "epoch": 0.20380434782608695, |
| "grad_norm": 4.045892982620479, |
| "kl": 0.099609375, |
| "learning_rate": 7.964426877470355e-07, |
| "loss": 0.004, |
| "reward": 2.76053524017334, |
| "reward_std": 0.01285035815089941, |
| "rewards/accuracy_reward_stage2": 0.7605355381965637, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 825 |
| }, |
| { |
| "completion_length": 88.390625, |
| "epoch": 0.20405138339920947, |
| "grad_norm": 2.4907619826917724, |
| "kl": 0.0859375, |
| "learning_rate": 7.96195652173913e-07, |
| "loss": 0.0034, |
| "reward": 2.803270101547241, |
| "reward_std": 0.03927075117826462, |
| "rewards/accuracy_reward_stage2": 0.8032701015472412, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 826 |
| }, |
| { |
| "completion_length": 92.828125, |
| "epoch": 0.204298418972332, |
| "grad_norm": 3.734962052920799, |
| "kl": 0.091796875, |
| "learning_rate": 7.959486166007904e-07, |
| "loss": 0.0037, |
| "reward": 2.655478000640869, |
| "reward_std": 0.028529653325676918, |
| "rewards/accuracy_reward_stage2": 0.6554780602455139, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.234375, |
| "step": 827 |
| }, |
| { |
| "completion_length": 118.625, |
| "epoch": 0.20454545454545456, |
| "grad_norm": 3.570207149465539, |
| "kl": 0.0810546875, |
| "learning_rate": 7.957015810276679e-07, |
| "loss": 0.0032, |
| "reward": 2.61575984954834, |
| "reward_std": 0.04259338974952698, |
| "rewards/accuracy_reward_stage2": 0.6157597303390503, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.296875, |
| "step": 828 |
| }, |
| { |
| "completion_length": 107.734375, |
| "epoch": 0.2047924901185771, |
| "grad_norm": 3.982682150827258, |
| "kl": 0.068359375, |
| "learning_rate": 7.954545454545454e-07, |
| "loss": 0.0027, |
| "reward": 2.3884055614471436, |
| "reward_std": 0.16423478722572327, |
| "rewards/accuracy_reward_stage2": 0.45871806144714355, |
| "rewards/format_reward_all_stage": 1.9296875, |
| "scores/refine_times": 1.28125, |
| "step": 829 |
| }, |
| { |
| "completion_length": 80.328125, |
| "epoch": 0.20503952569169961, |
| "grad_norm": 0.8626963889416652, |
| "kl": 0.07568359375, |
| "learning_rate": 7.95207509881423e-07, |
| "loss": 0.003, |
| "reward": 2.832073211669922, |
| "reward_std": 0.012326827272772789, |
| "rewards/accuracy_reward_stage2": 0.8320731520652771, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 830 |
| }, |
| { |
| "completion_length": 111.59375, |
| "epoch": 0.20528656126482214, |
| "grad_norm": 1.9373113229582275, |
| "kl": 0.08447265625, |
| "learning_rate": 7.949604743083004e-07, |
| "loss": 0.0034, |
| "reward": 2.71048903465271, |
| "reward_std": 0.10930237174034119, |
| "rewards/accuracy_reward_stage2": 0.7365307211875916, |
| "rewards/format_reward_all_stage": 1.9739583730697632, |
| "scores/refine_times": 1.328125, |
| "step": 831 |
| }, |
| { |
| "completion_length": 108.078125, |
| "epoch": 0.20553359683794467, |
| "grad_norm": 3.4279539102400514, |
| "kl": 0.10302734375, |
| "learning_rate": 7.947134387351778e-07, |
| "loss": 0.0041, |
| "reward": 2.6778836250305176, |
| "reward_std": 0.05375240743160248, |
| "rewards/accuracy_reward_stage2": 0.6778836250305176, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.21875, |
| "step": 832 |
| }, |
| { |
| "completion_length": 104.078125, |
| "epoch": 0.2057806324110672, |
| "grad_norm": 12.44012981683348, |
| "kl": 0.470703125, |
| "learning_rate": 7.944664031620553e-07, |
| "loss": 0.0189, |
| "reward": 2.657538414001465, |
| "reward_std": 0.05722765997052193, |
| "rewards/accuracy_reward_stage2": 0.6575384140014648, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 833 |
| }, |
| { |
| "completion_length": 149.515625, |
| "epoch": 0.20602766798418973, |
| "grad_norm": 1.2161205838642917, |
| "kl": 0.068359375, |
| "learning_rate": 7.942193675889328e-07, |
| "loss": 0.0027, |
| "reward": 2.7292327880859375, |
| "reward_std": 0.020421404391527176, |
| "rewards/accuracy_reward_stage2": 0.7292326092720032, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.5625, |
| "step": 834 |
| }, |
| { |
| "completion_length": 120.046875, |
| "epoch": 0.20627470355731226, |
| "grad_norm": 4.827345708037369, |
| "kl": 0.09814453125, |
| "learning_rate": 7.939723320158102e-07, |
| "loss": 0.0039, |
| "reward": 2.404914379119873, |
| "reward_std": 0.24059876799583435, |
| "rewards/accuracy_reward_stage2": 0.598664402961731, |
| "rewards/format_reward_all_stage": 1.806249976158142, |
| "scores/refine_times": 1.328125, |
| "step": 835 |
| }, |
| { |
| "completion_length": 139.640625, |
| "epoch": 0.20652173913043478, |
| "grad_norm": 2.549571532494945, |
| "kl": 0.08642578125, |
| "learning_rate": 7.937252964426877e-07, |
| "loss": 0.0035, |
| "reward": 2.589641571044922, |
| "reward_std": 0.2173718512058258, |
| "rewards/accuracy_reward_stage2": 0.7146413922309875, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.515625, |
| "step": 836 |
| }, |
| { |
| "completion_length": 102.03125, |
| "epoch": 0.2067687747035573, |
| "grad_norm": 2.3032759644093197, |
| "kl": 0.07958984375, |
| "learning_rate": 7.934782608695651e-07, |
| "loss": 0.0032, |
| "reward": 2.698106050491333, |
| "reward_std": 0.1670789122581482, |
| "rewards/accuracy_reward_stage2": 0.760606050491333, |
| "rewards/format_reward_all_stage": 1.9375, |
| "scores/refine_times": 1.28125, |
| "step": 837 |
| }, |
| { |
| "completion_length": 102.625, |
| "epoch": 0.20701581027667984, |
| "grad_norm": 2.7054313518366575, |
| "kl": 0.08935546875, |
| "learning_rate": 7.932312252964426e-07, |
| "loss": 0.0036, |
| "reward": 2.827049970626831, |
| "reward_std": 0.02332034707069397, |
| "rewards/accuracy_reward_stage2": 0.827049970626831, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.328125, |
| "step": 838 |
| }, |
| { |
| "completion_length": 134.90625, |
| "epoch": 0.20726284584980237, |
| "grad_norm": 3.525087666814418, |
| "kl": 0.083984375, |
| "learning_rate": 7.929841897233202e-07, |
| "loss": 0.0034, |
| "reward": 2.7634880542755127, |
| "reward_std": 0.16727206110954285, |
| "rewards/accuracy_reward_stage2": 0.8963003754615784, |
| "rewards/format_reward_all_stage": 1.8671875, |
| "scores/refine_times": 1.453125, |
| "step": 839 |
| }, |
| { |
| "completion_length": 133.34375, |
| "epoch": 0.2075098814229249, |
| "grad_norm": 2.803983186699194, |
| "kl": 0.0732421875, |
| "learning_rate": 7.927371541501976e-07, |
| "loss": 0.0029, |
| "reward": 2.6079330444335938, |
| "reward_std": 0.0722799152135849, |
| "rewards/accuracy_reward_stage2": 0.6235581040382385, |
| "rewards/format_reward_all_stage": 1.984375, |
| "scores/refine_times": 1.703125, |
| "step": 840 |
| }, |
| { |
| "completion_length": 84.6875, |
| "epoch": 0.20775691699604742, |
| "grad_norm": 0.9087254095102063, |
| "kl": 0.08203125, |
| "learning_rate": 7.92490118577075e-07, |
| "loss": 0.0033, |
| "reward": 2.972470283508301, |
| "reward_std": 0.04419417306780815, |
| "rewards/accuracy_reward_stage2": 0.972470223903656, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 841 |
| }, |
| { |
| "completion_length": 96.421875, |
| "epoch": 0.20800395256916995, |
| "grad_norm": 4.268367932857312, |
| "kl": 0.1064453125, |
| "learning_rate": 7.922430830039525e-07, |
| "loss": 0.0043, |
| "reward": 2.7230210304260254, |
| "reward_std": 0.09623900800943375, |
| "rewards/accuracy_reward_stage2": 0.7308334112167358, |
| "rewards/format_reward_all_stage": 1.9921875, |
| "scores/refine_times": 1.1875, |
| "step": 842 |
| }, |
| { |
| "completion_length": 94.1875, |
| "epoch": 0.20825098814229248, |
| "grad_norm": 3.751185950471063, |
| "kl": 0.08740234375, |
| "learning_rate": 7.9199604743083e-07, |
| "loss": 0.0035, |
| "reward": 2.553612470626831, |
| "reward_std": 0.13736850023269653, |
| "rewards/accuracy_reward_stage2": 0.6192374229431152, |
| "rewards/format_reward_all_stage": 1.9343750476837158, |
| "scores/refine_times": 1.265625, |
| "step": 843 |
| }, |
| { |
| "completion_length": 111.6875, |
| "epoch": 0.208498023715415, |
| "grad_norm": 2.3607450354442774, |
| "kl": 0.09326171875, |
| "learning_rate": 7.917490118577075e-07, |
| "loss": 0.0037, |
| "reward": 2.6834843158721924, |
| "reward_std": 0.04238169267773628, |
| "rewards/accuracy_reward_stage2": 0.6897343993186951, |
| "rewards/format_reward_all_stage": 1.993749976158142, |
| "scores/refine_times": 1.421875, |
| "step": 844 |
| }, |
| { |
| "completion_length": 97.8125, |
| "epoch": 0.20874505928853754, |
| "grad_norm": 2.810706581086738, |
| "kl": 0.10302734375, |
| "learning_rate": 7.915019762845849e-07, |
| "loss": 0.0041, |
| "reward": 2.604111433029175, |
| "reward_std": 0.10309471189975739, |
| "rewards/accuracy_reward_stage2": 0.6614030599594116, |
| "rewards/format_reward_all_stage": 1.9427083730697632, |
| "scores/refine_times": 1.21875, |
| "step": 845 |
| }, |
| { |
| "completion_length": 117.25, |
| "epoch": 0.20899209486166007, |
| "grad_norm": 2.7354608601120294, |
| "kl": 0.12353515625, |
| "learning_rate": 7.912549407114623e-07, |
| "loss": 0.0049, |
| "reward": 2.600095272064209, |
| "reward_std": 0.10307259112596512, |
| "rewards/accuracy_reward_stage2": 0.6188453435897827, |
| "rewards/format_reward_all_stage": 1.9812500476837158, |
| "scores/refine_times": 1.703125, |
| "step": 846 |
| }, |
| { |
| "completion_length": 94.34375, |
| "epoch": 0.20923913043478262, |
| "grad_norm": 2.56722136517573, |
| "kl": 0.10107421875, |
| "learning_rate": 7.9100790513834e-07, |
| "loss": 0.0041, |
| "reward": 2.5527243614196777, |
| "reward_std": 0.12820011377334595, |
| "rewards/accuracy_reward_stage2": 0.5995994806289673, |
| "rewards/format_reward_all_stage": 1.953125, |
| "scores/refine_times": 1.4375, |
| "step": 847 |
| }, |
| { |
| "completion_length": 127.0625, |
| "epoch": 0.20948616600790515, |
| "grad_norm": 2.391728807768981, |
| "kl": 0.0751953125, |
| "learning_rate": 7.907608695652174e-07, |
| "loss": 0.003, |
| "reward": 2.7395739555358887, |
| "reward_std": 0.15919004380702972, |
| "rewards/accuracy_reward_stage2": 0.7614490985870361, |
| "rewards/format_reward_all_stage": 1.978124976158142, |
| "scores/refine_times": 1.6875, |
| "step": 848 |
| }, |
| { |
| "completion_length": 139.4375, |
| "epoch": 0.20973320158102768, |
| "grad_norm": 2.186650932071641, |
| "kl": 0.07763671875, |
| "learning_rate": 7.905138339920948e-07, |
| "loss": 0.0031, |
| "reward": 2.6006877422332764, |
| "reward_std": 0.20573003590106964, |
| "rewards/accuracy_reward_stage2": 0.7256878018379211, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.703125, |
| "step": 849 |
| }, |
| { |
| "completion_length": 120.609375, |
| "epoch": 0.2099802371541502, |
| "grad_norm": 1.7864914929400448, |
| "kl": 0.087890625, |
| "learning_rate": 7.902667984189723e-07, |
| "loss": 0.0035, |
| "reward": 2.523496627807617, |
| "reward_std": 0.06999015808105469, |
| "rewards/accuracy_reward_stage2": 0.5297467708587646, |
| "rewards/format_reward_all_stage": 1.993749976158142, |
| "scores/refine_times": 1.46875, |
| "step": 850 |
| }, |
| { |
| "completion_length": 106.8125, |
| "epoch": 0.21022727272727273, |
| "grad_norm": 1.806522155761803, |
| "kl": 0.078125, |
| "learning_rate": 7.900197628458498e-07, |
| "loss": 0.0031, |
| "reward": 2.625, |
| "reward_std": 0.13363061845302582, |
| "rewards/accuracy_reward_stage2": 0.75, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.234375, |
| "step": 851 |
| }, |
| { |
| "completion_length": 115.234375, |
| "epoch": 0.21047430830039526, |
| "grad_norm": 4.110304936221235, |
| "kl": 0.1533203125, |
| "learning_rate": 7.897727272727272e-07, |
| "loss": 0.0061, |
| "reward": 2.6135501861572266, |
| "reward_std": 0.0783662497997284, |
| "rewards/accuracy_reward_stage2": 0.6135500073432922, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.546875, |
| "step": 852 |
| }, |
| { |
| "completion_length": 97.859375, |
| "epoch": 0.2107213438735178, |
| "grad_norm": 2.359536619986747, |
| "kl": 0.078125, |
| "learning_rate": 7.895256916996047e-07, |
| "loss": 0.0031, |
| "reward": 2.7388830184936523, |
| "reward_std": 0.04497361183166504, |
| "rewards/accuracy_reward_stage2": 0.7451329827308655, |
| "rewards/format_reward_all_stage": 1.993749976158142, |
| "scores/refine_times": 1.3125, |
| "step": 853 |
| }, |
| { |
| "completion_length": 84.375, |
| "epoch": 0.21096837944664032, |
| "grad_norm": 1.9983144321891226, |
| "kl": 0.0830078125, |
| "learning_rate": 7.892786561264821e-07, |
| "loss": 0.0033, |
| "reward": 2.8331847190856934, |
| "reward_std": 0.003715165425091982, |
| "rewards/accuracy_reward_stage2": 0.8331847190856934, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.234375, |
| "step": 854 |
| }, |
| { |
| "completion_length": 95.53125, |
| "epoch": 0.21121541501976285, |
| "grad_norm": 4.26865105498033, |
| "kl": 0.1103515625, |
| "learning_rate": 7.890316205533597e-07, |
| "loss": 0.0044, |
| "reward": 2.5257091522216797, |
| "reward_std": 0.08476169407367706, |
| "rewards/accuracy_reward_stage2": 0.5934174060821533, |
| "rewards/format_reward_all_stage": 1.9322917461395264, |
| "scores/refine_times": 1.359375, |
| "step": 855 |
| }, |
| { |
| "completion_length": 84.3125, |
| "epoch": 0.21146245059288538, |
| "grad_norm": 3.9489524139672327, |
| "kl": 0.0693359375, |
| "learning_rate": 7.887845849802372e-07, |
| "loss": 0.0028, |
| "reward": 2.546104907989502, |
| "reward_std": 0.08534545451402664, |
| "rewards/accuracy_reward_stage2": 0.5461047291755676, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.15625, |
| "step": 856 |
| }, |
| { |
| "completion_length": 102.078125, |
| "epoch": 0.2117094861660079, |
| "grad_norm": 3.4011550909615917, |
| "kl": 0.087890625, |
| "learning_rate": 7.885375494071146e-07, |
| "loss": 0.0035, |
| "reward": 2.5729289054870605, |
| "reward_std": 0.024190258234739304, |
| "rewards/accuracy_reward_stage2": 0.5729289054870605, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.296875, |
| "step": 857 |
| }, |
| { |
| "completion_length": 94.0, |
| "epoch": 0.21195652173913043, |
| "grad_norm": 3.6691553352007062, |
| "kl": 0.09765625, |
| "learning_rate": 7.882905138339921e-07, |
| "loss": 0.0039, |
| "reward": 2.7282519340515137, |
| "reward_std": 0.14093205332756042, |
| "rewards/accuracy_reward_stage2": 0.7282518148422241, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.234375, |
| "step": 858 |
| }, |
| { |
| "completion_length": 136.171875, |
| "epoch": 0.21220355731225296, |
| "grad_norm": 3.438976115970983, |
| "kl": 0.0859375, |
| "learning_rate": 7.880434782608695e-07, |
| "loss": 0.0034, |
| "reward": 2.5337977409362793, |
| "reward_std": 0.17680373787879944, |
| "rewards/accuracy_reward_stage2": 0.6035893559455872, |
| "rewards/format_reward_all_stage": 1.9302083253860474, |
| "scores/refine_times": 1.65625, |
| "step": 859 |
| }, |
| { |
| "completion_length": 116.359375, |
| "epoch": 0.2124505928853755, |
| "grad_norm": 3.3182114661161433, |
| "kl": 0.087890625, |
| "learning_rate": 7.87796442687747e-07, |
| "loss": 0.0035, |
| "reward": 2.672604560852051, |
| "reward_std": 0.22977818548679352, |
| "rewards/accuracy_reward_stage2": 0.7351046800613403, |
| "rewards/format_reward_all_stage": 1.9375, |
| "scores/refine_times": 1.453125, |
| "step": 860 |
| }, |
| { |
| "completion_length": 86.671875, |
| "epoch": 0.21269762845849802, |
| "grad_norm": 3.0669111128024706, |
| "kl": 0.0712890625, |
| "learning_rate": 7.875494071146245e-07, |
| "loss": 0.0029, |
| "reward": 2.748086452484131, |
| "reward_std": 0.03686128184199333, |
| "rewards/accuracy_reward_stage2": 0.7480865716934204, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.140625, |
| "step": 861 |
| }, |
| { |
| "completion_length": 104.65625, |
| "epoch": 0.21294466403162055, |
| "grad_norm": 2.7593142734907627, |
| "kl": 0.0888671875, |
| "learning_rate": 7.873023715415019e-07, |
| "loss": 0.0036, |
| "reward": 2.674123525619507, |
| "reward_std": 0.15257620811462402, |
| "rewards/accuracy_reward_stage2": 0.68818598985672, |
| "rewards/format_reward_all_stage": 1.985937476158142, |
| "scores/refine_times": 1.5625, |
| "step": 862 |
| }, |
| { |
| "completion_length": 110.3125, |
| "epoch": 0.21319169960474307, |
| "grad_norm": 2.715061158769814, |
| "kl": 0.0771484375, |
| "learning_rate": 7.870553359683793e-07, |
| "loss": 0.0031, |
| "reward": 2.671416759490967, |
| "reward_std": 0.09916723519563675, |
| "rewards/accuracy_reward_stage2": 0.6714168787002563, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.359375, |
| "step": 863 |
| }, |
| { |
| "completion_length": 140.625, |
| "epoch": 0.2134387351778656, |
| "grad_norm": 4.1276033550796765, |
| "kl": 0.080078125, |
| "learning_rate": 7.868083003952569e-07, |
| "loss": 0.0032, |
| "reward": 2.622908115386963, |
| "reward_std": 0.11840154230594635, |
| "rewards/accuracy_reward_stage2": 0.6229078769683838, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.546875, |
| "step": 864 |
| }, |
| { |
| "completion_length": 106.6875, |
| "epoch": 0.21368577075098813, |
| "grad_norm": 1.7834332667991692, |
| "kl": 0.07275390625, |
| "learning_rate": 7.865612648221343e-07, |
| "loss": 0.0029, |
| "reward": 2.72995662689209, |
| "reward_std": 0.06430064141750336, |
| "rewards/accuracy_reward_stage2": 0.7299565076828003, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.328125, |
| "step": 865 |
| }, |
| { |
| "completion_length": 109.0, |
| "epoch": 0.21393280632411066, |
| "grad_norm": 6.533478455009108, |
| "kl": 0.056396484375, |
| "learning_rate": 7.863142292490119e-07, |
| "loss": 0.0023, |
| "reward": 2.5403122901916504, |
| "reward_std": 0.14339181780815125, |
| "rewards/accuracy_reward_stage2": 0.5403121113777161, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.109375, |
| "step": 866 |
| }, |
| { |
| "completion_length": 127.734375, |
| "epoch": 0.21417984189723321, |
| "grad_norm": 2.927182477110509, |
| "kl": 0.09375, |
| "learning_rate": 7.860671936758893e-07, |
| "loss": 0.0038, |
| "reward": 2.6628847122192383, |
| "reward_std": 0.1707833856344223, |
| "rewards/accuracy_reward_stage2": 0.6847599744796753, |
| "rewards/format_reward_all_stage": 1.978124976158142, |
| "scores/refine_times": 1.5625, |
| "step": 867 |
| }, |
| { |
| "completion_length": 84.125, |
| "epoch": 0.21442687747035574, |
| "grad_norm": 2.022836352589753, |
| "kl": 0.0771484375, |
| "learning_rate": 7.858201581027668e-07, |
| "loss": 0.0031, |
| "reward": 2.7814769744873047, |
| "reward_std": 0.03681057691574097, |
| "rewards/accuracy_reward_stage2": 0.7814772129058838, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.1875, |
| "step": 868 |
| }, |
| { |
| "completion_length": 112.1875, |
| "epoch": 0.21467391304347827, |
| "grad_norm": 4.342396733664187, |
| "kl": 0.07177734375, |
| "learning_rate": 7.855731225296443e-07, |
| "loss": 0.0029, |
| "reward": 2.410027503967285, |
| "reward_std": 0.20548567175865173, |
| "rewards/accuracy_reward_stage2": 0.5131524801254272, |
| "rewards/format_reward_all_stage": 1.896875023841858, |
| "scores/refine_times": 1.4375, |
| "step": 869 |
| }, |
| { |
| "completion_length": 123.40625, |
| "epoch": 0.2149209486166008, |
| "grad_norm": 1.617395557985399, |
| "kl": 0.07373046875, |
| "learning_rate": 7.853260869565217e-07, |
| "loss": 0.003, |
| "reward": 2.830418348312378, |
| "reward_std": 0.01754125952720642, |
| "rewards/accuracy_reward_stage2": 0.8304183483123779, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.359375, |
| "step": 870 |
| }, |
| { |
| "completion_length": 75.0625, |
| "epoch": 0.21516798418972333, |
| "grad_norm": 4.3176509106435255, |
| "kl": 0.078125, |
| "learning_rate": 7.850790513833991e-07, |
| "loss": 0.0031, |
| "reward": 2.724806308746338, |
| "reward_std": 0.04878024384379387, |
| "rewards/accuracy_reward_stage2": 0.7248064875602722, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 871 |
| }, |
| { |
| "completion_length": 129.4375, |
| "epoch": 0.21541501976284586, |
| "grad_norm": 2.812378128182156, |
| "kl": 0.083984375, |
| "learning_rate": 7.848320158102767e-07, |
| "loss": 0.0034, |
| "reward": 2.668769359588623, |
| "reward_std": 0.09495694190263748, |
| "rewards/accuracy_reward_stage2": 0.6750193238258362, |
| "rewards/format_reward_all_stage": 1.993749976158142, |
| "scores/refine_times": 1.546875, |
| "step": 872 |
| }, |
| { |
| "completion_length": 89.0, |
| "epoch": 0.21566205533596838, |
| "grad_norm": 2.8304688457113945, |
| "kl": 0.0869140625, |
| "learning_rate": 7.845849802371541e-07, |
| "loss": 0.0035, |
| "reward": 2.8485946655273438, |
| "reward_std": 0.05919472128152847, |
| "rewards/accuracy_reward_stage2": 0.8485945463180542, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.21875, |
| "step": 873 |
| }, |
| { |
| "completion_length": 90.578125, |
| "epoch": 0.2159090909090909, |
| "grad_norm": 3.8502592160508633, |
| "kl": 0.06982421875, |
| "learning_rate": 7.843379446640315e-07, |
| "loss": 0.0028, |
| "reward": 2.465599298477173, |
| "reward_std": 0.2232595980167389, |
| "rewards/accuracy_reward_stage2": 0.5905991792678833, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.203125, |
| "step": 874 |
| }, |
| { |
| "completion_length": 107.96875, |
| "epoch": 0.21615612648221344, |
| "grad_norm": 2.163127681716706, |
| "kl": 0.08740234375, |
| "learning_rate": 7.840909090909091e-07, |
| "loss": 0.0035, |
| "reward": 2.5576090812683105, |
| "reward_std": 0.07345429062843323, |
| "rewards/accuracy_reward_stage2": 0.5576090812683105, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.40625, |
| "step": 875 |
| }, |
| { |
| "completion_length": 90.609375, |
| "epoch": 0.21640316205533597, |
| "grad_norm": 2.911026203681279, |
| "kl": 0.07568359375, |
| "learning_rate": 7.838438735177866e-07, |
| "loss": 0.003, |
| "reward": 2.7440195083618164, |
| "reward_std": 0.10027820616960526, |
| "rewards/accuracy_reward_stage2": 0.7440195083618164, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.1875, |
| "step": 876 |
| }, |
| { |
| "completion_length": 118.5625, |
| "epoch": 0.2166501976284585, |
| "grad_norm": 3.7722970526311936, |
| "kl": 0.078125, |
| "learning_rate": 7.83596837944664e-07, |
| "loss": 0.0031, |
| "reward": 2.705394744873047, |
| "reward_std": 0.2528733015060425, |
| "rewards/accuracy_reward_stage2": 0.7116448283195496, |
| "rewards/format_reward_all_stage": 1.993749976158142, |
| "scores/refine_times": 1.46875, |
| "step": 877 |
| }, |
| { |
| "completion_length": 79.015625, |
| "epoch": 0.21689723320158102, |
| "grad_norm": 2.2348619430388097, |
| "kl": 0.1025390625, |
| "learning_rate": 7.833498023715415e-07, |
| "loss": 0.0041, |
| "reward": 2.8125, |
| "reward_std": 0.06681530922651291, |
| "rewards/accuracy_reward_stage2": 0.8125, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 878 |
| }, |
| { |
| "completion_length": 101.59375, |
| "epoch": 0.21714426877470355, |
| "grad_norm": 1.7133622297820073, |
| "kl": 0.06201171875, |
| "learning_rate": 7.831027667984189e-07, |
| "loss": 0.0025, |
| "reward": 2.6325061321258545, |
| "reward_std": 0.13117417693138123, |
| "rewards/accuracy_reward_stage2": 0.7575061321258545, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.15625, |
| "step": 879 |
| }, |
| { |
| "completion_length": 126.6875, |
| "epoch": 0.21739130434782608, |
| "grad_norm": 3.445132940662978, |
| "kl": 0.0703125, |
| "learning_rate": 7.828557312252963e-07, |
| "loss": 0.0028, |
| "reward": 2.574112892150879, |
| "reward_std": 0.10674077272415161, |
| "rewards/accuracy_reward_stage2": 0.5741128921508789, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.46875, |
| "step": 880 |
| }, |
| { |
| "completion_length": 111.34375, |
| "epoch": 0.2176383399209486, |
| "grad_norm": 3.4192114253246424, |
| "kl": 0.07666015625, |
| "learning_rate": 7.826086956521739e-07, |
| "loss": 0.0031, |
| "reward": 2.6431050300598145, |
| "reward_std": 0.21736359596252441, |
| "rewards/accuracy_reward_stage2": 0.7681052088737488, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.3125, |
| "step": 881 |
| }, |
| { |
| "completion_length": 102.953125, |
| "epoch": 0.21788537549407114, |
| "grad_norm": 2.257112329346978, |
| "kl": 0.0791015625, |
| "learning_rate": 7.823616600790513e-07, |
| "loss": 0.0032, |
| "reward": 2.7524609565734863, |
| "reward_std": 0.0027502209413796663, |
| "rewards/accuracy_reward_stage2": 0.7524610161781311, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.34375, |
| "step": 882 |
| }, |
| { |
| "completion_length": 103.15625, |
| "epoch": 0.21813241106719367, |
| "grad_norm": 3.942982015966452, |
| "kl": 0.072265625, |
| "learning_rate": 7.821146245059288e-07, |
| "loss": 0.0029, |
| "reward": 2.6912498474121094, |
| "reward_std": 0.0877779871225357, |
| "rewards/accuracy_reward_stage2": 0.7016666531562805, |
| "rewards/format_reward_all_stage": 1.9895833730697632, |
| "scores/refine_times": 1.296875, |
| "step": 883 |
| }, |
| { |
| "completion_length": 92.46875, |
| "epoch": 0.2183794466403162, |
| "grad_norm": 4.112406050123377, |
| "kl": 0.1376953125, |
| "learning_rate": 7.818675889328063e-07, |
| "loss": 0.0055, |
| "reward": 2.843651056289673, |
| "reward_std": 0.11524404585361481, |
| "rewards/accuracy_reward_stage2": 0.8436509966850281, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.234375, |
| "step": 884 |
| }, |
| { |
| "completion_length": 100.5625, |
| "epoch": 0.21862648221343872, |
| "grad_norm": 2.8425864804826904, |
| "kl": 0.099609375, |
| "learning_rate": 7.816205533596838e-07, |
| "loss": 0.004, |
| "reward": 2.7560110092163086, |
| "reward_std": 0.07648982852697372, |
| "rewards/accuracy_reward_stage2": 0.7560111284255981, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.296875, |
| "step": 885 |
| }, |
| { |
| "completion_length": 153.015625, |
| "epoch": 0.21887351778656128, |
| "grad_norm": 1.7320481010726945, |
| "kl": 0.07861328125, |
| "learning_rate": 7.813735177865613e-07, |
| "loss": 0.0031, |
| "reward": 2.765270233154297, |
| "reward_std": 0.033314161002635956, |
| "rewards/accuracy_reward_stage2": 0.7652702331542969, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.640625, |
| "step": 886 |
| }, |
| { |
| "completion_length": 165.3125, |
| "epoch": 0.2191205533596838, |
| "grad_norm": 3.2769897873042666, |
| "kl": 0.06396484375, |
| "learning_rate": 7.811264822134387e-07, |
| "loss": 0.0025, |
| "reward": 2.6706483364105225, |
| "reward_std": 0.09551921486854553, |
| "rewards/accuracy_reward_stage2": 0.6706483364105225, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.75, |
| "step": 887 |
| }, |
| { |
| "completion_length": 87.90625, |
| "epoch": 0.21936758893280633, |
| "grad_norm": 2.9755951499704385, |
| "kl": 0.09765625, |
| "learning_rate": 7.808794466403161e-07, |
| "loss": 0.0039, |
| "reward": 2.736006736755371, |
| "reward_std": 0.0732710063457489, |
| "rewards/accuracy_reward_stage2": 0.7360066771507263, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.171875, |
| "step": 888 |
| }, |
| { |
| "completion_length": 95.109375, |
| "epoch": 0.21961462450592886, |
| "grad_norm": 3.388507614969505, |
| "kl": 0.07568359375, |
| "learning_rate": 7.806324110671937e-07, |
| "loss": 0.003, |
| "reward": 2.7535786628723145, |
| "reward_std": 0.028518326580524445, |
| "rewards/accuracy_reward_stage2": 0.7535787224769592, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.1875, |
| "step": 889 |
| }, |
| { |
| "completion_length": 85.640625, |
| "epoch": 0.2198616600790514, |
| "grad_norm": 3.8655007295215915, |
| "kl": 0.06298828125, |
| "learning_rate": 7.803853754940711e-07, |
| "loss": 0.0025, |
| "reward": 2.7656240463256836, |
| "reward_std": 0.10700556635856628, |
| "rewards/accuracy_reward_stage2": 0.7656240463256836, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.1875, |
| "step": 890 |
| }, |
| { |
| "completion_length": 93.90625, |
| "epoch": 0.22010869565217392, |
| "grad_norm": 1.700698474645943, |
| "kl": 0.08544921875, |
| "learning_rate": 7.801383399209485e-07, |
| "loss": 0.0034, |
| "reward": 2.713747501373291, |
| "reward_std": 0.02260005660355091, |
| "rewards/accuracy_reward_stage2": 0.7137476205825806, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.25, |
| "step": 891 |
| }, |
| { |
| "completion_length": 118.640625, |
| "epoch": 0.22035573122529645, |
| "grad_norm": 2.2855748354605305, |
| "kl": 0.0634765625, |
| "learning_rate": 7.79891304347826e-07, |
| "loss": 0.0025, |
| "reward": 2.671875, |
| "reward_std": 0.11100947856903076, |
| "rewards/accuracy_reward_stage2": 0.671875, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.328125, |
| "step": 892 |
| }, |
| { |
| "completion_length": 88.109375, |
| "epoch": 0.22060276679841898, |
| "grad_norm": 3.9695782246909497, |
| "kl": 0.06005859375, |
| "learning_rate": 7.796442687747036e-07, |
| "loss": 0.0024, |
| "reward": 2.7079808712005615, |
| "reward_std": 0.1411241590976715, |
| "rewards/accuracy_reward_stage2": 0.7079808712005615, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.078125, |
| "step": 893 |
| }, |
| { |
| "completion_length": 100.046875, |
| "epoch": 0.2208498023715415, |
| "grad_norm": 3.3606486294352473, |
| "kl": 0.0810546875, |
| "learning_rate": 7.793972332015811e-07, |
| "loss": 0.0033, |
| "reward": 2.6410939693450928, |
| "reward_std": 0.11747156083583832, |
| "rewards/accuracy_reward_stage2": 0.6410939693450928, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.1875, |
| "step": 894 |
| }, |
| { |
| "completion_length": 84.78125, |
| "epoch": 0.22109683794466403, |
| "grad_norm": 3.441456907155892, |
| "kl": 0.0859375, |
| "learning_rate": 7.791501976284585e-07, |
| "loss": 0.0034, |
| "reward": 2.6340184211730957, |
| "reward_std": 0.08257357776165009, |
| "rewards/accuracy_reward_stage2": 0.6340183019638062, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.1875, |
| "step": 895 |
| }, |
| { |
| "completion_length": 110.4375, |
| "epoch": 0.22134387351778656, |
| "grad_norm": 2.6882059908650606, |
| "kl": 0.0634765625, |
| "learning_rate": 7.789031620553359e-07, |
| "loss": 0.0025, |
| "reward": 2.633941173553467, |
| "reward_std": 0.11902174353599548, |
| "rewards/accuracy_reward_stage2": 0.6912329196929932, |
| "rewards/format_reward_all_stage": 1.9427083730697632, |
| "scores/refine_times": 1.328125, |
| "step": 896 |
| }, |
| { |
| "completion_length": 111.765625, |
| "epoch": 0.2215909090909091, |
| "grad_norm": 2.5472008274936466, |
| "kl": 0.07763671875, |
| "learning_rate": 7.786561264822134e-07, |
| "loss": 0.0031, |
| "reward": 2.6494970321655273, |
| "reward_std": 0.1488344967365265, |
| "rewards/accuracy_reward_stage2": 0.6494969725608826, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.625, |
| "step": 897 |
| }, |
| { |
| "completion_length": 142.640625, |
| "epoch": 0.22183794466403162, |
| "grad_norm": 3.487433794581114, |
| "kl": 0.061767578125, |
| "learning_rate": 7.784090909090909e-07, |
| "loss": 0.0025, |
| "reward": 2.633687734603882, |
| "reward_std": 0.16814634203910828, |
| "rewards/accuracy_reward_stage2": 0.6336876749992371, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.515625, |
| "step": 898 |
| }, |
| { |
| "completion_length": 97.71875, |
| "epoch": 0.22208498023715414, |
| "grad_norm": 2.1276640745086226, |
| "kl": 0.068359375, |
| "learning_rate": 7.781620553359683e-07, |
| "loss": 0.0027, |
| "reward": 2.71793270111084, |
| "reward_std": 0.02380518987774849, |
| "rewards/accuracy_reward_stage2": 0.7179328799247742, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.234375, |
| "step": 899 |
| }, |
| { |
| "completion_length": 89.171875, |
| "epoch": 0.22233201581027667, |
| "grad_norm": 4.0853981081869435, |
| "kl": 0.08642578125, |
| "learning_rate": 7.779150197628458e-07, |
| "loss": 0.0035, |
| "reward": 2.381068229675293, |
| "reward_std": 0.2457825243473053, |
| "rewards/accuracy_reward_stage2": 0.5060682892799377, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.265625, |
| "step": 900 |
| }, |
| { |
| "completion_length": 86.390625, |
| "epoch": 0.2225790513833992, |
| "grad_norm": 3.8442263343748673, |
| "kl": 0.0703125, |
| "learning_rate": 7.776679841897232e-07, |
| "loss": 0.0028, |
| "reward": 2.7973074913024902, |
| "reward_std": 0.070428267121315, |
| "rewards/accuracy_reward_stage2": 0.7973074913024902, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 901 |
| }, |
| { |
| "completion_length": 109.34375, |
| "epoch": 0.22282608695652173, |
| "grad_norm": 2.3746786204682397, |
| "kl": 0.07861328125, |
| "learning_rate": 7.774209486166008e-07, |
| "loss": 0.0031, |
| "reward": 2.5825467109680176, |
| "reward_std": 0.22069254517555237, |
| "rewards/accuracy_reward_stage2": 0.7231717109680176, |
| "rewards/format_reward_all_stage": 1.859375, |
| "scores/refine_times": 1.28125, |
| "step": 902 |
| }, |
| { |
| "completion_length": 115.71875, |
| "epoch": 0.22307312252964426, |
| "grad_norm": 3.744151198948681, |
| "kl": 0.1181640625, |
| "learning_rate": 7.771739130434783e-07, |
| "loss": 0.0047, |
| "reward": 2.5876035690307617, |
| "reward_std": 0.13640564680099487, |
| "rewards/accuracy_reward_stage2": 0.6110408306121826, |
| "rewards/format_reward_all_stage": 1.9765625, |
| "scores/refine_times": 1.5625, |
| "step": 903 |
| }, |
| { |
| "completion_length": 95.859375, |
| "epoch": 0.22332015810276679, |
| "grad_norm": 3.9864543123830023, |
| "kl": 0.0888671875, |
| "learning_rate": 7.769268774703557e-07, |
| "loss": 0.0036, |
| "reward": 2.503157615661621, |
| "reward_std": 0.07907243072986603, |
| "rewards/accuracy_reward_stage2": 0.5031576156616211, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.203125, |
| "step": 904 |
| }, |
| { |
| "completion_length": 124.15625, |
| "epoch": 0.22356719367588934, |
| "grad_norm": 3.414528616278631, |
| "kl": 0.10009765625, |
| "learning_rate": 7.766798418972331e-07, |
| "loss": 0.004, |
| "reward": 2.6220712661743164, |
| "reward_std": 0.03463301062583923, |
| "rewards/accuracy_reward_stage2": 0.6220711469650269, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.59375, |
| "step": 905 |
| }, |
| { |
| "completion_length": 108.421875, |
| "epoch": 0.22381422924901187, |
| "grad_norm": 2.65452224241206, |
| "kl": 0.07080078125, |
| "learning_rate": 7.764328063241107e-07, |
| "loss": 0.0028, |
| "reward": 2.69258713722229, |
| "reward_std": 0.15411412715911865, |
| "rewards/accuracy_reward_stage2": 0.75508713722229, |
| "rewards/format_reward_all_stage": 1.9375, |
| "scores/refine_times": 1.4375, |
| "step": 906 |
| }, |
| { |
| "completion_length": 120.15625, |
| "epoch": 0.2240612648221344, |
| "grad_norm": 3.8526319437643717, |
| "kl": 0.09619140625, |
| "learning_rate": 7.761857707509881e-07, |
| "loss": 0.0038, |
| "reward": 2.6302669048309326, |
| "reward_std": 0.13176321983337402, |
| "rewards/accuracy_reward_stage2": 0.6302669048309326, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.59375, |
| "step": 907 |
| }, |
| { |
| "completion_length": 103.4375, |
| "epoch": 0.22430830039525693, |
| "grad_norm": 2.8425333906095998, |
| "kl": 0.083984375, |
| "learning_rate": 7.759387351778656e-07, |
| "loss": 0.0034, |
| "reward": 2.5473570823669434, |
| "reward_std": 0.11453460156917572, |
| "rewards/accuracy_reward_stage2": 0.5473569631576538, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.421875, |
| "step": 908 |
| }, |
| { |
| "completion_length": 108.078125, |
| "epoch": 0.22455533596837945, |
| "grad_norm": 4.249231689338272, |
| "kl": 0.09716796875, |
| "learning_rate": 7.75691699604743e-07, |
| "loss": 0.0039, |
| "reward": 2.6726608276367188, |
| "reward_std": 0.05191066488623619, |
| "rewards/accuracy_reward_stage2": 0.6726609468460083, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.46875, |
| "step": 909 |
| }, |
| { |
| "completion_length": 100.75, |
| "epoch": 0.22480237154150198, |
| "grad_norm": 1.3264075118453993, |
| "kl": 0.0751953125, |
| "learning_rate": 7.754446640316205e-07, |
| "loss": 0.003, |
| "reward": 2.7188069820404053, |
| "reward_std": 0.10090002417564392, |
| "rewards/accuracy_reward_stage2": 0.7344319820404053, |
| "rewards/format_reward_all_stage": 1.984375, |
| "scores/refine_times": 1.375, |
| "step": 910 |
| }, |
| { |
| "completion_length": 100.671875, |
| "epoch": 0.2250494071146245, |
| "grad_norm": 2.6499971246685643, |
| "kl": 0.0966796875, |
| "learning_rate": 7.75197628458498e-07, |
| "loss": 0.0039, |
| "reward": 2.651829481124878, |
| "reward_std": 0.05644657090306282, |
| "rewards/accuracy_reward_stage2": 0.6518294215202332, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.375, |
| "step": 911 |
| }, |
| { |
| "completion_length": 82.625, |
| "epoch": 0.22529644268774704, |
| "grad_norm": 4.2859115031657, |
| "kl": 0.0908203125, |
| "learning_rate": 7.749505928853755e-07, |
| "loss": 0.0036, |
| "reward": 2.6028127670288086, |
| "reward_std": 0.09801940619945526, |
| "rewards/accuracy_reward_stage2": 0.6106254458427429, |
| "rewards/format_reward_all_stage": 1.9921875, |
| "scores/refine_times": 1.234375, |
| "step": 912 |
| }, |
| { |
| "completion_length": 82.1875, |
| "epoch": 0.22554347826086957, |
| "grad_norm": 3.7686117517627125, |
| "kl": 0.07421875, |
| "learning_rate": 7.747035573122529e-07, |
| "loss": 0.003, |
| "reward": 2.5454444885253906, |
| "reward_std": 0.11515636742115021, |
| "rewards/accuracy_reward_stage2": 0.5454442501068115, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.078125, |
| "step": 913 |
| }, |
| { |
| "completion_length": 111.703125, |
| "epoch": 0.2257905138339921, |
| "grad_norm": 2.298219131706107, |
| "kl": 0.0732421875, |
| "learning_rate": 7.744565217391305e-07, |
| "loss": 0.0029, |
| "reward": 2.8503479957580566, |
| "reward_std": 0.055034905672073364, |
| "rewards/accuracy_reward_stage2": 0.8503477573394775, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.4375, |
| "step": 914 |
| }, |
| { |
| "completion_length": 121.0, |
| "epoch": 0.22603754940711462, |
| "grad_norm": 2.4396894747569893, |
| "kl": 0.10302734375, |
| "learning_rate": 7.742094861660079e-07, |
| "loss": 0.0041, |
| "reward": 2.753645658493042, |
| "reward_std": 0.07929170876741409, |
| "rewards/accuracy_reward_stage2": 0.753645658493042, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.515625, |
| "step": 915 |
| }, |
| { |
| "completion_length": 118.328125, |
| "epoch": 0.22628458498023715, |
| "grad_norm": 1.9607094755268186, |
| "kl": 0.09375, |
| "learning_rate": 7.739624505928853e-07, |
| "loss": 0.0038, |
| "reward": 2.658937454223633, |
| "reward_std": 0.02175423502922058, |
| "rewards/accuracy_reward_stage2": 0.658937394618988, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.34375, |
| "step": 916 |
| }, |
| { |
| "completion_length": 93.40625, |
| "epoch": 0.22653162055335968, |
| "grad_norm": 2.5887397012016566, |
| "kl": 0.07958984375, |
| "learning_rate": 7.737154150197628e-07, |
| "loss": 0.0032, |
| "reward": 2.5921926498413086, |
| "reward_std": 0.07951997220516205, |
| "rewards/accuracy_reward_stage2": 0.5921927690505981, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.15625, |
| "step": 917 |
| }, |
| { |
| "completion_length": 102.40625, |
| "epoch": 0.2267786561264822, |
| "grad_norm": 2.6225329982689742, |
| "kl": 0.0732421875, |
| "learning_rate": 7.734683794466402e-07, |
| "loss": 0.0029, |
| "reward": 2.766453266143799, |
| "reward_std": 0.06807538866996765, |
| "rewards/accuracy_reward_stage2": 0.7664532661437988, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.265625, |
| "step": 918 |
| }, |
| { |
| "completion_length": 87.078125, |
| "epoch": 0.22702569169960474, |
| "grad_norm": 2.812032447985376, |
| "kl": 0.06884765625, |
| "learning_rate": 7.732213438735177e-07, |
| "loss": 0.0027, |
| "reward": 2.8405091762542725, |
| "reward_std": 0.07255198061466217, |
| "rewards/accuracy_reward_stage2": 0.9030092358589172, |
| "rewards/format_reward_all_stage": 1.9375, |
| "scores/refine_times": 1.125, |
| "step": 919 |
| }, |
| { |
| "completion_length": 88.578125, |
| "epoch": 0.22727272727272727, |
| "grad_norm": 3.352416052089907, |
| "kl": 0.08056640625, |
| "learning_rate": 7.729743083003952e-07, |
| "loss": 0.0032, |
| "reward": 2.8867108821868896, |
| "reward_std": 0.13572408258914948, |
| "rewards/accuracy_reward_stage2": 0.8971275687217712, |
| "rewards/format_reward_all_stage": 1.9895832538604736, |
| "scores/refine_times": 1.140625, |
| "step": 920 |
| }, |
| { |
| "completion_length": 120.65625, |
| "epoch": 0.2275197628458498, |
| "grad_norm": 1.9280858288695926, |
| "kl": 0.08544921875, |
| "learning_rate": 7.727272727272727e-07, |
| "loss": 0.0034, |
| "reward": 2.597029209136963, |
| "reward_std": 0.10120301693677902, |
| "rewards/accuracy_reward_stage2": 0.5970292687416077, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.546875, |
| "step": 921 |
| }, |
| { |
| "completion_length": 95.28125, |
| "epoch": 0.22776679841897232, |
| "grad_norm": 3.494588928223786, |
| "kl": 0.0849609375, |
| "learning_rate": 7.724802371541502e-07, |
| "loss": 0.0034, |
| "reward": 2.81472110748291, |
| "reward_std": 0.046646032482385635, |
| "rewards/accuracy_reward_stage2": 0.8147209882736206, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.3125, |
| "step": 922 |
| }, |
| { |
| "completion_length": 145.453125, |
| "epoch": 0.22801383399209485, |
| "grad_norm": 4.451245280949107, |
| "kl": 0.251953125, |
| "learning_rate": 7.722332015810277e-07, |
| "loss": 0.01, |
| "reward": 2.779994010925293, |
| "reward_std": 0.1533237248659134, |
| "rewards/accuracy_reward_stage2": 0.9049938917160034, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.65625, |
| "step": 923 |
| }, |
| { |
| "completion_length": 141.109375, |
| "epoch": 0.22826086956521738, |
| "grad_norm": 3.10942872267095, |
| "kl": 0.08740234375, |
| "learning_rate": 7.719861660079051e-07, |
| "loss": 0.0035, |
| "reward": 2.6243720054626465, |
| "reward_std": 0.08140174299478531, |
| "rewards/accuracy_reward_stage2": 0.6243720650672913, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.453125, |
| "step": 924 |
| }, |
| { |
| "completion_length": 113.125, |
| "epoch": 0.22850790513833993, |
| "grad_norm": 2.764415964686548, |
| "kl": 0.0986328125, |
| "learning_rate": 7.717391304347826e-07, |
| "loss": 0.0039, |
| "reward": 2.570035457611084, |
| "reward_std": 0.056792110204696655, |
| "rewards/accuracy_reward_stage2": 0.5700353384017944, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.34375, |
| "step": 925 |
| }, |
| { |
| "completion_length": 107.390625, |
| "epoch": 0.22875494071146246, |
| "grad_norm": 3.2431842066672534, |
| "kl": 0.123046875, |
| "learning_rate": 7.7149209486166e-07, |
| "loss": 0.0049, |
| "reward": 2.628124952316284, |
| "reward_std": 0.16987210512161255, |
| "rewards/accuracy_reward_stage2": 0.690625011920929, |
| "rewards/format_reward_all_stage": 1.9375, |
| "scores/refine_times": 1.4375, |
| "step": 926 |
| }, |
| { |
| "completion_length": 116.078125, |
| "epoch": 0.229001976284585, |
| "grad_norm": 3.0388485387607407, |
| "kl": 0.0859375, |
| "learning_rate": 7.712450592885375e-07, |
| "loss": 0.0034, |
| "reward": 2.7118468284606934, |
| "reward_std": 0.11388581991195679, |
| "rewards/accuracy_reward_stage2": 0.7274720668792725, |
| "rewards/format_reward_all_stage": 1.984375, |
| "scores/refine_times": 1.453125, |
| "step": 927 |
| }, |
| { |
| "completion_length": 125.390625, |
| "epoch": 0.22924901185770752, |
| "grad_norm": 2.4235888830830064, |
| "kl": 0.07373046875, |
| "learning_rate": 7.70998023715415e-07, |
| "loss": 0.0029, |
| "reward": 2.7621376514434814, |
| "reward_std": 0.01607050932943821, |
| "rewards/accuracy_reward_stage2": 0.7621376514434814, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.40625, |
| "step": 928 |
| }, |
| { |
| "completion_length": 152.75, |
| "epoch": 0.22949604743083005, |
| "grad_norm": 3.48783524178748, |
| "kl": 0.11083984375, |
| "learning_rate": 7.707509881422924e-07, |
| "loss": 0.0044, |
| "reward": 2.7437613010406494, |
| "reward_std": 0.1258169412612915, |
| "rewards/accuracy_reward_stage2": 0.7437613010406494, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.59375, |
| "step": 929 |
| }, |
| { |
| "completion_length": 125.78125, |
| "epoch": 0.22974308300395258, |
| "grad_norm": 2.84328279429593, |
| "kl": 0.07763671875, |
| "learning_rate": 7.705039525691699e-07, |
| "loss": 0.0031, |
| "reward": 2.6663095951080322, |
| "reward_std": 0.07182341814041138, |
| "rewards/accuracy_reward_stage2": 0.6819344162940979, |
| "rewards/format_reward_all_stage": 1.984375, |
| "scores/refine_times": 1.609375, |
| "step": 930 |
| }, |
| { |
| "completion_length": 92.515625, |
| "epoch": 0.2299901185770751, |
| "grad_norm": 4.25905140589049, |
| "kl": 0.107421875, |
| "learning_rate": 7.702569169960475e-07, |
| "loss": 0.0043, |
| "reward": 2.658834218978882, |
| "reward_std": 0.1202336847782135, |
| "rewards/accuracy_reward_stage2": 0.6588343381881714, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 931 |
| }, |
| { |
| "completion_length": 131.265625, |
| "epoch": 0.23023715415019763, |
| "grad_norm": 3.87126995736488, |
| "kl": 0.083984375, |
| "learning_rate": 7.700098814229249e-07, |
| "loss": 0.0034, |
| "reward": 2.730388641357422, |
| "reward_std": 0.14922493696212769, |
| "rewards/accuracy_reward_stage2": 0.7928886413574219, |
| "rewards/format_reward_all_stage": 1.9375, |
| "scores/refine_times": 1.5, |
| "step": 932 |
| }, |
| { |
| "completion_length": 153.6875, |
| "epoch": 0.23048418972332016, |
| "grad_norm": 3.205279272182687, |
| "kl": 0.0966796875, |
| "learning_rate": 7.697628458498024e-07, |
| "loss": 0.0039, |
| "reward": 2.547888994216919, |
| "reward_std": 0.1733008772134781, |
| "rewards/accuracy_reward_stage2": 0.6728890538215637, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.671875, |
| "step": 933 |
| }, |
| { |
| "completion_length": 126.84375, |
| "epoch": 0.2307312252964427, |
| "grad_norm": 3.400946207231384, |
| "kl": 0.083984375, |
| "learning_rate": 7.695158102766798e-07, |
| "loss": 0.0033, |
| "reward": 2.578770160675049, |
| "reward_std": 0.13955532014369965, |
| "rewards/accuracy_reward_stage2": 0.6516869068145752, |
| "rewards/format_reward_all_stage": 1.9270832538604736, |
| "scores/refine_times": 1.453125, |
| "step": 934 |
| }, |
| { |
| "completion_length": 122.65625, |
| "epoch": 0.23097826086956522, |
| "grad_norm": 3.248359221948695, |
| "kl": 0.0830078125, |
| "learning_rate": 7.692687747035573e-07, |
| "loss": 0.0033, |
| "reward": 2.5240418910980225, |
| "reward_std": 0.21152980625629425, |
| "rewards/accuracy_reward_stage2": 0.6438335180282593, |
| "rewards/format_reward_all_stage": 1.8802083730697632, |
| "scores/refine_times": 1.359375, |
| "step": 935 |
| }, |
| { |
| "completion_length": 133.171875, |
| "epoch": 0.23122529644268774, |
| "grad_norm": 2.4067173262093635, |
| "kl": 0.0810546875, |
| "learning_rate": 7.690217391304348e-07, |
| "loss": 0.0032, |
| "reward": 2.5248830318450928, |
| "reward_std": 0.36290836334228516, |
| "rewards/accuracy_reward_stage2": 0.7384247183799744, |
| "rewards/format_reward_all_stage": 1.7864583730697632, |
| "scores/refine_times": 1.59375, |
| "step": 936 |
| }, |
| { |
| "completion_length": 100.078125, |
| "epoch": 0.23147233201581027, |
| "grad_norm": 3.3677672306290787, |
| "kl": 0.06591796875, |
| "learning_rate": 7.687747035573122e-07, |
| "loss": 0.0026, |
| "reward": 2.5898051261901855, |
| "reward_std": 0.08437584340572357, |
| "rewards/accuracy_reward_stage2": 0.5898053646087646, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.28125, |
| "step": 937 |
| }, |
| { |
| "completion_length": 104.90625, |
| "epoch": 0.2317193675889328, |
| "grad_norm": 4.578017762931248, |
| "kl": 0.0986328125, |
| "learning_rate": 7.685276679841896e-07, |
| "loss": 0.0039, |
| "reward": 2.447653293609619, |
| "reward_std": 0.44203102588653564, |
| "rewards/accuracy_reward_stage2": 0.7757784128189087, |
| "rewards/format_reward_all_stage": 1.671875, |
| "scores/refine_times": 1.296875, |
| "step": 938 |
| }, |
| { |
| "completion_length": 101.359375, |
| "epoch": 0.23196640316205533, |
| "grad_norm": 4.2459965005065925, |
| "kl": 0.1328125, |
| "learning_rate": 7.682806324110671e-07, |
| "loss": 0.0053, |
| "reward": 2.704395294189453, |
| "reward_std": 0.073523610830307, |
| "rewards/accuracy_reward_stage2": 0.7200204133987427, |
| "rewards/format_reward_all_stage": 1.984375, |
| "scores/refine_times": 1.1875, |
| "step": 939 |
| }, |
| { |
| "completion_length": 141.59375, |
| "epoch": 0.23221343873517786, |
| "grad_norm": 3.0089788909015347, |
| "kl": 0.11181640625, |
| "learning_rate": 7.680335968379447e-07, |
| "loss": 0.0045, |
| "reward": 2.509549140930176, |
| "reward_std": 0.21922683715820312, |
| "rewards/accuracy_reward_stage2": 0.6345490217208862, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.53125, |
| "step": 940 |
| }, |
| { |
| "completion_length": 88.828125, |
| "epoch": 0.23246047430830039, |
| "grad_norm": 3.976294816968691, |
| "kl": 0.07861328125, |
| "learning_rate": 7.677865612648221e-07, |
| "loss": 0.0031, |
| "reward": 2.5723557472229004, |
| "reward_std": 0.0782502293586731, |
| "rewards/accuracy_reward_stage2": 0.5723556280136108, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 941 |
| }, |
| { |
| "completion_length": 87.703125, |
| "epoch": 0.2327075098814229, |
| "grad_norm": 4.189264645949934, |
| "kl": 0.1025390625, |
| "learning_rate": 7.675395256916996e-07, |
| "loss": 0.0041, |
| "reward": 2.315624952316284, |
| "reward_std": 0.4645420014858246, |
| "rewards/accuracy_reward_stage2": 0.690625011920929, |
| "rewards/format_reward_all_stage": 1.625, |
| "scores/refine_times": 1.265625, |
| "step": 942 |
| }, |
| { |
| "completion_length": 107.46875, |
| "epoch": 0.23295454545454544, |
| "grad_norm": 3.1924104886203106, |
| "kl": 0.08935546875, |
| "learning_rate": 7.67292490118577e-07, |
| "loss": 0.0036, |
| "reward": 2.8017683029174805, |
| "reward_std": 0.05147233232855797, |
| "rewards/accuracy_reward_stage2": 0.8017681837081909, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.484375, |
| "step": 943 |
| }, |
| { |
| "completion_length": 116.75, |
| "epoch": 0.233201581027668, |
| "grad_norm": 2.5798367724537314, |
| "kl": 0.08349609375, |
| "learning_rate": 7.670454545454545e-07, |
| "loss": 0.0033, |
| "reward": 2.5186541080474854, |
| "reward_std": 0.1710503101348877, |
| "rewards/accuracy_reward_stage2": 0.6436540484428406, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.59375, |
| "step": 944 |
| }, |
| { |
| "completion_length": 92.71875, |
| "epoch": 0.23344861660079053, |
| "grad_norm": 2.7387857072229442, |
| "kl": 0.07177734375, |
| "learning_rate": 7.66798418972332e-07, |
| "loss": 0.0029, |
| "reward": 2.7367796897888184, |
| "reward_std": 0.0510886088013649, |
| "rewards/accuracy_reward_stage2": 0.7367798089981079, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.3125, |
| "step": 945 |
| }, |
| { |
| "completion_length": 122.171875, |
| "epoch": 0.23369565217391305, |
| "grad_norm": 4.013586606877289, |
| "kl": 0.08154296875, |
| "learning_rate": 7.665513833992094e-07, |
| "loss": 0.0033, |
| "reward": 2.650787353515625, |
| "reward_std": 0.2529640793800354, |
| "rewards/accuracy_reward_stage2": 0.7757871150970459, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.484375, |
| "step": 946 |
| }, |
| { |
| "completion_length": 123.203125, |
| "epoch": 0.23394268774703558, |
| "grad_norm": 3.047862851518325, |
| "kl": 0.08251953125, |
| "learning_rate": 7.663043478260868e-07, |
| "loss": 0.0033, |
| "reward": 2.8821022510528564, |
| "reward_std": 0.11100947856903076, |
| "rewards/accuracy_reward_stage2": 0.8821022510528564, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.53125, |
| "step": 947 |
| }, |
| { |
| "completion_length": 114.65625, |
| "epoch": 0.2341897233201581, |
| "grad_norm": 3.592191483783041, |
| "kl": 0.07958984375, |
| "learning_rate": 7.660573122529644e-07, |
| "loss": 0.0032, |
| "reward": 2.639193534851074, |
| "reward_std": 0.07999169081449509, |
| "rewards/accuracy_reward_stage2": 0.6391934156417847, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.4375, |
| "step": 948 |
| }, |
| { |
| "completion_length": 84.3125, |
| "epoch": 0.23443675889328064, |
| "grad_norm": 2.1159024076815345, |
| "kl": 0.0751953125, |
| "learning_rate": 7.658102766798419e-07, |
| "loss": 0.003, |
| "reward": 2.8071794509887695, |
| "reward_std": 0.0024097806308418512, |
| "rewards/accuracy_reward_stage2": 0.8071794509887695, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.203125, |
| "step": 949 |
| }, |
| { |
| "completion_length": 115.6875, |
| "epoch": 0.23468379446640317, |
| "grad_norm": 2.8719086261669236, |
| "kl": 0.0888671875, |
| "learning_rate": 7.655632411067194e-07, |
| "loss": 0.0036, |
| "reward": 2.5976603031158447, |
| "reward_std": 0.19923239946365356, |
| "rewards/accuracy_reward_stage2": 0.7226603031158447, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.578125, |
| "step": 950 |
| }, |
| { |
| "completion_length": 130.765625, |
| "epoch": 0.2349308300395257, |
| "grad_norm": 2.8415116889005954, |
| "kl": 0.08203125, |
| "learning_rate": 7.653162055335968e-07, |
| "loss": 0.0033, |
| "reward": 2.8011350631713867, |
| "reward_std": 0.024894852191209793, |
| "rewards/accuracy_reward_stage2": 0.8011349439620972, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.484375, |
| "step": 951 |
| }, |
| { |
| "completion_length": 113.90625, |
| "epoch": 0.23517786561264822, |
| "grad_norm": 1.8274315476260714, |
| "kl": 0.09228515625, |
| "learning_rate": 7.650691699604743e-07, |
| "loss": 0.0037, |
| "reward": 2.7534830570220947, |
| "reward_std": 0.07084871828556061, |
| "rewards/accuracy_reward_stage2": 0.75348299741745, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.265625, |
| "step": 952 |
| }, |
| { |
| "completion_length": 111.015625, |
| "epoch": 0.23542490118577075, |
| "grad_norm": 4.2390429237173235, |
| "kl": 0.0927734375, |
| "learning_rate": 7.648221343873518e-07, |
| "loss": 0.0037, |
| "reward": 2.6470580101013184, |
| "reward_std": 0.18395009636878967, |
| "rewards/accuracy_reward_stage2": 0.7199746370315552, |
| "rewards/format_reward_all_stage": 1.9270833730697632, |
| "scores/refine_times": 1.515625, |
| "step": 953 |
| }, |
| { |
| "completion_length": 77.625, |
| "epoch": 0.23567193675889328, |
| "grad_norm": 4.251078064094217, |
| "kl": 0.10302734375, |
| "learning_rate": 7.645750988142292e-07, |
| "loss": 0.0041, |
| "reward": 2.7399420738220215, |
| "reward_std": 0.034756097942590714, |
| "rewards/accuracy_reward_stage2": 0.7399421334266663, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.140625, |
| "step": 954 |
| }, |
| { |
| "completion_length": 83.3125, |
| "epoch": 0.2359189723320158, |
| "grad_norm": 4.688994500071449, |
| "kl": 0.099609375, |
| "learning_rate": 7.643280632411066e-07, |
| "loss": 0.004, |
| "reward": 2.4010202884674072, |
| "reward_std": 0.06971799582242966, |
| "rewards/accuracy_reward_stage2": 0.40102022886276245, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.21875, |
| "step": 955 |
| }, |
| { |
| "completion_length": 90.90625, |
| "epoch": 0.23616600790513834, |
| "grad_norm": 3.0926787814732064, |
| "kl": 0.068359375, |
| "learning_rate": 7.640810276679841e-07, |
| "loss": 0.0027, |
| "reward": 2.7051496505737305, |
| "reward_std": 0.012495389208197594, |
| "rewards/accuracy_reward_stage2": 0.7051496505737305, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.203125, |
| "step": 956 |
| }, |
| { |
| "completion_length": 120.546875, |
| "epoch": 0.23641304347826086, |
| "grad_norm": 3.619910409738279, |
| "kl": 0.11669921875, |
| "learning_rate": 7.638339920948616e-07, |
| "loss": 0.0047, |
| "reward": 2.355205535888672, |
| "reward_std": 0.16237197816371918, |
| "rewards/accuracy_reward_stage2": 0.469789057970047, |
| "rewards/format_reward_all_stage": 1.8854166269302368, |
| "scores/refine_times": 1.53125, |
| "step": 957 |
| }, |
| { |
| "completion_length": 104.4375, |
| "epoch": 0.2366600790513834, |
| "grad_norm": 4.1469106870051835, |
| "kl": 0.109375, |
| "learning_rate": 7.635869565217391e-07, |
| "loss": 0.0044, |
| "reward": 2.559445858001709, |
| "reward_std": 0.15328779816627502, |
| "rewards/accuracy_reward_stage2": 0.5698623061180115, |
| "rewards/format_reward_all_stage": 1.9895833730697632, |
| "scores/refine_times": 1.53125, |
| "step": 958 |
| }, |
| { |
| "completion_length": 79.09375, |
| "epoch": 0.23690711462450592, |
| "grad_norm": 1.1230604076719064, |
| "kl": 0.08935546875, |
| "learning_rate": 7.633399209486166e-07, |
| "loss": 0.0036, |
| "reward": 2.766244649887085, |
| "reward_std": 0.012647372670471668, |
| "rewards/accuracy_reward_stage2": 0.7662445902824402, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.15625, |
| "step": 959 |
| }, |
| { |
| "completion_length": 81.59375, |
| "epoch": 0.23715415019762845, |
| "grad_norm": 4.099677035647309, |
| "kl": 0.095703125, |
| "learning_rate": 7.63092885375494e-07, |
| "loss": 0.0038, |
| "reward": 2.7637486457824707, |
| "reward_std": 0.053550224751234055, |
| "rewards/accuracy_reward_stage2": 0.7637484073638916, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.140625, |
| "step": 960 |
| }, |
| { |
| "completion_length": 117.5, |
| "epoch": 0.23740118577075098, |
| "grad_norm": 2.4667498900768488, |
| "kl": 0.07568359375, |
| "learning_rate": 7.628458498023716e-07, |
| "loss": 0.003, |
| "reward": 2.700864791870117, |
| "reward_std": 0.13465310633182526, |
| "rewards/accuracy_reward_stage2": 0.7008647918701172, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.3125, |
| "step": 961 |
| }, |
| { |
| "completion_length": 68.125, |
| "epoch": 0.2376482213438735, |
| "grad_norm": 4.113070453187623, |
| "kl": 0.09033203125, |
| "learning_rate": 7.62598814229249e-07, |
| "loss": 0.0036, |
| "reward": 2.504481077194214, |
| "reward_std": 0.016891546547412872, |
| "rewards/accuracy_reward_stage2": 0.5044810771942139, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0, |
| "step": 962 |
| }, |
| { |
| "completion_length": 92.0625, |
| "epoch": 0.23789525691699603, |
| "grad_norm": 3.146318360416011, |
| "kl": 0.0703125, |
| "learning_rate": 7.623517786561264e-07, |
| "loss": 0.0028, |
| "reward": 2.723379135131836, |
| "reward_std": 0.16081024706363678, |
| "rewards/accuracy_reward_stage2": 0.8483791351318359, |
| "rewards/format_reward_all_stage": 1.875, |
| "scores/refine_times": 1.203125, |
| "step": 963 |
| }, |
| { |
| "completion_length": 101.640625, |
| "epoch": 0.2381422924901186, |
| "grad_norm": 1.2878437891896688, |
| "kl": 0.0810546875, |
| "learning_rate": 7.621047430830039e-07, |
| "loss": 0.0032, |
| "reward": 2.9084339141845703, |
| "reward_std": 0.018986623734235764, |
| "rewards/accuracy_reward_stage2": 0.9084337949752808, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.46875, |
| "step": 964 |
| }, |
| { |
| "completion_length": 95.4375, |
| "epoch": 0.23838932806324112, |
| "grad_norm": 3.417312289254013, |
| "kl": 0.08740234375, |
| "learning_rate": 7.618577075098814e-07, |
| "loss": 0.0035, |
| "reward": 2.6807949542999268, |
| "reward_std": 0.14062517881393433, |
| "rewards/accuracy_reward_stage2": 0.6964200735092163, |
| "rewards/format_reward_all_stage": 1.984375, |
| "scores/refine_times": 1.296875, |
| "step": 965 |
| }, |
| { |
| "completion_length": 77.859375, |
| "epoch": 0.23863636363636365, |
| "grad_norm": 4.137033586610404, |
| "kl": 0.08251953125, |
| "learning_rate": 7.616106719367588e-07, |
| "loss": 0.0033, |
| "reward": 2.4498672485351562, |
| "reward_std": 0.012593725696206093, |
| "rewards/accuracy_reward_stage2": 0.4498673677444458, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 966 |
| }, |
| { |
| "completion_length": 97.296875, |
| "epoch": 0.23888339920948617, |
| "grad_norm": 3.496193472267087, |
| "kl": 0.0927734375, |
| "learning_rate": 7.613636363636364e-07, |
| "loss": 0.0037, |
| "reward": 2.7825238704681396, |
| "reward_std": 0.07081930339336395, |
| "rewards/accuracy_reward_stage2": 0.7825238704681396, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.28125, |
| "step": 967 |
| }, |
| { |
| "completion_length": 91.4375, |
| "epoch": 0.2391304347826087, |
| "grad_norm": 3.9040594654045284, |
| "kl": 0.0849609375, |
| "learning_rate": 7.611166007905138e-07, |
| "loss": 0.0034, |
| "reward": 2.6640286445617676, |
| "reward_std": 0.02129337564110756, |
| "rewards/accuracy_reward_stage2": 0.6640284657478333, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.265625, |
| "step": 968 |
| }, |
| { |
| "completion_length": 85.109375, |
| "epoch": 0.23937747035573123, |
| "grad_norm": 4.528559323672272, |
| "kl": 0.083984375, |
| "learning_rate": 7.608695652173913e-07, |
| "loss": 0.0034, |
| "reward": 2.4769628047943115, |
| "reward_std": 0.14590422809123993, |
| "rewards/accuracy_reward_stage2": 0.4769628345966339, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.203125, |
| "step": 969 |
| }, |
| { |
| "completion_length": 85.4375, |
| "epoch": 0.23962450592885376, |
| "grad_norm": 2.181031316308596, |
| "kl": 0.11279296875, |
| "learning_rate": 7.606225296442688e-07, |
| "loss": 0.0045, |
| "reward": 2.686516284942627, |
| "reward_std": 0.03785046935081482, |
| "rewards/accuracy_reward_stage2": 0.6865162253379822, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.3125, |
| "step": 970 |
| }, |
| { |
| "completion_length": 105.796875, |
| "epoch": 0.2398715415019763, |
| "grad_norm": 2.563633299576689, |
| "kl": 0.1435546875, |
| "learning_rate": 7.603754940711462e-07, |
| "loss": 0.0057, |
| "reward": 2.738003730773926, |
| "reward_std": 0.037225548177957535, |
| "rewards/accuracy_reward_stage2": 0.7380036115646362, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.515625, |
| "step": 971 |
| }, |
| { |
| "completion_length": 85.921875, |
| "epoch": 0.24011857707509882, |
| "grad_norm": 5.044397747161594, |
| "kl": 0.10302734375, |
| "learning_rate": 7.601284584980236e-07, |
| "loss": 0.0041, |
| "reward": 2.5778369903564453, |
| "reward_std": 0.07295674085617065, |
| "rewards/accuracy_reward_stage2": 0.5934619307518005, |
| "rewards/format_reward_all_stage": 1.984375, |
| "scores/refine_times": 1.203125, |
| "step": 972 |
| }, |
| { |
| "completion_length": 94.234375, |
| "epoch": 0.24036561264822134, |
| "grad_norm": 4.030016499108354, |
| "kl": 0.09228515625, |
| "learning_rate": 7.598814229249012e-07, |
| "loss": 0.0037, |
| "reward": 2.612272262573242, |
| "reward_std": 0.03439907357096672, |
| "rewards/accuracy_reward_stage2": 0.6122722029685974, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.421875, |
| "step": 973 |
| }, |
| { |
| "completion_length": 100.96875, |
| "epoch": 0.24061264822134387, |
| "grad_norm": 2.7858570805718648, |
| "kl": 0.0751953125, |
| "learning_rate": 7.596343873517786e-07, |
| "loss": 0.003, |
| "reward": 2.638523578643799, |
| "reward_std": 0.0695050060749054, |
| "rewards/accuracy_reward_stage2": 0.6385236978530884, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.40625, |
| "step": 974 |
| }, |
| { |
| "completion_length": 100.5, |
| "epoch": 0.2408596837944664, |
| "grad_norm": 3.356791391332148, |
| "kl": 0.1005859375, |
| "learning_rate": 7.59387351778656e-07, |
| "loss": 0.004, |
| "reward": 2.85272216796875, |
| "reward_std": 0.05522162467241287, |
| "rewards/accuracy_reward_stage2": 0.85272216796875, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.453125, |
| "step": 975 |
| }, |
| { |
| "completion_length": 90.484375, |
| "epoch": 0.24110671936758893, |
| "grad_norm": 2.5461528562751194, |
| "kl": 0.099609375, |
| "learning_rate": 7.591403162055335e-07, |
| "loss": 0.004, |
| "reward": 2.7427287101745605, |
| "reward_std": 0.06681530922651291, |
| "rewards/accuracy_reward_stage2": 0.7427287697792053, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.375, |
| "step": 976 |
| }, |
| { |
| "completion_length": 99.84375, |
| "epoch": 0.24135375494071146, |
| "grad_norm": 3.1758314697986822, |
| "kl": 0.1171875, |
| "learning_rate": 7.58893280632411e-07, |
| "loss": 0.0047, |
| "reward": 2.753756523132324, |
| "reward_std": 0.08295577764511108, |
| "rewards/accuracy_reward_stage2": 0.7537566423416138, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.46875, |
| "step": 977 |
| }, |
| { |
| "completion_length": 69.4375, |
| "epoch": 0.24160079051383399, |
| "grad_norm": 2.2364912811701285, |
| "kl": 0.109375, |
| "learning_rate": 7.586462450592886e-07, |
| "loss": 0.0044, |
| "reward": 2.868748188018799, |
| "reward_std": 0.00799154955893755, |
| "rewards/accuracy_reward_stage2": 0.8687483072280884, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 978 |
| }, |
| { |
| "completion_length": 92.125, |
| "epoch": 0.2418478260869565, |
| "grad_norm": 3.7954929280625547, |
| "kl": 0.08251953125, |
| "learning_rate": 7.58399209486166e-07, |
| "loss": 0.0033, |
| "reward": 2.7547996044158936, |
| "reward_std": 0.14635036885738373, |
| "rewards/accuracy_reward_stage2": 0.8199037313461304, |
| "rewards/format_reward_all_stage": 1.9348958730697632, |
| "scores/refine_times": 1.40625, |
| "step": 979 |
| }, |
| { |
| "completion_length": 75.46875, |
| "epoch": 0.24209486166007904, |
| "grad_norm": 2.5535180288665376, |
| "kl": 0.09130859375, |
| "learning_rate": 7.581521739130434e-07, |
| "loss": 0.0037, |
| "reward": 2.840176582336426, |
| "reward_std": 0.07781560719013214, |
| "rewards/accuracy_reward_stage2": 0.8401765823364258, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.296875, |
| "step": 980 |
| }, |
| { |
| "completion_length": 133.421875, |
| "epoch": 0.24234189723320157, |
| "grad_norm": 2.3731209551138543, |
| "kl": 0.0712890625, |
| "learning_rate": 7.579051383399209e-07, |
| "loss": 0.0029, |
| "reward": 2.651949882507324, |
| "reward_std": 0.11739328503608704, |
| "rewards/accuracy_reward_stage2": 0.7144500017166138, |
| "rewards/format_reward_all_stage": 1.9375, |
| "scores/refine_times": 1.90625, |
| "step": 981 |
| }, |
| { |
| "completion_length": 84.84375, |
| "epoch": 0.2425889328063241, |
| "grad_norm": 5.0187188109372975, |
| "kl": 0.0966796875, |
| "learning_rate": 7.576581027667984e-07, |
| "loss": 0.0039, |
| "reward": 2.6951634883880615, |
| "reward_std": 0.04134798049926758, |
| "rewards/accuracy_reward_stage2": 0.6951634883880615, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.296875, |
| "step": 982 |
| }, |
| { |
| "completion_length": 84.671875, |
| "epoch": 0.24283596837944665, |
| "grad_norm": 2.31892736773026, |
| "kl": 0.07080078125, |
| "learning_rate": 7.574110671936758e-07, |
| "loss": 0.0028, |
| "reward": 2.881075859069824, |
| "reward_std": 0.06218536198139191, |
| "rewards/accuracy_reward_stage2": 0.8914925456047058, |
| "rewards/format_reward_all_stage": 1.9895833730697632, |
| "scores/refine_times": 1.1875, |
| "step": 983 |
| }, |
| { |
| "completion_length": 101.5, |
| "epoch": 0.24308300395256918, |
| "grad_norm": 4.532001278651413, |
| "kl": 0.09228515625, |
| "learning_rate": 7.571640316205533e-07, |
| "loss": 0.0037, |
| "reward": 2.686784505844116, |
| "reward_std": 0.14913466572761536, |
| "rewards/accuracy_reward_stage2": 0.6945970058441162, |
| "rewards/format_reward_all_stage": 1.9921875, |
| "scores/refine_times": 1.359375, |
| "step": 984 |
| }, |
| { |
| "completion_length": 78.65625, |
| "epoch": 0.2433300395256917, |
| "grad_norm": 2.427351371197481, |
| "kl": 0.1025390625, |
| "learning_rate": 7.569169960474307e-07, |
| "loss": 0.0041, |
| "reward": 2.7506766319274902, |
| "reward_std": 0.001765109016560018, |
| "rewards/accuracy_reward_stage2": 0.7506764531135559, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.25, |
| "step": 985 |
| }, |
| { |
| "completion_length": 69.65625, |
| "epoch": 0.24357707509881424, |
| "grad_norm": 4.71948540499666, |
| "kl": 0.095703125, |
| "learning_rate": 7.566699604743084e-07, |
| "loss": 0.0038, |
| "reward": 2.7605538368225098, |
| "reward_std": 0.06787852942943573, |
| "rewards/accuracy_reward_stage2": 0.7605538964271545, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.140625, |
| "step": 986 |
| }, |
| { |
| "completion_length": 62.53125, |
| "epoch": 0.24382411067193677, |
| "grad_norm": 2.790062335645781, |
| "kl": 0.095703125, |
| "learning_rate": 7.564229249011858e-07, |
| "loss": 0.0038, |
| "reward": 2.872288703918457, |
| "reward_std": 0.0004953596508130431, |
| "rewards/accuracy_reward_stage2": 0.8722887635231018, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.0625, |
| "step": 987 |
| }, |
| { |
| "completion_length": 92.09375, |
| "epoch": 0.2440711462450593, |
| "grad_norm": 2.5955491599396074, |
| "kl": 0.0712890625, |
| "learning_rate": 7.561758893280632e-07, |
| "loss": 0.0029, |
| "reward": 2.7611703872680664, |
| "reward_std": 0.0292732622474432, |
| "rewards/accuracy_reward_stage2": 0.7611702680587769, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.34375, |
| "step": 988 |
| }, |
| { |
| "completion_length": 124.15625, |
| "epoch": 0.24431818181818182, |
| "grad_norm": 2.4770893409000316, |
| "kl": 0.07470703125, |
| "learning_rate": 7.559288537549407e-07, |
| "loss": 0.003, |
| "reward": 2.566910743713379, |
| "reward_std": 0.079023078083992, |
| "rewards/accuracy_reward_stage2": 0.5669107437133789, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.65625, |
| "step": 989 |
| }, |
| { |
| "completion_length": 88.0, |
| "epoch": 0.24456521739130435, |
| "grad_norm": 4.556477660838416, |
| "kl": 0.0869140625, |
| "learning_rate": 7.556818181818182e-07, |
| "loss": 0.0035, |
| "reward": 2.6633074283599854, |
| "reward_std": 0.09606172144412994, |
| "rewards/accuracy_reward_stage2": 0.6633073687553406, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.125, |
| "step": 990 |
| }, |
| { |
| "completion_length": 104.796875, |
| "epoch": 0.24481225296442688, |
| "grad_norm": 2.6122086287602713, |
| "kl": 0.07666015625, |
| "learning_rate": 7.554347826086956e-07, |
| "loss": 0.0031, |
| "reward": 2.571780204772949, |
| "reward_std": 0.07648254930973053, |
| "rewards/accuracy_reward_stage2": 0.5717802047729492, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.453125, |
| "step": 991 |
| }, |
| { |
| "completion_length": 76.59375, |
| "epoch": 0.2450592885375494, |
| "grad_norm": 4.7804674694975775, |
| "kl": 0.07080078125, |
| "learning_rate": 7.551877470355731e-07, |
| "loss": 0.0028, |
| "reward": 2.714232921600342, |
| "reward_std": 0.06315543502569199, |
| "rewards/accuracy_reward_stage2": 0.7142329216003418, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.109375, |
| "step": 992 |
| }, |
| { |
| "completion_length": 95.734375, |
| "epoch": 0.24530632411067194, |
| "grad_norm": 2.5192200109285294, |
| "kl": 0.08984375, |
| "learning_rate": 7.549407114624505e-07, |
| "loss": 0.0036, |
| "reward": 2.724212408065796, |
| "reward_std": 0.016075868159532547, |
| "rewards/accuracy_reward_stage2": 0.7242124676704407, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.328125, |
| "step": 993 |
| }, |
| { |
| "completion_length": 91.53125, |
| "epoch": 0.24555335968379446, |
| "grad_norm": 3.1808436621622116, |
| "kl": 0.07470703125, |
| "learning_rate": 7.546936758893279e-07, |
| "loss": 0.003, |
| "reward": 2.5958046913146973, |
| "reward_std": 0.02057287096977234, |
| "rewards/accuracy_reward_stage2": 0.595804750919342, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.234375, |
| "step": 994 |
| }, |
| { |
| "completion_length": 88.078125, |
| "epoch": 0.245800395256917, |
| "grad_norm": 3.075094133501221, |
| "kl": 0.08544921875, |
| "learning_rate": 7.544466403162056e-07, |
| "loss": 0.0034, |
| "reward": 2.7611076831817627, |
| "reward_std": 0.09097757935523987, |
| "rewards/accuracy_reward_stage2": 0.7611076831817627, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.234375, |
| "step": 995 |
| }, |
| { |
| "completion_length": 106.84375, |
| "epoch": 0.24604743083003952, |
| "grad_norm": 2.2773682917414493, |
| "kl": 0.060546875, |
| "learning_rate": 7.54199604743083e-07, |
| "loss": 0.0024, |
| "reward": 2.7901453971862793, |
| "reward_std": 0.0059069436974823475, |
| "rewards/accuracy_reward_stage2": 0.7901455163955688, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.328125, |
| "step": 996 |
| }, |
| { |
| "completion_length": 103.9375, |
| "epoch": 0.24629446640316205, |
| "grad_norm": 3.5787808365072875, |
| "kl": 0.09765625, |
| "learning_rate": 7.539525691699604e-07, |
| "loss": 0.0039, |
| "reward": 2.538865566253662, |
| "reward_std": 0.17112267017364502, |
| "rewards/accuracy_reward_stage2": 0.5388656258583069, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.375, |
| "step": 997 |
| }, |
| { |
| "completion_length": 122.25, |
| "epoch": 0.24654150197628458, |
| "grad_norm": 3.092097391385804, |
| "kl": 0.0625, |
| "learning_rate": 7.537055335968379e-07, |
| "loss": 0.0025, |
| "reward": 2.701768636703491, |
| "reward_std": 0.07486184686422348, |
| "rewards/accuracy_reward_stage2": 0.7017685770988464, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.453125, |
| "step": 998 |
| }, |
| { |
| "completion_length": 106.09375, |
| "epoch": 0.2467885375494071, |
| "grad_norm": 3.818115552269006, |
| "kl": 0.08837890625, |
| "learning_rate": 7.534584980237154e-07, |
| "loss": 0.0035, |
| "reward": 2.655691385269165, |
| "reward_std": 0.1008155569434166, |
| "rewards/accuracy_reward_stage2": 0.6556915044784546, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.390625, |
| "step": 999 |
| }, |
| { |
| "completion_length": 75.1875, |
| "epoch": 0.24703557312252963, |
| "grad_norm": 1.9883109917462498, |
| "kl": 0.076171875, |
| "learning_rate": 7.532114624505929e-07, |
| "loss": 0.003, |
| "reward": 2.7738916873931885, |
| "reward_std": 0.03292723000049591, |
| "rewards/accuracy_reward_stage2": 0.7738916873931885, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.171875, |
| "step": 1000 |
| }, |
| { |
| "completion_length": 115.71875, |
| "epoch": 0.24728260869565216, |
| "grad_norm": 3.1982546373064786, |
| "kl": 0.09033203125, |
| "learning_rate": 7.529644268774703e-07, |
| "loss": 0.0036, |
| "reward": 2.843109607696533, |
| "reward_std": 0.12261004000902176, |
| "rewards/accuracy_reward_stage2": 0.8431097269058228, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.484375, |
| "step": 1001 |
| }, |
| { |
| "completion_length": 101.109375, |
| "epoch": 0.2475296442687747, |
| "grad_norm": 2.574042445943342, |
| "kl": 0.06689453125, |
| "learning_rate": 7.527173913043477e-07, |
| "loss": 0.0027, |
| "reward": 2.5342187881469727, |
| "reward_std": 0.12293697893619537, |
| "rewards/accuracy_reward_stage2": 0.5420314073562622, |
| "rewards/format_reward_all_stage": 1.9921875, |
| "scores/refine_times": 1.390625, |
| "step": 1002 |
| }, |
| { |
| "completion_length": 93.890625, |
| "epoch": 0.24777667984189725, |
| "grad_norm": 0.19752743577346163, |
| "kl": 0.0615234375, |
| "learning_rate": 7.524703557312253e-07, |
| "loss": 0.0025, |
| "reward": 2.5, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward_stage2": 0.5, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.3125, |
| "step": 1003 |
| }, |
| { |
| "completion_length": 74.140625, |
| "epoch": 0.24802371541501977, |
| "grad_norm": 3.25297354456302, |
| "kl": 0.115234375, |
| "learning_rate": 7.522233201581028e-07, |
| "loss": 0.0046, |
| "reward": 2.7395200729370117, |
| "reward_std": 0.0691099762916565, |
| "rewards/accuracy_reward_stage2": 0.7395201921463013, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.140625, |
| "step": 1004 |
| }, |
| { |
| "completion_length": 101.265625, |
| "epoch": 0.2482707509881423, |
| "grad_norm": 2.786549692375035, |
| "kl": 0.07421875, |
| "learning_rate": 7.519762845849802e-07, |
| "loss": 0.003, |
| "reward": 2.7277092933654785, |
| "reward_std": 0.08703543990850449, |
| "rewards/accuracy_reward_stage2": 0.7277094125747681, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.34375, |
| "step": 1005 |
| }, |
| { |
| "completion_length": 73.265625, |
| "epoch": 0.24851778656126483, |
| "grad_norm": 2.9933686564396313, |
| "kl": 0.06396484375, |
| "learning_rate": 7.517292490118577e-07, |
| "loss": 0.0026, |
| "reward": 2.625016212463379, |
| "reward_std": 0.05336294695734978, |
| "rewards/accuracy_reward_stage2": 0.6250161528587341, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.140625, |
| "step": 1006 |
| }, |
| { |
| "completion_length": 90.71875, |
| "epoch": 0.24876482213438736, |
| "grad_norm": 3.37668174197228, |
| "kl": 0.07275390625, |
| "learning_rate": 7.514822134387352e-07, |
| "loss": 0.0029, |
| "reward": 2.533514976501465, |
| "reward_std": 0.07456710934638977, |
| "rewards/accuracy_reward_stage2": 0.5335150957107544, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.203125, |
| "step": 1007 |
| }, |
| { |
| "completion_length": 109.453125, |
| "epoch": 0.2490118577075099, |
| "grad_norm": 3.9869505928266453, |
| "kl": 0.10546875, |
| "learning_rate": 7.512351778656126e-07, |
| "loss": 0.0042, |
| "reward": 2.6633753776550293, |
| "reward_std": 0.08245876431465149, |
| "rewards/accuracy_reward_stage2": 0.6633754372596741, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.359375, |
| "step": 1008 |
| }, |
| { |
| "completion_length": 101.828125, |
| "epoch": 0.24925889328063242, |
| "grad_norm": 3.3323257698984188, |
| "kl": 0.06884765625, |
| "learning_rate": 7.509881422924901e-07, |
| "loss": 0.0028, |
| "reward": 2.789440870285034, |
| "reward_std": 0.11122827976942062, |
| "rewards/accuracy_reward_stage2": 0.7894407510757446, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.28125, |
| "step": 1009 |
| }, |
| { |
| "completion_length": 154.296875, |
| "epoch": 0.24950592885375494, |
| "grad_norm": 3.1765086057799063, |
| "kl": 0.07373046875, |
| "learning_rate": 7.507411067193675e-07, |
| "loss": 0.0029, |
| "reward": 2.5902161598205566, |
| "reward_std": 0.2405627965927124, |
| "rewards/accuracy_reward_stage2": 0.615216076374054, |
| "rewards/format_reward_all_stage": 1.975000023841858, |
| "scores/refine_times": 2.078125, |
| "step": 1010 |
| }, |
| { |
| "completion_length": 113.5625, |
| "epoch": 0.24975296442687747, |
| "grad_norm": 3.620470729082483, |
| "kl": 0.0830078125, |
| "learning_rate": 7.50494071146245e-07, |
| "loss": 0.0033, |
| "reward": 2.6796677112579346, |
| "reward_std": 0.16890506446361542, |
| "rewards/accuracy_reward_stage2": 0.6796677112579346, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.5625, |
| "step": 1011 |
| }, |
| { |
| "completion_length": 124.171875, |
| "epoch": 0.25, |
| "grad_norm": 3.01631508432083, |
| "kl": 0.10400390625, |
| "learning_rate": 7.502470355731225e-07, |
| "loss": 0.0042, |
| "reward": 2.738471269607544, |
| "reward_std": 0.10241246223449707, |
| "rewards/accuracy_reward_stage2": 0.7384714484214783, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.859375, |
| "step": 1012 |
| }, |
| { |
| "completion_length": 148.671875, |
| "epoch": 0.25024703557312256, |
| "grad_norm": 1.4692891485044033, |
| "kl": 0.076171875, |
| "learning_rate": 7.5e-07, |
| "loss": 0.003, |
| "reward": 2.6656787395477295, |
| "reward_std": 0.08242332935333252, |
| "rewards/accuracy_reward_stage2": 0.6656786799430847, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.984375, |
| "step": 1013 |
| }, |
| { |
| "completion_length": 130.828125, |
| "epoch": 0.25049407114624506, |
| "grad_norm": 2.7604690193081596, |
| "kl": 0.07666015625, |
| "learning_rate": 7.497529644268775e-07, |
| "loss": 0.0031, |
| "reward": 2.571310520172119, |
| "reward_std": 0.060206782072782516, |
| "rewards/accuracy_reward_stage2": 0.5713105201721191, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.875, |
| "step": 1014 |
| }, |
| { |
| "completion_length": 90.125, |
| "epoch": 0.2507411067193676, |
| "grad_norm": 3.249774020301462, |
| "kl": 0.0703125, |
| "learning_rate": 7.495059288537549e-07, |
| "loss": 0.0028, |
| "reward": 2.7852025032043457, |
| "reward_std": 0.005425943061709404, |
| "rewards/accuracy_reward_stage2": 0.7852025628089905, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.265625, |
| "step": 1015 |
| }, |
| { |
| "completion_length": 90.21875, |
| "epoch": 0.2509881422924901, |
| "grad_norm": 3.406147860849275, |
| "kl": 0.0791015625, |
| "learning_rate": 7.492588932806324e-07, |
| "loss": 0.0032, |
| "reward": 2.6671500205993652, |
| "reward_std": 0.06550855934619904, |
| "rewards/accuracy_reward_stage2": 0.6671501994132996, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.296875, |
| "step": 1016 |
| }, |
| { |
| "completion_length": 119.359375, |
| "epoch": 0.25123517786561267, |
| "grad_norm": 2.5853308765349716, |
| "kl": 0.0634765625, |
| "learning_rate": 7.490118577075099e-07, |
| "loss": 0.0025, |
| "reward": 2.4825072288513184, |
| "reward_std": 0.08852247148752213, |
| "rewards/accuracy_reward_stage2": 0.4825071692466736, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.46875, |
| "step": 1017 |
| }, |
| { |
| "completion_length": 108.234375, |
| "epoch": 0.25148221343873517, |
| "grad_norm": 0.17500328397226975, |
| "kl": 0.04931640625, |
| "learning_rate": 7.487648221343873e-07, |
| "loss": 0.002, |
| "reward": 3.0, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward_stage2": 1.0, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.421875, |
| "step": 1018 |
| }, |
| { |
| "completion_length": 114.9375, |
| "epoch": 0.2517292490118577, |
| "grad_norm": 1.8677219538650585, |
| "kl": 0.06298828125, |
| "learning_rate": 7.485177865612647e-07, |
| "loss": 0.0025, |
| "reward": 2.8387131690979004, |
| "reward_std": 0.03201249614357948, |
| "rewards/accuracy_reward_stage2": 0.8387130498886108, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.515625, |
| "step": 1019 |
| }, |
| { |
| "completion_length": 116.5, |
| "epoch": 0.2519762845849802, |
| "grad_norm": 3.918756506685844, |
| "kl": 0.08935546875, |
| "learning_rate": 7.482707509881423e-07, |
| "loss": 0.0036, |
| "reward": 2.706343650817871, |
| "reward_std": 0.1193799301981926, |
| "rewards/accuracy_reward_stage2": 0.7688437700271606, |
| "rewards/format_reward_all_stage": 1.9375, |
| "scores/refine_times": 1.546875, |
| "step": 1020 |
| }, |
| { |
| "completion_length": 103.046875, |
| "epoch": 0.2522233201581028, |
| "grad_norm": 2.2787870909974424, |
| "kl": 0.099609375, |
| "learning_rate": 7.480237154150197e-07, |
| "loss": 0.004, |
| "reward": 2.6051716804504395, |
| "reward_std": 0.0634067952632904, |
| "rewards/accuracy_reward_stage2": 0.6114215850830078, |
| "rewards/format_reward_all_stage": 1.993749976158142, |
| "scores/refine_times": 1.546875, |
| "step": 1021 |
| }, |
| { |
| "completion_length": 140.375, |
| "epoch": 0.2524703557312253, |
| "grad_norm": 1.5786423143571975, |
| "kl": 0.0634765625, |
| "learning_rate": 7.477766798418971e-07, |
| "loss": 0.0025, |
| "reward": 2.8234591484069824, |
| "reward_std": 0.1224951446056366, |
| "rewards/accuracy_reward_stage2": 0.8234590888023376, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.90625, |
| "step": 1022 |
| }, |
| { |
| "completion_length": 130.015625, |
| "epoch": 0.25271739130434784, |
| "grad_norm": 3.6531054074473466, |
| "kl": 0.0654296875, |
| "learning_rate": 7.475296442687747e-07, |
| "loss": 0.0026, |
| "reward": 2.7177672386169434, |
| "reward_std": 0.05522045120596886, |
| "rewards/accuracy_reward_stage2": 0.7177671194076538, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.53125, |
| "step": 1023 |
| }, |
| { |
| "completion_length": 138.125, |
| "epoch": 0.25296442687747034, |
| "grad_norm": 1.4829799008640552, |
| "kl": 0.09619140625, |
| "learning_rate": 7.472826086956522e-07, |
| "loss": 0.0038, |
| "reward": 2.8771204948425293, |
| "reward_std": 0.04724188148975372, |
| "rewards/accuracy_reward_stage2": 0.8911830186843872, |
| "rewards/format_reward_all_stage": 1.985937476158142, |
| "scores/refine_times": 1.96875, |
| "step": 1024 |
| }, |
| { |
| "completion_length": 127.140625, |
| "epoch": 0.2532114624505929, |
| "grad_norm": 1.5881287321745357, |
| "kl": 0.0771484375, |
| "learning_rate": 7.470355731225296e-07, |
| "loss": 0.0031, |
| "reward": 2.7447822093963623, |
| "reward_std": 0.0155083192512393, |
| "rewards/accuracy_reward_stage2": 0.7447823286056519, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.546875, |
| "step": 1025 |
| }, |
| { |
| "completion_length": 84.609375, |
| "epoch": 0.2534584980237154, |
| "grad_norm": 3.0329888153387974, |
| "kl": 0.07080078125, |
| "learning_rate": 7.467885375494071e-07, |
| "loss": 0.0028, |
| "reward": 2.5835673809051514, |
| "reward_std": 0.016321806237101555, |
| "rewards/accuracy_reward_stage2": 0.5835674405097961, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.203125, |
| "step": 1026 |
| }, |
| { |
| "completion_length": 115.34375, |
| "epoch": 0.25370553359683795, |
| "grad_norm": 3.298502539047111, |
| "kl": 0.0654296875, |
| "learning_rate": 7.465415019762845e-07, |
| "loss": 0.0026, |
| "reward": 2.6484375, |
| "reward_std": 0.12232004851102829, |
| "rewards/accuracy_reward_stage2": 0.65625, |
| "rewards/format_reward_all_stage": 1.9921875, |
| "scores/refine_times": 1.515625, |
| "step": 1027 |
| }, |
| { |
| "completion_length": 131.609375, |
| "epoch": 0.25395256916996045, |
| "grad_norm": 1.8281242653098133, |
| "kl": 0.05712890625, |
| "learning_rate": 7.462944664031621e-07, |
| "loss": 0.0023, |
| "reward": 2.6590847969055176, |
| "reward_std": 0.05560882389545441, |
| "rewards/accuracy_reward_stage2": 0.6590847373008728, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.625, |
| "step": 1028 |
| }, |
| { |
| "completion_length": 123.59375, |
| "epoch": 0.254199604743083, |
| "grad_norm": 4.069162484990196, |
| "kl": 0.078125, |
| "learning_rate": 7.460474308300395e-07, |
| "loss": 0.0031, |
| "reward": 2.5520071983337402, |
| "reward_std": 0.1841985285282135, |
| "rewards/accuracy_reward_stage2": 0.5582571029663086, |
| "rewards/format_reward_all_stage": 1.993749976158142, |
| "scores/refine_times": 1.625, |
| "step": 1029 |
| }, |
| { |
| "completion_length": 110.25, |
| "epoch": 0.2544466403162055, |
| "grad_norm": 3.208104168556757, |
| "kl": 0.1005859375, |
| "learning_rate": 7.458003952569169e-07, |
| "loss": 0.004, |
| "reward": 2.6219356060028076, |
| "reward_std": 0.02025969699025154, |
| "rewards/accuracy_reward_stage2": 0.6219354867935181, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.390625, |
| "step": 1030 |
| }, |
| { |
| "completion_length": 92.171875, |
| "epoch": 0.25469367588932806, |
| "grad_norm": 2.5573874648011263, |
| "kl": 0.07666015625, |
| "learning_rate": 7.455533596837944e-07, |
| "loss": 0.0031, |
| "reward": 2.6007657051086426, |
| "reward_std": 0.07939323782920837, |
| "rewards/accuracy_reward_stage2": 0.6007658243179321, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.1875, |
| "step": 1031 |
| }, |
| { |
| "completion_length": 114.59375, |
| "epoch": 0.2549407114624506, |
| "grad_norm": 1.6819453240515354, |
| "kl": 0.06884765625, |
| "learning_rate": 7.45306324110672e-07, |
| "loss": 0.0028, |
| "reward": 2.874180555343628, |
| "reward_std": 0.039452213793992996, |
| "rewards/accuracy_reward_stage2": 0.8845971822738647, |
| "rewards/format_reward_all_stage": 1.9895832538604736, |
| "scores/refine_times": 1.546875, |
| "step": 1032 |
| }, |
| { |
| "completion_length": 94.640625, |
| "epoch": 0.2551877470355731, |
| "grad_norm": 3.445762215503828, |
| "kl": 0.09033203125, |
| "learning_rate": 7.450592885375494e-07, |
| "loss": 0.0036, |
| "reward": 2.756856918334961, |
| "reward_std": 0.0909591019153595, |
| "rewards/accuracy_reward_stage2": 0.7568570375442505, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.375, |
| "step": 1033 |
| }, |
| { |
| "completion_length": 117.0625, |
| "epoch": 0.2554347826086957, |
| "grad_norm": 3.6510651933840377, |
| "kl": 0.07958984375, |
| "learning_rate": 7.448122529644269e-07, |
| "loss": 0.0032, |
| "reward": 2.7670958042144775, |
| "reward_std": 0.1569576859474182, |
| "rewards/accuracy_reward_stage2": 0.7733457684516907, |
| "rewards/format_reward_all_stage": 1.993749976158142, |
| "scores/refine_times": 1.40625, |
| "step": 1034 |
| }, |
| { |
| "completion_length": 84.296875, |
| "epoch": 0.2556818181818182, |
| "grad_norm": 2.941993649127529, |
| "kl": 0.10400390625, |
| "learning_rate": 7.445652173913043e-07, |
| "loss": 0.0042, |
| "reward": 2.608677625656128, |
| "reward_std": 0.01940278336405754, |
| "rewards/accuracy_reward_stage2": 0.6086776256561279, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.09375, |
| "step": 1035 |
| }, |
| { |
| "completion_length": 141.015625, |
| "epoch": 0.25592885375494073, |
| "grad_norm": 2.011819562020041, |
| "kl": 0.091796875, |
| "learning_rate": 7.443181818181817e-07, |
| "loss": 0.0037, |
| "reward": 2.705474853515625, |
| "reward_std": 0.15190356969833374, |
| "rewards/accuracy_reward_stage2": 0.7408912777900696, |
| "rewards/format_reward_all_stage": 1.964583396911621, |
| "scores/refine_times": 1.75, |
| "step": 1036 |
| }, |
| { |
| "completion_length": 112.796875, |
| "epoch": 0.25617588932806323, |
| "grad_norm": 1.4643025390300515, |
| "kl": 0.099609375, |
| "learning_rate": 7.440711462450593e-07, |
| "loss": 0.004, |
| "reward": 2.752537727355957, |
| "reward_std": 0.04632541537284851, |
| "rewards/accuracy_reward_stage2": 0.7603504061698914, |
| "rewards/format_reward_all_stage": 1.9921875, |
| "scores/refine_times": 1.578125, |
| "step": 1037 |
| }, |
| { |
| "completion_length": 119.546875, |
| "epoch": 0.2564229249011858, |
| "grad_norm": 2.068330713002787, |
| "kl": 0.0654296875, |
| "learning_rate": 7.438241106719367e-07, |
| "loss": 0.0026, |
| "reward": 2.9245011806488037, |
| "reward_std": 0.0003982662165071815, |
| "rewards/accuracy_reward_stage2": 0.9245011806488037, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.484375, |
| "step": 1038 |
| }, |
| { |
| "completion_length": 76.03125, |
| "epoch": 0.2566699604743083, |
| "grad_norm": 3.220272666537123, |
| "kl": 0.08154296875, |
| "learning_rate": 7.435770750988141e-07, |
| "loss": 0.0033, |
| "reward": 2.675968885421753, |
| "reward_std": 0.027604416012763977, |
| "rewards/accuracy_reward_stage2": 0.6759688854217529, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.09375, |
| "step": 1039 |
| }, |
| { |
| "completion_length": 114.046875, |
| "epoch": 0.25691699604743085, |
| "grad_norm": 2.6078505140682116, |
| "kl": 0.09619140625, |
| "learning_rate": 7.433300395256916e-07, |
| "loss": 0.0039, |
| "reward": 2.783844232559204, |
| "reward_std": 0.16356155276298523, |
| "rewards/accuracy_reward_stage2": 0.7838441133499146, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.53125, |
| "step": 1040 |
| }, |
| { |
| "completion_length": 104.578125, |
| "epoch": 0.25716403162055335, |
| "grad_norm": 2.81805356497672, |
| "kl": 0.09228515625, |
| "learning_rate": 7.430830039525692e-07, |
| "loss": 0.0037, |
| "reward": 2.5050501823425293, |
| "reward_std": 0.08182928711175919, |
| "rewards/accuracy_reward_stage2": 0.5050500631332397, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.4375, |
| "step": 1041 |
| }, |
| { |
| "completion_length": 95.8125, |
| "epoch": 0.2574110671936759, |
| "grad_norm": 2.0889313582541154, |
| "kl": 0.08154296875, |
| "learning_rate": 7.428359683794467e-07, |
| "loss": 0.0033, |
| "reward": 2.8057987689971924, |
| "reward_std": 0.03332037478685379, |
| "rewards/accuracy_reward_stage2": 0.8057988882064819, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.40625, |
| "step": 1042 |
| }, |
| { |
| "completion_length": 105.6875, |
| "epoch": 0.2576581027667984, |
| "grad_norm": 3.2267311653851065, |
| "kl": 0.1650390625, |
| "learning_rate": 7.425889328063241e-07, |
| "loss": 0.0066, |
| "reward": 2.6126112937927246, |
| "reward_std": 0.17364341020584106, |
| "rewards/accuracy_reward_stage2": 0.7454237937927246, |
| "rewards/format_reward_all_stage": 1.8671875, |
| "scores/refine_times": 1.5625, |
| "step": 1043 |
| }, |
| { |
| "completion_length": 100.609375, |
| "epoch": 0.25790513833992096, |
| "grad_norm": 1.925286487853523, |
| "kl": 0.09912109375, |
| "learning_rate": 7.423418972332015e-07, |
| "loss": 0.004, |
| "reward": 2.8033337593078613, |
| "reward_std": 0.09121645987033844, |
| "rewards/accuracy_reward_stage2": 0.8189588189125061, |
| "rewards/format_reward_all_stage": 1.984375, |
| "scores/refine_times": 1.4375, |
| "step": 1044 |
| }, |
| { |
| "completion_length": 91.796875, |
| "epoch": 0.25815217391304346, |
| "grad_norm": 0.7462047196287118, |
| "kl": 0.0908203125, |
| "learning_rate": 7.420948616600791e-07, |
| "loss": 0.0036, |
| "reward": 2.6150918006896973, |
| "reward_std": 0.02063605561852455, |
| "rewards/accuracy_reward_stage2": 0.6150918006896973, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.25, |
| "step": 1045 |
| }, |
| { |
| "completion_length": 118.390625, |
| "epoch": 0.258399209486166, |
| "grad_norm": 2.5501600272917697, |
| "kl": 0.08837890625, |
| "learning_rate": 7.418478260869565e-07, |
| "loss": 0.0035, |
| "reward": 2.7188854217529297, |
| "reward_std": 0.07711507380008698, |
| "rewards/accuracy_reward_stage2": 0.7293022274971008, |
| "rewards/format_reward_all_stage": 1.9895833730697632, |
| "scores/refine_times": 1.546875, |
| "step": 1046 |
| }, |
| { |
| "completion_length": 96.46875, |
| "epoch": 0.2586462450592885, |
| "grad_norm": 2.45339267390217, |
| "kl": 0.0869140625, |
| "learning_rate": 7.416007905138339e-07, |
| "loss": 0.0035, |
| "reward": 2.712198257446289, |
| "reward_std": 0.06328752636909485, |
| "rewards/accuracy_reward_stage2": 0.7121983766555786, |
| "rewards/format_reward_all_stage": 2.0, |
| "scores/refine_times": 1.359375, |
| "step": 1047 |
| }, |
| { |
| "completion_length": 89.875, |
| "epoch": 0.25889328063241107, |
| "grad_norm": 3.8252492083394243, |
| "kl": 0.09814453125, |
| "learning_rate": 7.413537549407114e-07, |
| "loss": 0.0039, |
| "reward": 2.5751380920410156, |
| "reward_std": 0.16470584273338318, |
| "rewards/accuracy_reward_stage2": 0.6376380920410156, |
| "rewards/format_reward_all_stage": 1.9375, |
| "scores/refine_times": 1.203125, |
| "step": 1048 |
| }, |
| { |
| "completion_length": 138.625, |
| "epoch": 0.25914031620553357, |
| "grad_norm": 2.602693075049154, |
| "kl": 0.0869140625, |
| "learning_rate": 7.411067193675889e-07, |
| "loss": 0.0035, |
| "reward": 2.7950634956359863, |
| "reward_std": 0.13027089834213257, |
| "rewards/accuracy_reward_stage2": 0.8653761148452759, |
| "rewards/format_reward_all_stage": 1.9296875, |
| "scores/refine_times": 1.6875, |
| "step": 1049 |
| }, |
| { |
| "completion_length": 105.390625, |
| "epoch": 0.25938735177865613, |
| "grad_norm": 2.8055130377676445, |
| "kl": 0.0810546875, |
| "learning_rate": 7.408596837944664e-07, |
| "loss": 0.0032, |
| "reward": 2.751202344894409, |
| "reward_std": 0.08926315605640411, |
| "rewards/accuracy_reward_stage2": 0.818910539150238, |
| "rewards/format_reward_all_stage": 1.9322917461395264, |
| "scores/refine_times": 1.359375, |
| "step": 1050 |
| } |
| ], |
| "logging_steps": 1.0, |
| "max_steps": 4048, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 50, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|