diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5370 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9985007496251874, + "eval_steps": 500, + "global_step": 333, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio": 0.0, + "completion_length": 125.27083587646484, + "epoch": 0.0029985007496251873, + "grad_norm": 2.316068982886434, + "kl": 0.0, + "learning_rate": 5.882352941176471e-07, + "loss": -0.139, + "reward": 0.4294887036085129, + "reward_std": 0.6044371202588081, + "rewards/accuracy_reward": 0.09375000419095159, + "rewards/reasoning_steps_reward": 0.0920138955116272, + "rewards/repetition_penalty_reward": -0.05184812843799591, + "rewards/tag_count_reward": 0.295572929084301, + "step": 1 + }, + { + "clip_ratio": 0.0, + "completion_length": 129.07812881469727, + "epoch": 0.005997001499250375, + "grad_norm": 2.528986939979676, + "kl": 0.0, + "learning_rate": 1.1764705882352942e-06, + "loss": -0.1784, + "reward": 0.4756753593683243, + "reward_std": 0.6413701921701431, + "rewards/accuracy_reward": 0.11979167070239782, + "rewards/reasoning_steps_reward": 0.1041666753590107, + "rewards/repetition_penalty_reward": -0.04776214715093374, + "rewards/tag_count_reward": 0.299479179084301, + "step": 2 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.48437881469727, + "epoch": 0.008995502248875561, + "grad_norm": 2.8435255560528976, + "kl": 0.00046443939208984375, + "learning_rate": 1.7647058823529414e-06, + "loss": -0.1512, + "reward": 0.4948234558105469, + "reward_std": 0.6535268872976303, + "rewards/accuracy_reward": 0.1510416679084301, + "rewards/reasoning_steps_reward": 0.07118056109175086, + "rewards/repetition_penalty_reward": -0.04380502179265022, + "rewards/tag_count_reward": 0.3164062649011612, + "step": 3 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.15625190734863, + "epoch": 0.01199400299850075, + "grad_norm": 2.6137621112730782, + "kl": 0.0006656646728515625, + "learning_rate": 2.3529411764705885e-06, + "loss": -0.0962, + "reward": 0.39275161921977997, + "reward_std": 0.5565674006938934, + "rewards/accuracy_reward": 0.0937500037252903, + "rewards/reasoning_steps_reward": 0.0590277835726738, + "rewards/repetition_penalty_reward": -0.0386720122769475, + "rewards/tag_count_reward": 0.2786458358168602, + "step": 4 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.72396278381348, + "epoch": 0.014992503748125937, + "grad_norm": 2.3547773173814814, + "kl": 0.0024394989013671875, + "learning_rate": 2.9411764705882355e-06, + "loss": -0.125, + "reward": 0.5311619490385056, + "reward_std": 0.61662757396698, + "rewards/accuracy_reward": 0.14062500139698386, + "rewards/reasoning_steps_reward": 0.0885416716337204, + "rewards/repetition_penalty_reward": -0.05737974401563406, + "rewards/tag_count_reward": 0.3593750074505806, + "step": 5 + }, + { + "clip_ratio": 0.0, + "completion_length": 138.2447967529297, + "epoch": 0.017991004497751123, + "grad_norm": 2.1062805811826757, + "kl": 0.055267333984375, + "learning_rate": 3.529411764705883e-06, + "loss": -0.0627, + "reward": 0.7481685727834702, + "reward_std": 0.7483109384775162, + "rewards/accuracy_reward": 0.2708333432674408, + "rewards/reasoning_steps_reward": 0.06076389271765947, + "rewards/repetition_penalty_reward": -0.06129324156790972, + "rewards/tag_count_reward": 0.4778645858168602, + "step": 6 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.7708396911621, + "epoch": 0.020989505247376312, + "grad_norm": 91.80938835633462, + "kl": 2.9296875, + "learning_rate": 4.11764705882353e-06, + "loss": 0.0604, + "reward": 1.0586681962013245, + "reward_std": 0.6929789483547211, + "rewards/accuracy_reward": 0.3489583358168602, + "rewards/reasoning_steps_reward": 0.1371527910232544, + "rewards/repetition_penalty_reward": -0.06676589138805866, + "rewards/tag_count_reward": 0.6393229365348816, + "step": 7 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.0104217529297, + "epoch": 0.0239880059970015, + "grad_norm": 80.12581730059306, + "kl": 2.7734375, + "learning_rate": 4.705882352941177e-06, + "loss": 0.0618, + "reward": 1.2028335630893707, + "reward_std": 0.6846612095832825, + "rewards/accuracy_reward": 0.4687500074505806, + "rewards/reasoning_steps_reward": 0.1232638955116272, + "rewards/repetition_penalty_reward": -0.06626365892589092, + "rewards/tag_count_reward": 0.6770833730697632, + "step": 8 + }, + { + "clip_ratio": 0.0, + "completion_length": 161.4791717529297, + "epoch": 0.026986506746626688, + "grad_norm": 25.478259616973777, + "kl": 0.8330078125, + "learning_rate": 5.294117647058824e-06, + "loss": -0.0184, + "reward": 1.3118138909339905, + "reward_std": 0.7422287911176682, + "rewards/accuracy_reward": 0.4843750149011612, + "rewards/reasoning_steps_reward": 0.18055557273328304, + "rewards/repetition_penalty_reward": -0.07056461833417416, + "rewards/tag_count_reward": 0.7174479365348816, + "step": 9 + }, + { + "clip_ratio": 0.0, + "completion_length": 165.47916793823242, + "epoch": 0.029985007496251874, + "grad_norm": 3.3317610347487396, + "kl": 0.205810546875, + "learning_rate": 5.882352941176471e-06, + "loss": -0.0374, + "reward": 1.234879344701767, + "reward_std": 0.7370101809501648, + "rewards/accuracy_reward": 0.432291679084301, + "rewards/reasoning_steps_reward": 0.1458333469927311, + "rewards/repetition_penalty_reward": -0.06069365330040455, + "rewards/tag_count_reward": 0.7174479365348816, + "step": 10 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.56771087646484, + "epoch": 0.03298350824587706, + "grad_norm": 1.2483918606755664, + "kl": 0.08367919921875, + "learning_rate": 6.470588235294119e-06, + "loss": -0.064, + "reward": 1.2948878109455109, + "reward_std": 0.7816713899374008, + "rewards/accuracy_reward": 0.3489583432674408, + "rewards/reasoning_steps_reward": 0.2638889104127884, + "rewards/repetition_penalty_reward": -0.07577204331755638, + "rewards/tag_count_reward": 0.7578125149011612, + "step": 11 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.20312881469727, + "epoch": 0.035982008995502246, + "grad_norm": 1.2630629002924778, + "kl": 0.05889892578125, + "learning_rate": 7.058823529411766e-06, + "loss": -0.0437, + "reward": 1.472415030002594, + "reward_std": 0.7569083422422409, + "rewards/accuracy_reward": 0.473958358168602, + "rewards/reasoning_steps_reward": 0.2899305671453476, + "rewards/repetition_penalty_reward": -0.061005206778645515, + "rewards/tag_count_reward": 0.7695312649011612, + "step": 12 + }, + { + "clip_ratio": 0.0, + "completion_length": 209.9791717529297, + "epoch": 0.038980509745127435, + "grad_norm": 1.1307015314913393, + "kl": 0.04443359375, + "learning_rate": 7.647058823529411e-06, + "loss": -0.0193, + "reward": 1.6883811056613922, + "reward_std": 0.7850492298603058, + "rewards/accuracy_reward": 0.5625000223517418, + "rewards/reasoning_steps_reward": 0.392361119389534, + "rewards/repetition_penalty_reward": -0.06986541766673326, + "rewards/tag_count_reward": 0.8033854216337204, + "step": 13 + }, + { + "clip_ratio": 0.0, + "completion_length": 222.8385467529297, + "epoch": 0.041979010494752625, + "grad_norm": 1.0685540714109272, + "kl": 0.08837890625, + "learning_rate": 8.23529411764706e-06, + "loss": -0.0437, + "reward": 1.8284152746200562, + "reward_std": 0.7906496375799179, + "rewards/accuracy_reward": 0.5468750149011612, + "rewards/reasoning_steps_reward": 0.5121527835726738, + "rewards/repetition_penalty_reward": -0.07045630738139153, + "rewards/tag_count_reward": 0.8398437649011612, + "step": 14 + }, + { + "clip_ratio": 0.0, + "completion_length": 244.41667556762695, + "epoch": 0.044977511244377814, + "grad_norm": 0.9074510920159916, + "kl": 0.08544921875, + "learning_rate": 8.823529411764707e-06, + "loss": -0.0056, + "reward": 2.034987300634384, + "reward_std": 0.6369659751653671, + "rewards/accuracy_reward": 0.5000000074505806, + "rewards/reasoning_steps_reward": 0.7569444626569748, + "rewards/repetition_penalty_reward": -0.10607173293828964, + "rewards/tag_count_reward": 0.884114608168602, + "step": 15 + }, + { + "clip_ratio": 0.0, + "completion_length": 245.24480438232422, + "epoch": 0.047976011994003, + "grad_norm": 0.958833819569195, + "kl": 0.107177734375, + "learning_rate": 9.411764705882354e-06, + "loss": 0.064, + "reward": 2.3543498516082764, + "reward_std": 0.5774905681610107, + "rewards/accuracy_reward": 0.6822916865348816, + "rewards/reasoning_steps_reward": 0.833333358168602, + "rewards/repetition_penalty_reward": -0.09486887603998184, + "rewards/tag_count_reward": 0.9335937649011612, + "step": 16 + }, + { + "clip_ratio": 0.0, + "completion_length": 224.5833396911621, + "epoch": 0.050974512743628186, + "grad_norm": 32.488470313797116, + "kl": 0.901611328125, + "learning_rate": 1e-05, + "loss": 0.0729, + "reward": 2.285028785467148, + "reward_std": 0.5724412277340889, + "rewards/accuracy_reward": 0.5885416716337204, + "rewards/reasoning_steps_reward": 0.885416716337204, + "rewards/repetition_penalty_reward": -0.0938775297254324, + "rewards/tag_count_reward": 0.9049479365348816, + "step": 17 + }, + { + "clip_ratio": 0.0, + "completion_length": 207.55729293823242, + "epoch": 0.053973013493253376, + "grad_norm": 1.0958281725310235, + "kl": 0.14453125, + "learning_rate": 1.0588235294117648e-05, + "loss": -0.0121, + "reward": 2.4053784608840942, + "reward_std": 0.49531228840351105, + "rewards/accuracy_reward": 0.645833358168602, + "rewards/reasoning_steps_reward": 0.8975695073604584, + "rewards/repetition_penalty_reward": -0.08724310249090195, + "rewards/tag_count_reward": 0.9492187649011612, + "step": 18 + }, + { + "clip_ratio": 0.0, + "completion_length": 215.67708587646484, + "epoch": 0.05697151424287856, + "grad_norm": 1.0829252560391265, + "kl": 0.15185546875, + "learning_rate": 1.1176470588235295e-05, + "loss": -0.0053, + "reward": 2.3993316292762756, + "reward_std": 0.49921783059835434, + "rewards/accuracy_reward": 0.6979166865348816, + "rewards/reasoning_steps_reward": 0.9097222685813904, + "rewards/repetition_penalty_reward": -0.08981773070991039, + "rewards/tag_count_reward": 0.8815104365348816, + "step": 19 + }, + { + "clip_ratio": 0.0, + "completion_length": 244.6354217529297, + "epoch": 0.05997001499250375, + "grad_norm": 1.1922172194368157, + "kl": 0.1689453125, + "learning_rate": 1.1764705882352942e-05, + "loss": 0.1106, + "reward": 2.2201938033103943, + "reward_std": 0.5487363934516907, + "rewards/accuracy_reward": 0.4947916716337204, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.09664654545485973, + "rewards/tag_count_reward": 0.8776042014360428, + "step": 20 + }, + { + "clip_ratio": 0.0, + "completion_length": 308.5833435058594, + "epoch": 0.06296851574212893, + "grad_norm": 0.9900748090272767, + "kl": 0.1285400390625, + "learning_rate": 1.235294117647059e-05, + "loss": 0.1992, + "reward": 2.4696518182754517, + "reward_std": 0.5737268030643463, + "rewards/accuracy_reward": 0.6979166865348816, + "rewards/reasoning_steps_reward": 0.97743059694767, + "rewards/repetition_penalty_reward": -0.10543505474925041, + "rewards/tag_count_reward": 0.899739608168602, + "step": 21 + }, + { + "clip_ratio": 0.0, + "completion_length": 292.5416717529297, + "epoch": 0.06596701649175413, + "grad_norm": 0.9436079749621433, + "kl": 0.150146484375, + "learning_rate": 1.2941176470588238e-05, + "loss": 0.2047, + "reward": 2.5528025031089783, + "reward_std": 0.5636586248874664, + "rewards/accuracy_reward": 0.7447917014360428, + "rewards/reasoning_steps_reward": 0.9670138955116272, + "rewards/repetition_penalty_reward": -0.10822184756398201, + "rewards/tag_count_reward": 0.9492187649011612, + "step": 22 + }, + { + "clip_ratio": 0.0, + "completion_length": 299.51563262939453, + "epoch": 0.06896551724137931, + "grad_norm": 3.8939799298403286, + "kl": 0.23046875, + "learning_rate": 1.3529411764705885e-05, + "loss": 0.1481, + "reward": 2.538209021091461, + "reward_std": 0.5256823599338531, + "rewards/accuracy_reward": 0.7135416865348816, + "rewards/reasoning_steps_reward": 0.9809028208255768, + "rewards/repetition_penalty_reward": -0.10154793784022331, + "rewards/tag_count_reward": 0.9453125298023224, + "step": 23 + }, + { + "clip_ratio": 0.0, + "completion_length": 331.1458511352539, + "epoch": 0.07196401799100449, + "grad_norm": 1.4799872075073064, + "kl": 0.28955078125, + "learning_rate": 1.4117647058823532e-05, + "loss": 0.2011, + "reward": 2.411967933177948, + "reward_std": 0.7274067103862762, + "rewards/accuracy_reward": 0.6822916716337204, + "rewards/reasoning_steps_reward": 0.9496527910232544, + "rewards/repetition_penalty_reward": -0.09758076071739197, + "rewards/tag_count_reward": 0.8776041716337204, + "step": 24 + }, + { + "clip_ratio": 0.0, + "completion_length": 348.09375762939453, + "epoch": 0.07496251874062969, + "grad_norm": 1.8716934303767792, + "kl": 0.21533203125, + "learning_rate": 1.4705882352941179e-05, + "loss": 0.2849, + "reward": 2.35833877325058, + "reward_std": 0.6804773062467575, + "rewards/accuracy_reward": 0.6093750223517418, + "rewards/reasoning_steps_reward": 0.9583333879709244, + "rewards/repetition_penalty_reward": -0.10780714452266693, + "rewards/tag_count_reward": 0.8984375149011612, + "step": 25 + }, + { + "clip_ratio": 0.0, + "completion_length": 403.9166793823242, + "epoch": 0.07796101949025487, + "grad_norm": 10.274951887282484, + "kl": 0.6142578125, + "learning_rate": 1.5294117647058822e-05, + "loss": 0.3997, + "reward": 2.2658557891845703, + "reward_std": 0.7715227752923965, + "rewards/accuracy_reward": 0.6093750223517418, + "rewards/reasoning_steps_reward": 0.94618059694767, + "rewards/repetition_penalty_reward": -0.0852727573364973, + "rewards/tag_count_reward": 0.7955729514360428, + "step": 26 + }, + { + "clip_ratio": 0.0, + "completion_length": 455.1614685058594, + "epoch": 0.08095952023988005, + "grad_norm": 6.823619814725695, + "kl": 1.068359375, + "learning_rate": 1.5882352941176473e-05, + "loss": 0.4192, + "reward": 1.9341484606266022, + "reward_std": 0.9543884545564651, + "rewards/accuracy_reward": 0.4635416716337204, + "rewards/reasoning_steps_reward": 0.8663194328546524, + "rewards/repetition_penalty_reward": -0.06888982094824314, + "rewards/tag_count_reward": 0.673177108168602, + "step": 27 + }, + { + "clip_ratio": 0.0, + "completion_length": 369.0052185058594, + "epoch": 0.08395802098950525, + "grad_norm": 5.114037986110308, + "kl": 0.4541015625, + "learning_rate": 1.647058823529412e-05, + "loss": 0.3594, + "reward": 2.284946322441101, + "reward_std": 0.8626722097396851, + "rewards/accuracy_reward": 0.6562500149011612, + "rewards/reasoning_steps_reward": 0.890625, + "rewards/repetition_penalty_reward": -0.0783350057899952, + "rewards/tag_count_reward": 0.8164062798023224, + "step": 28 + }, + { + "clip_ratio": 0.0, + "completion_length": 344.8229217529297, + "epoch": 0.08695652173913043, + "grad_norm": 1009.5340319486562, + "kl": 16.3125, + "learning_rate": 1.7058823529411767e-05, + "loss": 1.4554, + "reward": 2.209560751914978, + "reward_std": 0.7678176611661911, + "rewards/accuracy_reward": 0.5781250149011612, + "rewards/reasoning_steps_reward": 0.897569477558136, + "rewards/repetition_penalty_reward": -0.06561294477432966, + "rewards/tag_count_reward": 0.7994791865348816, + "step": 29 + }, + { + "clip_ratio": 0.0, + "completion_length": 356.0572967529297, + "epoch": 0.08995502248875563, + "grad_norm": 23.197023290991723, + "kl": 0.953125, + "learning_rate": 1.7647058823529414e-05, + "loss": 0.2222, + "reward": 1.9122081696987152, + "reward_std": 0.8025388270616531, + "rewards/accuracy_reward": 0.4375000149011612, + "rewards/reasoning_steps_reward": 0.8281250298023224, + "rewards/repetition_penalty_reward": -0.06175030395388603, + "rewards/tag_count_reward": 0.7083333432674408, + "step": 30 + }, + { + "clip_ratio": 0.0, + "completion_length": 326.0104217529297, + "epoch": 0.09295352323838081, + "grad_norm": 8.878208799628512, + "kl": 0.9375, + "learning_rate": 1.823529411764706e-05, + "loss": 0.1336, + "reward": 1.718822568655014, + "reward_std": 0.798391655087471, + "rewards/accuracy_reward": 0.3125000149011612, + "rewards/reasoning_steps_reward": 0.8263888955116272, + "rewards/repetition_penalty_reward": -0.058087206445634365, + "rewards/tag_count_reward": 0.638020858168602, + "step": 31 + }, + { + "clip_ratio": 0.0, + "completion_length": 302.17189025878906, + "epoch": 0.095952023988006, + "grad_norm": 5.191420623812987, + "kl": 2.138671875, + "learning_rate": 1.8823529411764708e-05, + "loss": -0.0731, + "reward": 1.7180909216403961, + "reward_std": 0.795787900686264, + "rewards/accuracy_reward": 0.2604166753590107, + "rewards/reasoning_steps_reward": 0.8732639402151108, + "rewards/repetition_penalty_reward": -0.04579801578074694, + "rewards/tag_count_reward": 0.630208358168602, + "step": 32 + }, + { + "clip_ratio": 0.0, + "completion_length": 329.6666793823242, + "epoch": 0.09895052473763119, + "grad_norm": 20.842747943141333, + "kl": 4.875, + "learning_rate": 1.9411764705882355e-05, + "loss": -0.1732, + "reward": 1.663686603307724, + "reward_std": 0.7371216714382172, + "rewards/accuracy_reward": 0.2239583432674408, + "rewards/reasoning_steps_reward": 0.8593750298023224, + "rewards/repetition_penalty_reward": -0.06417796947062016, + "rewards/tag_count_reward": 0.6445312649011612, + "step": 33 + }, + { + "clip_ratio": 0.0, + "completion_length": 326.96875762939453, + "epoch": 0.10194902548725637, + "grad_norm": 3.7039312187876177, + "kl": 4.06640625, + "learning_rate": 2e-05, + "loss": -0.3241, + "reward": 1.5329826474189758, + "reward_std": 0.797045961022377, + "rewards/accuracy_reward": 0.1770833358168602, + "rewards/reasoning_steps_reward": 0.819444477558136, + "rewards/repetition_penalty_reward": -0.058597257360816, + "rewards/tag_count_reward": 0.595052108168602, + "step": 34 + }, + { + "clip_ratio": 0.0, + "completion_length": 469.40625762939453, + "epoch": 0.10494752623688156, + "grad_norm": 19.322333158500665, + "kl": 2.0654296875, + "learning_rate": 1.9999448019954837e-05, + "loss": -0.1809, + "reward": 1.9347608387470245, + "reward_std": 0.8773187696933746, + "rewards/accuracy_reward": 0.4270833507180214, + "rewards/reasoning_steps_reward": 0.8923610895872116, + "rewards/repetition_penalty_reward": -0.11254816874861717, + "rewards/tag_count_reward": 0.7278645932674408, + "step": 35 + }, + { + "clip_ratio": 0.0, + "completion_length": 522.7396087646484, + "epoch": 0.10794602698650675, + "grad_norm": 3.6713399077901303, + "kl": 0.58203125, + "learning_rate": 1.9997792140755746e-05, + "loss": -0.0915, + "reward": 2.4146523475646973, + "reward_std": 0.6919413357973099, + "rewards/accuracy_reward": 0.7500000149011612, + "rewards/reasoning_steps_reward": 0.9340278208255768, + "rewards/repetition_penalty_reward": -0.15739644691348076, + "rewards/tag_count_reward": 0.888020858168602, + "step": 36 + }, + { + "clip_ratio": 0.0, + "completion_length": 433.7916793823242, + "epoch": 0.11094452773613193, + "grad_norm": 2.113817065645351, + "kl": 0.7890625, + "learning_rate": 1.999503254520518e-05, + "loss": -0.1104, + "reward": 2.4119739532470703, + "reward_std": 0.7027525901794434, + "rewards/accuracy_reward": 0.7656250298023224, + "rewards/reasoning_steps_reward": 0.9305555820465088, + "rewards/repetition_penalty_reward": -0.18524829670786858, + "rewards/tag_count_reward": 0.9010416865348816, + "step": 37 + }, + { + "clip_ratio": 0.0, + "completion_length": 304.2708435058594, + "epoch": 0.11394302848575712, + "grad_norm": 3.3062068674445064, + "kl": 2.06640625, + "learning_rate": 1.999116953795147e-05, + "loss": -0.1854, + "reward": 2.173910915851593, + "reward_std": 0.8623018711805344, + "rewards/accuracy_reward": 0.6614583507180214, + "rewards/reasoning_steps_reward": 0.845486119389534, + "rewards/repetition_penalty_reward": -0.18329406157135963, + "rewards/tag_count_reward": 0.8502604365348816, + "step": 38 + }, + { + "clip_ratio": 0.0, + "completion_length": 262.3020935058594, + "epoch": 0.11694152923538231, + "grad_norm": 2.487067537690022, + "kl": 1.455078125, + "learning_rate": 1.9986203545455205e-05, + "loss": -0.1053, + "reward": 2.481999635696411, + "reward_std": 0.6076074242591858, + "rewards/accuracy_reward": 0.8906250298023224, + "rewards/reasoning_steps_reward": 0.8750000447034836, + "rewards/repetition_penalty_reward": -0.23544833436608315, + "rewards/tag_count_reward": 0.9518229514360428, + "step": 39 + }, + { + "clip_ratio": 0.0, + "completion_length": 254.07813262939453, + "epoch": 0.1199400299850075, + "grad_norm": 12.184820883902836, + "kl": 3.974609375, + "learning_rate": 1.9980135115942135e-05, + "loss": -0.0478, + "reward": 2.3865994215011597, + "reward_std": 0.5614848285913467, + "rewards/accuracy_reward": 0.770833358168602, + "rewards/reasoning_steps_reward": 0.876736119389534, + "rewards/repetition_penalty_reward": -0.19586599990725517, + "rewards/tag_count_reward": 0.934895858168602, + "step": 40 + }, + { + "clip_ratio": 0.0, + "completion_length": 237.33334350585938, + "epoch": 0.12293853073463268, + "grad_norm": 6.144455968970083, + "kl": 3.984375, + "learning_rate": 1.9972964919342664e-05, + "loss": -0.0385, + "reward": 2.3252296447753906, + "reward_std": 0.7344638109207153, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.8819445073604584, + "rewards/repetition_penalty_reward": -0.1699962317943573, + "rewards/tag_count_reward": 0.8632812649011612, + "step": 41 + }, + { + "clip_ratio": 0.0, + "completion_length": 291.4791717529297, + "epoch": 0.12593703148425786, + "grad_norm": 4998.4510260355355, + "kl": 91.7841796875, + "learning_rate": 1.9964693747217873e-05, + "loss": 4.7855, + "reward": 2.487729072570801, + "reward_std": 0.477145679295063, + "rewards/accuracy_reward": 0.8697916865348816, + "rewards/reasoning_steps_reward": 0.9340278208255768, + "rewards/repetition_penalty_reward": -0.18067368865013123, + "rewards/tag_count_reward": 0.864583358168602, + "step": 42 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.6145935058594, + "epoch": 0.12893553223388307, + "grad_norm": 5.6325713646688405, + "kl": 1.0654296875, + "learning_rate": 1.9955322512672162e-05, + "loss": -0.064, + "reward": 2.384717285633087, + "reward_std": 0.6374871581792831, + "rewards/accuracy_reward": 0.8385416865348816, + "rewards/reasoning_steps_reward": 0.901041716337204, + "rewards/repetition_penalty_reward": -0.1426265835762024, + "rewards/tag_count_reward": 0.7877604216337204, + "step": 43 + }, + { + "clip_ratio": 0.0, + "completion_length": 353.10939025878906, + "epoch": 0.13193403298350825, + "grad_norm": 1.0900138361408922, + "kl": 0.6494140625, + "learning_rate": 1.9944852250252416e-05, + "loss": -0.0128, + "reward": 2.404056489467621, + "reward_std": 0.5440548211336136, + "rewards/accuracy_reward": 0.7447916865348816, + "rewards/reasoning_steps_reward": 0.8906250298023224, + "rewards/repetition_penalty_reward": -0.13500603288412094, + "rewards/tag_count_reward": 0.9036458432674408, + "step": 44 + }, + { + "clip_ratio": 0.0, + "completion_length": 435.9635543823242, + "epoch": 0.13493253373313344, + "grad_norm": 447.68957414171246, + "kl": 15.2900390625, + "learning_rate": 1.993328411583383e-05, + "loss": 1.0339, + "reward": 2.6905240416526794, + "reward_std": 0.36615417525172234, + "rewards/accuracy_reward": 0.9375000298023224, + "rewards/reasoning_steps_reward": 0.9184028059244156, + "rewards/repetition_penalty_reward": -0.13022266328334808, + "rewards/tag_count_reward": 0.9648437649011612, + "step": 45 + }, + { + "clip_ratio": 0.0, + "completion_length": 583.7187805175781, + "epoch": 0.13793103448275862, + "grad_norm": 1.610843891213588, + "kl": 0.5166015625, + "learning_rate": 1.992061938649227e-05, + "loss": 0.0019, + "reward": 2.6286553740501404, + "reward_std": 0.3213835656642914, + "rewards/accuracy_reward": 0.770833358168602, + "rewards/reasoning_steps_reward": 0.9774305522441864, + "rewards/repetition_penalty_reward": -0.10788994282484055, + "rewards/tag_count_reward": 0.9882812649011612, + "step": 46 + }, + { + "clip_ratio": 0.0, + "completion_length": 726.1250152587891, + "epoch": 0.1409295352323838, + "grad_norm": 7.039405614303865, + "kl": 0.6123046875, + "learning_rate": 1.9906859460363307e-05, + "loss": -0.0022, + "reward": 2.555482268333435, + "reward_std": 0.44716860353946686, + "rewards/accuracy_reward": 0.7187500074505806, + "rewards/reasoning_steps_reward": 0.9600694179534912, + "rewards/repetition_penalty_reward": -0.06474354676902294, + "rewards/tag_count_reward": 0.9414062649011612, + "step": 47 + }, + { + "clip_ratio": 0.0, + "completion_length": 823.7031402587891, + "epoch": 0.14392803598200898, + "grad_norm": 7.8638528575359885, + "kl": 0.5986328125, + "learning_rate": 1.989200585648788e-05, + "loss": 0.0195, + "reward": 2.400337427854538, + "reward_std": 0.5809096917510033, + "rewards/accuracy_reward": 0.7031250223517418, + "rewards/reasoning_steps_reward": 0.9097222238779068, + "rewards/repetition_penalty_reward": -0.05625995807349682, + "rewards/tag_count_reward": 0.8437500149011612, + "step": 48 + }, + { + "clip_ratio": 0.0, + "completion_length": 958.1198120117188, + "epoch": 0.1469265367316342, + "grad_norm": 3.4063309297902227, + "kl": 0.56640625, + "learning_rate": 1.9876060214644568e-05, + "loss": 0.0231, + "reward": 1.8466069400310516, + "reward_std": 0.7590629458427429, + "rewards/accuracy_reward": 0.432291679084301, + "rewards/reasoning_steps_reward": 0.8385417014360428, + "rewards/repetition_penalty_reward": -0.034903590101748705, + "rewards/tag_count_reward": 0.6106770932674408, + "step": 49 + }, + { + "clip_ratio": 0.0, + "completion_length": 996.1250305175781, + "epoch": 0.14992503748125938, + "grad_norm": 3.75487802054811, + "kl": 0.9189453125, + "learning_rate": 1.9859024295168593e-05, + "loss": 0.0443, + "reward": 1.636008232831955, + "reward_std": 0.7502989023923874, + "rewards/accuracy_reward": 0.3906250149011612, + "rewards/reasoning_steps_reward": 0.788194477558136, + "rewards/repetition_penalty_reward": -0.0336966784670949, + "rewards/tag_count_reward": 0.4908854439854622, + "step": 50 + }, + { + "clip_ratio": 0.0, + "completion_length": 1002.0833435058594, + "epoch": 0.15292353823088456, + "grad_norm": 26.36570320352729, + "kl": 2.140625, + "learning_rate": 1.9840899978757483e-05, + "loss": 0.0805, + "reward": 1.6898488700389862, + "reward_std": 0.7471358329057693, + "rewards/accuracy_reward": 0.3489583432674408, + "rewards/reasoning_steps_reward": 0.8315972834825516, + "rewards/repetition_penalty_reward": -0.046696340665221214, + "rewards/tag_count_reward": 0.5559896007180214, + "step": 51 + }, + { + "clip_ratio": 0.0, + "completion_length": 990.2968902587891, + "epoch": 0.15592203898050974, + "grad_norm": 29.24774607573346, + "kl": 6.390625, + "learning_rate": 1.9821689266263425e-05, + "loss": 0.1745, + "reward": 1.9025542736053467, + "reward_std": 0.8440426588058472, + "rewards/accuracy_reward": 0.5052083507180214, + "rewards/reasoning_steps_reward": 0.7934027910232544, + "rewards/repetition_penalty_reward": -0.06272357050329447, + "rewards/tag_count_reward": 0.6666666716337204, + "step": 52 + }, + { + "clip_ratio": 0.0, + "completion_length": 564.7864761352539, + "epoch": 0.15892053973013492, + "grad_norm": 21.634969699176207, + "kl": 1.97265625, + "learning_rate": 1.980139427847242e-05, + "loss": 0.6096, + "reward": 2.039624661207199, + "reward_std": 0.7389360666275024, + "rewards/accuracy_reward": 0.5468750074505806, + "rewards/reasoning_steps_reward": 0.8038195073604584, + "rewards/repetition_penalty_reward": -0.05976778268814087, + "rewards/tag_count_reward": 0.7486979365348816, + "step": 53 + }, + { + "clip_ratio": 0.0, + "completion_length": 626.5000152587891, + "epoch": 0.1619190404797601, + "grad_norm": 20.599440681644708, + "kl": 1.03125, + "learning_rate": 1.9780017255870114e-05, + "loss": 0.5382, + "reward": 1.7361580431461334, + "reward_std": 0.7883689254522324, + "rewards/accuracy_reward": 0.3593750074505806, + "rewards/reasoning_steps_reward": 0.7899305671453476, + "rewards/repetition_penalty_reward": -0.047262136824429035, + "rewards/tag_count_reward": 0.634114608168602, + "step": 54 + }, + { + "clip_ratio": 0.0, + "completion_length": 813.6041717529297, + "epoch": 0.16491754122938532, + "grad_norm": 14.859517097725304, + "kl": 2.419921875, + "learning_rate": 1.9757560558394493e-05, + "loss": 0.3132, + "reward": 1.3911511600017548, + "reward_std": 0.7518036961555481, + "rewards/accuracy_reward": 0.2187500074505806, + "rewards/reasoning_steps_reward": 0.6961806118488312, + "rewards/repetition_penalty_reward": -0.026383677031844854, + "rewards/tag_count_reward": 0.5026041865348816, + "step": 55 + }, + { + "clip_ratio": 0.0, + "completion_length": 910.8177337646484, + "epoch": 0.1679160419790105, + "grad_norm": 6.499919813214984, + "kl": 0.9853515625, + "learning_rate": 1.9734026665175335e-05, + "loss": 0.0658, + "reward": 1.1412139385938644, + "reward_std": 0.6940838098526001, + "rewards/accuracy_reward": 0.1197916679084301, + "rewards/reasoning_steps_reward": 0.6545139253139496, + "rewards/repetition_penalty_reward": -0.0172062402125448, + "rewards/tag_count_reward": 0.3841145858168602, + "step": 56 + }, + { + "clip_ratio": 0.0, + "completion_length": 949.7083587646484, + "epoch": 0.17091454272863568, + "grad_norm": 1.0108146959506905, + "kl": 0.8154296875, + "learning_rate": 1.9709418174260523e-05, + "loss": 0.0444, + "reward": 1.0093085765838623, + "reward_std": 0.6726640909910202, + "rewards/accuracy_reward": 0.07812500186264515, + "rewards/reasoning_steps_reward": 0.5972222685813904, + "rewards/repetition_penalty_reward": -0.014997066929936409, + "rewards/tag_count_reward": 0.3489583432674408, + "step": 57 + }, + { + "clip_ratio": 0.0, + "completion_length": 923.5104370117188, + "epoch": 0.17391304347826086, + "grad_norm": 2.2827458536380156, + "kl": 0.9248046875, + "learning_rate": 1.9683737802329242e-05, + "loss": 0.0701, + "reward": 1.2827005088329315, + "reward_std": 0.6604696810245514, + "rewards/accuracy_reward": 0.11458333721384406, + "rewards/reasoning_steps_reward": 0.7690972834825516, + "rewards/repetition_penalty_reward": -0.021553035592660308, + "rewards/tag_count_reward": 0.4205729216337204, + "step": 58 + }, + { + "clip_ratio": 0.0, + "completion_length": 832.3229370117188, + "epoch": 0.17691154422788605, + "grad_norm": 11.547641471792147, + "kl": 1.482421875, + "learning_rate": 1.9656988384392075e-05, + "loss": 0.2098, + "reward": 1.5504461526870728, + "reward_std": 0.5935858860611916, + "rewards/accuracy_reward": 0.16666667256504297, + "rewards/reasoning_steps_reward": 0.855902835726738, + "rewards/repetition_penalty_reward": -0.029415032360702753, + "rewards/tag_count_reward": 0.5572916865348816, + "step": 59 + }, + { + "clip_ratio": 0.0, + "completion_length": 811.9479217529297, + "epoch": 0.17991004497751126, + "grad_norm": 7.13025211784621, + "kl": 2.056640625, + "learning_rate": 1.9629172873477995e-05, + "loss": 0.1859, + "reward": 1.8182637393474579, + "reward_std": 0.7368517369031906, + "rewards/accuracy_reward": 0.3593750111758709, + "rewards/reasoning_steps_reward": 0.8611111044883728, + "rewards/repetition_penalty_reward": -0.0441494369879365, + "rewards/tag_count_reward": 0.641927108168602, + "step": 60 + }, + { + "clip_ratio": 0.0, + "completion_length": 754.9479370117188, + "epoch": 0.18290854572713644, + "grad_norm": 8.766884713406547, + "kl": 1.634765625, + "learning_rate": 1.96002943403084e-05, + "loss": 0.2092, + "reward": 2.166029632091522, + "reward_std": 0.7226460427045822, + "rewards/accuracy_reward": 0.645833358168602, + "rewards/reasoning_steps_reward": 0.9079861789941788, + "rewards/repetition_penalty_reward": -0.06487324088811874, + "rewards/tag_count_reward": 0.6770833730697632, + "step": 61 + }, + { + "clip_ratio": 0.0, + "completion_length": 641.5833587646484, + "epoch": 0.18590704647676162, + "grad_norm": 5.967296694311895, + "kl": 1.23828125, + "learning_rate": 1.9570355972958098e-05, + "loss": 0.2609, + "reward": 1.9569042026996613, + "reward_std": 0.6188908591866493, + "rewards/accuracy_reward": 0.4114583432674408, + "rewards/reasoning_steps_reward": 0.9444444626569748, + "rewards/repetition_penalty_reward": -0.07347787916660309, + "rewards/tag_count_reward": 0.6744792014360428, + "step": 62 + }, + { + "clip_ratio": 0.0, + "completion_length": 655.6041870117188, + "epoch": 0.1889055472263868, + "grad_norm": 8.21428152429746, + "kl": 1.9296875, + "learning_rate": 1.953936107650336e-05, + "loss": 0.2559, + "reward": 1.7195740938186646, + "reward_std": 0.5279941856861115, + "rewards/accuracy_reward": 0.2187500037252903, + "rewards/reasoning_steps_reward": 0.9461805820465088, + "rewards/repetition_penalty_reward": -0.10290857404470444, + "rewards/tag_count_reward": 0.657552108168602, + "step": 63 + }, + { + "clip_ratio": 0.0, + "completion_length": 768.5729217529297, + "epoch": 0.191904047976012, + "grad_norm": 4.853080558533273, + "kl": 3.99609375, + "learning_rate": 1.9507313072657057e-05, + "loss": 0.2929, + "reward": 1.5277346670627594, + "reward_std": 0.40065091848373413, + "rewards/accuracy_reward": 0.0677083358168602, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.15542514994740486, + "rewards/tag_count_reward": 0.6640625149011612, + "step": 64 + }, + { + "clip_ratio": 0.0, + "completion_length": 745.9583587646484, + "epoch": 0.19490254872563717, + "grad_norm": 2.314098183884576, + "kl": 3.66015625, + "learning_rate": 1.9474215499390912e-05, + "loss": 0.3051, + "reward": 1.5480964481830597, + "reward_std": 0.35690316557884216, + "rewards/accuracy_reward": 0.015625000465661287, + "rewards/reasoning_steps_reward": 0.9253472238779068, + "rewards/repetition_penalty_reward": -0.1090216189622879, + "rewards/tag_count_reward": 0.716145858168602, + "step": 65 + }, + { + "clip_ratio": 0.0, + "completion_length": 594.9427261352539, + "epoch": 0.19790104947526238, + "grad_norm": 2.348236466880056, + "kl": 2.451171875, + "learning_rate": 1.944007201054492e-05, + "loss": 0.3924, + "reward": 1.5447391867637634, + "reward_std": 0.509847991168499, + "rewards/accuracy_reward": 0.03645833441987634, + "rewards/reasoning_steps_reward": 0.8750000149011612, + "rewards/repetition_penalty_reward": -0.10630252212285995, + "rewards/tag_count_reward": 0.739583358168602, + "step": 66 + }, + { + "clip_ratio": 0.0, + "completion_length": 620.7760620117188, + "epoch": 0.20089955022488756, + "grad_norm": 0.6607962588027059, + "kl": 4.9921875, + "learning_rate": 1.9404886375423982e-05, + "loss": 0.3226, + "reward": 1.5875399112701416, + "reward_std": 0.30489787086844444, + "rewards/accuracy_reward": 0.015625000465661287, + "rewards/reasoning_steps_reward": 0.9479166865348816, + "rewards/repetition_penalty_reward": -0.11298095621168613, + "rewards/tag_count_reward": 0.7369792014360428, + "step": 67 + }, + { + "clip_ratio": 0.0, + "completion_length": 544.3229293823242, + "epoch": 0.20389805097451275, + "grad_norm": 1.2843597560384168, + "kl": 4.6953125, + "learning_rate": 1.93686624783818e-05, + "loss": 0.2727, + "reward": 1.6841119825839996, + "reward_std": 0.3194814845919609, + "rewards/accuracy_reward": 0.03125000046566129, + "rewards/reasoning_steps_reward": 0.9565972238779068, + "rewards/repetition_penalty_reward": -0.11493314802646637, + "rewards/tag_count_reward": 0.8111979514360428, + "step": 68 + }, + { + "clip_ratio": 0.0, + "completion_length": 547.8958435058594, + "epoch": 0.20689655172413793, + "grad_norm": 0.709620329405288, + "kl": 4.9921875, + "learning_rate": 1.9331404318392028e-05, + "loss": 0.2179, + "reward": 1.8003253638744354, + "reward_std": 0.37343159317970276, + "rewards/accuracy_reward": 0.08854166837409139, + "rewards/reasoning_steps_reward": 0.9565972685813904, + "rewards/repetition_penalty_reward": -0.1015844214707613, + "rewards/tag_count_reward": 0.856770858168602, + "step": 69 + }, + { + "clip_ratio": 0.0, + "completion_length": 495.00000762939453, + "epoch": 0.2098950524737631, + "grad_norm": 0.7619813319396578, + "kl": 4.671875, + "learning_rate": 1.9293116008606838e-05, + "loss": 0.188, + "reward": 1.8160299956798553, + "reward_std": 0.27780742943286896, + "rewards/accuracy_reward": 0.046875000931322575, + "rewards/reasoning_steps_reward": 0.9739583432674408, + "rewards/repetition_penalty_reward": -0.1188659518957138, + "rewards/tag_count_reward": 0.9140625298023224, + "step": 70 + }, + { + "clip_ratio": 0.0, + "completion_length": 646.7083511352539, + "epoch": 0.2128935532233883, + "grad_norm": 0.7272889048910283, + "kl": 5.125, + "learning_rate": 1.925380177590282e-05, + "loss": 0.212, + "reward": 1.8391913771629333, + "reward_std": 0.2926352843642235, + "rewards/accuracy_reward": 0.03125000046566129, + "rewards/reasoning_steps_reward": 0.9635417014360428, + "rewards/repetition_penalty_reward": -0.09830864518880844, + "rewards/tag_count_reward": 0.9427083730697632, + "step": 71 + }, + { + "clip_ratio": 0.0, + "completion_length": 691.5677108764648, + "epoch": 0.2158920539730135, + "grad_norm": 0.5588946822791705, + "kl": 5.1640625, + "learning_rate": 1.921346596041437e-05, + "loss": 0.2451, + "reward": 1.953030288219452, + "reward_std": 0.3250604011118412, + "rewards/accuracy_reward": 0.09895833348855376, + "rewards/reasoning_steps_reward": 0.9583333879709244, + "rewards/repetition_penalty_reward": -0.061292664147913456, + "rewards/tag_count_reward": 0.9570312798023224, + "step": 72 + }, + { + "clip_ratio": 0.0, + "completion_length": 690.0364837646484, + "epoch": 0.21889055472263869, + "grad_norm": 0.5154663707678571, + "kl": 4.48828125, + "learning_rate": 1.917211301505453e-05, + "loss": 0.2857, + "reward": 1.984506070613861, + "reward_std": 0.4579034373164177, + "rewards/accuracy_reward": 0.2083333395421505, + "rewards/reasoning_steps_reward": 0.8940972983837128, + "rewards/repetition_penalty_reward": -0.043705823831260204, + "rewards/tag_count_reward": 0.9257812798023224, + "step": 73 + }, + { + "clip_ratio": 0.0, + "completion_length": 421.1666793823242, + "epoch": 0.22188905547226387, + "grad_norm": 0.7720729416463754, + "kl": 2.33984375, + "learning_rate": 1.9129747505023438e-05, + "loss": 0.2752, + "reward": 2.0761736631393433, + "reward_std": 0.5404656231403351, + "rewards/accuracy_reward": 0.27604167349636555, + "rewards/reasoning_steps_reward": 0.8906250894069672, + "rewards/repetition_penalty_reward": -0.041013902984559536, + "rewards/tag_count_reward": 0.950520858168602, + "step": 74 + }, + { + "clip_ratio": 0.0, + "completion_length": 286.5260429382324, + "epoch": 0.22488755622188905, + "grad_norm": 0.8315806740092944, + "kl": 1.33984375, + "learning_rate": 1.9086374107304312e-05, + "loss": 0.2576, + "reward": 2.031211197376251, + "reward_std": 0.42878295481204987, + "rewards/accuracy_reward": 0.18229167303070426, + "rewards/reasoning_steps_reward": 0.9322916269302368, + "rewards/repetition_penalty_reward": -0.03389316704124212, + "rewards/tag_count_reward": 0.950520858168602, + "step": 75 + }, + { + "clip_ratio": 0.0, + "completion_length": 235.10938262939453, + "epoch": 0.22788605697151423, + "grad_norm": 0.9080987838709209, + "kl": 0.7822265625, + "learning_rate": 1.9041997610147166e-05, + "loss": 0.1866, + "reward": 2.127915918827057, + "reward_std": 0.4003848433494568, + "rewards/accuracy_reward": 0.244791679084301, + "rewards/reasoning_steps_reward": 0.9739583879709244, + "rewards/repetition_penalty_reward": -0.0491675129160285, + "rewards/tag_count_reward": 0.9583333432674408, + "step": 76 + }, + { + "clip_ratio": 0.0, + "completion_length": 257.5625114440918, + "epoch": 0.23088455772113944, + "grad_norm": 1.013137501122327, + "kl": 0.517578125, + "learning_rate": 1.8996622912540182e-05, + "loss": 0.1469, + "reward": 2.171510338783264, + "reward_std": 0.43282945454120636, + "rewards/accuracy_reward": 0.260416679084301, + "rewards/reasoning_steps_reward": 0.9947916567325592, + "rewards/repetition_penalty_reward": -0.048541837371885777, + "rewards/tag_count_reward": 0.9648437798023224, + "step": 77 + }, + { + "clip_ratio": 0.0, + "completion_length": 375.3698043823242, + "epoch": 0.23388305847076463, + "grad_norm": 0.8031277081942563, + "kl": 0.859375, + "learning_rate": 1.8950255023668876e-05, + "loss": 0.4293, + "reward": 2.1424886882305145, + "reward_std": 0.5950686857104301, + "rewards/accuracy_reward": 0.354166679084301, + "rewards/reasoning_steps_reward": 0.9635417014360428, + "rewards/repetition_penalty_reward": -0.046313464641571045, + "rewards/tag_count_reward": 0.8710937649011612, + "step": 78 + }, + { + "clip_ratio": 0.0, + "completion_length": 491.32814025878906, + "epoch": 0.2368815592203898, + "grad_norm": 1.7806633395846703, + "kl": 0.951171875, + "learning_rate": 1.8902899062363142e-05, + "loss": 0.6398, + "reward": 1.9999366104602814, + "reward_std": 0.7513662576675415, + "rewards/accuracy_reward": 0.3437500074505806, + "rewards/reasoning_steps_reward": 0.9409722983837128, + "rewards/repetition_penalty_reward": -0.032181489281356335, + "rewards/tag_count_reward": 0.747395858168602, + "step": 79 + }, + { + "clip_ratio": 0.0, + "completion_length": 629.9583587646484, + "epoch": 0.239880059970015, + "grad_norm": 0.894829793618965, + "kl": 0.8115234375, + "learning_rate": 1.8854560256532098e-05, + "loss": 0.5427, + "reward": 1.8730204403400421, + "reward_std": 0.8077119290828705, + "rewards/accuracy_reward": 0.354166679084301, + "rewards/reasoning_steps_reward": 0.9253472834825516, + "rewards/repetition_penalty_reward": -0.02628516126424074, + "rewards/tag_count_reward": 0.6197916865348816, + "step": 80 + }, + { + "clip_ratio": 0.0, + "completion_length": 446.9895935058594, + "epoch": 0.24287856071964017, + "grad_norm": 1.5621375006184983, + "kl": 0.9130859375, + "learning_rate": 1.8805243942587e-05, + "loss": 0.7422, + "reward": 2.229804575443268, + "reward_std": 0.8122627884149551, + "rewards/accuracy_reward": 0.5520833358168602, + "rewards/reasoning_steps_reward": 0.9565972536802292, + "rewards/repetition_penalty_reward": -0.04970931261777878, + "rewards/tag_count_reward": 0.7708333432674408, + "step": 81 + }, + { + "clip_ratio": 0.0, + "completion_length": 326.78126525878906, + "epoch": 0.24587706146926536, + "grad_norm": 0.734038354451238, + "kl": 0.5283203125, + "learning_rate": 1.8754955564852082e-05, + "loss": 0.4824, + "reward": 2.3457452058792114, + "reward_std": 0.41162994503974915, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.9930555671453476, + "rewards/repetition_penalty_reward": -0.08611250855028629, + "rewards/tag_count_reward": 0.8971354216337204, + "step": 82 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.6093864440918, + "epoch": 0.24887556221889057, + "grad_norm": 0.7617786864490775, + "kl": 0.4599609375, + "learning_rate": 1.870370067496355e-05, + "loss": 0.4915, + "reward": 2.696290135383606, + "reward_std": 0.42367345839738846, + "rewards/accuracy_reward": 0.8697916716337204, + "rewards/reasoning_steps_reward": 0.9756944477558136, + "rewards/repetition_penalty_reward": -0.09581072442233562, + "rewards/tag_count_reward": 0.9466145932674408, + "step": 83 + }, + { + "clip_ratio": 0.0, + "completion_length": 251.5833396911621, + "epoch": 0.2518740629685157, + "grad_norm": 0.8354783555661154, + "kl": 0.4912109375, + "learning_rate": 1.8651484931256685e-05, + "loss": 0.3193, + "reward": 2.5308874249458313, + "reward_std": 0.34281000867486, + "rewards/accuracy_reward": 0.6770833432674408, + "rewards/reasoning_steps_reward": 0.9843750298023224, + "rewards/repetition_penalty_reward": -0.09281065501272678, + "rewards/tag_count_reward": 0.9622395932674408, + "step": 84 + }, + { + "clip_ratio": 0.0, + "completion_length": 217.48438262939453, + "epoch": 0.25487256371814093, + "grad_norm": 0.8219462123752692, + "kl": 0.4619140625, + "learning_rate": 1.8598314098141208e-05, + "loss": 0.3273, + "reward": 2.6751975417137146, + "reward_std": 0.2594960853457451, + "rewards/accuracy_reward": 0.8072916865348816, + "rewards/reasoning_steps_reward": 0.970486119389534, + "rewards/repetition_penalty_reward": -0.0817470382899046, + "rewards/tag_count_reward": 0.9791667014360428, + "step": 85 + }, + { + "clip_ratio": 0.0, + "completion_length": 238.35937881469727, + "epoch": 0.25787106446776614, + "grad_norm": 0.8013857932799926, + "kl": 0.5302734375, + "learning_rate": 1.8544194045464888e-05, + "loss": 0.5448, + "reward": 2.324418604373932, + "reward_std": 0.5969930738210678, + "rewards/accuracy_reward": 0.5625000223517418, + "rewards/reasoning_steps_reward": 0.8784722536802292, + "rewards/repetition_penalty_reward": -0.06837661191821098, + "rewards/tag_count_reward": 0.9518229365348816, + "step": 86 + }, + { + "clip_ratio": 0.0, + "completion_length": 253.87500762939453, + "epoch": 0.2608695652173913, + "grad_norm": 1.2404216424832606, + "kl": 0.623046875, + "learning_rate": 1.848913074786555e-05, + "loss": 0.3312, + "reward": 2.4448606371879578, + "reward_std": 0.5359718501567841, + "rewards/accuracy_reward": 0.6927083507180214, + "rewards/reasoning_steps_reward": 0.8732638955116272, + "rewards/repetition_penalty_reward": -0.07163260504603386, + "rewards/tag_count_reward": 0.950520858168602, + "step": 87 + }, + { + "clip_ratio": 0.0, + "completion_length": 266.10939025878906, + "epoch": 0.2638680659670165, + "grad_norm": 2.28058295704287, + "kl": 0.61572265625, + "learning_rate": 1.843313028411149e-05, + "loss": 0.5956, + "reward": 2.511428415775299, + "reward_std": 0.48727843910455704, + "rewards/accuracy_reward": 0.708333358168602, + "rewards/reasoning_steps_reward": 0.9253472536802292, + "rewards/repetition_penalty_reward": -0.05845023598521948, + "rewards/tag_count_reward": 0.9361979365348816, + "step": 88 + }, + { + "clip_ratio": 0.0, + "completion_length": 290.9114685058594, + "epoch": 0.26686656671664166, + "grad_norm": 259.99242737782754, + "kl": 3.0830078125, + "learning_rate": 1.8376198836430415e-05, + "loss": 0.6364, + "reward": 2.46868097782135, + "reward_std": 0.635415643453598, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9461806118488312, + "rewards/repetition_penalty_reward": -0.06083299312740564, + "rewards/tag_count_reward": 0.9166667014360428, + "step": 89 + }, + { + "clip_ratio": 0.0, + "completion_length": 379.1302185058594, + "epoch": 0.2698650674662669, + "grad_norm": 1.9382632692598936, + "kl": 0.8193359375, + "learning_rate": 1.8318342689826938e-05, + "loss": 0.7494, + "reward": 2.3082011342048645, + "reward_std": 0.7488896995782852, + "rewards/accuracy_reward": 0.5572916716337204, + "rewards/reasoning_steps_reward": 0.9392361342906952, + "rewards/repetition_penalty_reward": -0.04249336663633585, + "rewards/tag_count_reward": 0.8541667014360428, + "step": 90 + }, + { + "clip_ratio": 0.0, + "completion_length": 344.7395935058594, + "epoch": 0.272863568215892, + "grad_norm": 1.4041925703207834, + "kl": 0.7041015625, + "learning_rate": 1.8259568231388737e-05, + "loss": 0.7354, + "reward": 2.346260666847229, + "reward_std": 0.6538338512182236, + "rewards/accuracy_reward": 0.552083358168602, + "rewards/reasoning_steps_reward": 0.9739583730697632, + "rewards/repetition_penalty_reward": -0.04696857463568449, + "rewards/tag_count_reward": 0.8671875149011612, + "step": 91 + }, + { + "clip_ratio": 0.0, + "completion_length": 329.8073043823242, + "epoch": 0.27586206896551724, + "grad_norm": 0.9388360459997904, + "kl": 0.697265625, + "learning_rate": 1.819988194958146e-05, + "loss": 0.7864, + "reward": 2.366865336894989, + "reward_std": 0.7087779939174652, + "rewards/accuracy_reward": 0.5937500149011612, + "rewards/reasoning_steps_reward": 0.947916716337204, + "rewards/repetition_penalty_reward": -0.053707641549408436, + "rewards/tag_count_reward": 0.8789062798023224, + "step": 92 + }, + { + "clip_ratio": 0.0, + "completion_length": 323.0104293823242, + "epoch": 0.27886056971514245, + "grad_norm": 1.0904070944040622, + "kl": 0.67578125, + "learning_rate": 1.8139290433532415e-05, + "loss": 0.7764, + "reward": 2.373181402683258, + "reward_std": 0.7174654752016068, + "rewards/accuracy_reward": 0.6093750298023224, + "rewards/reasoning_steps_reward": 0.9392361044883728, + "rewards/repetition_penalty_reward": -0.05433602724224329, + "rewards/tag_count_reward": 0.8789062798023224, + "step": 93 + }, + { + "clip_ratio": 0.0, + "completion_length": 286.31250762939453, + "epoch": 0.2818590704647676, + "grad_norm": 1.345138349799746, + "kl": 0.6591796875, + "learning_rate": 1.807780037230315e-05, + "loss": 0.9459, + "reward": 2.401767611503601, + "reward_std": 0.7818585783243179, + "rewards/accuracy_reward": 0.6718750149011612, + "rewards/reasoning_steps_reward": 0.9218750447034836, + "rewards/repetition_penalty_reward": -0.052659488283097744, + "rewards/tag_count_reward": 0.8606770932674408, + "step": 94 + }, + { + "clip_ratio": 0.0, + "completion_length": 320.9583435058594, + "epoch": 0.2848575712143928, + "grad_norm": 1.6639693291434057, + "kl": 0.837890625, + "learning_rate": 1.8015418554151024e-05, + "loss": 0.7482, + "reward": 2.097085416316986, + "reward_std": 0.9631160348653793, + "rewards/accuracy_reward": 0.536458358168602, + "rewards/reasoning_steps_reward": 0.8246527761220932, + "rewards/repetition_penalty_reward": -0.04397374298423529, + "rewards/tag_count_reward": 0.7799479216337204, + "step": 95 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.17187881469727, + "epoch": 0.28785607196401797, + "grad_norm": 2.6809348147477285, + "kl": 1.669921875, + "learning_rate": 1.7952151865779792e-05, + "loss": -0.1937, + "reward": 1.646718680858612, + "reward_std": 1.0644067823886871, + "rewards/accuracy_reward": 0.3437500037252903, + "rewards/reasoning_steps_reward": 0.7274306118488312, + "rewards/repetition_penalty_reward": -0.03253482934087515, + "rewards/tag_count_reward": 0.6080729514360428, + "step": 96 + }, + { + "clip_ratio": 0.0, + "completion_length": 137.05208587646484, + "epoch": 0.2908545727136432, + "grad_norm": 1.661973040336361, + "kl": 0.6298828125, + "learning_rate": 1.7888007291579357e-05, + "loss": -0.0513, + "reward": 2.4063791632652283, + "reward_std": 0.647190622985363, + "rewards/accuracy_reward": 0.6197916865348816, + "rewards/reasoning_steps_reward": 0.9531250596046448, + "rewards/repetition_penalty_reward": -0.0636730408295989, + "rewards/tag_count_reward": 0.8971354365348816, + "step": 97 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.68750381469727, + "epoch": 0.2938530734632684, + "grad_norm": 2.7127088778027684, + "kl": 0.8984375, + "learning_rate": 1.7822991912854716e-05, + "loss": 0.195, + "reward": 2.5627546310424805, + "reward_std": 0.4789634570479393, + "rewards/accuracy_reward": 0.7135416865348816, + "rewards/reasoning_steps_reward": 0.9826389104127884, + "rewards/repetition_penalty_reward": -0.060509427450597286, + "rewards/tag_count_reward": 0.9270833432674408, + "step": 98 + }, + { + "clip_ratio": 0.0, + "completion_length": 211.39062881469727, + "epoch": 0.29685157421289354, + "grad_norm": 18.504039023965966, + "kl": 1.9638671875, + "learning_rate": 1.77571129070442e-05, + "loss": 0.3326, + "reward": 2.2352594137191772, + "reward_std": 0.7260878309607506, + "rewards/accuracy_reward": 0.583333358168602, + "rewards/reasoning_steps_reward": 0.890625, + "rewards/repetition_penalty_reward": -0.06291783228516579, + "rewards/tag_count_reward": 0.82421875, + "step": 99 + }, + { + "clip_ratio": 0.0, + "completion_length": 410.6458435058594, + "epoch": 0.29985007496251875, + "grad_norm": 7.369461971351607, + "kl": 1.18359375, + "learning_rate": 1.7690377546927134e-05, + "loss": 0.182, + "reward": 2.507060259580612, + "reward_std": 0.49995335936546326, + "rewards/accuracy_reward": 0.6458333488553762, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.055439687334001064, + "rewards/tag_count_reward": 0.9375000298023224, + "step": 100 + }, + { + "clip_ratio": 0.0, + "completion_length": 614.0625152587891, + "epoch": 0.3028485757121439, + "grad_norm": 4.425874093822116, + "kl": 1.77734375, + "learning_rate": 1.7622793199820935e-05, + "loss": 0.2128, + "reward": 2.6125723719596863, + "reward_std": 0.4479042589664459, + "rewards/accuracy_reward": 0.7239583432674408, + "rewards/reasoning_steps_reward": 0.987847238779068, + "rewards/repetition_penalty_reward": -0.0510562164708972, + "rewards/tag_count_reward": 0.9518229216337204, + "step": 101 + }, + { + "clip_ratio": 0.0, + "completion_length": 817.4895935058594, + "epoch": 0.3058470764617691, + "grad_norm": 4.565018933297784, + "kl": 3.43359375, + "learning_rate": 1.7554367326767793e-05, + "loss": 0.1219, + "reward": 2.5981725454330444, + "reward_std": 0.4292430207133293, + "rewards/accuracy_reward": 0.6875000223517418, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.043320574797689915, + "rewards/tag_count_reward": 0.9609375298023224, + "step": 102 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.87500381469727, + "epoch": 0.30884557721139433, + "grad_norm": 14.749084867590199, + "kl": 1.5126953125, + "learning_rate": 1.7485107481711014e-05, + "loss": 0.3689, + "reward": 2.522024154663086, + "reward_std": 0.3764337971806526, + "rewards/accuracy_reward": 0.645833358168602, + "rewards/reasoning_steps_reward": 0.9878472238779068, + "rewards/repetition_penalty_reward": -0.07910444028675556, + "rewards/tag_count_reward": 0.9674479365348816, + "step": 103 + }, + { + "clip_ratio": 0.0, + "completion_length": 140.06771278381348, + "epoch": 0.3118440779610195, + "grad_norm": 1.5854423021252229, + "kl": 0.75390625, + "learning_rate": 1.7415021310661073e-05, + "loss": 0.2421, + "reward": 2.5152770280838013, + "reward_std": 0.5358111336827278, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.989583358168602, + "rewards/repetition_penalty_reward": -0.08628544956445694, + "rewards/tag_count_reward": 0.9453125149011612, + "step": 104 + }, + { + "clip_ratio": 0.0, + "completion_length": 167.8229217529297, + "epoch": 0.3148425787106447, + "grad_norm": 2.0606435436365245, + "kl": 0.578125, + "learning_rate": 1.7344116550851546e-05, + "loss": 0.477, + "reward": 2.623923897743225, + "reward_std": 0.48354143649339676, + "rewards/accuracy_reward": 0.7343750149011612, + "rewards/reasoning_steps_reward": 0.9947916716337204, + "rewards/repetition_penalty_reward": -0.07659695856273174, + "rewards/tag_count_reward": 0.9713541865348816, + "step": 105 + }, + { + "clip_ratio": 0.0, + "completion_length": 142.77083587646484, + "epoch": 0.31784107946026985, + "grad_norm": 10.522238174054582, + "kl": 1.28515625, + "learning_rate": 1.7272401029884932e-05, + "loss": 0.2673, + "reward": 2.4795719981193542, + "reward_std": 0.5011198297142982, + "rewards/accuracy_reward": 0.5937500149011612, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.08379613049328327, + "rewards/tag_count_reward": 0.9765625149011612, + "step": 106 + }, + { + "clip_ratio": 0.0, + "completion_length": 134.3802146911621, + "epoch": 0.32083958020989506, + "grad_norm": 3.3181439930079146, + "kl": 0.7978515625, + "learning_rate": 1.719988266486854e-05, + "loss": 0.1636, + "reward": 2.5852218866348267, + "reward_std": 0.4785446897149086, + "rewards/accuracy_reward": 0.6875000149011612, + "rewards/reasoning_steps_reward": 0.9947916716337204, + "rewards/repetition_penalty_reward": -0.08665317296981812, + "rewards/tag_count_reward": 0.9895833432674408, + "step": 107 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.4479217529297, + "epoch": 0.3238380809595202, + "grad_norm": 3.2954339112996336, + "kl": 0.8662109375, + "learning_rate": 1.7126569461540445e-05, + "loss": 0.1118, + "reward": 2.564746856689453, + "reward_std": 0.34381402283906937, + "rewards/accuracy_reward": 0.661458358168602, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.08759699948132038, + "rewards/tag_count_reward": 0.9908854365348816, + "step": 108 + }, + { + "clip_ratio": 0.0, + "completion_length": 170.9166717529297, + "epoch": 0.3268365817091454, + "grad_norm": 5.1235193996536745, + "kl": 1.318359375, + "learning_rate": 1.70524695133857e-05, + "loss": 0.2634, + "reward": 2.745190441608429, + "reward_std": 0.3354305140674114, + "rewards/accuracy_reward": 0.8697916865348816, + "rewards/reasoning_steps_reward": 0.9947916716337204, + "rewards/repetition_penalty_reward": -0.09465333260595798, + "rewards/tag_count_reward": 0.9752604365348816, + "step": 109 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.3489646911621, + "epoch": 0.32983508245877063, + "grad_norm": 6.042829717081394, + "kl": 0.63818359375, + "learning_rate": 1.6977591000742855e-05, + "loss": 0.1922, + "reward": 2.687567353248596, + "reward_std": 0.2190675288438797, + "rewards/accuracy_reward": 0.7916667014360428, + "rewards/reasoning_steps_reward": 0.9947916716337204, + "rewards/repetition_penalty_reward": -0.08326606638729572, + "rewards/tag_count_reward": 0.9843750149011612, + "step": 110 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.07813262939453, + "epoch": 0.3328335832083958, + "grad_norm": 27.456070537172923, + "kl": 1.2197265625, + "learning_rate": 1.6901942189900867e-05, + "loss": 0.0855, + "reward": 2.70491623878479, + "reward_std": 0.4226943477988243, + "rewards/accuracy_reward": 0.7968750149011612, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.07372974790632725, + "rewards/tag_count_reward": 0.981770858168602, + "step": 111 + }, + { + "clip_ratio": 0.0, + "completion_length": 152.4583396911621, + "epoch": 0.335832083958021, + "grad_norm": 1317.4357368028043, + "kl": 24.67529296875, + "learning_rate": 1.6825531432186545e-05, + "loss": 1.4049, + "reward": 2.6002301573753357, + "reward_std": 0.3956380560994148, + "rewards/accuracy_reward": 0.6979166716337204, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.0807595532387495, + "rewards/tag_count_reward": 0.9830729365348816, + "step": 112 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.70313262939453, + "epoch": 0.33883058470764615, + "grad_norm": 186.82025106193726, + "kl": 5.892578125, + "learning_rate": 1.6748367163042577e-05, + "loss": 0.1938, + "reward": 2.6722583174705505, + "reward_std": 0.47787418961524963, + "rewards/accuracy_reward": 0.7968750149011612, + "rewards/reasoning_steps_reward": 0.9947916716337204, + "rewards/repetition_penalty_reward": -0.07123132981359959, + "rewards/tag_count_reward": 0.9518229216337204, + "step": 113 + }, + { + "clip_ratio": 0.0, + "completion_length": 151.92188262939453, + "epoch": 0.34182908545727136, + "grad_norm": 4.951400788715416, + "kl": 0.75830078125, + "learning_rate": 1.6670457901096328e-05, + "loss": 0.0216, + "reward": 2.5308876037597656, + "reward_std": 0.4789142981171608, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.07197706028819084, + "rewards/tag_count_reward": 0.9361979514360428, + "step": 114 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.68750381469727, + "epoch": 0.3448275862068966, + "grad_norm": 1.1162974968117996, + "kl": 0.4677734375, + "learning_rate": 1.659181224721938e-05, + "loss": 0.021, + "reward": 2.730695605278015, + "reward_std": 0.3208945095539093, + "rewards/accuracy_reward": 0.8385416865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.07008570153266191, + "rewards/tag_count_reward": 0.9622395932674408, + "step": 115 + }, + { + "clip_ratio": 0.0, + "completion_length": 166.71875381469727, + "epoch": 0.34782608695652173, + "grad_norm": 2.052554622098234, + "kl": 0.48193359375, + "learning_rate": 1.6512438883578047e-05, + "loss": 0.0718, + "reward": 2.7236560583114624, + "reward_std": 0.2550913393497467, + "rewards/accuracy_reward": 0.8437500298023224, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.0706147812306881, + "rewards/tag_count_reward": 0.9505208432674408, + "step": 116 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.95833587646484, + "epoch": 0.35082458770614694, + "grad_norm": 7.239442859707766, + "kl": 1.1904296875, + "learning_rate": 1.6432346572674897e-05, + "loss": 0.0808, + "reward": 2.7389387488365173, + "reward_std": 0.36363864317536354, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.06835305411368608, + "rewards/tag_count_reward": 0.9531250149011612, + "step": 117 + }, + { + "clip_ratio": 0.0, + "completion_length": 159.48958587646484, + "epoch": 0.3538230884557721, + "grad_norm": 2.1682395821362865, + "kl": 0.53564453125, + "learning_rate": 1.6351544156381413e-05, + "loss": 0.0727, + "reward": 2.877605438232422, + "reward_std": 0.2183693777769804, + "rewards/accuracy_reward": 0.9687500149011612, + "rewards/reasoning_steps_reward": 0.9965277761220932, + "rewards/repetition_penalty_reward": -0.055120449513196945, + "rewards/tag_count_reward": 0.9674479365348816, + "step": 118 + }, + { + "clip_ratio": 0.0, + "completion_length": 157.2395896911621, + "epoch": 0.3568215892053973, + "grad_norm": 38.71837348464348, + "kl": 2.7021484375, + "learning_rate": 1.6270040554961866e-05, + "loss": 0.2602, + "reward": 2.6591410040855408, + "reward_std": 0.34208307787775993, + "rewards/accuracy_reward": 0.739583358168602, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.05440086964517832, + "rewards/tag_count_reward": 0.973958358168602, + "step": 119 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.41666793823242, + "epoch": 0.3598200899550225, + "grad_norm": 4.316214986822223, + "kl": 0.68115234375, + "learning_rate": 1.6187844766088586e-05, + "loss": 0.2187, + "reward": 2.7976441979408264, + "reward_std": 0.3110164441168308, + "rewards/accuracy_reward": 0.8802083432674408, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.043501587584614754, + "rewards/tag_count_reward": 0.9609375149011612, + "step": 120 + }, + { + "clip_ratio": 0.0, + "completion_length": 130.14062690734863, + "epoch": 0.36281859070464767, + "grad_norm": 1.648081178543267, + "kl": 0.45947265625, + "learning_rate": 1.6104965863848615e-05, + "loss": -0.0071, + "reward": 2.7758957743644714, + "reward_std": 0.303048393689096, + "rewards/accuracy_reward": 0.8593750149011612, + "rewards/reasoning_steps_reward": 0.9930555671453476, + "rewards/repetition_penalty_reward": -0.027055577840656042, + "rewards/tag_count_reward": 0.9505208432674408, + "step": 121 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.65104675292969, + "epoch": 0.3658170914542729, + "grad_norm": 1.2095122817688286, + "kl": 0.59716796875, + "learning_rate": 1.6021412997741994e-05, + "loss": -0.0131, + "reward": 2.820314347743988, + "reward_std": 0.2720828726887703, + "rewards/accuracy_reward": 0.8854166865348816, + "rewards/reasoning_steps_reward": 0.9843750149011612, + "rewards/repetition_penalty_reward": -0.027342125307768583, + "rewards/tag_count_reward": 0.977864608168602, + "step": 122 + }, + { + "clip_ratio": 0.0, + "completion_length": 149.05729293823242, + "epoch": 0.36881559220389803, + "grad_norm": 58.69921900530141, + "kl": 5.037109375, + "learning_rate": 1.593719539167169e-05, + "loss": 0.309, + "reward": 2.668303608894348, + "reward_std": 0.38671524077653885, + "rewards/accuracy_reward": 0.7343750149011612, + "rewards/reasoning_steps_reward": 0.986111119389534, + "rewards/repetition_penalty_reward": -0.01963039650581777, + "rewards/tag_count_reward": 0.9674479365348816, + "step": 123 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.10937881469727, + "epoch": 0.37181409295352325, + "grad_norm": 13.742825745337187, + "kl": 0.82275390625, + "learning_rate": 1.5852322342925294e-05, + "loss": 0.0778, + "reward": 2.820654571056366, + "reward_std": 0.292279414832592, + "rewards/accuracy_reward": 0.8750000149011612, + "rewards/reasoning_steps_reward": 0.9947916716337204, + "rewards/repetition_penalty_reward": -0.021793504944071174, + "rewards/tag_count_reward": 0.9726562649011612, + "step": 124 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.92187881469727, + "epoch": 0.3748125937031484, + "grad_norm": 2.232033331871603, + "kl": 0.484375, + "learning_rate": 1.5766803221148676e-05, + "loss": 0.105, + "reward": 2.7213205695152283, + "reward_std": 0.32770272716879845, + "rewards/accuracy_reward": 0.7760416865348816, + "rewards/reasoning_steps_reward": 0.9947916716337204, + "rewards/repetition_penalty_reward": -0.0208670892752707, + "rewards/tag_count_reward": 0.9713541865348816, + "step": 125 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.58854293823242, + "epoch": 0.3778110944527736, + "grad_norm": 1.0178638016852966, + "kl": 0.49951171875, + "learning_rate": 1.568064746731156e-05, + "loss": 0.011, + "reward": 2.6907403469085693, + "reward_std": 0.35912006720900536, + "rewards/accuracy_reward": 0.7447916865348816, + "rewards/reasoning_steps_reward": 0.9843750149011612, + "rewards/repetition_penalty_reward": -0.01498887687921524, + "rewards/tag_count_reward": 0.9765625149011612, + "step": 126 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.31250190734863, + "epoch": 0.3808095952023988, + "grad_norm": 1.1770522092684346, + "kl": 0.888671875, + "learning_rate": 1.5593864592665333e-05, + "loss": -0.0463, + "reward": 2.7778323888778687, + "reward_std": 0.38024942576885223, + "rewards/accuracy_reward": 0.848958358168602, + "rewards/reasoning_steps_reward": 0.9756944626569748, + "rewards/repetition_penalty_reward": -0.014268482336774468, + "rewards/tag_count_reward": 0.9674479365348816, + "step": 127 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.88021087646484, + "epoch": 0.383808095952024, + "grad_norm": 1.1780540061687166, + "kl": 0.45703125, + "learning_rate": 1.550646417769301e-05, + "loss": -0.0279, + "reward": 2.692775547504425, + "reward_std": 0.4350905865430832, + "rewards/accuracy_reward": 0.755208358168602, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.012953592464327812, + "rewards/tag_count_reward": 0.9713541865348816, + "step": 128 + }, + { + "clip_ratio": 0.0, + "completion_length": 113.96354484558105, + "epoch": 0.3868065967016492, + "grad_norm": 1.695442486056536, + "kl": 0.6533203125, + "learning_rate": 1.541845587105159e-05, + "loss": -0.0497, + "reward": 2.6655062437057495, + "reward_std": 0.5277450531721115, + "rewards/accuracy_reward": 0.7656250298023224, + "rewards/reasoning_steps_reward": 0.965277835726738, + "rewards/repetition_penalty_reward": -0.014615388121455908, + "rewards/tag_count_reward": 0.9492187649011612, + "step": 129 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.38020896911621, + "epoch": 0.38980509745127434, + "grad_norm": 1.53256816336331, + "kl": 0.7158203125, + "learning_rate": 1.532984938850689e-05, + "loss": -0.0148, + "reward": 2.6938071250915527, + "reward_std": 0.4943315237760544, + "rewards/accuracy_reward": 0.7760416865348816, + "rewards/reasoning_steps_reward": 0.9756944924592972, + "rewards/repetition_penalty_reward": -0.009752006619237363, + "rewards/tag_count_reward": 0.9518229365348816, + "step": 130 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.53646087646484, + "epoch": 0.39280359820089955, + "grad_norm": 2.1945480451348214, + "kl": 0.9951171875, + "learning_rate": 1.524065451186095e-05, + "loss": -0.0313, + "reward": 2.531009793281555, + "reward_std": 0.5116675943136215, + "rewards/accuracy_reward": 0.6302083432674408, + "rewards/reasoning_steps_reward": 0.9670139253139496, + "rewards/repetition_penalty_reward": -0.016733432421460748, + "rewards/tag_count_reward": 0.9505208432674408, + "step": 131 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.78125381469727, + "epoch": 0.39580209895052476, + "grad_norm": 4.5129778094997155, + "kl": 1.5009765625, + "learning_rate": 1.5150881087872184e-05, + "loss": 0.0509, + "reward": 2.7841535210609436, + "reward_std": 0.3951072469353676, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9774305820465088, + "rewards/repetition_penalty_reward": -0.013589567271992564, + "rewards/tag_count_reward": 0.966145858168602, + "step": 132 + }, + { + "clip_ratio": 0.0, + "completion_length": 127.78646087646484, + "epoch": 0.3988005997001499, + "grad_norm": 38.945437112042384, + "kl": 6.130859375, + "learning_rate": 1.5060539027168317e-05, + "loss": 0.2991, + "reward": 2.6191622018814087, + "reward_std": 0.470632191747427, + "rewards/accuracy_reward": 0.7031250149011612, + "rewards/reasoning_steps_reward": 0.9774305671453476, + "rewards/repetition_penalty_reward": -0.013216342777013779, + "rewards/tag_count_reward": 0.9518229365348816, + "step": 133 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.30208587646484, + "epoch": 0.4017991004497751, + "grad_norm": 11.506961978299184, + "kl": 2.6162109375, + "learning_rate": 1.4969638303152296e-05, + "loss": 0.0406, + "reward": 2.60586279630661, + "reward_std": 0.49396244436502457, + "rewards/accuracy_reward": 0.692708358168602, + "rewards/reasoning_steps_reward": 0.9687500596046448, + "rewards/repetition_penalty_reward": -0.019137236289680004, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 134 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.50521278381348, + "epoch": 0.4047976011994003, + "grad_norm": 2.0647869792670006, + "kl": 0.6298828125, + "learning_rate": 1.4878188950901275e-05, + "loss": 0.0324, + "reward": 2.8466954231262207, + "reward_std": 0.35728102922439575, + "rewards/accuracy_reward": 0.9010416865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.011811425443738699, + "rewards/tag_count_reward": 0.9713541865348816, + "step": 135 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.55729293823242, + "epoch": 0.4077961019490255, + "grad_norm": 1.2964235131560873, + "kl": 0.46630859375, + "learning_rate": 1.4786201066058767e-05, + "loss": 0.023, + "reward": 2.9288823008537292, + "reward_std": 0.20188137842342257, + "rewards/accuracy_reward": 0.9531250149011612, + "rewards/reasoning_steps_reward": 0.9947916716337204, + "rewards/repetition_penalty_reward": -0.011221994878724217, + "rewards/tag_count_reward": 0.9921875149011612, + "step": 136 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.47917175292969, + "epoch": 0.4107946026986507, + "grad_norm": 4.427417919453853, + "kl": 0.8115234375, + "learning_rate": 1.4693684803720139e-05, + "loss": 0.0257, + "reward": 2.752627432346344, + "reward_std": 0.28887180984020233, + "rewards/accuracy_reward": 0.7968750149011612, + "rewards/reasoning_steps_reward": 0.9878472238779068, + "rewards/repetition_penalty_reward": -0.017771947663277388, + "rewards/tag_count_reward": 0.9856770932674408, + "step": 137 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.52604484558105, + "epoch": 0.41379310344827586, + "grad_norm": 3.187742864763733, + "kl": 0.6494140625, + "learning_rate": 1.4600650377311523e-05, + "loss": 0.0343, + "reward": 2.9099594950675964, + "reward_std": 0.214247893425636, + "rewards/accuracy_reward": 0.9375000149011612, + "rewards/reasoning_steps_reward": 0.9947916716337204, + "rewards/repetition_penalty_reward": -0.011915652547031641, + "rewards/tag_count_reward": 0.9895833432674408, + "step": 138 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.72396087646484, + "epoch": 0.41679160419790107, + "grad_norm": 1.8697959272406102, + "kl": 0.705078125, + "learning_rate": 1.4507108057462297e-05, + "loss": -0.0029, + "reward": 2.6899372935295105, + "reward_std": 0.2231074832379818, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9878472238779068, + "rewards/repetition_penalty_reward": -0.010149642825126648, + "rewards/tag_count_reward": 0.9830729365348816, + "step": 139 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.42708778381348, + "epoch": 0.4197901049475262, + "grad_norm": 23.21366760717317, + "kl": 1.552734375, + "learning_rate": 1.4413068170871252e-05, + "loss": 0.0225, + "reward": 2.7771247029304504, + "reward_std": 0.2754965058993548, + "rewards/accuracy_reward": 0.8229167014360428, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.008465770864859223, + "rewards/tag_count_reward": 0.9765625149011612, + "step": 140 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.60937881469727, + "epoch": 0.42278860569715143, + "grad_norm": 1.2588277588625658, + "kl": 0.47802734375, + "learning_rate": 1.4318541099166556e-05, + "loss": 0.03, + "reward": 2.9449267387390137, + "reward_std": 0.10798337496817112, + "rewards/accuracy_reward": 0.9635416716337204, + "rewards/reasoning_steps_reward": 0.9965277761220932, + "rewards/repetition_penalty_reward": -0.009934437868651003, + "rewards/tag_count_reward": 0.9947916716337204, + "step": 141 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.51041793823242, + "epoch": 0.4257871064467766, + "grad_norm": 1.6180185537864251, + "kl": 0.7041015625, + "learning_rate": 1.4223537277759667e-05, + "loss": -0.0281, + "reward": 2.9196689128875732, + "reward_std": 0.2676460010698065, + "rewards/accuracy_reward": 0.9635416865348816, + "rewards/reasoning_steps_reward": 0.9809027910232544, + "rewards/repetition_penalty_reward": -0.005244338244665414, + "rewards/tag_count_reward": 0.98046875, + "step": 142 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.96354293823242, + "epoch": 0.4287856071964018, + "grad_norm": 3.173672388607253, + "kl": 0.861328125, + "learning_rate": 1.4128067194693316e-05, + "loss": 0.0193, + "reward": 2.866965413093567, + "reward_std": 0.36002135276794434, + "rewards/accuracy_reward": 0.911458358168602, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.006298626307398081, + "rewards/tag_count_reward": 0.9687500149011612, + "step": 143 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.04166984558105, + "epoch": 0.431784107946027, + "grad_norm": 3.72381753576245, + "kl": 1.1474609375, + "learning_rate": 1.4032141389483648e-05, + "loss": 0.0526, + "reward": 2.784775972366333, + "reward_std": 0.2770255096256733, + "rewards/accuracy_reward": 0.817708358168602, + "rewards/reasoning_steps_reward": 0.9913194328546524, + "rewards/repetition_penalty_reward": -0.007324688020162284, + "rewards/tag_count_reward": 0.9830729365348816, + "step": 144 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.05208396911621, + "epoch": 0.43478260869565216, + "grad_norm": 98.84817232879341, + "kl": 7.1748046875, + "learning_rate": 1.3935770451956732e-05, + "loss": 0.2566, + "reward": 2.7451387643814087, + "reward_std": 0.23619456216692924, + "rewards/accuracy_reward": 0.770833358168602, + "rewards/reasoning_steps_reward": 0.9947916865348816, + "rewards/repetition_penalty_reward": -0.006163446931168437, + "rewards/tag_count_reward": 0.9856770932674408, + "step": 145 + }, + { + "clip_ratio": 0.0, + "completion_length": 95.88541984558105, + "epoch": 0.43778110944527737, + "grad_norm": 1.6790256708529574, + "kl": 0.71337890625, + "learning_rate": 1.3838965021079447e-05, + "loss": 0.022, + "reward": 2.582995355129242, + "reward_std": 0.33666881918907166, + "rewards/accuracy_reward": 0.630208358168602, + "rewards/reasoning_steps_reward": 0.9756944477558136, + "rewards/repetition_penalty_reward": -0.0033762191596906632, + "rewards/tag_count_reward": 0.9804687649011612, + "step": 146 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.12500190734863, + "epoch": 0.4407796101949025, + "grad_norm": 1.000815515911012, + "kl": 0.4951171875, + "learning_rate": 1.3741735783785022e-05, + "loss": 0.014, + "reward": 2.7670981884002686, + "reward_std": 0.24415395595133305, + "rewards/accuracy_reward": 0.7864583432674408, + "rewards/reasoning_steps_reward": 0.9947916716337204, + "rewards/repetition_penalty_reward": -0.0037352032377384603, + "rewards/tag_count_reward": 0.9895833432674408, + "step": 147 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.77604293823242, + "epoch": 0.44377811094452774, + "grad_norm": 10.482881482497314, + "kl": 1.677734375, + "learning_rate": 1.3644093473793213e-05, + "loss": 0.0242, + "reward": 2.8607059121131897, + "reward_std": 0.39441923797130585, + "rewards/accuracy_reward": 0.9166667014360428, + "rewards/reasoning_steps_reward": 0.987847238779068, + "rewards/repetition_penalty_reward": -0.004745513964735437, + "rewards/tag_count_reward": 0.9609375149011612, + "step": 148 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.33333587646484, + "epoch": 0.44677661169415295, + "grad_norm": 3.0619014190201397, + "kl": 0.70263671875, + "learning_rate": 1.3546048870425356e-05, + "loss": -0.0034, + "reward": 2.7546103596687317, + "reward_std": 0.3782291766256094, + "rewards/accuracy_reward": 0.802083358168602, + "rewards/reasoning_steps_reward": 0.9826389104127884, + "rewards/repetition_penalty_reward": -0.007976488093845546, + "rewards/tag_count_reward": 0.9778645932674408, + "step": 149 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.68750381469727, + "epoch": 0.4497751124437781, + "grad_norm": 1.757178598271745, + "kl": 0.6220703125, + "learning_rate": 1.3447612797414371e-05, + "loss": -0.0372, + "reward": 2.696329712867737, + "reward_std": 0.48958031833171844, + "rewards/accuracy_reward": 0.78125, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.003757232567295432, + "rewards/tag_count_reward": 0.9466146230697632, + "step": 150 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.88021087646484, + "epoch": 0.4527736131934033, + "grad_norm": 1.0864550515764144, + "kl": 0.47802734375, + "learning_rate": 1.3348796121709862e-05, + "loss": -0.0041, + "reward": 2.914130389690399, + "reward_std": 0.26327061653137207, + "rewards/accuracy_reward": 0.942708358168602, + "rewards/reasoning_steps_reward": 0.991319477558136, + "rewards/repetition_penalty_reward": -0.004272434976883233, + "rewards/tag_count_reward": 0.9843750149011612, + "step": 151 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.10416793823242, + "epoch": 0.45577211394302847, + "grad_norm": 1.1731563120826676, + "kl": 0.47802734375, + "learning_rate": 1.3249609752278454e-05, + "loss": 0.006, + "reward": 2.5750681161880493, + "reward_std": 0.31880153343081474, + "rewards/accuracy_reward": 0.6093750074505806, + "rewards/reasoning_steps_reward": 0.9930555671453476, + "rewards/repetition_penalty_reward": -0.006529179809149355, + "rewards/tag_count_reward": 0.9791666716337204, + "step": 152 + }, + { + "clip_ratio": 0.0, + "completion_length": 92.29166984558105, + "epoch": 0.4587706146926537, + "grad_norm": 1.1346690177946404, + "kl": 0.45361328125, + "learning_rate": 1.315006463889948e-05, + "loss": -0.0176, + "reward": 2.9095540046691895, + "reward_std": 0.2783464193344116, + "rewards/accuracy_reward": 0.9531250298023224, + "rewards/reasoning_steps_reward": 0.9826389104127884, + "rewards/repetition_penalty_reward": -0.0027724849642254412, + "rewards/tag_count_reward": 0.9765625149011612, + "step": 153 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.83333587646484, + "epoch": 0.4617691154422789, + "grad_norm": 1.1933914174647031, + "kl": 0.48486328125, + "learning_rate": 1.3050171770956176e-05, + "loss": 0.0015, + "reward": 2.72132408618927, + "reward_std": 0.32843077182769775, + "rewards/accuracy_reward": 0.7656250298023224, + "rewards/reasoning_steps_reward": 0.9895833432674408, + "rewards/repetition_penalty_reward": -0.009144885872956365, + "rewards/tag_count_reward": 0.9752604365348816, + "step": 154 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.83333587646484, + "epoch": 0.46476761619190404, + "grad_norm": 1.2021972493346804, + "kl": 0.47314453125, + "learning_rate": 1.2949942176222497e-05, + "loss": -0.0459, + "reward": 2.7491848468780518, + "reward_std": 0.4561289846897125, + "rewards/accuracy_reward": 0.8229167014360428, + "rewards/reasoning_steps_reward": 0.9791666716337204, + "rewards/repetition_penalty_reward": -0.0034193213214166462, + "rewards/tag_count_reward": 0.9505208432674408, + "step": 155 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.64062690734863, + "epoch": 0.46776611694152925, + "grad_norm": 1.3131511244620084, + "kl": 0.56982421875, + "learning_rate": 1.2849386919645686e-05, + "loss": -0.0388, + "reward": 2.743965268135071, + "reward_std": 0.39171791821718216, + "rewards/accuracy_reward": 0.8281250149011612, + "rewards/reasoning_steps_reward": 0.9826389104127884, + "rewards/repetition_penalty_reward": -0.010809163737576455, + "rewards/tag_count_reward": 0.9440104216337204, + "step": 156 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.53125190734863, + "epoch": 0.4707646176911544, + "grad_norm": 1.69232758602469, + "kl": 0.56396484375, + "learning_rate": 1.2748517102124755e-05, + "loss": -0.0245, + "reward": 2.7549294233322144, + "reward_std": 0.3383819945156574, + "rewards/accuracy_reward": 0.817708358168602, + "rewards/reasoning_steps_reward": 0.9826389402151108, + "rewards/repetition_penalty_reward": -0.006355439778417349, + "rewards/tag_count_reward": 0.9609375149011612, + "step": 157 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.05208587646484, + "epoch": 0.4737631184407796, + "grad_norm": 2.265769890602018, + "kl": 0.73681640625, + "learning_rate": 1.2647343859284997e-05, + "loss": -0.0689, + "reward": 2.662193477153778, + "reward_std": 0.4981583207845688, + "rewards/accuracy_reward": 0.786458358168602, + "rewards/reasoning_steps_reward": 0.9722222536802292, + "rewards/repetition_penalty_reward": -0.011851702001877129, + "rewards/tag_count_reward": 0.9153645932674408, + "step": 158 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.22916984558105, + "epoch": 0.4767616191904048, + "grad_norm": 2.255759107812268, + "kl": 0.79541015625, + "learning_rate": 1.2545878360248633e-05, + "loss": 0.0144, + "reward": 2.8301729559898376, + "reward_std": 0.3695299196988344, + "rewards/accuracy_reward": 0.9010416865348816, + "rewards/reasoning_steps_reward": 0.9982638955116272, + "rewards/repetition_penalty_reward": -0.009236796642653644, + "rewards/tag_count_reward": 0.9401041865348816, + "step": 159 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.39062881469727, + "epoch": 0.47976011994003, + "grad_norm": 7.190371202010685, + "kl": 1.99609375, + "learning_rate": 1.2444131806401818e-05, + "loss": 0.0233, + "reward": 2.587311804294586, + "reward_std": 0.4335063770413399, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9722222536802292, + "rewards/repetition_penalty_reward": -0.007306311279535294, + "rewards/tag_count_reward": 0.9348958432674408, + "step": 160 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.15104484558105, + "epoch": 0.4827586206896552, + "grad_norm": 11.57931701299926, + "kl": 2.580078125, + "learning_rate": 1.2342115430158024e-05, + "loss": 0.0864, + "reward": 2.5142497420310974, + "reward_std": 0.592901311814785, + "rewards/accuracy_reward": 0.6510417014360428, + "rewards/reasoning_steps_reward": 0.9722222238779068, + "rewards/repetition_penalty_reward": -0.012660016654990613, + "rewards/tag_count_reward": 0.903645858168602, + "step": 161 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.13541793823242, + "epoch": 0.48575712143928035, + "grad_norm": 4.161233572872993, + "kl": 1.421875, + "learning_rate": 1.223984049371805e-05, + "loss": 0.0328, + "reward": 2.5102869868278503, + "reward_std": 0.34895218163728714, + "rewards/accuracy_reward": 0.5937500223517418, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.0066401001531630754, + "rewards/tag_count_reward": 0.9440104365348816, + "step": 162 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.828125, + "epoch": 0.48875562218890556, + "grad_norm": 3.9809594705586053, + "kl": 1.162109375, + "learning_rate": 1.2137318287826699e-05, + "loss": -0.0078, + "reward": 2.5699517726898193, + "reward_std": 0.4455106034874916, + "rewards/accuracy_reward": 0.6770833507180214, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.015117747941985726, + "rewards/tag_count_reward": 0.942708358168602, + "step": 163 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.94270896911621, + "epoch": 0.4917541229385307, + "grad_norm": 19.775528338909474, + "kl": 2.1142578125, + "learning_rate": 1.2034560130526341e-05, + "loss": 0.0267, + "reward": 2.7591124176979065, + "reward_std": 0.3192651905119419, + "rewards/accuracy_reward": 0.8281250298023224, + "rewards/reasoning_steps_reward": 0.9687500447034836, + "rewards/repetition_penalty_reward": -0.006512661639135331, + "rewards/tag_count_reward": 0.9687500149011612, + "step": 164 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.59896278381348, + "epoch": 0.4947526236881559, + "grad_norm": 1.1276005185146887, + "kl": 0.5009765625, + "learning_rate": 1.1931577365907433e-05, + "loss": 0.0095, + "reward": 2.7071834206581116, + "reward_std": 0.24475602060556412, + "rewards/accuracy_reward": 0.7500000149011612, + "rewards/reasoning_steps_reward": 0.9826388955116272, + "rewards/repetition_penalty_reward": -0.009830599068664014, + "rewards/tag_count_reward": 0.984375, + "step": 165 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.80729675292969, + "epoch": 0.49775112443778113, + "grad_norm": 1.4396819065845348, + "kl": 0.5517578125, + "learning_rate": 1.1828381362856195e-05, + "loss": 0.0061, + "reward": 2.7072086334228516, + "reward_std": 0.2500213086605072, + "rewards/accuracy_reward": 0.7552083432674408, + "rewards/reasoning_steps_reward": 0.9809028059244156, + "rewards/repetition_penalty_reward": -0.008069265051744878, + "rewards/tag_count_reward": 0.9791666716337204, + "step": 166 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.23437690734863, + "epoch": 0.5007496251874063, + "grad_norm": 1.4162738213578512, + "kl": 0.58447265625, + "learning_rate": 1.1724983513799505e-05, + "loss": -0.028, + "reward": 2.83200603723526, + "reward_std": 0.2547433339059353, + "rewards/accuracy_reward": 0.880208358168602, + "rewards/reasoning_steps_reward": 0.9774305820465088, + "rewards/repetition_penalty_reward": -0.006101653270889074, + "rewards/tag_count_reward": 0.98046875, + "step": 167 + }, + { + "clip_ratio": 0.0, + "completion_length": 102.34896087646484, + "epoch": 0.5037481259370314, + "grad_norm": 1.5741520276879495, + "kl": 0.5859375, + "learning_rate": 1.1621395233447247e-05, + "loss": -0.0284, + "reward": 2.856131374835968, + "reward_std": 0.336722657084465, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.970486119389534, + "rewards/repetition_penalty_reward": -0.00628192734438926, + "rewards/tag_count_reward": 0.9752604216337204, + "step": 168 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.50521087646484, + "epoch": 0.5067466266866567, + "grad_norm": 3.143223791327549, + "kl": 0.8681640625, + "learning_rate": 1.1517627957532155e-05, + "loss": -0.0218, + "reward": 2.8190484642982483, + "reward_std": 0.3480563126504421, + "rewards/accuracy_reward": 0.927083358168602, + "rewards/reasoning_steps_reward": 0.9409722536802292, + "rewards/repetition_penalty_reward": -0.012548819300718606, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 169 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.671875, + "epoch": 0.5097451274362819, + "grad_norm": 4.087516775956481, + "kl": 1.3857421875, + "learning_rate": 1.1413693141547354e-05, + "loss": -0.0334, + "reward": 2.7752469778060913, + "reward_std": 0.4824451133608818, + "rewards/accuracy_reward": 0.9010416865348816, + "rewards/reasoning_steps_reward": 0.9531250149011612, + "rewards/repetition_penalty_reward": -0.009909308631904423, + "rewards/tag_count_reward": 0.9309895932674408, + "step": 170 + }, + { + "clip_ratio": 0.0, + "completion_length": 108.43750190734863, + "epoch": 0.512743628185907, + "grad_norm": 1.808660970560849, + "kl": 1.2373046875, + "learning_rate": 1.1309602259481726e-05, + "loss": -0.0589, + "reward": 2.6380687952041626, + "reward_std": 0.5042757764458656, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.97743059694767, + "rewards/repetition_penalty_reward": -0.013841008301824331, + "rewards/tag_count_reward": 0.9453125, + "step": 171 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.00000190734863, + "epoch": 0.5157421289355323, + "grad_norm": 3.2045053936261754, + "kl": 1.26953125, + "learning_rate": 1.1205366802553231e-05, + "loss": -0.0773, + "reward": 2.749893009662628, + "reward_std": 0.5450254082679749, + "rewards/accuracy_reward": 0.8437500298023224, + "rewards/reasoning_steps_reward": 0.9670138955116272, + "rewards/repetition_penalty_reward": -0.007485455018468201, + "rewards/tag_count_reward": 0.9466145932674408, + "step": 172 + }, + { + "clip_ratio": 0.0, + "completion_length": 104.08854484558105, + "epoch": 0.5187406296851574, + "grad_norm": 4.461811464537438, + "kl": 1.9453125, + "learning_rate": 1.1100998277940316e-05, + "loss": -0.0487, + "reward": 2.8390066623687744, + "reward_std": 0.4234722927212715, + "rewards/accuracy_reward": 0.927083358168602, + "rewards/reasoning_steps_reward": 0.9687500149011612, + "rewards/repetition_penalty_reward": -0.007347506354562938, + "rewards/tag_count_reward": 0.9505208432674408, + "step": 173 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.59895896911621, + "epoch": 0.5217391304347826, + "grad_norm": 1.1431834037063382, + "kl": 0.70361328125, + "learning_rate": 1.0996508207511565e-05, + "loss": -0.0495, + "reward": 2.7981215715408325, + "reward_std": 0.32188424095511436, + "rewards/accuracy_reward": 0.880208358168602, + "rewards/reasoning_steps_reward": 0.9809027910232544, + "rewards/repetition_penalty_reward": -0.02262500289361924, + "rewards/tag_count_reward": 0.9596354216337204, + "step": 174 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.19270896911621, + "epoch": 0.5247376311844077, + "grad_norm": 6.089832729503498, + "kl": 1.47265625, + "learning_rate": 1.089190812655374e-05, + "loss": 0.0148, + "reward": 2.733491837978363, + "reward_std": 0.37758616358041763, + "rewards/accuracy_reward": 0.8020833432674408, + "rewards/reasoning_steps_reward": 0.9843750298023224, + "rewards/repetition_penalty_reward": -0.019112566020339727, + "rewards/tag_count_reward": 0.9661458432674408, + "step": 175 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.48437690734863, + "epoch": 0.527736131934033, + "grad_norm": 1.0305128960164511, + "kl": 0.55419921875, + "learning_rate": 1.0787209582498315e-05, + "loss": 0.0177, + "reward": 2.8952815532684326, + "reward_std": 0.21657648496329784, + "rewards/accuracy_reward": 0.9322916865348816, + "rewards/reasoning_steps_reward": 0.9947916865348816, + "rewards/repetition_penalty_reward": -0.0161769341211766, + "rewards/tag_count_reward": 0.9843750149011612, + "step": 176 + }, + { + "clip_ratio": 0.0, + "completion_length": 122.35416984558105, + "epoch": 0.5307346326836582, + "grad_norm": 1.3250160837320035, + "kl": 0.54052734375, + "learning_rate": 1.0682424133646712e-05, + "loss": 0.0118, + "reward": 2.8939477801322937, + "reward_std": 0.1932453876361251, + "rewards/accuracy_reward": 0.9218750149011612, + "rewards/reasoning_steps_reward": 0.9947916716337204, + "rewards/repetition_penalty_reward": -0.012302407994866371, + "rewards/tag_count_reward": 0.9895833432674408, + "step": 177 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.97916984558105, + "epoch": 0.5337331334332833, + "grad_norm": 1.1202124937220295, + "kl": 0.50048828125, + "learning_rate": 1.0577563347894286e-05, + "loss": 0.0343, + "reward": 2.8213730454444885, + "reward_std": 0.12381599424406886, + "rewards/accuracy_reward": 0.8489583432674408, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.017168688587844372, + "rewards/tag_count_reward": 0.9895833432674408, + "step": 178 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.73958587646484, + "epoch": 0.5367316341829086, + "grad_norm": 0.9845571322277763, + "kl": 0.4287109375, + "learning_rate": 1.0472638801453287e-05, + "loss": 0.0274, + "reward": 2.886799395084381, + "reward_std": 0.19121930375695229, + "rewards/accuracy_reward": 0.9010417014360428, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.011638134252279997, + "rewards/tag_count_reward": 0.9973958432674408, + "step": 179 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.58333396911621, + "epoch": 0.5397301349325337, + "grad_norm": 1.3925075814052417, + "kl": 0.671875, + "learning_rate": 1.0367662077574898e-05, + "loss": 0.006, + "reward": 2.823054552078247, + "reward_std": 0.2618987523019314, + "rewards/accuracy_reward": 0.8593750149011612, + "rewards/reasoning_steps_reward": 0.998263880610466, + "rewards/repetition_penalty_reward": -0.009844740270636976, + "rewards/tag_count_reward": 0.9752604514360428, + "step": 180 + }, + { + "clip_ratio": 0.0, + "completion_length": 105.11979484558105, + "epoch": 0.5427286356821589, + "grad_norm": 2.280358299836913, + "kl": 0.8330078125, + "learning_rate": 1.0262644765270472e-05, + "loss": 0.0481, + "reward": 2.8652560710906982, + "reward_std": 0.35385340452194214, + "rewards/accuracy_reward": 0.9062500298023224, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.008007904718397185, + "rewards/tag_count_reward": 0.973958358168602, + "step": 181 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.85416793823242, + "epoch": 0.545727136431784, + "grad_norm": 8.675677318683208, + "kl": 1.4765625, + "learning_rate": 1.0157598458032165e-05, + "loss": 0.056, + "reward": 2.7417566776275635, + "reward_std": 0.3702938035130501, + "rewards/accuracy_reward": 0.7812500149011612, + "rewards/reasoning_steps_reward": 0.9947916865348816, + "rewards/repetition_penalty_reward": -0.004337083431892097, + "rewards/tag_count_reward": 0.9700520932674408, + "step": 182 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.06771087646484, + "epoch": 0.5487256371814093, + "grad_norm": 9.74056119805903, + "kl": 1.44775390625, + "learning_rate": 1.0052534752553063e-05, + "loss": 0.0459, + "reward": 2.8742672204971313, + "reward_std": 0.2908545406535268, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.9774305671453476, + "rewards/repetition_penalty_reward": -0.0042049614421557635, + "rewards/tag_count_reward": 0.984375, + "step": 183 + }, + { + "clip_ratio": 0.0, + "completion_length": 94.85937690734863, + "epoch": 0.5517241379310345, + "grad_norm": 6.447606788332282, + "kl": 1.68603515625, + "learning_rate": 9.947465247446942e-06, + "loss": 0.0884, + "reward": 2.8351686000823975, + "reward_std": 0.41117405891418457, + "rewards/accuracy_reward": 0.9010416865348816, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.007279429468326271, + "rewards/tag_count_reward": 0.9622395932674408, + "step": 184 + }, + { + "clip_ratio": 0.0, + "completion_length": 92.75000381469727, + "epoch": 0.5547226386806596, + "grad_norm": 38.497156677104286, + "kl": 2.2099609375, + "learning_rate": 9.842401541967838e-06, + "loss": 0.1526, + "reward": 2.821943938732147, + "reward_std": 0.33023548871278763, + "rewards/accuracy_reward": 0.8593750298023224, + "rewards/reasoning_steps_reward": 0.989583358168602, + "rewards/repetition_penalty_reward": -0.0035769873938988894, + "rewards/tag_count_reward": 0.9765625, + "step": 185 + }, + { + "clip_ratio": 0.0, + "completion_length": 88.43229484558105, + "epoch": 0.5577211394302849, + "grad_norm": 1.83258753969118, + "kl": 0.9951171875, + "learning_rate": 9.737355234729531e-06, + "loss": 0.017, + "reward": 2.743259012699127, + "reward_std": 0.3549557775259018, + "rewards/accuracy_reward": 0.8072916865348816, + "rewards/reasoning_steps_reward": 0.9722222536802292, + "rewards/repetition_penalty_reward": -0.005005000915843993, + "rewards/tag_count_reward": 0.96875, + "step": 186 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.44791984558105, + "epoch": 0.56071964017991, + "grad_norm": 1.8795466176250475, + "kl": 1.1171875, + "learning_rate": 9.632337922425106e-06, + "loss": -0.0016, + "reward": 2.803585946559906, + "reward_std": 0.39651068300008774, + "rewards/accuracy_reward": 0.8750000149011612, + "rewards/reasoning_steps_reward": 0.973958358168602, + "rewards/repetition_penalty_reward": -0.008914108970202506, + "rewards/tag_count_reward": 0.9635416716337204, + "step": 187 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.19791984558105, + "epoch": 0.5637181409295352, + "grad_norm": 43.26072438039279, + "kl": 3.529296875, + "learning_rate": 9.527361198546715e-06, + "loss": 0.1745, + "reward": 2.712351143360138, + "reward_std": 0.5067479014396667, + "rewards/accuracy_reward": 0.8385416716337204, + "rewards/reasoning_steps_reward": 0.947916716337204, + "rewards/repetition_penalty_reward": -0.005096889741253108, + "rewards/tag_count_reward": 0.9309895932674408, + "step": 188 + }, + { + "clip_ratio": 0.0, + "completion_length": 91.64062690734863, + "epoch": 0.5667166416791605, + "grad_norm": 11.228523577104813, + "kl": 1.244140625, + "learning_rate": 9.422436652105718e-06, + "loss": 0.1033, + "reward": 2.70283704996109, + "reward_std": 0.4315161928534508, + "rewards/accuracy_reward": 0.786458358168602, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.00462843308923766, + "rewards/tag_count_reward": 0.9557291716337204, + "step": 189 + }, + { + "clip_ratio": 0.0, + "completion_length": 90.95833778381348, + "epoch": 0.5697151424287856, + "grad_norm": 6.30477846185162, + "kl": 1.2138671875, + "learning_rate": 9.317575866353293e-06, + "loss": 0.1868, + "reward": 2.829277455806732, + "reward_std": 0.42368828505277634, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.9791667014360428, + "rewards/repetition_penalty_reward": -0.004055907833389938, + "rewards/tag_count_reward": 0.9375000149011612, + "step": 190 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.19791793823242, + "epoch": 0.5727136431784108, + "grad_norm": 7.430329871632132, + "kl": 2.046875, + "learning_rate": 9.212790417501688e-06, + "loss": 0.1354, + "reward": 2.5388087034225464, + "reward_std": 0.6883069798350334, + "rewards/accuracy_reward": 0.7031250149011612, + "rewards/reasoning_steps_reward": 0.9461805820465088, + "rewards/repetition_penalty_reward": -0.0050282846204936504, + "rewards/tag_count_reward": 0.8945312649011612, + "step": 191 + }, + { + "clip_ratio": 0.0, + "completion_length": 98.67187690734863, + "epoch": 0.5757121439280359, + "grad_norm": 5.377382713696718, + "kl": 1.591796875, + "learning_rate": 9.108091873446264e-06, + "loss": 0.2739, + "reward": 2.752804219722748, + "reward_std": 0.5407753884792328, + "rewards/accuracy_reward": 0.8854166716337204, + "rewards/reasoning_steps_reward": 0.9340278059244156, + "rewards/repetition_penalty_reward": -0.004140403761994094, + "rewards/tag_count_reward": 0.9375000149011612, + "step": 192 + }, + { + "clip_ratio": 0.0, + "completion_length": 103.41666984558105, + "epoch": 0.5787106446776612, + "grad_norm": 4.0000714621638105, + "kl": 1.837890625, + "learning_rate": 9.003491792488438e-06, + "loss": 0.2431, + "reward": 2.7179980874061584, + "reward_std": 0.5703071355819702, + "rewards/accuracy_reward": 0.8437500298023224, + "rewards/reasoning_steps_reward": 0.9461805671453476, + "rewards/repetition_penalty_reward": -0.004224258096655831, + "rewards/tag_count_reward": 0.9322916865348816, + "step": 193 + }, + { + "clip_ratio": 0.0, + "completion_length": 109.62500190734863, + "epoch": 0.5817091454272864, + "grad_norm": 6.784297297705125, + "kl": 1.486328125, + "learning_rate": 8.899001722059687e-06, + "loss": 0.4446, + "reward": 2.69238018989563, + "reward_std": 0.5326372683048248, + "rewards/accuracy_reward": 0.7916667014360428, + "rewards/reasoning_steps_reward": 0.9548610895872116, + "rewards/repetition_penalty_reward": -0.005970706697553396, + "rewards/tag_count_reward": 0.9518229216337204, + "step": 194 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.79167175292969, + "epoch": 0.5847076461769115, + "grad_norm": 15.68094738303537, + "kl": 1.1884765625, + "learning_rate": 8.79463319744677e-06, + "loss": 0.553, + "reward": 2.6782643795013428, + "reward_std": 0.4203122928738594, + "rewards/accuracy_reward": 0.7656250149011612, + "rewards/reasoning_steps_reward": 0.9531250447034836, + "rewards/repetition_penalty_reward": -0.004027447925182059, + "rewards/tag_count_reward": 0.9635416716337204, + "step": 195 + }, + { + "clip_ratio": 0.0, + "completion_length": 107.8281307220459, + "epoch": 0.5877061469265368, + "grad_norm": 9.395887357942577, + "kl": 1.21875, + "learning_rate": 8.690397740518279e-06, + "loss": 0.4238, + "reward": 2.8701762557029724, + "reward_std": 0.38676780834794044, + "rewards/accuracy_reward": 0.9375000298023224, + "rewards/reasoning_steps_reward": 0.9687500149011612, + "rewards/repetition_penalty_reward": -0.0035217873519286513, + "rewards/tag_count_reward": 0.9674479365348816, + "step": 196 + }, + { + "clip_ratio": 0.0, + "completion_length": 106.45312690734863, + "epoch": 0.5907046476761619, + "grad_norm": 4.57053771862413, + "kl": 0.8564453125, + "learning_rate": 8.586306858452653e-06, + "loss": 0.3129, + "reward": 2.78133487701416, + "reward_std": 0.38999389111995697, + "rewards/accuracy_reward": 0.833333358168602, + "rewards/reasoning_steps_reward": 0.9809028059244156, + "rewards/repetition_penalty_reward": -0.006859738496132195, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 197 + }, + { + "clip_ratio": 0.0, + "completion_length": 114.46354675292969, + "epoch": 0.5937031484257871, + "grad_norm": 11.162254121408726, + "kl": 1.4287109375, + "learning_rate": 8.48237204246785e-06, + "loss": 0.5309, + "reward": 2.7748392820358276, + "reward_std": 0.4289344698190689, + "rewards/accuracy_reward": 0.8697917014360428, + "rewards/reasoning_steps_reward": 0.9496528059244156, + "rewards/repetition_penalty_reward": -0.00554278859635815, + "rewards/tag_count_reward": 0.9609375, + "step": 198 + }, + { + "clip_ratio": 0.0, + "completion_length": 111.04166793823242, + "epoch": 0.5967016491754122, + "grad_norm": 29.168148040502643, + "kl": 1.78515625, + "learning_rate": 8.378604766552756e-06, + "loss": 0.3664, + "reward": 2.7375897765159607, + "reward_std": 0.32485630363225937, + "rewards/accuracy_reward": 0.7812500149011612, + "rewards/reasoning_steps_reward": 0.9895833432674408, + "rewards/repetition_penalty_reward": -0.0072020008228719234, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 199 + }, + { + "clip_ratio": 0.0, + "completion_length": 125.81250381469727, + "epoch": 0.5997001499250375, + "grad_norm": 7.416359123988612, + "kl": 0.955078125, + "learning_rate": 8.275016486200498e-06, + "loss": 0.623, + "reward": 2.769349992275238, + "reward_std": 0.4332389682531357, + "rewards/accuracy_reward": 0.8385416865348816, + "rewards/reasoning_steps_reward": 0.9670139104127884, + "rewards/repetition_penalty_reward": -0.004955811775289476, + "rewards/tag_count_reward": 0.96875, + "step": 200 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.6026986506746627, + "grad_norm": 27.882830639662522, + "kl": 2.0904541015625, + "learning_rate": 8.17161863714381e-06, + "loss": 0.0835, + "reward": 0.0013020833721384406, + "reward_std": 0.0052083334885537624, + "rewards/accuracy_reward": 0.0, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": 0.0, + "rewards/tag_count_reward": 0.0013020833721384406, + "step": 201 + }, + { + "clip_ratio": 0.0, + "completion_length": 1014.3333435058594, + "epoch": 0.6056971514242878, + "grad_norm": 0.6042533284398771, + "kl": 0.113037109375, + "learning_rate": 8.06842263409257e-06, + "loss": 0.0419, + "reward": 0.029826793004758656, + "reward_std": 0.11930717201903462, + "rewards/accuracy_reward": 0.0052083334885537624, + "rewards/reasoning_steps_reward": 0.010416666977107525, + "rewards/repetition_penalty_reward": -0.00012112403055652976, + "rewards/tag_count_reward": 0.014322917093522847, + "step": 202 + }, + { + "clip_ratio": 0.0, + "completion_length": 1019.1822967529297, + "epoch": 0.6086956521739131, + "grad_norm": 1.3832204376438657, + "kl": 0.21142578125, + "learning_rate": 7.965439869473664e-06, + "loss": 0.0271, + "reward": 0.018031382700428367, + "reward_std": 0.06882480159401894, + "rewards/accuracy_reward": 0.0052083334885537624, + "rewards/reasoning_steps_reward": 0.0052083334885537624, + "rewards/repetition_penalty_reward": -0.00019778480054810643, + "rewards/tag_count_reward": 0.007812500232830644, + "step": 203 + }, + { + "clip_ratio": 0.0, + "completion_length": 1024.0, + "epoch": 0.6116941529235382, + "grad_norm": 0.9850061937944327, + "kl": 0.1051025390625, + "learning_rate": 7.862681712173304e-06, + "loss": 0.0042, + "reward": 0.007812500232830644, + "reward_std": 0.027949271723628044, + "rewards/accuracy_reward": 0.0, + "rewards/reasoning_steps_reward": 0.0, + "rewards/repetition_penalty_reward": 0.0, + "rewards/tag_count_reward": 0.007812500232830644, + "step": 204 + }, + { + "clip_ratio": 0.0, + "completion_length": 1009.2812652587891, + "epoch": 0.6146926536731634, + "grad_norm": 0.44253170115395524, + "kl": 0.1737060546875, + "learning_rate": 7.760159506281955e-06, + "loss": 0.0639, + "reward": 0.04947916732635349, + "reward_std": 0.19791666930541396, + "rewards/accuracy_reward": 0.015625000465661287, + "rewards/reasoning_steps_reward": 0.015625000465661287, + "rewards/repetition_penalty_reward": 0.0, + "rewards/tag_count_reward": 0.01822916732635349, + "step": 205 + }, + { + "clip_ratio": 0.0, + "completion_length": 995.3854370117188, + "epoch": 0.6176911544227887, + "grad_norm": 3.8020369247401415, + "kl": 0.369384765625, + "learning_rate": 7.65788456984198e-06, + "loss": 0.1046, + "reward": 0.10146949626505375, + "reward_std": 0.3067319616675377, + "rewards/accuracy_reward": 0.026041667442768812, + "rewards/reasoning_steps_reward": 0.031250000931322575, + "rewards/repetition_penalty_reward": -9.300596138928086e-05, + "rewards/tag_count_reward": 0.044270834885537624, + "step": 206 + }, + { + "clip_ratio": 0.0, + "completion_length": 980.5520935058594, + "epoch": 0.6206896551724138, + "grad_norm": 2.4622521421222046, + "kl": 0.35595703125, + "learning_rate": 7.555868193598188e-06, + "loss": 0.1499, + "reward": 0.15076165180653334, + "reward_std": 0.4339568931609392, + "rewards/accuracy_reward": 0.041666666977107525, + "rewards/reasoning_steps_reward": 0.046875000931322575, + "rewards/repetition_penalty_reward": -0.0002800179208861664, + "rewards/tag_count_reward": 0.06250000186264515, + "step": 207 + }, + { + "clip_ratio": 0.0, + "completion_length": 985.7864685058594, + "epoch": 0.623688155922039, + "grad_norm": 508.3775666051304, + "kl": 21.75, + "learning_rate": 7.4541216397513705e-06, + "loss": 1.0583, + "reward": 0.15980212949216366, + "reward_std": 0.5328295826911926, + "rewards/accuracy_reward": 0.03645833441987634, + "rewards/reasoning_steps_reward": 0.057291668839752674, + "rewards/repetition_penalty_reward": -0.0003541221594787203, + "rewards/tag_count_reward": 0.06640625093132257, + "step": 208 + }, + { + "clip_ratio": 0.0, + "completion_length": 980.5989685058594, + "epoch": 0.6266866566716641, + "grad_norm": 16.61142685491831, + "kl": 1.8037109375, + "learning_rate": 7.352656140715006e-06, + "loss": 0.1989, + "reward": 0.16810668725520372, + "reward_std": 0.42627183347940445, + "rewards/accuracy_reward": 0.03645833441987634, + "rewards/reasoning_steps_reward": 0.05381944612599909, + "rewards/repetition_penalty_reward": -0.00029610148339997977, + "rewards/tag_count_reward": 0.07812500139698386, + "step": 209 + }, + { + "clip_ratio": 0.0, + "completion_length": 990.5625152587891, + "epoch": 0.6296851574212894, + "grad_norm": 5.532107109224189, + "kl": 0.662109375, + "learning_rate": 7.2514828978752434e-06, + "loss": 0.1355, + "reward": 0.1534007415175438, + "reward_std": 0.41936828941106796, + "rewards/accuracy_reward": 0.031250000931322575, + "rewards/reasoning_steps_reward": 0.04687500232830644, + "rewards/repetition_penalty_reward": -0.0002450980609864928, + "rewards/tag_count_reward": 0.07552083488553762, + "step": 210 + }, + { + "clip_ratio": 0.0, + "completion_length": 942.3229370117188, + "epoch": 0.6326836581709145, + "grad_norm": 14.765094590304768, + "kl": 0.7763671875, + "learning_rate": 7.150613080354315e-06, + "loss": 0.262, + "reward": 0.32499945536255836, + "reward_std": 0.7303483635187149, + "rewards/accuracy_reward": 0.06770833535119891, + "rewards/reasoning_steps_reward": 0.118055559694767, + "rewards/repetition_penalty_reward": -0.0013894452131353319, + "rewards/tag_count_reward": 0.1406250037252903, + "step": 211 + }, + { + "clip_ratio": 0.0, + "completion_length": 893.5156402587891, + "epoch": 0.6356821589205397, + "grad_norm": 13.402868555399907, + "kl": 2.154296875, + "learning_rate": 7.050057823777503e-06, + "loss": 0.4338, + "reward": 0.4721004366874695, + "reward_std": 0.9947518408298492, + "rewards/accuracy_reward": 0.13020833767950535, + "rewards/reasoning_steps_reward": 0.1545138955116272, + "rewards/repetition_penalty_reward": -0.0014238738513085991, + "rewards/tag_count_reward": 0.1888020895421505, + "step": 212 + }, + { + "clip_ratio": 0.0, + "completion_length": 882.8437652587891, + "epoch": 0.638680659670165, + "grad_norm": 533.9503112402125, + "kl": 46.984375, + "learning_rate": 6.9498282290438235e-06, + "loss": 2.4742, + "reward": 0.5224730595946312, + "reward_std": 0.9336346387863159, + "rewards/accuracy_reward": 0.1354166716337204, + "rewards/reasoning_steps_reward": 0.1666666716337204, + "rewards/repetition_penalty_reward": -0.0009644359670346603, + "rewards/tag_count_reward": 0.221354179084301, + "step": 213 + }, + { + "clip_ratio": 0.0, + "completion_length": 821.7448120117188, + "epoch": 0.6416791604197901, + "grad_norm": 149.18286801865162, + "kl": 16.4375, + "learning_rate": 6.849935361100522e-06, + "loss": 1.2104, + "reward": 0.7398104518651962, + "reward_std": 1.1720031201839447, + "rewards/accuracy_reward": 0.2083333432674408, + "rewards/reasoning_steps_reward": 0.2586805671453476, + "rewards/repetition_penalty_reward": -0.0019430473039392382, + "rewards/tag_count_reward": 0.2747395932674408, + "step": 214 + }, + { + "clip_ratio": 0.0, + "completion_length": 840.9218902587891, + "epoch": 0.6446776611694153, + "grad_norm": 30.355372142343207, + "kl": 3.22265625, + "learning_rate": 6.750390247721549e-06, + "loss": 0.562, + "reward": 0.6818769425153732, + "reward_std": 1.1269759833812714, + "rewards/accuracy_reward": 0.17708333767950535, + "rewards/reasoning_steps_reward": 0.2413194589316845, + "rewards/repetition_penalty_reward": -0.0021508438512682915, + "rewards/tag_count_reward": 0.2656250074505806, + "step": 215 + }, + { + "clip_ratio": 0.0, + "completion_length": 803.9166870117188, + "epoch": 0.6476761619190404, + "grad_norm": 23.89552805694713, + "kl": 1.5625, + "learning_rate": 6.651203878290139e-06, + "loss": 0.5275, + "reward": 0.8299884647130966, + "reward_std": 1.1851888000965118, + "rewards/accuracy_reward": 0.2031250037252903, + "rewards/reasoning_steps_reward": 0.2934028059244156, + "rewards/repetition_penalty_reward": -0.0011747851967811584, + "rewards/tag_count_reward": 0.334635429084301, + "step": 216 + }, + { + "clip_ratio": 0.0, + "completion_length": 718.4583435058594, + "epoch": 0.6506746626686657, + "grad_norm": 30.60183508089706, + "kl": 2.451171875, + "learning_rate": 6.552387202585629e-06, + "loss": 0.7066, + "reward": 1.0522598177194595, + "reward_std": 1.2560299634933472, + "rewards/accuracy_reward": 0.2708333432674408, + "rewards/reasoning_steps_reward": 0.3732639104127884, + "rewards/repetition_penalty_reward": -0.0032958039082586765, + "rewards/tag_count_reward": 0.4114583432674408, + "step": 217 + }, + { + "clip_ratio": 0.0, + "completion_length": 718.8177185058594, + "epoch": 0.6536731634182908, + "grad_norm": 64.25492952390515, + "kl": 7.8125, + "learning_rate": 6.453951129574644e-06, + "loss": 1.02, + "reward": 1.1462399363517761, + "reward_std": 1.311941385269165, + "rewards/accuracy_reward": 0.3229166753590107, + "rewards/reasoning_steps_reward": 0.3888889104127884, + "rewards/repetition_penalty_reward": -0.0017635486146900803, + "rewards/tag_count_reward": 0.4361979216337204, + "step": 218 + }, + { + "clip_ratio": 0.0, + "completion_length": 574.0937652587891, + "epoch": 0.656671664167916, + "grad_norm": 108.73570895032785, + "kl": 14.359375, + "learning_rate": 6.355906526206788e-06, + "loss": 1.8085, + "reward": 1.5105059146881104, + "reward_std": 1.3671056032180786, + "rewards/accuracy_reward": 0.432291679084301, + "rewards/reasoning_steps_reward": 0.5277777910232544, + "rewards/repetition_penalty_reward": -0.004251136677339673, + "rewards/tag_count_reward": 0.5546875149011612, + "step": 219 + }, + { + "clip_ratio": 0.0, + "completion_length": 675.6041870117188, + "epoch": 0.6596701649175413, + "grad_norm": 68.16871801389341, + "kl": 10.140625, + "learning_rate": 6.2582642162149775e-06, + "loss": 1.2516, + "reward": 1.230929583311081, + "reward_std": 1.2748659253120422, + "rewards/accuracy_reward": 0.3072916716337204, + "rewards/reasoning_steps_reward": 0.4531250149011612, + "rewards/repetition_penalty_reward": -0.003445470007136464, + "rewards/tag_count_reward": 0.4739583507180214, + "step": 220 + }, + { + "clip_ratio": 0.0, + "completion_length": 565.9896087646484, + "epoch": 0.6626686656671664, + "grad_norm": 37.36448608931068, + "kl": 7.1015625, + "learning_rate": 6.161034978920555e-06, + "loss": 1.2308, + "reward": 1.6033472418785095, + "reward_std": 1.2990168035030365, + "rewards/accuracy_reward": 0.4375000149011612, + "rewards/reasoning_steps_reward": 0.5798611491918564, + "rewards/repetition_penalty_reward": -0.003857686650007963, + "rewards/tag_count_reward": 0.5898437649011612, + "step": 221 + }, + { + "clip_ratio": 0.0, + "completion_length": 613.5521087646484, + "epoch": 0.6656671664167916, + "grad_norm": 42.67953665381653, + "kl": 6.75, + "learning_rate": 6.064229548043272e-06, + "loss": 1.1467, + "reward": 1.5173589289188385, + "reward_std": 1.3208832442760468, + "rewards/accuracy_reward": 0.416666679084301, + "rewards/reasoning_steps_reward": 0.5625000447034836, + "rewards/repetition_penalty_reward": -0.004776577930897474, + "rewards/tag_count_reward": 0.5429687649011612, + "step": 222 + }, + { + "clip_ratio": 0.0, + "completion_length": 550.3437652587891, + "epoch": 0.6686656671664168, + "grad_norm": 33.134335694451195, + "kl": 5.140625, + "learning_rate": 5.9678586105163535e-06, + "loss": 1.071, + "reward": 1.621566891670227, + "reward_std": 1.2871178090572357, + "rewards/accuracy_reward": 0.4427083432674408, + "rewards/reasoning_steps_reward": 0.6006944626569748, + "rewards/repetition_penalty_reward": -0.005169248324818909, + "rewards/tag_count_reward": 0.583333358168602, + "step": 223 + }, + { + "clip_ratio": 0.0, + "completion_length": 479.4739761352539, + "epoch": 0.671664167916042, + "grad_norm": 47.63738327986576, + "kl": 4.26953125, + "learning_rate": 5.8719328053066886e-06, + "loss": 1.1644, + "reward": 1.8401104509830475, + "reward_std": 1.2440669536590576, + "rewards/accuracy_reward": 0.5156250149011612, + "rewards/reasoning_steps_reward": 0.6840278059244156, + "rewards/repetition_penalty_reward": -0.005375679756980389, + "rewards/tag_count_reward": 0.645833358168602, + "step": 224 + }, + { + "clip_ratio": 0.0, + "completion_length": 488.96356201171875, + "epoch": 0.6746626686656672, + "grad_norm": 78.62849172701118, + "kl": 9.546875, + "learning_rate": 5.776462722240337e-06, + "loss": 1.7087, + "reward": 1.9351984560489655, + "reward_std": 1.149868130683899, + "rewards/accuracy_reward": 0.5104166865348816, + "rewards/reasoning_steps_reward": 0.763888955116272, + "rewards/repetition_penalty_reward": -0.004471685038879514, + "rewards/tag_count_reward": 0.665364608168602, + "step": 225 + }, + { + "clip_ratio": 0.0, + "completion_length": 463.96875762939453, + "epoch": 0.6776611694152923, + "grad_norm": 50.31185538870822, + "kl": 9.921875, + "learning_rate": 5.6814589008334475e-06, + "loss": 1.669, + "reward": 1.9726946949958801, + "reward_std": 1.1515787243843079, + "rewards/accuracy_reward": 0.5312500149011612, + "rewards/reasoning_steps_reward": 0.7743055671453476, + "rewards/repetition_penalty_reward": -0.006038067163899541, + "rewards/tag_count_reward": 0.6731770932674408, + "step": 226 + }, + { + "clip_ratio": 0.0, + "completion_length": 484.86981201171875, + "epoch": 0.6806596701649176, + "grad_norm": 25.321634983297866, + "kl": 7.1796875, + "learning_rate": 5.58693182912875e-06, + "loss": 1.4453, + "reward": 2.014142394065857, + "reward_std": 1.1831817626953125, + "rewards/accuracy_reward": 0.567708358168602, + "rewards/reasoning_steps_reward": 0.788194477558136, + "rewards/repetition_penalty_reward": -0.0032188262266572565, + "rewards/tag_count_reward": 0.661458358168602, + "step": 227 + }, + { + "clip_ratio": 0.0, + "completion_length": 444.3385543823242, + "epoch": 0.6836581709145427, + "grad_norm": 31.357353768527005, + "kl": 2.78125, + "learning_rate": 5.4928919425377035e-06, + "loss": 1.1606, + "reward": 2.098558932542801, + "reward_std": 1.0963150560855865, + "rewards/accuracy_reward": 0.5885416865348816, + "rewards/reasoning_steps_reward": 0.8263889402151108, + "rewards/repetition_penalty_reward": -0.0038717142306268215, + "rewards/tag_count_reward": 0.6875000149011612, + "step": 228 + }, + { + "clip_ratio": 0.0, + "completion_length": 514.4323043823242, + "epoch": 0.6866566716641679, + "grad_norm": 30.871800901209355, + "kl": 2.44921875, + "learning_rate": 5.399349622688479e-06, + "loss": 0.9849, + "reward": 1.9208540618419647, + "reward_std": 1.0992612540721893, + "rewards/accuracy_reward": 0.4739583358168602, + "rewards/reasoning_steps_reward": 0.8159722685813904, + "rewards/repetition_penalty_reward": -0.005795358796603978, + "rewards/tag_count_reward": 0.6367187649011612, + "step": 229 + }, + { + "clip_ratio": 0.0, + "completion_length": 407.44793701171875, + "epoch": 0.6896551724137931, + "grad_norm": 29.451600398259565, + "kl": 2.1015625, + "learning_rate": 5.306315196279864e-06, + "loss": 1.1623, + "reward": 2.2249255180358887, + "reward_std": 1.0277338027954102, + "rewards/accuracy_reward": 0.630208358168602, + "rewards/reasoning_steps_reward": 0.8836805820465088, + "rewards/repetition_penalty_reward": -0.0051092767680529505, + "rewards/tag_count_reward": 0.7161458432674408, + "step": 230 + }, + { + "clip_ratio": 0.0, + "completion_length": 408.5989685058594, + "epoch": 0.6926536731634183, + "grad_norm": 33.38250055371977, + "kl": 5.451171875, + "learning_rate": 5.213798933941237e-06, + "loss": 1.5075, + "reward": 2.2532132267951965, + "reward_std": 1.0094469636678696, + "rewards/accuracy_reward": 0.645833358168602, + "rewards/reasoning_steps_reward": 0.8767361044883728, + "rewards/repetition_penalty_reward": -0.003731334174517542, + "rewards/tag_count_reward": 0.7343750298023224, + "step": 231 + }, + { + "clip_ratio": 0.0, + "completion_length": 442.53125762939453, + "epoch": 0.6956521739130435, + "grad_norm": 31.956941133841717, + "kl": 6.71875, + "learning_rate": 5.121811049098728e-06, + "loss": 1.5082, + "reward": 2.063460409641266, + "reward_std": 1.0131369531154633, + "rewards/accuracy_reward": 0.5104166865348816, + "rewards/reasoning_steps_reward": 0.864583358168602, + "rewards/repetition_penalty_reward": -0.006852172431536019, + "rewards/tag_count_reward": 0.6953125149011612, + "step": 232 + }, + { + "clip_ratio": 0.0, + "completion_length": 461.07813262939453, + "epoch": 0.6986506746626686, + "grad_norm": 14.583503473769614, + "kl": 4.49609375, + "learning_rate": 5.030361696847706e-06, + "loss": 1.3138, + "reward": 2.1591392755508423, + "reward_std": 1.082452654838562, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.8767361640930176, + "rewards/repetition_penalty_reward": -0.004055280558532104, + "rewards/tag_count_reward": 0.6822916865348816, + "step": 233 + }, + { + "clip_ratio": 0.0, + "completion_length": 465.6146011352539, + "epoch": 0.7016491754122939, + "grad_norm": 31.93736272659645, + "kl": 2.73046875, + "learning_rate": 4.939460972831684e-06, + "loss": 1.1015, + "reward": 2.054699957370758, + "reward_std": 1.0152440816164017, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 0.8628472238779068, + "rewards/repetition_penalty_reward": -0.004761879798024893, + "rewards/tag_count_reward": 0.6757812649011612, + "step": 234 + }, + { + "clip_ratio": 0.0, + "completion_length": 424.4479293823242, + "epoch": 0.704647676161919, + "grad_norm": 22.78953660465126, + "kl": 1.619140625, + "learning_rate": 4.849118912127817e-06, + "loss": 1.1123, + "reward": 2.1616870760917664, + "reward_std": 1.1319475769996643, + "rewards/accuracy_reward": 0.6354166865348816, + "rewards/reasoning_steps_reward": 0.8315972685813904, + "rewards/repetition_penalty_reward": -0.004545638337731361, + "rewards/tag_count_reward": 0.6992187649011612, + "step": 235 + }, + { + "clip_ratio": 0.0, + "completion_length": 432.97918701171875, + "epoch": 0.7076461769115442, + "grad_norm": 32.912310324473026, + "kl": 3.009765625, + "learning_rate": 4.759345488139054e-06, + "loss": 1.2209, + "reward": 2.037692070007324, + "reward_std": 1.0419560819864273, + "rewards/accuracy_reward": 0.5208333507180214, + "rewards/reasoning_steps_reward": 0.8315972238779068, + "rewards/repetition_penalty_reward": -0.0022385247866623104, + "rewards/tag_count_reward": 0.6875000298023224, + "step": 236 + }, + { + "clip_ratio": 0.0, + "completion_length": 415.2864685058594, + "epoch": 0.7106446776611695, + "grad_norm": 11.37715164851099, + "kl": 1.5185546875, + "learning_rate": 4.670150611493116e-06, + "loss": 1.1303, + "reward": 2.193199932575226, + "reward_std": 1.067429170012474, + "rewards/accuracy_reward": 0.6406250149011612, + "rewards/reasoning_steps_reward": 0.8437500447034836, + "rewards/repetition_penalty_reward": -0.002112621790729463, + "rewards/tag_count_reward": 0.7109375149011612, + "step": 237 + }, + { + "clip_ratio": 0.0, + "completion_length": 432.12500762939453, + "epoch": 0.7136431784107946, + "grad_norm": 21.21508512413439, + "kl": 2.18359375, + "learning_rate": 4.581544128948413e-06, + "loss": 1.1413, + "reward": 2.1001856327056885, + "reward_std": 1.1043085157871246, + "rewards/accuracy_reward": 0.5885416865348816, + "rewards/reasoning_steps_reward": 0.822916716337204, + "rewards/repetition_penalty_reward": -0.005283167352899909, + "rewards/tag_count_reward": 0.6940104365348816, + "step": 238 + }, + { + "clip_ratio": 0.0, + "completion_length": 455.89064025878906, + "epoch": 0.7166416791604198, + "grad_norm": 3.7024792757899943, + "kl": 1.119140625, + "learning_rate": 4.493535822306993e-06, + "loss": 1.0115, + "reward": 1.9977717995643616, + "reward_std": 1.0762813091278076, + "rewards/accuracy_reward": 0.546875, + "rewards/reasoning_steps_reward": 0.7881944477558136, + "rewards/repetition_penalty_reward": -0.00396440684562549, + "rewards/tag_count_reward": 0.6666666865348816, + "step": 239 + }, + { + "clip_ratio": 0.0, + "completion_length": 377.89583587646484, + "epoch": 0.719640179910045, + "grad_norm": 3.6760487188389717, + "kl": 0.953125, + "learning_rate": 4.406135407334669e-06, + "loss": 1.1086, + "reward": 2.1464386582374573, + "reward_std": 1.1137472093105316, + "rewards/accuracy_reward": 0.5989583432674408, + "rewards/reasoning_steps_reward": 0.8159722238779068, + "rewards/repetition_penalty_reward": -0.002866994822397828, + "rewards/tag_count_reward": 0.7343750298023224, + "step": 240 + }, + { + "clip_ratio": 0.0, + "completion_length": 384.5364685058594, + "epoch": 0.7226386806596702, + "grad_norm": 19.529434988342086, + "kl": 1.9482421875, + "learning_rate": 4.319352532688444e-06, + "loss": 1.2755, + "reward": 2.1653665900230408, + "reward_std": 1.1197065114974976, + "rewards/accuracy_reward": 0.6250000149011612, + "rewards/reasoning_steps_reward": 0.8142361342906952, + "rewards/repetition_penalty_reward": -0.003036284673726186, + "rewards/tag_count_reward": 0.7291666865348816, + "step": 241 + }, + { + "clip_ratio": 0.0, + "completion_length": 377.8020935058594, + "epoch": 0.7256371814092953, + "grad_norm": 46.6982672472414, + "kl": 2.0419921875, + "learning_rate": 4.2331967788513295e-06, + "loss": 1.2806, + "reward": 2.2072007060050964, + "reward_std": 1.220471739768982, + "rewards/accuracy_reward": 0.6979166865348816, + "rewards/reasoning_steps_reward": 0.7881944924592972, + "rewards/repetition_penalty_reward": -0.0028688511229120195, + "rewards/tag_count_reward": 0.723958358168602, + "step": 242 + }, + { + "clip_ratio": 0.0, + "completion_length": 396.3802261352539, + "epoch": 0.7286356821589205, + "grad_norm": 51.973275524782814, + "kl": 2.06640625, + "learning_rate": 4.1476776570747065e-06, + "loss": 1.0713, + "reward": 1.942684918642044, + "reward_std": 1.1059914082288742, + "rewards/accuracy_reward": 0.4687500223517418, + "rewards/reasoning_steps_reward": 0.7656250447034836, + "rewards/repetition_penalty_reward": -0.005231783725321293, + "rewards/tag_count_reward": 0.7135416865348816, + "step": 243 + }, + { + "clip_ratio": 0.0, + "completion_length": 428.0104293823242, + "epoch": 0.7316341829085458, + "grad_norm": 4.890418211224268, + "kl": 1.2119140625, + "learning_rate": 4.0628046083283134e-06, + "loss": 1.0296, + "reward": 1.890177071094513, + "reward_std": 1.1783712059259415, + "rewards/accuracy_reward": 0.5000000149011612, + "rewards/reasoning_steps_reward": 0.7187500149011612, + "rewards/repetition_penalty_reward": -0.0030522070010192692, + "rewards/tag_count_reward": 0.6744791865348816, + "step": 244 + }, + { + "clip_ratio": 0.0, + "completion_length": 387.3073043823242, + "epoch": 0.7346326836581709, + "grad_norm": 5.532289024750771, + "kl": 1.078125, + "learning_rate": 3.9785870022580075e-06, + "loss": 1.1278, + "reward": 2.0230748057365417, + "reward_std": 1.2535961270332336, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.7326389104127884, + "rewards/repetition_penalty_reward": -0.005137050000485033, + "rewards/tag_count_reward": 0.7122395932674408, + "step": 245 + }, + { + "clip_ratio": 0.0, + "completion_length": 348.17189025878906, + "epoch": 0.7376311844077961, + "grad_norm": 9.03974459360704, + "kl": 0.9912109375, + "learning_rate": 3.895034136151388e-06, + "loss": 1.2011, + "reward": 2.1917267441749573, + "reward_std": 1.1604805290699005, + "rewards/accuracy_reward": 0.661458358168602, + "rewards/reasoning_steps_reward": 0.7812499850988388, + "rewards/repetition_penalty_reward": -0.004887988092377782, + "rewards/tag_count_reward": 0.7539062649011612, + "step": 246 + }, + { + "clip_ratio": 0.0, + "completion_length": 371.46876525878906, + "epoch": 0.7406296851574213, + "grad_norm": 10.803578128694179, + "kl": 0.986328125, + "learning_rate": 3.8121552339114166e-06, + "loss": 1.1353, + "reward": 2.1688408851623535, + "reward_std": 1.189782053232193, + "rewards/accuracy_reward": 0.645833358168602, + "rewards/reasoning_steps_reward": 0.7847222536802292, + "rewards/repetition_penalty_reward": -0.00390218710526824, + "rewards/tag_count_reward": 0.7421875298023224, + "step": 247 + }, + { + "clip_ratio": 0.0, + "completion_length": 352.3073043823242, + "epoch": 0.7436281859070465, + "grad_norm": 15.262258125639379, + "kl": 0.8876953125, + "learning_rate": 3.729959445038136e-06, + "loss": 1.1792, + "reward": 2.193057656288147, + "reward_std": 1.1223880648612976, + "rewards/accuracy_reward": 0.6458333656191826, + "rewards/reasoning_steps_reward": 0.7951389104127884, + "rewards/repetition_penalty_reward": -0.004425037943292409, + "rewards/tag_count_reward": 0.7565104365348816, + "step": 248 + }, + { + "clip_ratio": 0.0, + "completion_length": 383.93751525878906, + "epoch": 0.7466266866566716, + "grad_norm": 2160.279743864228, + "kl": 34.333984375, + "learning_rate": 3.6484558436185936e-06, + "loss": 4.9009, + "reward": 2.1113908290863037, + "reward_std": 1.0678627490997314, + "rewards/accuracy_reward": 0.5625000149011612, + "rewards/reasoning_steps_reward": 0.8263889253139496, + "rewards/repetition_penalty_reward": -0.005362731404602528, + "rewards/tag_count_reward": 0.7278645932674408, + "step": 249 + }, + { + "clip_ratio": 0.0, + "completion_length": 367.54689025878906, + "epoch": 0.7496251874062968, + "grad_norm": 43.212145640693585, + "kl": 1.115234375, + "learning_rate": 3.5676534273251072e-06, + "loss": 1.1604, + "reward": 2.2231311798095703, + "reward_std": 1.064414381980896, + "rewards/accuracy_reward": 0.6406250074505806, + "rewards/reasoning_steps_reward": 0.8472222685813904, + "rewards/repetition_penalty_reward": -0.004299435357097536, + "rewards/tag_count_reward": 0.739583358168602, + "step": 250 + }, + { + "clip_ratio": 0.0, + "completion_length": 352.95314025878906, + "epoch": 0.7526236881559221, + "grad_norm": 1377.7529772591768, + "kl": 48.345703125, + "learning_rate": 3.487561116421958e-06, + "loss": 4.8422, + "reward": 2.2985642552375793, + "reward_std": 0.997696116566658, + "rewards/accuracy_reward": 0.6822917014360428, + "rewards/reasoning_steps_reward": 0.8715277910232544, + "rewards/repetition_penalty_reward": -0.003953163628466427, + "rewards/tag_count_reward": 0.7486979365348816, + "step": 251 + }, + { + "clip_ratio": 0.0, + "completion_length": 307.4739646911621, + "epoch": 0.7556221889055472, + "grad_norm": 237.6223200520559, + "kl": 7.603515625, + "learning_rate": 3.408187752780624e-06, + "loss": 1.9096, + "reward": 2.386752665042877, + "reward_std": 0.99474136531353, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.901041716337204, + "rewards/repetition_penalty_reward": -0.01298706023953855, + "rewards/tag_count_reward": 0.7903646230697632, + "step": 252 + }, + { + "clip_ratio": 0.0, + "completion_length": 272.8541793823242, + "epoch": 0.7586206896551724, + "grad_norm": 103.11957636266683, + "kl": 1.3759765625, + "learning_rate": 3.329542098903674e-06, + "loss": 1.2511, + "reward": 2.4556241035461426, + "reward_std": 0.754611574113369, + "rewards/accuracy_reward": 0.708333358168602, + "rewards/reasoning_steps_reward": 0.9288194626569748, + "rewards/repetition_penalty_reward": -0.010955846635624766, + "rewards/tag_count_reward": 0.829427108168602, + "step": 253 + }, + { + "clip_ratio": 0.0, + "completion_length": 216.80209350585938, + "epoch": 0.7616191904047976, + "grad_norm": 72.74924199958468, + "kl": 1.1376953125, + "learning_rate": 3.2516328369574247e-06, + "loss": 1.1836, + "reward": 2.5425453782081604, + "reward_std": 0.6245445907115936, + "rewards/accuracy_reward": 0.713541679084301, + "rewards/reasoning_steps_reward": 0.9479166865348816, + "rewards/repetition_penalty_reward": -0.005631837877444923, + "rewards/tag_count_reward": 0.8867187649011612, + "step": 254 + }, + { + "clip_ratio": 0.0, + "completion_length": 288.1458396911621, + "epoch": 0.7646176911544228, + "grad_norm": 18.047951176172486, + "kl": 0.7626953125, + "learning_rate": 3.174468567813461e-06, + "loss": 1.2569, + "reward": 2.4596019983291626, + "reward_std": 0.9773845970630646, + "rewards/accuracy_reward": 0.755208358168602, + "rewards/reasoning_steps_reward": 0.8854167014360428, + "rewards/repetition_penalty_reward": -0.003939674003049731, + "rewards/tag_count_reward": 0.8229166865348816, + "step": 255 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.9635467529297, + "epoch": 0.767616191904048, + "grad_norm": 2617.8075543625873, + "kl": 1.8046875, + "learning_rate": 3.0980578100991356e-06, + "loss": 0.9846, + "reward": 2.420164167881012, + "reward_std": 0.901138424873352, + "rewards/accuracy_reward": 0.739583358168602, + "rewards/reasoning_steps_reward": 0.8559027910232544, + "rewards/repetition_penalty_reward": -0.0034470059908926487, + "rewards/tag_count_reward": 0.8281250149011612, + "step": 256 + }, + { + "clip_ratio": 0.0, + "completion_length": 244.65104293823242, + "epoch": 0.7706146926536732, + "grad_norm": 10.063702176948555, + "kl": 0.7568359375, + "learning_rate": 3.022408999257148e-06, + "loss": 1.0871, + "reward": 2.4764610528945923, + "reward_std": 0.8722782582044601, + "rewards/accuracy_reward": 0.7656250298023224, + "rewards/reasoning_steps_reward": 0.8697917014360428, + "rewards/repetition_penalty_reward": -0.004007782437838614, + "rewards/tag_count_reward": 0.845052108168602, + "step": 257 + }, + { + "clip_ratio": 0.0, + "completion_length": 236.68750762939453, + "epoch": 0.7736131934032984, + "grad_norm": 57.97300190458974, + "kl": 0.9248046875, + "learning_rate": 2.947530486614303e-06, + "loss": 1.1472, + "reward": 2.488058924674988, + "reward_std": 0.8399006128311157, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9062500149011612, + "rewards/repetition_penalty_reward": -0.008034905651584268, + "rewards/tag_count_reward": 0.860677108168602, + "step": 258 + }, + { + "clip_ratio": 0.0, + "completion_length": 290.46875762939453, + "epoch": 0.7766116941529235, + "grad_norm": 911.2077972417934, + "kl": 33.609375, + "learning_rate": 2.8734305384595598e-06, + "loss": 1.6782, + "reward": 2.371084213256836, + "reward_std": 0.9771549105644226, + "rewards/accuracy_reward": 0.6822916865348816, + "rewards/reasoning_steps_reward": 0.8663194626569748, + "rewards/repetition_penalty_reward": -0.005651986633893102, + "rewards/tag_count_reward": 0.8281250149011612, + "step": 259 + }, + { + "clip_ratio": 0.0, + "completion_length": 237.38542556762695, + "epoch": 0.7796101949025487, + "grad_norm": 326.19482464697984, + "kl": 3.037109375, + "learning_rate": 2.8001173351314625e-06, + "loss": 1.277, + "reward": 2.5145729780197144, + "reward_std": 0.8064324706792831, + "rewards/accuracy_reward": 0.770833358168602, + "rewards/reasoning_steps_reward": 0.8888888955116272, + "rewards/repetition_penalty_reward": -0.007128427678253502, + "rewards/tag_count_reward": 0.8619791716337204, + "step": 260 + }, + { + "clip_ratio": 0.0, + "completion_length": 247.31771087646484, + "epoch": 0.782608695652174, + "grad_norm": 38.273912583736674, + "kl": 1.0126953125, + "learning_rate": 2.7275989701150684e-06, + "loss": 1.3591, + "reward": 2.555409252643585, + "reward_std": 0.9130035191774368, + "rewards/accuracy_reward": 0.8125000149011612, + "rewards/reasoning_steps_reward": 0.8854166865348816, + "rewards/repetition_penalty_reward": -0.004486680845730007, + "rewards/tag_count_reward": 0.8619791865348816, + "step": 261 + }, + { + "clip_ratio": 0.0, + "completion_length": 211.36458587646484, + "epoch": 0.7856071964017991, + "grad_norm": 63.967595082731656, + "kl": 0.875, + "learning_rate": 2.6558834491484576e-06, + "loss": 1.0617, + "reward": 2.4956788420677185, + "reward_std": 0.7778256386518478, + "rewards/accuracy_reward": 0.708333358168602, + "rewards/reasoning_steps_reward": 0.9062500149011612, + "rewards/repetition_penalty_reward": -0.005623297765851021, + "rewards/tag_count_reward": 0.8867187798023224, + "step": 262 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.95312881469727, + "epoch": 0.7886056971514243, + "grad_norm": 195.88629364008162, + "kl": 1.525390625, + "learning_rate": 2.5849786893389296e-06, + "loss": 1.5584, + "reward": 2.52648663520813, + "reward_std": 0.8215616047382355, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9131944626569748, + "rewards/repetition_penalty_reward": -0.0038954283227212727, + "rewards/tag_count_reward": 0.8880208432674408, + "step": 263 + }, + { + "clip_ratio": 0.0, + "completion_length": 234.9270896911621, + "epoch": 0.7916041979010495, + "grad_norm": 111.05889429266634, + "kl": 2.357421875, + "learning_rate": 2.514892518288988e-06, + "loss": 1.6347, + "reward": 2.5823854207992554, + "reward_std": 0.9507413059473038, + "rewards/accuracy_reward": 0.848958358168602, + "rewards/reasoning_steps_reward": 0.8750000149011612, + "rewards/repetition_penalty_reward": -0.004854308412177488, + "rewards/tag_count_reward": 0.8632812649011612, + "step": 264 + }, + { + "clip_ratio": 0.0, + "completion_length": 236.2187614440918, + "epoch": 0.7946026986506747, + "grad_norm": 71.84249826406462, + "kl": 2.79296875, + "learning_rate": 2.445632673232208e-06, + "loss": 1.4097, + "reward": 2.507424294948578, + "reward_std": 0.8585045337677002, + "rewards/accuracy_reward": 0.770833358168602, + "rewards/reasoning_steps_reward": 0.8802083432674408, + "rewards/repetition_penalty_reward": -0.002992400841321796, + "rewards/tag_count_reward": 0.8593750298023224, + "step": 265 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.5520935058594, + "epoch": 0.7976011994002998, + "grad_norm": 952.7545080808868, + "kl": 22.484375, + "learning_rate": 2.3772068001790682e-06, + "loss": 3.905, + "reward": 2.385596811771393, + "reward_std": 0.9874599426984787, + "rewards/accuracy_reward": 0.6979166865348816, + "rewards/reasoning_steps_reward": 0.8645833730697632, + "rewards/repetition_penalty_reward": -0.0063302937196567655, + "rewards/tag_count_reward": 0.8294270932674408, + "step": 266 + }, + { + "clip_ratio": 0.0, + "completion_length": 205.85938262939453, + "epoch": 0.800599700149925, + "grad_norm": 345.0356082309632, + "kl": 10.75, + "learning_rate": 2.309622453072867e-06, + "loss": 2.8112, + "reward": 2.638276517391205, + "reward_std": 0.6796813756227493, + "rewards/accuracy_reward": 0.8229166716337204, + "rewards/reasoning_steps_reward": 0.9236111342906952, + "rewards/repetition_penalty_reward": -0.005386793913203292, + "rewards/tag_count_reward": 0.8971354514360428, + "step": 267 + }, + { + "clip_ratio": 0.0, + "completion_length": 261.25000381469727, + "epoch": 0.8035982008995503, + "grad_norm": 243.7268824601945, + "kl": 3.1484375, + "learning_rate": 2.2428870929558012e-06, + "loss": 1.6183, + "reward": 2.373494029045105, + "reward_std": 0.9536427110433578, + "rewards/accuracy_reward": 0.6718750149011612, + "rewards/reasoning_steps_reward": 0.8663194626569748, + "rewards/repetition_penalty_reward": -0.005846357671543956, + "rewards/tag_count_reward": 0.8411458432674408, + "step": 268 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.04688262939453, + "epoch": 0.8065967016491754, + "grad_norm": 203.47240043545727, + "kl": 2.35546875, + "learning_rate": 2.177008087145286e-06, + "loss": 1.358, + "reward": 2.610089957714081, + "reward_std": 0.7462358474731445, + "rewards/accuracy_reward": 0.7968750149011612, + "rewards/reasoning_steps_reward": 0.923611119389534, + "rewards/repetition_penalty_reward": -0.006229604536201805, + "rewards/tag_count_reward": 0.8958333432674408, + "step": 269 + }, + { + "clip_ratio": 0.0, + "completion_length": 220.5156364440918, + "epoch": 0.8095952023988006, + "grad_norm": 104.0415374329618, + "kl": 4.044921875, + "learning_rate": 2.111992708420646e-06, + "loss": 1.7589, + "reward": 2.612178146839142, + "reward_std": 0.8539304882287979, + "rewards/accuracy_reward": 0.8385416865348816, + "rewards/reasoning_steps_reward": 0.89930559694767, + "rewards/repetition_penalty_reward": -0.004575358587317169, + "rewards/tag_count_reward": 0.8789062798023224, + "step": 270 + }, + { + "clip_ratio": 0.0, + "completion_length": 228.5260467529297, + "epoch": 0.8125937031484258, + "grad_norm": 162.74223326789044, + "kl": 11.875, + "learning_rate": 2.047848134220213e-06, + "loss": 2.9084, + "reward": 2.4650405645370483, + "reward_std": 0.8374519795179367, + "rewards/accuracy_reward": 0.6979166865348816, + "rewards/reasoning_steps_reward": 0.9027778059244156, + "rewards/repetition_penalty_reward": -0.008049842552281916, + "rewards/tag_count_reward": 0.872395858168602, + "step": 271 + }, + { + "clip_ratio": 0.0, + "completion_length": 274.6197929382324, + "epoch": 0.815592203898051, + "grad_norm": 103.66672953735151, + "kl": 8.3125, + "learning_rate": 1.984581445848981e-06, + "loss": 2.2752, + "reward": 2.4658045768737793, + "reward_std": 0.9265211671590805, + "rewards/accuracy_reward": 0.7656250298023224, + "rewards/reasoning_steps_reward": 0.8697917014360428, + "rewards/repetition_penalty_reward": -0.00554958607244771, + "rewards/tag_count_reward": 0.8359375298023224, + "step": 272 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.3697967529297, + "epoch": 0.8185907046476761, + "grad_norm": 106.41271848435063, + "kl": 6.5703125, + "learning_rate": 1.9221996276968523e-06, + "loss": 2.0641, + "reward": 2.389641523361206, + "reward_std": 1.0108384490013123, + "rewards/accuracy_reward": 0.7135416865348816, + "rewards/reasoning_steps_reward": 0.8628472536802292, + "rewards/repetition_penalty_reward": -0.007059881230816245, + "rewards/tag_count_reward": 0.8203125149011612, + "step": 273 + }, + { + "clip_ratio": 0.0, + "completion_length": 218.31250762939453, + "epoch": 0.8215892053973014, + "grad_norm": 78.45170527728588, + "kl": 3.9296875, + "learning_rate": 1.8607095664675868e-06, + "loss": 1.7482, + "reward": 2.5906487703323364, + "reward_std": 0.8199838548898697, + "rewards/accuracy_reward": 0.8020833432674408, + "rewards/reasoning_steps_reward": 0.9062500149011612, + "rewards/repetition_penalty_reward": -0.007007573178270832, + "rewards/tag_count_reward": 0.8893229365348816, + "step": 274 + }, + { + "clip_ratio": 0.0, + "completion_length": 255.67708587646484, + "epoch": 0.8245877061469266, + "grad_norm": 28.349825823882362, + "kl": 5.97265625, + "learning_rate": 1.8001180504185401e-06, + "loss": 1.9051, + "reward": 2.504952549934387, + "reward_std": 0.9115692526102066, + "rewards/accuracy_reward": 0.755208358168602, + "rewards/reasoning_steps_reward": 0.8923611491918564, + "rewards/repetition_penalty_reward": -0.0032940262462943792, + "rewards/tag_count_reward": 0.860677108168602, + "step": 275 + }, + { + "clip_ratio": 0.0, + "completion_length": 223.79688262939453, + "epoch": 0.8275862068965517, + "grad_norm": 82.66659054217332, + "kl": 9.0390625, + "learning_rate": 1.7404317686112638e-06, + "loss": 2.4944, + "reward": 2.659183382987976, + "reward_std": 0.785422757267952, + "rewards/accuracy_reward": 0.8541667014360428, + "rewards/reasoning_steps_reward": 0.923611119389534, + "rewards/repetition_penalty_reward": -0.007917315931990743, + "rewards/tag_count_reward": 0.8893229365348816, + "step": 276 + }, + { + "clip_ratio": 0.0, + "completion_length": 225.1197967529297, + "epoch": 0.8305847076461769, + "grad_norm": 115.53315284387001, + "kl": 9.71875, + "learning_rate": 1.6816573101730637e-06, + "loss": 2.136, + "reward": 2.557182252407074, + "reward_std": 0.806065671145916, + "rewards/accuracy_reward": 0.7968750149011612, + "rewards/reasoning_steps_reward": 0.8923610895872116, + "rewards/repetition_penalty_reward": -0.004449750704225153, + "rewards/tag_count_reward": 0.8723958432674408, + "step": 277 + }, + { + "clip_ratio": 0.0, + "completion_length": 259.59375762939453, + "epoch": 0.8335832083958021, + "grad_norm": 55.03228946038301, + "kl": 5.38671875, + "learning_rate": 1.6238011635695849e-06, + "loss": 1.6557, + "reward": 2.512112319469452, + "reward_std": 0.8234718143939972, + "rewards/accuracy_reward": 0.7968750298023224, + "rewards/reasoning_steps_reward": 0.8802083730697632, + "rewards/repetition_penalty_reward": -0.0048148492351174355, + "rewards/tag_count_reward": 0.8398437649011612, + "step": 278 + }, + { + "clip_ratio": 0.0, + "completion_length": 262.9635543823242, + "epoch": 0.8365817091454273, + "grad_norm": 64.0876715179611, + "kl": 6.34375, + "learning_rate": 1.5668697158885104e-06, + "loss": 2.0553, + "reward": 2.3902793526649475, + "reward_std": 0.9917967170476913, + "rewards/accuracy_reward": 0.6979166865348816, + "rewards/reasoning_steps_reward": 0.8541666567325592, + "rewards/repetition_penalty_reward": -0.0029499368683900684, + "rewards/tag_count_reward": 0.8411458432674408, + "step": 279 + }, + { + "clip_ratio": 0.0, + "completion_length": 214.36980056762695, + "epoch": 0.8395802098950524, + "grad_norm": 71.70526659015454, + "kl": 8.18359375, + "learning_rate": 1.5108692521344526e-06, + "loss": 2.302, + "reward": 2.5640260577201843, + "reward_std": 0.7752020061016083, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9097222536802292, + "rewards/repetition_penalty_reward": -0.005852422269526869, + "rewards/tag_count_reward": 0.8893229365348816, + "step": 280 + }, + { + "clip_ratio": 0.0, + "completion_length": 207.5260467529297, + "epoch": 0.8425787106446777, + "grad_norm": 41.429427085551154, + "kl": 7.34765625, + "learning_rate": 1.4558059545351144e-06, + "loss": 2.4269, + "reward": 2.5451850295066833, + "reward_std": 0.7376502603292465, + "rewards/accuracy_reward": 0.7447917014360428, + "rewards/reasoning_steps_reward": 0.9184028208255768, + "rewards/repetition_penalty_reward": -0.004728195344796404, + "rewards/tag_count_reward": 0.8867187798023224, + "step": 281 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.7864646911621, + "epoch": 0.8455772113943029, + "grad_norm": 83.02833138285867, + "kl": 2.6708984375, + "learning_rate": 1.4016859018587958e-06, + "loss": 1.3215, + "reward": 2.6956359148025513, + "reward_std": 0.533905953168869, + "rewards/accuracy_reward": 0.8437500298023224, + "rewards/reasoning_steps_reward": 0.9357638955116272, + "rewards/repetition_penalty_reward": -0.005753145087510347, + "rewards/tag_count_reward": 0.9218750149011612, + "step": 282 + }, + { + "clip_ratio": 0.0, + "completion_length": 224.3854217529297, + "epoch": 0.848575712143928, + "grad_norm": 85.4845823002127, + "kl": 5.06640625, + "learning_rate": 1.3485150687433168e-06, + "loss": 2.0253, + "reward": 2.631519079208374, + "reward_std": 0.8587917536497116, + "rewards/accuracy_reward": 0.8593750149011612, + "rewards/reasoning_steps_reward": 0.9062499850988388, + "rewards/repetition_penalty_reward": -0.0038975586649030447, + "rewards/tag_count_reward": 0.8697916865348816, + "step": 283 + }, + { + "clip_ratio": 0.0, + "completion_length": 234.18230056762695, + "epoch": 0.8515742128935532, + "grad_norm": 74.49346765573262, + "kl": 10.109375, + "learning_rate": 1.2962993250364541e-06, + "loss": 2.5552, + "reward": 2.4909926652908325, + "reward_std": 0.8654274046421051, + "rewards/accuracy_reward": 0.739583358168602, + "rewards/reasoning_steps_reward": 0.8906250447034836, + "rewards/repetition_penalty_reward": -0.009007335815113038, + "rewards/tag_count_reward": 0.8697916716337204, + "step": 284 + }, + { + "clip_ratio": 0.0, + "completion_length": 238.6614646911621, + "epoch": 0.8545727136431784, + "grad_norm": 172.140865328852, + "kl": 10.84375, + "learning_rate": 1.2450444351479196e-06, + "loss": 2.3458, + "reward": 2.4398276805877686, + "reward_std": 0.7756916880607605, + "rewards/accuracy_reward": 0.6614583507180214, + "rewards/reasoning_steps_reward": 0.9114583432674408, + "rewards/repetition_penalty_reward": -0.006786908605135977, + "rewards/tag_count_reward": 0.8736979365348816, + "step": 285 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.40625762939453, + "epoch": 0.8575712143928036, + "grad_norm": 33.813378116083186, + "kl": 4.0390625, + "learning_rate": 1.1947560574130013e-06, + "loss": 1.2931, + "reward": 2.638872265815735, + "reward_std": 0.5787665322422981, + "rewards/accuracy_reward": 0.7968750149011612, + "rewards/reasoning_steps_reward": 0.9392361640930176, + "rewards/repetition_penalty_reward": -0.006093142030294985, + "rewards/tag_count_reward": 0.9088541865348816, + "step": 286 + }, + { + "clip_ratio": 0.0, + "completion_length": 267.8020935058594, + "epoch": 0.8605697151424287, + "grad_norm": 29.38596871781673, + "kl": 7.890625, + "learning_rate": 1.1454397434679022e-06, + "loss": 2.2911, + "reward": 2.4633301496505737, + "reward_std": 1.00086210668087, + "rewards/accuracy_reward": 0.755208358168602, + "rewards/reasoning_steps_reward": 0.8715277910232544, + "rewards/repetition_penalty_reward": -0.0019476997549645603, + "rewards/tag_count_reward": 0.8385416865348816, + "step": 287 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.26562881469727, + "epoch": 0.863568215892054, + "grad_norm": 22.60777483816971, + "kl": 5.671875, + "learning_rate": 1.0971009376368614e-06, + "loss": 1.9524, + "reward": 2.4915042221546173, + "reward_std": 0.6768720299005508, + "rewards/accuracy_reward": 0.6718750149011612, + "rewards/reasoning_steps_reward": 0.923611119389534, + "rewards/repetition_penalty_reward": -0.00632570160087198, + "rewards/tag_count_reward": 0.9023437798023224, + "step": 288 + }, + { + "clip_ratio": 0.0, + "completion_length": 230.11458587646484, + "epoch": 0.8665667166416792, + "grad_norm": 203.49207627713312, + "kl": 12.6796875, + "learning_rate": 1.049744976331124e-06, + "loss": 3.1976, + "reward": 2.5564926862716675, + "reward_std": 0.9073340892791748, + "rewards/accuracy_reward": 0.7968750149011612, + "rewards/reasoning_steps_reward": 0.8958333432674408, + "rewards/repetition_penalty_reward": -0.0047051976434886456, + "rewards/tag_count_reward": 0.868489608168602, + "step": 289 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.89062881469727, + "epoch": 0.8695652173913043, + "grad_norm": 124.52484685260382, + "kl": 9.375, + "learning_rate": 1.0033770874598226e-06, + "loss": 2.8325, + "reward": 2.5556147694587708, + "reward_std": 0.8008880615234375, + "rewards/accuracy_reward": 0.7447916865348816, + "rewards/reasoning_steps_reward": 0.9201388955116272, + "rewards/repetition_penalty_reward": -0.003847024345304817, + "rewards/tag_count_reward": 0.8945312798023224, + "step": 290 + }, + { + "clip_ratio": 0.0, + "completion_length": 214.3177146911621, + "epoch": 0.8725637181409296, + "grad_norm": 58.07106802782898, + "kl": 4.95703125, + "learning_rate": 9.580023898528346e-07, + "loss": 2.0119, + "reward": 2.5350549817085266, + "reward_std": 0.7193208187818527, + "rewards/accuracy_reward": 0.729166679084301, + "rewards/reasoning_steps_reward": 0.923611119389534, + "rewards/repetition_penalty_reward": -0.012254099652636796, + "rewards/tag_count_reward": 0.8945312649011612, + "step": 291 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.4583396911621, + "epoch": 0.8755622188905547, + "grad_norm": 56.388506410538476, + "kl": 5.796875, + "learning_rate": 9.136258926956887e-07, + "loss": 1.9009, + "reward": 2.553084135055542, + "reward_std": 0.7310795336961746, + "rewards/accuracy_reward": 0.7031250149011612, + "rewards/reasoning_steps_reward": 0.9531250298023224, + "rewards/repetition_penalty_reward": -0.006811692088376731, + "rewards/tag_count_reward": 0.9036458432674408, + "step": 292 + }, + { + "clip_ratio": 0.0, + "completion_length": 241.4270896911621, + "epoch": 0.8785607196401799, + "grad_norm": 28.637264291998722, + "kl": 9.59375, + "learning_rate": 8.702524949765645e-07, + "loss": 2.5645, + "reward": 2.4492486715316772, + "reward_std": 0.9317169636487961, + "rewards/accuracy_reward": 0.7135416865348816, + "rewards/reasoning_steps_reward": 0.8854167014360428, + "rewards/repetition_penalty_reward": -0.003876364848110825, + "rewards/tag_count_reward": 0.8541666865348816, + "step": 293 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.4166717529297, + "epoch": 0.881559220389805, + "grad_norm": 62.774529626199765, + "kl": 5.64453125, + "learning_rate": 8.278869849454718e-07, + "loss": 2.3349, + "reward": 2.748142123222351, + "reward_std": 0.7123552411794662, + "rewards/accuracy_reward": 0.9010416865348816, + "rewards/reasoning_steps_reward": 0.9340278208255768, + "rewards/repetition_penalty_reward": -0.003594138892367482, + "rewards/tag_count_reward": 0.9166666865348816, + "step": 294 + }, + { + "clip_ratio": 0.0, + "completion_length": 257.51042556762695, + "epoch": 0.8845577211394303, + "grad_norm": 128.72898033686099, + "kl": 11.6328125, + "learning_rate": 7.865340395856325e-07, + "loss": 2.9018, + "reward": 2.4993616342544556, + "reward_std": 0.9201382249593735, + "rewards/accuracy_reward": 0.7656250149011612, + "rewards/reasoning_steps_reward": 0.8836805820465088, + "rewards/repetition_penalty_reward": -0.0041107177385129035, + "rewards/tag_count_reward": 0.8541666865348816, + "step": 295 + }, + { + "clip_ratio": 0.0, + "completion_length": 208.9895896911621, + "epoch": 0.8875562218890555, + "grad_norm": 19.58108496310109, + "kl": 6.859375, + "learning_rate": 7.461982240971799e-07, + "loss": 2.4699, + "reward": 2.625687062740326, + "reward_std": 0.8259201794862747, + "rewards/accuracy_reward": 0.8072916865348816, + "rewards/reasoning_steps_reward": 0.9322916716337204, + "rewards/repetition_penalty_reward": -0.0032192860962823033, + "rewards/tag_count_reward": 0.8893229216337204, + "step": 296 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.17708587646484, + "epoch": 0.8905547226386806, + "grad_norm": 33.259888962364116, + "kl": 6.0, + "learning_rate": 7.068839913931646e-07, + "loss": 2.0135, + "reward": 2.681620717048645, + "reward_std": 0.7299316972494125, + "rewards/accuracy_reward": 0.8645833432674408, + "rewards/reasoning_steps_reward": 0.9322916865348816, + "rewards/repetition_penalty_reward": -0.004577249084832147, + "rewards/tag_count_reward": 0.8893229365348816, + "step": 297 + }, + { + "clip_ratio": 0.0, + "completion_length": 241.8229217529297, + "epoch": 0.8935532233883059, + "grad_norm": 47.82236374571516, + "kl": 7.6875, + "learning_rate": 6.685956816079753e-07, + "loss": 2.4039, + "reward": 2.5991902351379395, + "reward_std": 0.863490641117096, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/reasoning_steps_reward": 0.9062500149011612, + "rewards/repetition_penalty_reward": -0.004976554249878973, + "rewards/tag_count_reward": 0.8645833730697632, + "step": 298 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.1666717529297, + "epoch": 0.896551724137931, + "grad_norm": 21.908160424151905, + "kl": 6.2890625, + "learning_rate": 6.313375216182039e-07, + "loss": 2.1573, + "reward": 2.705993890762329, + "reward_std": 0.7558012455701828, + "rewards/accuracy_reward": 0.8750000149011612, + "rewards/reasoning_steps_reward": 0.9288194626569748, + "rewards/repetition_penalty_reward": -0.00277355604339391, + "rewards/tag_count_reward": 0.9049479365348816, + "step": 299 + }, + { + "clip_ratio": 0.0, + "completion_length": 204.30730056762695, + "epoch": 0.8995502248875562, + "grad_norm": 122.95452439926706, + "kl": 8.9375, + "learning_rate": 5.951136245760181e-07, + "loss": 2.666, + "reward": 2.5854055881500244, + "reward_std": 0.6643490642309189, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9496528059244156, + "rewards/repetition_penalty_reward": -0.0061743835103698075, + "rewards/tag_count_reward": 0.9127604365348816, + "step": 300 + }, + { + "clip_ratio": 0.0, + "completion_length": 276.95314025878906, + "epoch": 0.9025487256371814, + "grad_norm": 295.5147824374305, + "kl": 15.828125, + "learning_rate": 5.599279894550824e-07, + "loss": 3.1455, + "reward": 2.4131381511688232, + "reward_std": 0.8852957636117935, + "rewards/accuracy_reward": 0.7031250149011612, + "rewards/reasoning_steps_reward": 0.8819444477558136, + "rewards/repetition_penalty_reward": -0.003962728020269424, + "rewards/tag_count_reward": 0.8320312649011612, + "step": 301 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.93750762939453, + "epoch": 0.9055472263868066, + "grad_norm": 70.53830308263373, + "kl": 8.5390625, + "learning_rate": 5.257845006090911e-07, + "loss": 2.4792, + "reward": 2.5973342061042786, + "reward_std": 0.7405329048633575, + "rewards/accuracy_reward": 0.786458358168602, + "rewards/reasoning_steps_reward": 0.9322916865348816, + "rewards/repetition_penalty_reward": -0.005530498514417559, + "rewards/tag_count_reward": 0.884114608168602, + "step": 302 + }, + { + "clip_ratio": 0.0, + "completion_length": 257.29167556762695, + "epoch": 0.9085457271364318, + "grad_norm": 22.013195631274943, + "kl": 6.3125, + "learning_rate": 4.926869273429447e-07, + "loss": 1.8309, + "reward": 2.465520918369293, + "reward_std": 0.7833161950111389, + "rewards/accuracy_reward": 0.7135416865348816, + "rewards/reasoning_steps_reward": 0.9062500298023224, + "rewards/repetition_penalty_reward": -0.009739560075104237, + "rewards/tag_count_reward": 0.8554687798023224, + "step": 303 + }, + { + "clip_ratio": 0.0, + "completion_length": 252.58854293823242, + "epoch": 0.9115442278860569, + "grad_norm": 88.95393283353653, + "kl": 5.8515625, + "learning_rate": 4.606389234966424e-07, + "loss": 1.9041, + "reward": 2.496855139732361, + "reward_std": 0.8814668357372284, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.88368059694767, + "rewards/repetition_penalty_reward": -0.00531507984851487, + "rewards/tag_count_reward": 0.8476562649011612, + "step": 304 + }, + { + "clip_ratio": 0.0, + "completion_length": 211.03125381469727, + "epoch": 0.9145427286356822, + "grad_norm": 152.53165269997837, + "kl": 4.96484375, + "learning_rate": 4.2964402704190555e-07, + "loss": 2.153, + "reward": 2.586493730545044, + "reward_std": 0.8061617463827133, + "rewards/accuracy_reward": 0.786458358168602, + "rewards/reasoning_steps_reward": 0.920138880610466, + "rewards/repetition_penalty_reward": -0.0042182166944257915, + "rewards/tag_count_reward": 0.8841145932674408, + "step": 305 + }, + { + "clip_ratio": 0.0, + "completion_length": 216.83333587646484, + "epoch": 0.9175412293853074, + "grad_norm": 88.75542795978963, + "kl": 6.0703125, + "learning_rate": 3.997056596916038e-07, + "loss": 2.1542, + "reward": 2.656059443950653, + "reward_std": 0.79727503657341, + "rewards/accuracy_reward": 0.8645833432674408, + "rewards/reasoning_steps_reward": 0.91493059694767, + "rewards/repetition_penalty_reward": -0.0036628567904699594, + "rewards/tag_count_reward": 0.8802083432674408, + "step": 306 + }, + { + "clip_ratio": 0.0, + "completion_length": 208.17188262939453, + "epoch": 0.9205397301349325, + "grad_norm": 75.82177154852495, + "kl": 8.06640625, + "learning_rate": 3.708271265220087e-07, + "loss": 2.1738, + "reward": 2.6760451197624207, + "reward_std": 0.7257892712950706, + "rewards/accuracy_reward": 0.880208358168602, + "rewards/reasoning_steps_reward": 0.9166666865348816, + "rewards/repetition_penalty_reward": -0.006246598088182509, + "rewards/tag_count_reward": 0.8854166865348816, + "step": 307 + }, + { + "clip_ratio": 0.0, + "completion_length": 226.62500381469727, + "epoch": 0.9235382308845578, + "grad_norm": 167.48098467184784, + "kl": 10.15625, + "learning_rate": 3.430116156079277e-07, + "loss": 2.5838, + "reward": 2.539145290851593, + "reward_std": 0.8195622712373734, + "rewards/accuracy_reward": 0.7552083432674408, + "rewards/reasoning_steps_reward": 0.9166666865348816, + "rewards/repetition_penalty_reward": -0.006427670712582767, + "rewards/tag_count_reward": 0.8736979216337204, + "step": 308 + }, + { + "clip_ratio": 0.0, + "completion_length": 250.46875381469727, + "epoch": 0.9265367316341829, + "grad_norm": 188.4999317880181, + "kl": 12.703125, + "learning_rate": 3.1626219767075584e-07, + "loss": 2.8945, + "reward": 2.4836193919181824, + "reward_std": 0.8600171506404877, + "rewards/accuracy_reward": 0.739583358168602, + "rewards/reasoning_steps_reward": 0.9079861342906952, + "rewards/repetition_penalty_reward": -0.01551259565167129, + "rewards/tag_count_reward": 0.8515625149011612, + "step": 309 + }, + { + "clip_ratio": 0.0, + "completion_length": 245.3020896911621, + "epoch": 0.9295352323838081, + "grad_norm": 61.9794266708475, + "kl": 9.3046875, + "learning_rate": 2.905818257394799e-07, + "loss": 2.2241, + "reward": 2.4837332367897034, + "reward_std": 0.7677433341741562, + "rewards/accuracy_reward": 0.7291666716337204, + "rewards/reasoning_steps_reward": 0.9079861491918564, + "rewards/repetition_penalty_reward": -0.004982162558007985, + "rewards/tag_count_reward": 0.8515625149011612, + "step": 310 + }, + { + "clip_ratio": 0.0, + "completion_length": 223.09375762939453, + "epoch": 0.9325337331334332, + "grad_norm": 38.9834806127014, + "kl": 6.96875, + "learning_rate": 2.659733348246685e-07, + "loss": 2.2927, + "reward": 2.5947685837745667, + "reward_std": 0.8158188462257385, + "rewards/accuracy_reward": 0.8072916716337204, + "rewards/reasoning_steps_reward": 0.9149305671453476, + "rewards/repetition_penalty_reward": -0.005057800153736025, + "rewards/tag_count_reward": 0.8776041716337204, + "step": 311 + }, + { + "clip_ratio": 0.0, + "completion_length": 237.89063262939453, + "epoch": 0.9355322338830585, + "grad_norm": 93.39130786178272, + "kl": 5.546875, + "learning_rate": 2.4243944160550757e-07, + "loss": 1.9383, + "reward": 2.5787097811698914, + "reward_std": 0.834644541144371, + "rewards/accuracy_reward": 0.817708358168602, + "rewards/reasoning_steps_reward": 0.9010417014360428, + "rewards/repetition_penalty_reward": -0.007227733498439193, + "rewards/tag_count_reward": 0.8671875149011612, + "step": 312 + }, + { + "clip_ratio": 0.0, + "completion_length": 220.5572967529297, + "epoch": 0.9385307346326837, + "grad_norm": 75.42828643371548, + "kl": 5.5546875, + "learning_rate": 2.199827441298863e-07, + "loss": 1.8684, + "reward": 2.61844664812088, + "reward_std": 0.8282945156097412, + "rewards/accuracy_reward": 0.8437500149011612, + "rewards/reasoning_steps_reward": 0.9062500298023224, + "rewards/repetition_penalty_reward": -0.006553410203196108, + "rewards/tag_count_reward": 0.8750000149011612, + "step": 313 + }, + { + "clip_ratio": 0.0, + "completion_length": 254.90625762939453, + "epoch": 0.9415292353823088, + "grad_norm": 55.21000753237673, + "kl": 6.890625, + "learning_rate": 1.986057215275816e-07, + "loss": 2.2089, + "reward": 2.4655805230140686, + "reward_std": 0.8631928116083145, + "rewards/accuracy_reward": 0.7135416865348816, + "rewards/reasoning_steps_reward": 0.9010416716337204, + "rewards/repetition_penalty_reward": -0.004471502033993602, + "rewards/tag_count_reward": 0.8554687649011612, + "step": 314 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.2916717529297, + "epoch": 0.9445277361319341, + "grad_norm": 13.351484586247325, + "kl": 4.96875, + "learning_rate": 1.7831073373657527e-07, + "loss": 1.2923, + "reward": 2.658263087272644, + "reward_std": 0.573756992816925, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/reasoning_steps_reward": 0.9236111491918564, + "rewards/repetition_penalty_reward": -0.00753560135490261, + "rewards/tag_count_reward": 0.9088541716337204, + "step": 315 + }, + { + "clip_ratio": 0.0, + "completion_length": 211.89583587646484, + "epoch": 0.9475262368815592, + "grad_norm": 14.409679464629958, + "kl": 7.03125, + "learning_rate": 1.5910002124251979e-07, + "loss": 2.3063, + "reward": 2.6654553413391113, + "reward_std": 0.7150935083627701, + "rewards/accuracy_reward": 0.8281250298023224, + "rewards/reasoning_steps_reward": 0.9444444924592972, + "rewards/repetition_penalty_reward": -0.002947573288111016, + "rewards/tag_count_reward": 0.895833358168602, + "step": 316 + }, + { + "clip_ratio": 0.0, + "completion_length": 269.00521087646484, + "epoch": 0.9505247376311844, + "grad_norm": 157.41546141813902, + "kl": 12.15625, + "learning_rate": 1.4097570483140642e-07, + "loss": 2.5569, + "reward": 2.467340648174286, + "reward_std": 0.888208270072937, + "rewards/accuracy_reward": 0.7239583432674408, + "rewards/reasoning_steps_reward": 0.9097222685813904, + "rewards/repetition_penalty_reward": -0.00878784217638895, + "rewards/tag_count_reward": 0.8424479365348816, + "step": 317 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.60938262939453, + "epoch": 0.9535232383808095, + "grad_norm": 141.14810050857622, + "kl": 9.65625, + "learning_rate": 1.239397853554336e-07, + "loss": 3.0646, + "reward": 2.610135555267334, + "reward_std": 0.7995207458734512, + "rewards/accuracy_reward": 0.7968750149011612, + "rewards/reasoning_steps_reward": 0.9253472685813904, + "rewards/repetition_penalty_reward": -0.005315949267242104, + "rewards/tag_count_reward": 0.8932291716337204, + "step": 318 + }, + { + "clip_ratio": 0.0, + "completion_length": 220.1302146911621, + "epoch": 0.9565217391304348, + "grad_norm": 104.39513312177709, + "kl": 9.4296875, + "learning_rate": 1.0799414351212234e-07, + "loss": 2.6495, + "reward": 2.5906622409820557, + "reward_std": 0.7361035495996475, + "rewards/accuracy_reward": 0.786458358168602, + "rewards/reasoning_steps_reward": 0.9218750596046448, + "rewards/repetition_penalty_reward": -0.006994032271904871, + "rewards/tag_count_reward": 0.8893229365348816, + "step": 319 + }, + { + "clip_ratio": 0.0, + "completion_length": 260.72396087646484, + "epoch": 0.95952023988006, + "grad_norm": 172.54963273214497, + "kl": 12.796875, + "learning_rate": 9.314053963669245e-08, + "loss": 2.6928, + "reward": 2.501668691635132, + "reward_std": 0.9300008714199066, + "rewards/accuracy_reward": 0.7968750149011612, + "rewards/reasoning_steps_reward": 0.8750000596046448, + "rewards/repetition_penalty_reward": -0.006144002894870937, + "rewards/tag_count_reward": 0.8359375298023224, + "step": 320 + }, + { + "clip_ratio": 0.0, + "completion_length": 222.2656364440918, + "epoch": 0.9625187406296851, + "grad_norm": 58.41749025051054, + "kl": 8.359375, + "learning_rate": 7.938061350773241e-08, + "loss": 2.5917, + "reward": 2.5556360483169556, + "reward_std": 0.8327264338731766, + "rewards/accuracy_reward": 0.7656250298023224, + "rewards/reasoning_steps_reward": 0.9166666865348816, + "rewards/repetition_penalty_reward": -0.006864078342914581, + "rewards/tag_count_reward": 0.880208358168602, + "step": 321 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.64583587646484, + "epoch": 0.9655172413793104, + "grad_norm": 21.274250522963346, + "kl": 6.21875, + "learning_rate": 6.671588416617081e-08, + "loss": 2.0353, + "reward": 2.5732468962669373, + "reward_std": 0.6762567013502121, + "rewards/accuracy_reward": 0.7343750149011612, + "rewards/reasoning_steps_reward": 0.934027761220932, + "rewards/repetition_penalty_reward": -0.00661420589312911, + "rewards/tag_count_reward": 0.911458358168602, + "step": 322 + }, + { + "clip_ratio": 0.0, + "completion_length": 248.5416717529297, + "epoch": 0.9685157421289355, + "grad_norm": 30.694601447554664, + "kl": 9.1796875, + "learning_rate": 5.5147749747582744e-08, + "loss": 2.2922, + "reward": 2.4823118448257446, + "reward_std": 0.845875695347786, + "rewards/accuracy_reward": 0.7343750149011612, + "rewards/reasoning_steps_reward": 0.8975694477558136, + "rewards/repetition_penalty_reward": -0.0037993842852301896, + "rewards/tag_count_reward": 0.8541667014360428, + "step": 323 + }, + { + "clip_ratio": 0.0, + "completion_length": 226.02605056762695, + "epoch": 0.9715142428785607, + "grad_norm": 37.717634935459415, + "kl": 6.8359375, + "learning_rate": 4.467748732783994e-08, + "loss": 2.167, + "reward": 2.6481770277023315, + "reward_std": 0.7722356021404266, + "rewards/accuracy_reward": 0.8385416716337204, + "rewards/reasoning_steps_reward": 0.9270833283662796, + "rewards/repetition_penalty_reward": -0.0015625922533217818, + "rewards/tag_count_reward": 0.884114608168602, + "step": 324 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.2916717529297, + "epoch": 0.974512743628186, + "grad_norm": 116.25145619245001, + "kl": 4.23828125, + "learning_rate": 3.530625278212685e-08, + "loss": 1.9012, + "reward": 2.604085922241211, + "reward_std": 0.7135767489671707, + "rewards/accuracy_reward": 0.7604166865348816, + "rewards/reasoning_steps_reward": 0.9392361491918564, + "rewards/repetition_penalty_reward": -0.004421164951054379, + "rewards/tag_count_reward": 0.9088541865348816, + "step": 325 + }, + { + "clip_ratio": 0.0, + "completion_length": 225.10937881469727, + "epoch": 0.9775112443778111, + "grad_norm": 47.023072554995835, + "kl": 6.8359375, + "learning_rate": 2.7035080657338287e-08, + "loss": 2.2076, + "reward": 2.577496290206909, + "reward_std": 0.8721825927495956, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.9010417014360428, + "rewards/repetition_penalty_reward": -0.005837142816744745, + "rewards/tag_count_reward": 0.8697916865348816, + "step": 326 + }, + { + "clip_ratio": 0.0, + "completion_length": 232.28125762939453, + "epoch": 0.9805097451274363, + "grad_norm": 91.66318973005008, + "kl": 6.0703125, + "learning_rate": 1.986488405786524e-08, + "loss": 2.2354, + "reward": 2.548638343811035, + "reward_std": 0.8752985298633575, + "rewards/accuracy_reward": 0.7812500149011612, + "rewards/reasoning_steps_reward": 0.9079861640930176, + "rewards/repetition_penalty_reward": -0.006483202683739364, + "rewards/tag_count_reward": 0.8658854365348816, + "step": 327 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.97396087646484, + "epoch": 0.9835082458770614, + "grad_norm": 93.30979532325884, + "kl": 5.15625, + "learning_rate": 1.3796454544796612e-08, + "loss": 2.018, + "reward": 2.5490445494651794, + "reward_std": 0.7066824734210968, + "rewards/accuracy_reward": 0.7343750149011612, + "rewards/reasoning_steps_reward": 0.921875, + "rewards/repetition_penalty_reward": -0.005643073527608067, + "rewards/tag_count_reward": 0.8984375149011612, + "step": 328 + }, + { + "clip_ratio": 0.0, + "completion_length": 209.05208587646484, + "epoch": 0.9865067466266867, + "grad_norm": 55.95983139614906, + "kl": 5.65625, + "learning_rate": 8.83046204853133e-09, + "loss": 2.0135, + "reward": 2.6783204674720764, + "reward_std": 0.7424749433994293, + "rewards/accuracy_reward": 0.8697916865348816, + "rewards/reasoning_steps_reward": 0.9201388955116272, + "rewards/repetition_penalty_reward": -0.0035372423008084297, + "rewards/tag_count_reward": 0.8919271230697632, + "step": 329 + }, + { + "clip_ratio": 0.0, + "completion_length": 248.3541717529297, + "epoch": 0.9895052473763118, + "grad_norm": 45.74323780759467, + "kl": 6.8671875, + "learning_rate": 4.967454794823079e-09, + "loss": 2.1326, + "reward": 2.388830304145813, + "reward_std": 0.7917287796735764, + "rewards/accuracy_reward": 0.6354167014360428, + "rewards/reasoning_steps_reward": 0.8993055522441864, + "rewards/repetition_penalty_reward": -0.006569080462213606, + "rewards/tag_count_reward": 0.8606771230697632, + "step": 330 + }, + { + "clip_ratio": 0.0, + "completion_length": 226.32291793823242, + "epoch": 0.992503748125937, + "grad_norm": 18.948649605644253, + "kl": 6.3515625, + "learning_rate": 2.2078592442553725e-09, + "loss": 1.9433, + "reward": 2.566007614135742, + "reward_std": 0.7248884737491608, + "rewards/accuracy_reward": 0.7656250149011612, + "rewards/reasoning_steps_reward": 0.923611119389534, + "rewards/repetition_penalty_reward": -0.006041057640686631, + "rewards/tag_count_reward": 0.8828125149011612, + "step": 331 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.53646087646484, + "epoch": 0.9955022488755623, + "grad_norm": 54.47996371401269, + "kl": 5.13671875, + "learning_rate": 5.519800451625479e-10, + "loss": 1.9846, + "reward": 2.375428855419159, + "reward_std": 0.6744664013385773, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.937500074505806, + "rewards/repetition_penalty_reward": -0.00477961910655722, + "rewards/tag_count_reward": 0.9010416716337204, + "step": 332 + }, + { + "clip_ratio": 0.0, + "completion_length": 230.6927146911621, + "epoch": 0.9985007496251874, + "grad_norm": 58.000699042461555, + "kl": 7.21875, + "learning_rate": 0.0, + "loss": 2.3714, + "reward": 2.6160709261894226, + "reward_std": 0.8584436923265457, + "rewards/accuracy_reward": 0.8437500149011612, + "rewards/reasoning_steps_reward": 0.9062500149011612, + "rewards/repetition_penalty_reward": -0.00632495793979615, + "rewards/tag_count_reward": 0.8723958432674408, + "step": 333 + }, + { + "epoch": 0.9985007496251874, + "step": 333, + "total_flos": 0.0, + "train_loss": 0.6876773679259796, + "train_runtime": 4819.3131, + "train_samples_per_second": 0.83, + "train_steps_per_second": 0.069 + } + ], + "logging_steps": 1, + "max_steps": 333, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}