diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,5353 +1,10681 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.9985007496251874, + "epoch": 0.9988751406074241, "eval_steps": 500, - "global_step": 333, + "global_step": 666, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, - "completion_length": 125.27083587646484, - "epoch": 0.0029985007496251873, - "grad_norm": 2.316068982886434, + "completion_length": 150.88541984558105, + "epoch": 0.0014998125234345708, + "grad_norm": 2.014207151760731, "kl": 0.0, - "learning_rate": 5.882352941176471e-07, - "loss": -0.139, - "reward": 0.4294887036085129, - "reward_std": 0.6044371202588081, - "rewards/accuracy_reward": 0.09375000419095159, - "rewards/reasoning_steps_reward": 0.0920138955116272, - "rewards/repetition_penalty_reward": -0.05184812843799591, - "rewards/tag_count_reward": 0.295572929084301, + "learning_rate": 2.9850746268656716e-07, + "loss": -0.123, + "reward": 0.39659371972084045, + "reward_std": 0.6371014267206192, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/reasoning_steps_reward": 0.04861111380159855, + "rewards/repetition_penalty_reward": -0.05305909365415573, + "rewards/tag_count_reward": 0.2760416716337204, "step": 1 }, { "clip_ratio": 0.0, - "completion_length": 129.07812881469727, - "epoch": 0.005997001499250375, - "grad_norm": 2.528986939979676, + "completion_length": 139.43750381469727, + "epoch": 0.0029996250468691415, + "grad_norm": 2.2602676300267803, "kl": 0.0, - "learning_rate": 1.1764705882352942e-06, - "loss": -0.1784, - "reward": 0.4756753593683243, - "reward_std": 0.6413701921701431, - "rewards/accuracy_reward": 0.11979167070239782, - "rewards/reasoning_steps_reward": 0.1041666753590107, - "rewards/repetition_penalty_reward": -0.04776214715093374, - "rewards/tag_count_reward": 0.299479179084301, + "learning_rate": 5.970149253731343e-07, + "loss": -0.0563, + "reward": 0.29565349593758583, + "reward_std": 0.5291136056184769, + "rewards/accuracy_reward": 0.07812500046566129, + "rewards/reasoning_steps_reward": 0.039930558297783136, + "rewards/repetition_penalty_reward": -0.04505832493305206, + "rewards/tag_count_reward": 0.2226562537252903, "step": 2 }, { "clip_ratio": 0.0, - "completion_length": 116.48437881469727, - "epoch": 0.008995502248875561, - "grad_norm": 2.8435255560528976, - "kl": 0.00046443939208984375, - "learning_rate": 1.7647058823529414e-06, - "loss": -0.1512, - "reward": 0.4948234558105469, - "reward_std": 0.6535268872976303, - "rewards/accuracy_reward": 0.1510416679084301, - "rewards/reasoning_steps_reward": 0.07118056109175086, - "rewards/repetition_penalty_reward": -0.04380502179265022, - "rewards/tag_count_reward": 0.3164062649011612, + "completion_length": 153.2708396911621, + "epoch": 0.0044994375703037125, + "grad_norm": 2.1173606964196554, + "kl": 0.000335693359375, + "learning_rate": 8.955223880597015e-07, + "loss": -0.1131, + "reward": 0.4513879381120205, + "reward_std": 0.6719020158052444, + "rewards/accuracy_reward": 0.14583333721384406, + "rewards/reasoning_steps_reward": 0.07986111380159855, + "rewards/repetition_penalty_reward": -0.047744009643793106, + "rewards/tag_count_reward": 0.2734375037252903, "step": 3 }, { "clip_ratio": 0.0, - "completion_length": 113.15625190734863, - "epoch": 0.01199400299850075, - "grad_norm": 2.6137621112730782, - "kl": 0.0006656646728515625, - "learning_rate": 2.3529411764705885e-06, - "loss": -0.0962, - "reward": 0.39275161921977997, - "reward_std": 0.5565674006938934, - "rewards/accuracy_reward": 0.0937500037252903, - "rewards/reasoning_steps_reward": 0.0590277835726738, - "rewards/repetition_penalty_reward": -0.0386720122769475, - "rewards/tag_count_reward": 0.2786458358168602, + "completion_length": 143.56250381469727, + "epoch": 0.005999250093738283, + "grad_norm": 2.1085551568397918, + "kl": 0.00038623809814453125, + "learning_rate": 1.1940298507462686e-06, + "loss": -0.0492, + "reward": 0.2926744148135185, + "reward_std": 0.5166807025671005, + "rewards/accuracy_reward": 0.07812500046566129, + "rewards/reasoning_steps_reward": 0.0451388917863369, + "rewards/repetition_penalty_reward": -0.04933948162943125, + "rewards/tag_count_reward": 0.2187500037252903, "step": 4 }, { "clip_ratio": 0.0, - "completion_length": 122.72396278381348, - "epoch": 0.014992503748125937, - "grad_norm": 2.3547773173814814, - "kl": 0.0024394989013671875, - "learning_rate": 2.9411764705882355e-06, - "loss": -0.125, - "reward": 0.5311619490385056, - "reward_std": 0.61662757396698, - "rewards/accuracy_reward": 0.14062500139698386, - "rewards/reasoning_steps_reward": 0.0885416716337204, - "rewards/repetition_penalty_reward": -0.05737974401563406, - "rewards/tag_count_reward": 0.3593750074505806, + "completion_length": 139.6354217529297, + "epoch": 0.0074990626171728535, + "grad_norm": 2.0783939318892255, + "kl": 0.0004963874816894531, + "learning_rate": 1.4925373134328358e-06, + "loss": -0.0786, + "reward": 0.33502432703971863, + "reward_std": 0.5550277233123779, + "rewards/accuracy_reward": 0.08854166977107525, + "rewards/reasoning_steps_reward": 0.055555559694767, + "rewards/repetition_penalty_reward": -0.044749997556209564, + "rewards/tag_count_reward": 0.2356770858168602, "step": 5 }, { "clip_ratio": 0.0, - "completion_length": 138.2447967529297, - "epoch": 0.017991004497751123, - "grad_norm": 2.1062805811826757, - "kl": 0.055267333984375, - "learning_rate": 3.529411764705883e-06, - "loss": -0.0627, - "reward": 0.7481685727834702, - "reward_std": 0.7483109384775162, - "rewards/accuracy_reward": 0.2708333432674408, - "rewards/reasoning_steps_reward": 0.06076389271765947, - "rewards/repetition_penalty_reward": -0.06129324156790972, - "rewards/tag_count_reward": 0.4778645858168602, + "completion_length": 141.94271087646484, + "epoch": 0.008998875140607425, + "grad_norm": 2.140600840808783, + "kl": 0.001529693603515625, + "learning_rate": 1.791044776119403e-06, + "loss": -0.1144, + "reward": 0.3174091763794422, + "reward_std": 0.5731675401329994, + "rewards/accuracy_reward": 0.08854166697710752, + "rewards/reasoning_steps_reward": 0.04513889132067561, + "rewards/repetition_penalty_reward": -0.05325053818523884, + "rewards/tag_count_reward": 0.2369791753590107, "step": 6 }, { "clip_ratio": 0.0, - "completion_length": 150.7708396911621, - "epoch": 0.020989505247376312, - "grad_norm": 91.80938835633462, - "kl": 2.9296875, - "learning_rate": 4.11764705882353e-06, - "loss": 0.0604, - "reward": 1.0586681962013245, - "reward_std": 0.6929789483547211, - "rewards/accuracy_reward": 0.3489583358168602, - "rewards/reasoning_steps_reward": 0.1371527910232544, - "rewards/repetition_penalty_reward": -0.06676589138805866, - "rewards/tag_count_reward": 0.6393229365348816, + "completion_length": 139.45833778381348, + "epoch": 0.010498687664041995, + "grad_norm": 2.0946221841916577, + "kl": 0.012420654296875, + "learning_rate": 2.08955223880597e-06, + "loss": -0.0705, + "reward": 0.4593450725078583, + "reward_std": 0.5785461291670799, + "rewards/accuracy_reward": 0.09895833488553762, + "rewards/reasoning_steps_reward": 0.032986113568767905, + "rewards/repetition_penalty_reward": -0.05020354688167572, + "rewards/tag_count_reward": 0.377604179084301, "step": 7 }, { "clip_ratio": 0.0, - "completion_length": 154.0104217529297, - "epoch": 0.0239880059970015, - "grad_norm": 80.12581730059306, - "kl": 2.7734375, - "learning_rate": 4.705882352941177e-06, - "loss": 0.0618, - "reward": 1.2028335630893707, - "reward_std": 0.6846612095832825, - "rewards/accuracy_reward": 0.4687500074505806, - "rewards/reasoning_steps_reward": 0.1232638955116272, - "rewards/repetition_penalty_reward": -0.06626365892589092, - "rewards/tag_count_reward": 0.6770833730697632, + "completion_length": 160.0104217529297, + "epoch": 0.011998500187476566, + "grad_norm": 1.610019846974552, + "kl": 0.0306396484375, + "learning_rate": 2.3880597014925373e-06, + "loss": -0.0826, + "reward": 0.5668669790029526, + "reward_std": 0.695038303732872, + "rewards/accuracy_reward": 0.1458333358168602, + "rewards/reasoning_steps_reward": 0.08506944868713617, + "rewards/repetition_penalty_reward": -0.053358727134764194, + "rewards/tag_count_reward": 0.3893229216337204, "step": 8 }, { "clip_ratio": 0.0, - "completion_length": 161.4791717529297, - "epoch": 0.026986506746626688, - "grad_norm": 25.478259616973777, - "kl": 0.8330078125, - "learning_rate": 5.294117647058824e-06, - "loss": -0.0184, - "reward": 1.3118138909339905, - "reward_std": 0.7422287911176682, - "rewards/accuracy_reward": 0.4843750149011612, - "rewards/reasoning_steps_reward": 0.18055557273328304, - "rewards/repetition_penalty_reward": -0.07056461833417416, - "rewards/tag_count_reward": 0.7174479365348816, + "completion_length": 171.60937881469727, + "epoch": 0.013498312710911136, + "grad_norm": 19.671391688045556, + "kl": 0.759765625, + "learning_rate": 2.686567164179105e-06, + "loss": -0.1139, + "reward": 0.9202403426170349, + "reward_std": 0.7923817485570908, + "rewards/accuracy_reward": 0.3020833432674408, + "rewards/reasoning_steps_reward": 0.1649305671453476, + "rewards/repetition_penalty_reward": -0.07541941851377487, + "rewards/tag_count_reward": 0.5286458432674408, "step": 9 }, { "clip_ratio": 0.0, - "completion_length": 165.47916793823242, - "epoch": 0.029985007496251874, - "grad_norm": 3.3317610347487396, - "kl": 0.205810546875, - "learning_rate": 5.882352941176471e-06, - "loss": -0.0374, - "reward": 1.234879344701767, - "reward_std": 0.7370101809501648, - "rewards/accuracy_reward": 0.432291679084301, - "rewards/reasoning_steps_reward": 0.1458333469927311, - "rewards/repetition_penalty_reward": -0.06069365330040455, - "rewards/tag_count_reward": 0.7174479365348816, + "completion_length": 173.39062881469727, + "epoch": 0.014998125234345707, + "grad_norm": 116.39843362025961, + "kl": 3.4140625, + "learning_rate": 2.9850746268656716e-06, + "loss": 0.0701, + "reward": 0.9044180512428284, + "reward_std": 0.7367634326219559, + "rewards/accuracy_reward": 0.2968750074505806, + "rewards/reasoning_steps_reward": 0.06423611333593726, + "rewards/repetition_penalty_reward": -0.06867224909365177, + "rewards/tag_count_reward": 0.6119791716337204, "step": 10 }, { "clip_ratio": 0.0, - "completion_length": 178.56771087646484, - "epoch": 0.03298350824587706, - "grad_norm": 1.2483918606755664, - "kl": 0.08367919921875, - "learning_rate": 6.470588235294119e-06, - "loss": -0.064, - "reward": 1.2948878109455109, - "reward_std": 0.7816713899374008, - "rewards/accuracy_reward": 0.3489583432674408, - "rewards/reasoning_steps_reward": 0.2638889104127884, - "rewards/repetition_penalty_reward": -0.07577204331755638, - "rewards/tag_count_reward": 0.7578125149011612, + "completion_length": 165.9322967529297, + "epoch": 0.016497937757780277, + "grad_norm": 16.02276467950822, + "kl": 0.72412109375, + "learning_rate": 3.283582089552239e-06, + "loss": 0.0225, + "reward": 1.0981639921665192, + "reward_std": 0.7406069040298462, + "rewards/accuracy_reward": 0.416666679084301, + "rewards/reasoning_steps_reward": 0.07812500465661287, + "rewards/repetition_penalty_reward": -0.06459642946720123, + "rewards/tag_count_reward": 0.6679687649011612, "step": 11 }, { "clip_ratio": 0.0, - "completion_length": 193.20312881469727, - "epoch": 0.035982008995502246, - "grad_norm": 1.2630629002924778, - "kl": 0.05889892578125, - "learning_rate": 7.058823529411766e-06, - "loss": -0.0437, - "reward": 1.472415030002594, - "reward_std": 0.7569083422422409, - "rewards/accuracy_reward": 0.473958358168602, - "rewards/reasoning_steps_reward": 0.2899305671453476, - "rewards/repetition_penalty_reward": -0.061005206778645515, - "rewards/tag_count_reward": 0.7695312649011612, + "completion_length": 188.2135467529297, + "epoch": 0.01799775028121485, + "grad_norm": 5.745682328164949, + "kl": 0.293212890625, + "learning_rate": 3.582089552238806e-06, + "loss": 0.0254, + "reward": 1.068365916609764, + "reward_std": 0.7423798739910126, + "rewards/accuracy_reward": 0.3593750149011612, + "rewards/reasoning_steps_reward": 0.1111111156642437, + "rewards/repetition_penalty_reward": -0.0635785311460495, + "rewards/tag_count_reward": 0.661458358168602, "step": 12 }, { "clip_ratio": 0.0, - "completion_length": 209.9791717529297, - "epoch": 0.038980509745127435, - "grad_norm": 1.1307015314913393, - "kl": 0.04443359375, - "learning_rate": 7.647058823529411e-06, - "loss": -0.0193, - "reward": 1.6883811056613922, - "reward_std": 0.7850492298603058, - "rewards/accuracy_reward": 0.5625000223517418, - "rewards/reasoning_steps_reward": 0.392361119389534, - "rewards/repetition_penalty_reward": -0.06986541766673326, - "rewards/tag_count_reward": 0.8033854216337204, + "completion_length": 160.97916793823242, + "epoch": 0.01949756280464942, + "grad_norm": 4.696162571842331, + "kl": 0.228271484375, + "learning_rate": 3.8805970149253735e-06, + "loss": -0.0185, + "reward": 1.1401186883449554, + "reward_std": 0.7335168719291687, + "rewards/accuracy_reward": 0.3645833395421505, + "rewards/reasoning_steps_reward": 0.10416667349636555, + "rewards/repetition_penalty_reward": -0.05519386660307646, + "rewards/tag_count_reward": 0.7265625149011612, "step": 13 }, { "clip_ratio": 0.0, - "completion_length": 222.8385467529297, - "epoch": 0.041979010494752625, - "grad_norm": 1.0685540714109272, - "kl": 0.08837890625, - "learning_rate": 8.23529411764706e-06, - "loss": -0.0437, - "reward": 1.8284152746200562, - "reward_std": 0.7906496375799179, - "rewards/accuracy_reward": 0.5468750149011612, - "rewards/reasoning_steps_reward": 0.5121527835726738, - "rewards/repetition_penalty_reward": -0.07045630738139153, - "rewards/tag_count_reward": 0.8398437649011612, + "completion_length": 149.4947967529297, + "epoch": 0.02099737532808399, + "grad_norm": 4.440887208265822, + "kl": 0.205322265625, + "learning_rate": 4.17910447761194e-06, + "loss": 0.0925, + "reward": 1.1221649795770645, + "reward_std": 0.6743175089359283, + "rewards/accuracy_reward": 0.3958333469927311, + "rewards/reasoning_steps_reward": 0.039930558763444424, + "rewards/repetition_penalty_reward": -0.06490109767764807, + "rewards/tag_count_reward": 0.7513020932674408, "step": 14 }, { "clip_ratio": 0.0, - "completion_length": 244.41667556762695, - "epoch": 0.044977511244377814, - "grad_norm": 0.9074510920159916, - "kl": 0.08544921875, - "learning_rate": 8.823529411764707e-06, - "loss": -0.0056, - "reward": 2.034987300634384, - "reward_std": 0.6369659751653671, - "rewards/accuracy_reward": 0.5000000074505806, - "rewards/reasoning_steps_reward": 0.7569444626569748, - "rewards/repetition_penalty_reward": -0.10607173293828964, - "rewards/tag_count_reward": 0.884114608168602, + "completion_length": 164.8072967529297, + "epoch": 0.02249718785151856, + "grad_norm": 4.632359176117392, + "kl": 0.2489013671875, + "learning_rate": 4.477611940298508e-06, + "loss": 0.0844, + "reward": 1.2497529983520508, + "reward_std": 0.6609881520271301, + "rewards/accuracy_reward": 0.432291679084301, + "rewards/reasoning_steps_reward": 0.09027777938172221, + "rewards/repetition_penalty_reward": -0.0488581582903862, + "rewards/tag_count_reward": 0.7760416865348816, "step": 15 }, { "clip_ratio": 0.0, - "completion_length": 245.24480438232422, - "epoch": 0.047976011994003, - "grad_norm": 0.958833819569195, - "kl": 0.107177734375, - "learning_rate": 9.411764705882354e-06, - "loss": 0.064, - "reward": 2.3543498516082764, - "reward_std": 0.5774905681610107, - "rewards/accuracy_reward": 0.6822916865348816, - "rewards/reasoning_steps_reward": 0.833333358168602, - "rewards/repetition_penalty_reward": -0.09486887603998184, - "rewards/tag_count_reward": 0.9335937649011612, + "completion_length": 159.5104217529297, + "epoch": 0.023997000374953132, + "grad_norm": 1.5383616347232285, + "kl": 0.078369140625, + "learning_rate": 4.7761194029850745e-06, + "loss": 0.016, + "reward": 1.3225627541542053, + "reward_std": 0.6928769499063492, + "rewards/accuracy_reward": 0.5156250149011612, + "rewards/reasoning_steps_reward": 0.0642361156642437, + "rewards/repetition_penalty_reward": -0.04896502383053303, + "rewards/tag_count_reward": 0.7916667014360428, "step": 16 }, { "clip_ratio": 0.0, - "completion_length": 224.5833396911621, - "epoch": 0.050974512743628186, - "grad_norm": 32.488470313797116, - "kl": 0.901611328125, - "learning_rate": 1e-05, - "loss": 0.0729, - "reward": 2.285028785467148, - "reward_std": 0.5724412277340889, - "rewards/accuracy_reward": 0.5885416716337204, - "rewards/reasoning_steps_reward": 0.885416716337204, - "rewards/repetition_penalty_reward": -0.0938775297254324, - "rewards/tag_count_reward": 0.9049479365348816, + "completion_length": 163.390625, + "epoch": 0.0254968128983877, + "grad_norm": 1.3506574427021751, + "kl": 0.06396484375, + "learning_rate": 5.074626865671642e-06, + "loss": 0.0088, + "reward": 1.286087691783905, + "reward_std": 0.7203228771686554, + "rewards/accuracy_reward": 0.4531250223517418, + "rewards/reasoning_steps_reward": 0.12500000558793545, + "rewards/repetition_penalty_reward": -0.055058157071471214, + "rewards/tag_count_reward": 0.7630208432674408, "step": 17 }, { "clip_ratio": 0.0, - "completion_length": 207.55729293823242, - "epoch": 0.053973013493253376, - "grad_norm": 1.0958281725310235, - "kl": 0.14453125, - "learning_rate": 1.0588235294117648e-05, - "loss": -0.0121, - "reward": 2.4053784608840942, - "reward_std": 0.49531228840351105, - "rewards/accuracy_reward": 0.645833358168602, - "rewards/reasoning_steps_reward": 0.8975695073604584, - "rewards/repetition_penalty_reward": -0.08724310249090195, - "rewards/tag_count_reward": 0.9492187649011612, + "completion_length": 178.83854293823242, + "epoch": 0.02699662542182227, + "grad_norm": 1.2350533486266992, + "kl": 0.0546875, + "learning_rate": 5.37313432835821e-06, + "loss": -0.0441, + "reward": 1.1719821691513062, + "reward_std": 0.7089956551790237, + "rewards/accuracy_reward": 0.354166679084301, + "rewards/reasoning_steps_reward": 0.1684027947485447, + "rewards/repetition_penalty_reward": -0.06282693240791559, + "rewards/tag_count_reward": 0.7122395932674408, "step": 18 }, { "clip_ratio": 0.0, - "completion_length": 215.67708587646484, - "epoch": 0.05697151424287856, - "grad_norm": 1.0829252560391265, - "kl": 0.15185546875, - "learning_rate": 1.1176470588235295e-05, - "loss": -0.0053, - "reward": 2.3993316292762756, - "reward_std": 0.49921783059835434, - "rewards/accuracy_reward": 0.6979166865348816, - "rewards/reasoning_steps_reward": 0.9097222685813904, - "rewards/repetition_penalty_reward": -0.08981773070991039, - "rewards/tag_count_reward": 0.8815104365348816, + "completion_length": 200.53125762939453, + "epoch": 0.028496437945256844, + "grad_norm": 1.121999866578993, + "kl": 0.059814453125, + "learning_rate": 5.671641791044776e-06, + "loss": -0.0504, + "reward": 1.23805570602417, + "reward_std": 0.6833942234516144, + "rewards/accuracy_reward": 0.3333333507180214, + "rewards/reasoning_steps_reward": 0.2274305708706379, + "rewards/repetition_penalty_reward": -0.07401033584028482, + "rewards/tag_count_reward": 0.7513020932674408, "step": 19 }, { "clip_ratio": 0.0, - "completion_length": 244.6354217529297, - "epoch": 0.05997001499250375, - "grad_norm": 1.1922172194368157, - "kl": 0.1689453125, - "learning_rate": 1.1764705882352942e-05, - "loss": 0.1106, - "reward": 2.2201938033103943, - "reward_std": 0.5487363934516907, - "rewards/accuracy_reward": 0.4947916716337204, - "rewards/reasoning_steps_reward": 0.944444477558136, - "rewards/repetition_penalty_reward": -0.09664654545485973, - "rewards/tag_count_reward": 0.8776042014360428, + "completion_length": 210.5833396911621, + "epoch": 0.029996250468691414, + "grad_norm": 2.277302963689603, + "kl": 0.07037353515625, + "learning_rate": 5.970149253731343e-06, + "loss": 0.0108, + "reward": 1.2767416834831238, + "reward_std": 0.7041554003953934, + "rewards/accuracy_reward": 0.3645833432674408, + "rewards/reasoning_steps_reward": 0.2361111268401146, + "rewards/repetition_penalty_reward": -0.06483818404376507, + "rewards/tag_count_reward": 0.7408854365348816, "step": 20 }, { "clip_ratio": 0.0, - "completion_length": 308.5833435058594, - "epoch": 0.06296851574212893, - "grad_norm": 0.9900748090272767, - "kl": 0.1285400390625, - "learning_rate": 1.235294117647059e-05, - "loss": 0.1992, - "reward": 2.4696518182754517, - "reward_std": 0.5737268030643463, - "rewards/accuracy_reward": 0.6979166865348816, - "rewards/reasoning_steps_reward": 0.97743059694767, - "rewards/repetition_penalty_reward": -0.10543505474925041, - "rewards/tag_count_reward": 0.899739608168602, + "completion_length": 206.15625381469727, + "epoch": 0.031496062992125984, + "grad_norm": 1.051251380495841, + "kl": 0.10986328125, + "learning_rate": 6.2686567164179116e-06, + "loss": -0.0285, + "reward": 1.291599839925766, + "reward_std": 0.6519357860088348, + "rewards/accuracy_reward": 0.3229166828095913, + "rewards/reasoning_steps_reward": 0.2812500260770321, + "rewards/repetition_penalty_reward": -0.07298353686928749, + "rewards/tag_count_reward": 0.7604166865348816, "step": 21 }, { "clip_ratio": 0.0, - "completion_length": 292.5416717529297, - "epoch": 0.06596701649175413, - "grad_norm": 0.9436079749621433, - "kl": 0.150146484375, - "learning_rate": 1.2941176470588238e-05, - "loss": 0.2047, - "reward": 2.5528025031089783, - "reward_std": 0.5636586248874664, - "rewards/accuracy_reward": 0.7447917014360428, - "rewards/reasoning_steps_reward": 0.9670138955116272, - "rewards/repetition_penalty_reward": -0.10822184756398201, - "rewards/tag_count_reward": 0.9492187649011612, + "completion_length": 248.79687881469727, + "epoch": 0.032995875515560553, + "grad_norm": 0.9759846536663065, + "kl": 0.0806884765625, + "learning_rate": 6.567164179104478e-06, + "loss": -0.0597, + "reward": 1.6020659506320953, + "reward_std": 0.7493701279163361, + "rewards/accuracy_reward": 0.4322916865348816, + "rewards/reasoning_steps_reward": 0.5225694701075554, + "rewards/repetition_penalty_reward": -0.07805563136935234, + "rewards/tag_count_reward": 0.7252604365348816, "step": 22 }, { "clip_ratio": 0.0, - "completion_length": 299.51563262939453, - "epoch": 0.06896551724137931, - "grad_norm": 3.8939799298403286, - "kl": 0.23046875, - "learning_rate": 1.3529411764705885e-05, - "loss": 0.1481, - "reward": 2.538209021091461, - "reward_std": 0.5256823599338531, - "rewards/accuracy_reward": 0.7135416865348816, - "rewards/reasoning_steps_reward": 0.9809028208255768, - "rewards/repetition_penalty_reward": -0.10154793784022331, - "rewards/tag_count_reward": 0.9453125298023224, + "completion_length": 262.93750762939453, + "epoch": 0.03449568803899512, + "grad_norm": 0.8719127454418464, + "kl": 0.0849609375, + "learning_rate": 6.865671641791045e-06, + "loss": -0.0692, + "reward": 1.6442042291164398, + "reward_std": 0.6580450236797333, + "rewards/accuracy_reward": 0.3645833432674408, + "rewards/reasoning_steps_reward": 0.614583395421505, + "rewards/repetition_penalty_reward": -0.09017078392207623, + "rewards/tag_count_reward": 0.755208358168602, "step": 23 }, { "clip_ratio": 0.0, - "completion_length": 331.1458511352539, - "epoch": 0.07196401799100449, - "grad_norm": 1.4799872075073064, - "kl": 0.28955078125, - "learning_rate": 1.4117647058823532e-05, - "loss": 0.2011, - "reward": 2.411967933177948, - "reward_std": 0.7274067103862762, - "rewards/accuracy_reward": 0.6822916716337204, - "rewards/reasoning_steps_reward": 0.9496527910232544, - "rewards/repetition_penalty_reward": -0.09758076071739197, - "rewards/tag_count_reward": 0.8776041716337204, + "completion_length": 255.6718864440918, + "epoch": 0.0359955005624297, + "grad_norm": 1.7487963120056436, + "kl": 0.219970703125, + "learning_rate": 7.164179104477612e-06, + "loss": -0.0599, + "reward": 1.8432446718215942, + "reward_std": 0.7495845705270767, + "rewards/accuracy_reward": 0.5000000149011612, + "rewards/reasoning_steps_reward": 0.6840277910232544, + "rewards/repetition_penalty_reward": -0.08817902393639088, + "rewards/tag_count_reward": 0.7473958432674408, "step": 24 }, { "clip_ratio": 0.0, - "completion_length": 348.09375762939453, - "epoch": 0.07496251874062969, - "grad_norm": 1.8716934303767792, - "kl": 0.21533203125, - "learning_rate": 1.4705882352941179e-05, - "loss": 0.2849, - "reward": 2.35833877325058, - "reward_std": 0.6804773062467575, - "rewards/accuracy_reward": 0.6093750223517418, - "rewards/reasoning_steps_reward": 0.9583333879709244, - "rewards/repetition_penalty_reward": -0.10780714452266693, - "rewards/tag_count_reward": 0.8984375149011612, + "completion_length": 292.53125762939453, + "epoch": 0.03749531308586427, + "grad_norm": 0.9847582761123604, + "kl": 0.1385498046875, + "learning_rate": 7.46268656716418e-06, + "loss": -0.0537, + "reward": 1.8635078966617584, + "reward_std": 0.6817265152931213, + "rewards/accuracy_reward": 0.3958333358168602, + "rewards/reasoning_steps_reward": 0.7881944626569748, + "rewards/repetition_penalty_reward": -0.10697820782661438, + "rewards/tag_count_reward": 0.7864583432674408, "step": 25 }, { "clip_ratio": 0.0, - "completion_length": 403.9166793823242, - "epoch": 0.07796101949025487, - "grad_norm": 10.274951887282484, - "kl": 0.6142578125, - "learning_rate": 1.5294117647058822e-05, - "loss": 0.3997, - "reward": 2.2658557891845703, - "reward_std": 0.7715227752923965, - "rewards/accuracy_reward": 0.6093750223517418, - "rewards/reasoning_steps_reward": 0.94618059694767, - "rewards/repetition_penalty_reward": -0.0852727573364973, - "rewards/tag_count_reward": 0.7955729514360428, + "completion_length": 291.9322967529297, + "epoch": 0.03899512560929884, + "grad_norm": 0.8461790712043904, + "kl": 0.1474609375, + "learning_rate": 7.761194029850747e-06, + "loss": 0.0, + "reward": 1.9677788615226746, + "reward_std": 0.653272807598114, + "rewards/accuracy_reward": 0.4739583432674408, + "rewards/reasoning_steps_reward": 0.8489583730697632, + "rewards/repetition_penalty_reward": -0.10774205438792706, + "rewards/tag_count_reward": 0.7526041865348816, "step": 26 }, { "clip_ratio": 0.0, - "completion_length": 455.1614685058594, - "epoch": 0.08095952023988005, - "grad_norm": 6.823619814725695, - "kl": 1.068359375, - "learning_rate": 1.5882352941176473e-05, - "loss": 0.4192, - "reward": 1.9341484606266022, - "reward_std": 0.9543884545564651, - "rewards/accuracy_reward": 0.4635416716337204, - "rewards/reasoning_steps_reward": 0.8663194328546524, - "rewards/repetition_penalty_reward": -0.06888982094824314, - "rewards/tag_count_reward": 0.673177108168602, + "completion_length": 271.35939025878906, + "epoch": 0.04049493813273341, + "grad_norm": 6.973505953549008, + "kl": 0.4765625, + "learning_rate": 8.059701492537314e-06, + "loss": 0.0103, + "reward": 2.150063157081604, + "reward_std": 0.5991375297307968, + "rewards/accuracy_reward": 0.5260416865348816, + "rewards/reasoning_steps_reward": 0.8958333730697632, + "rewards/repetition_penalty_reward": -0.10123893804848194, + "rewards/tag_count_reward": 0.829427108168602, "step": 27 }, { "clip_ratio": 0.0, - "completion_length": 369.0052185058594, - "epoch": 0.08395802098950525, - "grad_norm": 5.114037986110308, - "kl": 0.4541015625, - "learning_rate": 1.647058823529412e-05, - "loss": 0.3594, - "reward": 2.284946322441101, - "reward_std": 0.8626722097396851, - "rewards/accuracy_reward": 0.6562500149011612, - "rewards/reasoning_steps_reward": 0.890625, - "rewards/repetition_penalty_reward": -0.0783350057899952, - "rewards/tag_count_reward": 0.8164062798023224, + "completion_length": 261.28125762939453, + "epoch": 0.04199475065616798, + "grad_norm": 3.850773043534155, + "kl": 0.293212890625, + "learning_rate": 8.35820895522388e-06, + "loss": -0.0304, + "reward": 2.154005229473114, + "reward_std": 0.621285080909729, + "rewards/accuracy_reward": 0.5312500298023224, + "rewards/reasoning_steps_reward": 0.859375, + "rewards/repetition_penalty_reward": -0.1012031976133585, + "rewards/tag_count_reward": 0.8645833730697632, "step": 28 }, { "clip_ratio": 0.0, - "completion_length": 344.8229217529297, - "epoch": 0.08695652173913043, - "grad_norm": 1009.5340319486562, - "kl": 16.3125, - "learning_rate": 1.7058823529411767e-05, - "loss": 1.4554, - "reward": 2.209560751914978, - "reward_std": 0.7678176611661911, - "rewards/accuracy_reward": 0.5781250149011612, - "rewards/reasoning_steps_reward": 0.897569477558136, - "rewards/repetition_penalty_reward": -0.06561294477432966, - "rewards/tag_count_reward": 0.7994791865348816, + "completion_length": 229.0260467529297, + "epoch": 0.04349456317960255, + "grad_norm": 1.4645034098283287, + "kl": 0.23095703125, + "learning_rate": 8.656716417910447e-06, + "loss": 0.0797, + "reward": 2.0209468007087708, + "reward_std": 0.6779871582984924, + "rewards/accuracy_reward": 0.5885416865348816, + "rewards/reasoning_steps_reward": 0.7534722685813904, + "rewards/repetition_penalty_reward": -0.10622344352304935, + "rewards/tag_count_reward": 0.7851562798023224, "step": 29 }, { "clip_ratio": 0.0, - "completion_length": 356.0572967529297, - "epoch": 0.08995502248875563, - "grad_norm": 23.197023290991723, - "kl": 0.953125, - "learning_rate": 1.7647058823529414e-05, - "loss": 0.2222, - "reward": 1.9122081696987152, - "reward_std": 0.8025388270616531, - "rewards/accuracy_reward": 0.4375000149011612, - "rewards/reasoning_steps_reward": 0.8281250298023224, - "rewards/repetition_penalty_reward": -0.06175030395388603, - "rewards/tag_count_reward": 0.7083333432674408, + "completion_length": 232.93229293823242, + "epoch": 0.04499437570303712, + "grad_norm": 1.247184264877055, + "kl": 0.162841796875, + "learning_rate": 8.955223880597016e-06, + "loss": -0.0625, + "reward": 2.040347546339035, + "reward_std": 0.6728687584400177, + "rewards/accuracy_reward": 0.5729166865348816, + "rewards/reasoning_steps_reward": 0.7795138955116272, + "rewards/repetition_penalty_reward": -0.10374973341822624, + "rewards/tag_count_reward": 0.7916666865348816, "step": 30 }, { "clip_ratio": 0.0, - "completion_length": 326.0104217529297, - "epoch": 0.09295352323838081, - "grad_norm": 8.878208799628512, - "kl": 0.9375, - "learning_rate": 1.823529411764706e-05, - "loss": 0.1336, - "reward": 1.718822568655014, - "reward_std": 0.798391655087471, - "rewards/accuracy_reward": 0.3125000149011612, - "rewards/reasoning_steps_reward": 0.8263888955116272, - "rewards/repetition_penalty_reward": -0.058087206445634365, - "rewards/tag_count_reward": 0.638020858168602, + "completion_length": 248.26562881469727, + "epoch": 0.046494188226471694, + "grad_norm": 0.8484087335607122, + "kl": 0.172607421875, + "learning_rate": 9.253731343283582e-06, + "loss": 0.0142, + "reward": 2.083979547023773, + "reward_std": 0.5710583031177521, + "rewards/accuracy_reward": 0.4687500074505806, + "rewards/reasoning_steps_reward": 0.9479167014360428, + "rewards/repetition_penalty_reward": -0.10221851244568825, + "rewards/tag_count_reward": 0.7695312649011612, "step": 31 }, { "clip_ratio": 0.0, - "completion_length": 302.17189025878906, - "epoch": 0.095952023988006, - "grad_norm": 5.191420623812987, - "kl": 2.138671875, - "learning_rate": 1.8823529411764708e-05, - "loss": -0.0731, - "reward": 1.7180909216403961, - "reward_std": 0.795787900686264, - "rewards/accuracy_reward": 0.2604166753590107, - "rewards/reasoning_steps_reward": 0.8732639402151108, - "rewards/repetition_penalty_reward": -0.04579801578074694, - "rewards/tag_count_reward": 0.630208358168602, + "completion_length": 252.73959350585938, + "epoch": 0.047994000749906264, + "grad_norm": 1.1744348492144583, + "kl": 0.1875, + "learning_rate": 9.552238805970149e-06, + "loss": 0.0776, + "reward": 2.3965033292770386, + "reward_std": 0.5822078287601471, + "rewards/accuracy_reward": 0.6718750149011612, + "rewards/reasoning_steps_reward": 0.9930555671453476, + "rewards/repetition_penalty_reward": -0.09785447083413601, + "rewards/tag_count_reward": 0.8294270932674408, "step": 32 }, { "clip_ratio": 0.0, - "completion_length": 329.6666793823242, - "epoch": 0.09895052473763119, - "grad_norm": 20.842747943141333, - "kl": 4.875, - "learning_rate": 1.9411764705882355e-05, - "loss": -0.1732, - "reward": 1.663686603307724, - "reward_std": 0.7371216714382172, - "rewards/accuracy_reward": 0.2239583432674408, - "rewards/reasoning_steps_reward": 0.8593750298023224, - "rewards/repetition_penalty_reward": -0.06417796947062016, - "rewards/tag_count_reward": 0.6445312649011612, + "completion_length": 250.71875762939453, + "epoch": 0.049493813273340834, + "grad_norm": 0.9115161284883908, + "kl": 0.181884765625, + "learning_rate": 9.850746268656717e-06, + "loss": 0.038, + "reward": 2.35109943151474, + "reward_std": 0.5347126573324203, + "rewards/accuracy_reward": 0.5625000223517418, + "rewards/reasoning_steps_reward": 0.9947916716337204, + "rewards/repetition_penalty_reward": -0.09681737795472145, + "rewards/tag_count_reward": 0.8906250298023224, "step": 33 }, { "clip_ratio": 0.0, - "completion_length": 326.96875762939453, - "epoch": 0.10194902548725637, - "grad_norm": 3.7039312187876177, - "kl": 4.06640625, - "learning_rate": 2e-05, - "loss": -0.3241, - "reward": 1.5329826474189758, - "reward_std": 0.797045961022377, - "rewards/accuracy_reward": 0.1770833358168602, - "rewards/reasoning_steps_reward": 0.819444477558136, - "rewards/repetition_penalty_reward": -0.058597257360816, - "rewards/tag_count_reward": 0.595052108168602, + "completion_length": 256.7604217529297, + "epoch": 0.0509936257967754, + "grad_norm": 0.9316669826597189, + "kl": 0.2060546875, + "learning_rate": 1.0149253731343284e-05, + "loss": 0.1298, + "reward": 2.2372400164604187, + "reward_std": 0.5037485882639885, + "rewards/accuracy_reward": 0.4427083432674408, + "rewards/reasoning_steps_reward": 0.989583358168602, + "rewards/repetition_penalty_reward": -0.09348933398723602, + "rewards/tag_count_reward": 0.8984375149011612, "step": 34 }, { "clip_ratio": 0.0, - "completion_length": 469.40625762939453, - "epoch": 0.10494752623688156, - "grad_norm": 19.322333158500665, - "kl": 2.0654296875, - "learning_rate": 1.9999448019954837e-05, - "loss": -0.1809, - "reward": 1.9347608387470245, - "reward_std": 0.8773187696933746, - "rewards/accuracy_reward": 0.4270833507180214, - "rewards/reasoning_steps_reward": 0.8923610895872116, - "rewards/repetition_penalty_reward": -0.11254816874861717, - "rewards/tag_count_reward": 0.7278645932674408, + "completion_length": 231.1458396911621, + "epoch": 0.05249343832020997, + "grad_norm": 0.978595163631655, + "kl": 0.194580078125, + "learning_rate": 1.0447761194029851e-05, + "loss": 0.0872, + "reward": 2.379105567932129, + "reward_std": 0.48672880232334137, + "rewards/accuracy_reward": 0.5781250223517418, + "rewards/reasoning_steps_reward": 0.9704861491918564, + "rewards/repetition_penalty_reward": -0.07315144501626492, + "rewards/tag_count_reward": 0.9036458432674408, "step": 35 }, { "clip_ratio": 0.0, - "completion_length": 522.7396087646484, - "epoch": 0.10794602698650675, - "grad_norm": 3.6713399077901303, - "kl": 0.58203125, - "learning_rate": 1.9997792140755746e-05, - "loss": -0.0915, - "reward": 2.4146523475646973, - "reward_std": 0.6919413357973099, - "rewards/accuracy_reward": 0.7500000149011612, - "rewards/reasoning_steps_reward": 0.9340278208255768, - "rewards/repetition_penalty_reward": -0.15739644691348076, - "rewards/tag_count_reward": 0.888020858168602, + "completion_length": 206.8645896911621, + "epoch": 0.05399325084364454, + "grad_norm": 1.001370277577607, + "kl": 0.2109375, + "learning_rate": 1.074626865671642e-05, + "loss": -0.0238, + "reward": 2.4131481647491455, + "reward_std": 0.5277487263083458, + "rewards/accuracy_reward": 0.5937500149011612, + "rewards/reasoning_steps_reward": 0.9531250298023224, + "rewards/repetition_penalty_reward": -0.07122688181698322, + "rewards/tag_count_reward": 0.9375000149011612, "step": 36 }, { "clip_ratio": 0.0, - "completion_length": 433.7916793823242, - "epoch": 0.11094452773613193, - "grad_norm": 2.113817065645351, - "kl": 0.7890625, - "learning_rate": 1.999503254520518e-05, - "loss": -0.1104, - "reward": 2.4119739532470703, - "reward_std": 0.7027525901794434, - "rewards/accuracy_reward": 0.7656250298023224, - "rewards/reasoning_steps_reward": 0.9305555820465088, - "rewards/repetition_penalty_reward": -0.18524829670786858, - "rewards/tag_count_reward": 0.9010416865348816, + "completion_length": 204.64583587646484, + "epoch": 0.05549306336707911, + "grad_norm": 0.9668083665156728, + "kl": 0.22216796875, + "learning_rate": 1.1044776119402986e-05, + "loss": -0.017, + "reward": 2.486513674259186, + "reward_std": 0.5187593251466751, + "rewards/accuracy_reward": 0.7031250298023224, + "rewards/reasoning_steps_reward": 0.9166666865348816, + "rewards/repetition_penalty_reward": -0.06166350655257702, + "rewards/tag_count_reward": 0.9283854514360428, "step": 37 }, { "clip_ratio": 0.0, - "completion_length": 304.2708435058594, - "epoch": 0.11394302848575712, - "grad_norm": 3.3062068674445064, - "kl": 2.06640625, - "learning_rate": 1.999116953795147e-05, - "loss": -0.1854, - "reward": 2.173910915851593, - "reward_std": 0.8623018711805344, - "rewards/accuracy_reward": 0.6614583507180214, - "rewards/reasoning_steps_reward": 0.845486119389534, - "rewards/repetition_penalty_reward": -0.18329406157135963, - "rewards/tag_count_reward": 0.8502604365348816, + "completion_length": 221.57812881469727, + "epoch": 0.05699287589051369, + "grad_norm": 0.9933532091220145, + "kl": 0.27978515625, + "learning_rate": 1.1343283582089553e-05, + "loss": 0.0533, + "reward": 2.315297782421112, + "reward_std": 0.467474602162838, + "rewards/accuracy_reward": 0.5156250111758709, + "rewards/reasoning_steps_reward": 0.9461806118488312, + "rewards/repetition_penalty_reward": -0.05796616990119219, + "rewards/tag_count_reward": 0.911458358168602, "step": 38 }, { "clip_ratio": 0.0, - "completion_length": 262.3020935058594, - "epoch": 0.11694152923538231, - "grad_norm": 2.487067537690022, - "kl": 1.455078125, - "learning_rate": 1.9986203545455205e-05, - "loss": -0.1053, - "reward": 2.481999635696411, - "reward_std": 0.6076074242591858, - "rewards/accuracy_reward": 0.8906250298023224, - "rewards/reasoning_steps_reward": 0.8750000447034836, - "rewards/repetition_penalty_reward": -0.23544833436608315, - "rewards/tag_count_reward": 0.9518229514360428, + "completion_length": 211.67708587646484, + "epoch": 0.05849268841394826, + "grad_norm": 1.1014497191190673, + "kl": 0.24658203125, + "learning_rate": 1.1641791044776121e-05, + "loss": -0.0132, + "reward": 2.4762988686561584, + "reward_std": 0.4729772359132767, + "rewards/accuracy_reward": 0.7343750298023224, + "rewards/reasoning_steps_reward": 0.871527835726738, + "rewards/repetition_penalty_reward": -0.07231233641505241, + "rewards/tag_count_reward": 0.9427083432674408, "step": 39 }, { "clip_ratio": 0.0, - "completion_length": 254.07813262939453, - "epoch": 0.1199400299850075, - "grad_norm": 12.184820883902836, - "kl": 3.974609375, - "learning_rate": 1.9980135115942135e-05, - "loss": -0.0478, - "reward": 2.3865994215011597, - "reward_std": 0.5614848285913467, - "rewards/accuracy_reward": 0.770833358168602, - "rewards/reasoning_steps_reward": 0.876736119389534, - "rewards/repetition_penalty_reward": -0.19586599990725517, - "rewards/tag_count_reward": 0.934895858168602, + "completion_length": 255.4895896911621, + "epoch": 0.05999250093738283, + "grad_norm": 0.876579700760602, + "kl": 0.27001953125, + "learning_rate": 1.1940298507462686e-05, + "loss": 0.05, + "reward": 2.4912882447242737, + "reward_std": 0.5008950978517532, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9253472834825516, + "rewards/repetition_penalty_reward": -0.07598620746284723, + "rewards/tag_count_reward": 0.9752604365348816, "step": 40 }, { "clip_ratio": 0.0, - "completion_length": 237.33334350585938, - "epoch": 0.12293853073463268, - "grad_norm": 6.144455968970083, - "kl": 3.984375, - "learning_rate": 1.9972964919342664e-05, - "loss": -0.0385, - "reward": 2.3252296447753906, - "reward_std": 0.7344638109207153, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/reasoning_steps_reward": 0.8819445073604584, - "rewards/repetition_penalty_reward": -0.1699962317943573, - "rewards/tag_count_reward": 0.8632812649011612, + "completion_length": 302.0885543823242, + "epoch": 0.0614923134608174, + "grad_norm": 1.5357965494048256, + "kl": 0.3916015625, + "learning_rate": 1.2238805970149255e-05, + "loss": 0.1043, + "reward": 2.368771195411682, + "reward_std": 0.578594297170639, + "rewards/accuracy_reward": 0.5833333507180214, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.08912823535501957, + "rewards/tag_count_reward": 0.9231770932674408, "step": 41 }, { "clip_ratio": 0.0, - "completion_length": 291.4791717529297, - "epoch": 0.12593703148425786, - "grad_norm": 4998.4510260355355, - "kl": 91.7841796875, - "learning_rate": 1.9964693747217873e-05, - "loss": 4.7855, - "reward": 2.487729072570801, - "reward_std": 0.477145679295063, - "rewards/accuracy_reward": 0.8697916865348816, - "rewards/reasoning_steps_reward": 0.9340278208255768, - "rewards/repetition_penalty_reward": -0.18067368865013123, - "rewards/tag_count_reward": 0.864583358168602, + "completion_length": 320.3645935058594, + "epoch": 0.06299212598425197, + "grad_norm": 1.015169753494364, + "kl": 0.4111328125, + "learning_rate": 1.2537313432835823e-05, + "loss": 0.0663, + "reward": 2.4001752734184265, + "reward_std": 0.6187782883644104, + "rewards/accuracy_reward": 0.661458358168602, + "rewards/reasoning_steps_reward": 0.9392361640930176, + "rewards/repetition_penalty_reward": -0.10286298580467701, + "rewards/tag_count_reward": 0.9023437649011612, "step": 42 }, { "clip_ratio": 0.0, - "completion_length": 284.6145935058594, - "epoch": 0.12893553223388307, - "grad_norm": 5.6325713646688405, - "kl": 1.0654296875, - "learning_rate": 1.9955322512672162e-05, - "loss": -0.064, - "reward": 2.384717285633087, - "reward_std": 0.6374871581792831, - "rewards/accuracy_reward": 0.8385416865348816, - "rewards/reasoning_steps_reward": 0.901041716337204, - "rewards/repetition_penalty_reward": -0.1426265835762024, - "rewards/tag_count_reward": 0.7877604216337204, + "completion_length": 327.3854293823242, + "epoch": 0.06449193850768654, + "grad_norm": 1.6703329299260001, + "kl": 0.4345703125, + "learning_rate": 1.2835820895522388e-05, + "loss": 0.1761, + "reward": 2.34128737449646, + "reward_std": 0.6598222628235817, + "rewards/accuracy_reward": 0.5937500298023224, + "rewards/reasoning_steps_reward": 0.9392361044883728, + "rewards/repetition_penalty_reward": -0.09274039790034294, + "rewards/tag_count_reward": 0.9010416716337204, "step": 43 }, { "clip_ratio": 0.0, - "completion_length": 353.10939025878906, - "epoch": 0.13193403298350825, - "grad_norm": 1.0900138361408922, - "kl": 0.6494140625, - "learning_rate": 1.9944852250252416e-05, - "loss": -0.0128, - "reward": 2.404056489467621, - "reward_std": 0.5440548211336136, - "rewards/accuracy_reward": 0.7447916865348816, - "rewards/reasoning_steps_reward": 0.8906250298023224, - "rewards/repetition_penalty_reward": -0.13500603288412094, - "rewards/tag_count_reward": 0.9036458432674408, + "completion_length": 308.81250762939453, + "epoch": 0.06599175103112111, + "grad_norm": 1.8118029753094556, + "kl": 0.40234375, + "learning_rate": 1.3134328358208957e-05, + "loss": 0.2046, + "reward": 2.240744113922119, + "reward_std": 0.6155901998281479, + "rewards/accuracy_reward": 0.5520833432674408, + "rewards/reasoning_steps_reward": 0.8923611044883728, + "rewards/repetition_penalty_reward": -0.08260660991072655, + "rewards/tag_count_reward": 0.8789062649011612, "step": 44 }, { "clip_ratio": 0.0, - "completion_length": 435.9635543823242, - "epoch": 0.13493253373313344, - "grad_norm": 447.68957414171246, - "kl": 15.2900390625, - "learning_rate": 1.993328411583383e-05, - "loss": 1.0339, - "reward": 2.6905240416526794, - "reward_std": 0.36615417525172234, - "rewards/accuracy_reward": 0.9375000298023224, - "rewards/reasoning_steps_reward": 0.9184028059244156, - "rewards/repetition_penalty_reward": -0.13022266328334808, - "rewards/tag_count_reward": 0.9648437649011612, + "completion_length": 364.7135467529297, + "epoch": 0.06749156355455568, + "grad_norm": 5.936213660745324, + "kl": 0.47021484375, + "learning_rate": 1.3432835820895525e-05, + "loss": 0.425, + "reward": 1.964487075805664, + "reward_std": 0.7735159993171692, + "rewards/accuracy_reward": 0.4583333507180214, + "rewards/reasoning_steps_reward": 0.8750000149011612, + "rewards/repetition_penalty_reward": -0.08629421889781952, + "rewards/tag_count_reward": 0.7174479365348816, "step": 45 }, { "clip_ratio": 0.0, - "completion_length": 583.7187805175781, - "epoch": 0.13793103448275862, - "grad_norm": 1.610843891213588, - "kl": 0.5166015625, - "learning_rate": 1.992061938649227e-05, - "loss": 0.0019, - "reward": 2.6286553740501404, - "reward_std": 0.3213835656642914, - "rewards/accuracy_reward": 0.770833358168602, - "rewards/reasoning_steps_reward": 0.9774305522441864, - "rewards/repetition_penalty_reward": -0.10788994282484055, - "rewards/tag_count_reward": 0.9882812649011612, + "completion_length": 377.81251525878906, + "epoch": 0.06899137607799025, + "grad_norm": 68.04443222745681, + "kl": 3.87890625, + "learning_rate": 1.373134328358209e-05, + "loss": 0.5361, + "reward": 1.82472363114357, + "reward_std": 0.8007150739431381, + "rewards/accuracy_reward": 0.4479166716337204, + "rewards/reasoning_steps_reward": 0.8715278059244156, + "rewards/repetition_penalty_reward": -0.07545002363622189, + "rewards/tag_count_reward": 0.5807291716337204, "step": 46 }, { "clip_ratio": 0.0, - "completion_length": 726.1250152587891, - "epoch": 0.1409295352323838, - "grad_norm": 7.039405614303865, - "kl": 0.6123046875, - "learning_rate": 1.9906859460363307e-05, - "loss": -0.0022, - "reward": 2.555482268333435, - "reward_std": 0.44716860353946686, - "rewards/accuracy_reward": 0.7187500074505806, - "rewards/reasoning_steps_reward": 0.9600694179534912, - "rewards/repetition_penalty_reward": -0.06474354676902294, - "rewards/tag_count_reward": 0.9414062649011612, + "completion_length": 373.68750762939453, + "epoch": 0.07049118860142482, + "grad_norm": 383.1306772268743, + "kl": 10.515625, + "learning_rate": 1.4029850746268658e-05, + "loss": 0.602, + "reward": 1.422120749950409, + "reward_std": 0.7384027689695358, + "rewards/accuracy_reward": 0.2135416716337204, + "rewards/reasoning_steps_reward": 0.777777835726738, + "rewards/repetition_penalty_reward": -0.049667539075016975, + "rewards/tag_count_reward": 0.4804687649011612, "step": 47 }, { "clip_ratio": 0.0, - "completion_length": 823.7031402587891, - "epoch": 0.14392803598200898, - "grad_norm": 7.8638528575359885, - "kl": 0.5986328125, - "learning_rate": 1.989200585648788e-05, - "loss": 0.0195, - "reward": 2.400337427854538, - "reward_std": 0.5809096917510033, - "rewards/accuracy_reward": 0.7031250223517418, - "rewards/reasoning_steps_reward": 0.9097222238779068, - "rewards/repetition_penalty_reward": -0.05625995807349682, - "rewards/tag_count_reward": 0.8437500149011612, + "completion_length": 383.4479293823242, + "epoch": 0.0719910011248594, + "grad_norm": 127.08044008975428, + "kl": 78.771484375, + "learning_rate": 1.4328358208955224e-05, + "loss": 0.5014, + "reward": 1.5611045956611633, + "reward_std": 0.8897948116064072, + "rewards/accuracy_reward": 0.3072916716337204, + "rewards/reasoning_steps_reward": 0.751736119389534, + "rewards/repetition_penalty_reward": -0.05781908147037029, + "rewards/tag_count_reward": 0.559895858168602, "step": 48 }, { "clip_ratio": 0.0, - "completion_length": 958.1198120117188, - "epoch": 0.1469265367316342, - "grad_norm": 3.4063309297902227, - "kl": 0.56640625, - "learning_rate": 1.9876060214644568e-05, - "loss": 0.0231, - "reward": 1.8466069400310516, - "reward_std": 0.7590629458427429, - "rewards/accuracy_reward": 0.432291679084301, - "rewards/reasoning_steps_reward": 0.8385417014360428, - "rewards/repetition_penalty_reward": -0.034903590101748705, - "rewards/tag_count_reward": 0.6106770932674408, + "completion_length": 343.1770935058594, + "epoch": 0.07349081364829396, + "grad_norm": 8.004059356580026, + "kl": 1.6875, + "learning_rate": 1.4626865671641792e-05, + "loss": 0.2767, + "reward": 1.384813278913498, + "reward_std": 0.8601708263158798, + "rewards/accuracy_reward": 0.2187500037252903, + "rewards/reasoning_steps_reward": 0.7118055671453476, + "rewards/repetition_penalty_reward": -0.06787777040153742, + "rewards/tag_count_reward": 0.5221354439854622, "step": 49 }, { "clip_ratio": 0.0, - "completion_length": 996.1250305175781, - "epoch": 0.14992503748125938, - "grad_norm": 3.75487802054811, - "kl": 0.9189453125, - "learning_rate": 1.9859024295168593e-05, - "loss": 0.0443, - "reward": 1.636008232831955, - "reward_std": 0.7502989023923874, - "rewards/accuracy_reward": 0.3906250149011612, - "rewards/reasoning_steps_reward": 0.788194477558136, - "rewards/repetition_penalty_reward": -0.0336966784670949, - "rewards/tag_count_reward": 0.4908854439854622, + "completion_length": 351.7135543823242, + "epoch": 0.07499062617172854, + "grad_norm": 1.7830772174928224, + "kl": 1.0224609375, + "learning_rate": 1.492537313432836e-05, + "loss": 0.1119, + "reward": 1.2045287191867828, + "reward_std": 0.8166554719209671, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/reasoning_steps_reward": 0.6302083432674408, + "rewards/repetition_penalty_reward": -0.06630465760827065, + "rewards/tag_count_reward": 0.4739583432674408, "step": 50 }, { "clip_ratio": 0.0, - "completion_length": 1002.0833435058594, - "epoch": 0.15292353823088456, - "grad_norm": 26.36570320352729, - "kl": 2.140625, - "learning_rate": 1.9840899978757483e-05, - "loss": 0.0805, - "reward": 1.6898488700389862, - "reward_std": 0.7471358329057693, - "rewards/accuracy_reward": 0.3489583432674408, - "rewards/reasoning_steps_reward": 0.8315972834825516, - "rewards/repetition_penalty_reward": -0.046696340665221214, - "rewards/tag_count_reward": 0.5559896007180214, + "completion_length": 410.0677185058594, + "epoch": 0.0764904386951631, + "grad_norm": 1.780458421762445, + "kl": 1.1845703125, + "learning_rate": 1.5223880597014925e-05, + "loss": -0.0193, + "reward": 1.3819158375263214, + "reward_std": 0.8455617725849152, + "rewards/accuracy_reward": 0.2708333395421505, + "rewards/reasoning_steps_reward": 0.7777777910232544, + "rewards/repetition_penalty_reward": -0.13153912127017975, + "rewards/tag_count_reward": 0.4648437649011612, "step": 51 }, { "clip_ratio": 0.0, - "completion_length": 990.2968902587891, - "epoch": 0.15592203898050974, - "grad_norm": 29.24774607573346, - "kl": 6.390625, - "learning_rate": 1.9821689266263425e-05, - "loss": 0.1745, - "reward": 1.9025542736053467, - "reward_std": 0.8440426588058472, - "rewards/accuracy_reward": 0.5052083507180214, - "rewards/reasoning_steps_reward": 0.7934027910232544, - "rewards/repetition_penalty_reward": -0.06272357050329447, - "rewards/tag_count_reward": 0.6666666716337204, + "completion_length": 583.6302185058594, + "epoch": 0.07799025121859768, + "grad_norm": 0.8595799417367336, + "kl": 0.94970703125, + "learning_rate": 1.5522388059701494e-05, + "loss": -0.1912, + "reward": 1.392962396144867, + "reward_std": 0.8328078836202621, + "rewards/accuracy_reward": 0.31250000558793545, + "rewards/reasoning_steps_reward": 0.8055555820465088, + "rewards/repetition_penalty_reward": -0.19254108518362045, + "rewards/tag_count_reward": 0.4674479365348816, "step": 52 }, { "clip_ratio": 0.0, - "completion_length": 564.7864761352539, - "epoch": 0.15892053973013492, - "grad_norm": 21.634969699176207, - "kl": 1.97265625, - "learning_rate": 1.980139427847242e-05, - "loss": 0.6096, - "reward": 2.039624661207199, - "reward_std": 0.7389360666275024, - "rewards/accuracy_reward": 0.5468750074505806, - "rewards/reasoning_steps_reward": 0.8038195073604584, - "rewards/repetition_penalty_reward": -0.05976778268814087, - "rewards/tag_count_reward": 0.7486979365348816, + "completion_length": 761.7812805175781, + "epoch": 0.07949006374203224, + "grad_norm": 1.9426702160291227, + "kl": 0.73828125, + "learning_rate": 1.582089552238806e-05, + "loss": -0.0632, + "reward": 1.2566802203655243, + "reward_std": 0.7984166294336319, + "rewards/accuracy_reward": 0.2864583395421505, + "rewards/reasoning_steps_reward": 0.8125000149011612, + "rewards/repetition_penalty_reward": -0.2914969325065613, + "rewards/tag_count_reward": 0.4492187574505806, "step": 53 }, { "clip_ratio": 0.0, - "completion_length": 626.5000152587891, - "epoch": 0.1619190404797601, - "grad_norm": 20.599440681644708, - "kl": 1.03125, - "learning_rate": 1.9780017255870114e-05, - "loss": 0.5382, - "reward": 1.7361580431461334, - "reward_std": 0.7883689254522324, - "rewards/accuracy_reward": 0.3593750074505806, - "rewards/reasoning_steps_reward": 0.7899305671453476, - "rewards/repetition_penalty_reward": -0.047262136824429035, - "rewards/tag_count_reward": 0.634114608168602, + "completion_length": 769.3333587646484, + "epoch": 0.08098987626546682, + "grad_norm": 0.606126420750735, + "kl": 0.58154296875, + "learning_rate": 1.6119402985074627e-05, + "loss": -0.0516, + "reward": 1.2068730890750885, + "reward_std": 0.5995621234178543, + "rewards/accuracy_reward": 0.17708333861082792, + "rewards/reasoning_steps_reward": 0.8819444924592972, + "rewards/repetition_penalty_reward": -0.34304021298885345, + "rewards/tag_count_reward": 0.490885429084301, "step": 54 }, { "clip_ratio": 0.0, - "completion_length": 813.6041717529297, - "epoch": 0.16491754122938532, - "grad_norm": 14.859517097725304, - "kl": 2.419921875, - "learning_rate": 1.9757560558394493e-05, - "loss": 0.3132, - "reward": 1.3911511600017548, - "reward_std": 0.7518036961555481, - "rewards/accuracy_reward": 0.2187500074505806, - "rewards/reasoning_steps_reward": 0.6961806118488312, - "rewards/repetition_penalty_reward": -0.026383677031844854, - "rewards/tag_count_reward": 0.5026041865348816, + "completion_length": 632.9479370117188, + "epoch": 0.0824896887889014, + "grad_norm": 141.4492030747302, + "kl": 1.0986328125, + "learning_rate": 1.6417910447761197e-05, + "loss": -0.0583, + "reward": 1.5605631172657013, + "reward_std": 0.6802570223808289, + "rewards/accuracy_reward": 0.4010416716337204, + "rewards/reasoning_steps_reward": 0.88368059694767, + "rewards/repetition_penalty_reward": -0.2957737147808075, + "rewards/tag_count_reward": 0.5716145932674408, "step": 55 }, { "clip_ratio": 0.0, - "completion_length": 910.8177337646484, - "epoch": 0.1679160419790105, - "grad_norm": 6.499919813214984, - "kl": 0.9853515625, - "learning_rate": 1.9734026665175335e-05, - "loss": 0.0658, - "reward": 1.1412139385938644, - "reward_std": 0.6940838098526001, - "rewards/accuracy_reward": 0.1197916679084301, - "rewards/reasoning_steps_reward": 0.6545139253139496, - "rewards/repetition_penalty_reward": -0.0172062402125448, - "rewards/tag_count_reward": 0.3841145858168602, + "completion_length": 510.9791793823242, + "epoch": 0.08398950131233596, + "grad_norm": 0.9471168937555826, + "kl": 0.603515625, + "learning_rate": 1.671641791044776e-05, + "loss": -0.0603, + "reward": 1.6753197610378265, + "reward_std": 0.5637442171573639, + "rewards/accuracy_reward": 0.3958333507180214, + "rewards/reasoning_steps_reward": 0.9253472238779068, + "rewards/repetition_penalty_reward": -0.22789209336042404, + "rewards/tag_count_reward": 0.5820312798023224, "step": 56 }, { "clip_ratio": 0.0, - "completion_length": 949.7083587646484, - "epoch": 0.17091454272863568, - "grad_norm": 1.0108146959506905, - "kl": 0.8154296875, - "learning_rate": 1.9709418174260523e-05, - "loss": 0.0444, - "reward": 1.0093085765838623, - "reward_std": 0.6726640909910202, - "rewards/accuracy_reward": 0.07812500186264515, - "rewards/reasoning_steps_reward": 0.5972222685813904, - "rewards/repetition_penalty_reward": -0.014997066929936409, - "rewards/tag_count_reward": 0.3489583432674408, + "completion_length": 433.4479293823242, + "epoch": 0.08548931383577053, + "grad_norm": 12471.246658659455, + "kl": 123.4453125, + "learning_rate": 1.701492537313433e-05, + "loss": 7.2604, + "reward": 1.5392451286315918, + "reward_std": 0.656680166721344, + "rewards/accuracy_reward": 0.3645833469927311, + "rewards/reasoning_steps_reward": 0.84375, + "rewards/repetition_penalty_reward": -0.2185674048960209, + "rewards/tag_count_reward": 0.5494791865348816, "step": 57 }, { "clip_ratio": 0.0, - "completion_length": 923.5104370117188, - "epoch": 0.17391304347826086, - "grad_norm": 2.2827458536380156, - "kl": 0.9248046875, - "learning_rate": 1.9683737802329242e-05, - "loss": 0.0701, - "reward": 1.2827005088329315, - "reward_std": 0.6604696810245514, - "rewards/accuracy_reward": 0.11458333721384406, - "rewards/reasoning_steps_reward": 0.7690972834825516, - "rewards/repetition_penalty_reward": -0.021553035592660308, - "rewards/tag_count_reward": 0.4205729216337204, + "completion_length": 334.6198043823242, + "epoch": 0.0869891263592051, + "grad_norm": 169.80984697224983, + "kl": 123.8564453125, + "learning_rate": 1.7313432835820894e-05, + "loss": 0.1593, + "reward": 1.3166283071041107, + "reward_std": 0.6126263588666916, + "rewards/accuracy_reward": 0.1979166716337204, + "rewards/reasoning_steps_reward": 0.819444477558136, + "rewards/repetition_penalty_reward": -0.1720869541168213, + "rewards/tag_count_reward": 0.471354179084301, "step": 58 }, { "clip_ratio": 0.0, - "completion_length": 832.3229370117188, - "epoch": 0.17691154422788605, - "grad_norm": 11.547641471792147, - "kl": 1.482421875, - "learning_rate": 1.9656988384392075e-05, - "loss": 0.2098, - "reward": 1.5504461526870728, - "reward_std": 0.5935858860611916, - "rewards/accuracy_reward": 0.16666667256504297, - "rewards/reasoning_steps_reward": 0.855902835726738, - "rewards/repetition_penalty_reward": -0.029415032360702753, - "rewards/tag_count_reward": 0.5572916865348816, + "completion_length": 297.93750762939453, + "epoch": 0.08848893888263967, + "grad_norm": 9.047081897906422, + "kl": 2.00390625, + "learning_rate": 1.7611940298507464e-05, + "loss": -0.1778, + "reward": 1.1133750975131989, + "reward_std": 0.5235892608761787, + "rewards/accuracy_reward": 0.06770833488553762, + "rewards/reasoning_steps_reward": 0.8072916865348816, + "rewards/repetition_penalty_reward": -0.13141660019755363, + "rewards/tag_count_reward": 0.369791679084301, "step": 59 }, { "clip_ratio": 0.0, - "completion_length": 811.9479217529297, - "epoch": 0.17991004497751126, - "grad_norm": 7.13025211784621, - "kl": 2.056640625, - "learning_rate": 1.9629172873477995e-05, - "loss": 0.1859, - "reward": 1.8182637393474579, - "reward_std": 0.7368517369031906, - "rewards/accuracy_reward": 0.3593750111758709, - "rewards/reasoning_steps_reward": 0.8611111044883728, - "rewards/repetition_penalty_reward": -0.0441494369879365, - "rewards/tag_count_reward": 0.641927108168602, + "completion_length": 355.6614761352539, + "epoch": 0.08998875140607424, + "grad_norm": 8.682581929106801, + "kl": 1.0380859375, + "learning_rate": 1.791044776119403e-05, + "loss": -0.1428, + "reward": 1.3278673589229584, + "reward_std": 0.5868176072835922, + "rewards/accuracy_reward": 0.2031250074505806, + "rewards/reasoning_steps_reward": 0.824652835726738, + "rewards/repetition_penalty_reward": -0.19079595804214478, + "rewards/tag_count_reward": 0.490885429084301, "step": 60 }, { "clip_ratio": 0.0, - "completion_length": 754.9479370117188, - "epoch": 0.18290854572713644, - "grad_norm": 8.766884713406547, - "kl": 1.634765625, - "learning_rate": 1.96002943403084e-05, - "loss": 0.2092, - "reward": 2.166029632091522, - "reward_std": 0.7226460427045822, - "rewards/accuracy_reward": 0.645833358168602, - "rewards/reasoning_steps_reward": 0.9079861789941788, - "rewards/repetition_penalty_reward": -0.06487324088811874, - "rewards/tag_count_reward": 0.6770833730697632, + "completion_length": 389.0104217529297, + "epoch": 0.09148856392950881, + "grad_norm": 2.638836671205147, + "kl": 0.8349609375, + "learning_rate": 1.8208955223880598e-05, + "loss": -0.0578, + "reward": 1.3582229912281036, + "reward_std": 0.5021841153502464, + "rewards/accuracy_reward": 0.2187500074505806, + "rewards/reasoning_steps_reward": 0.862847238779068, + "rewards/repetition_penalty_reward": -0.26764511317014694, + "rewards/tag_count_reward": 0.5442708432674408, "step": 61 }, { "clip_ratio": 0.0, - "completion_length": 641.5833587646484, - "epoch": 0.18590704647676162, - "grad_norm": 5.967296694311895, - "kl": 1.23828125, - "learning_rate": 1.9570355972958098e-05, - "loss": 0.2609, - "reward": 1.9569042026996613, - "reward_std": 0.6188908591866493, - "rewards/accuracy_reward": 0.4114583432674408, - "rewards/reasoning_steps_reward": 0.9444444626569748, - "rewards/repetition_penalty_reward": -0.07347787916660309, - "rewards/tag_count_reward": 0.6744792014360428, + "completion_length": 338.09376525878906, + "epoch": 0.09298837645294339, + "grad_norm": 3.079165996915571, + "kl": 0.822265625, + "learning_rate": 1.8507462686567165e-05, + "loss": -0.0992, + "reward": 1.3493062555789948, + "reward_std": 0.4447471499443054, + "rewards/accuracy_reward": 0.15104167303070426, + "rewards/reasoning_steps_reward": 0.8923611491918564, + "rewards/repetition_penalty_reward": -0.22144031897187233, + "rewards/tag_count_reward": 0.5273437649011612, "step": 62 }, { "clip_ratio": 0.0, - "completion_length": 655.6041870117188, - "epoch": 0.1889055472263868, - "grad_norm": 8.21428152429746, - "kl": 1.9296875, - "learning_rate": 1.953936107650336e-05, - "loss": 0.2559, - "reward": 1.7195740938186646, - "reward_std": 0.5279941856861115, - "rewards/accuracy_reward": 0.2187500037252903, - "rewards/reasoning_steps_reward": 0.9461805820465088, - "rewards/repetition_penalty_reward": -0.10290857404470444, - "rewards/tag_count_reward": 0.657552108168602, + "completion_length": 354.5, + "epoch": 0.09448818897637795, + "grad_norm": 4.4024040515363145, + "kl": 0.748046875, + "learning_rate": 1.8805970149253735e-05, + "loss": -0.0358, + "reward": 1.5263647735118866, + "reward_std": 0.6498845219612122, + "rewards/accuracy_reward": 0.3229166753590107, + "rewards/reasoning_steps_reward": 0.888888955116272, + "rewards/repetition_penalty_reward": -0.2323157861828804, + "rewards/tag_count_reward": 0.5468750149011612, "step": 63 }, { "clip_ratio": 0.0, - "completion_length": 768.5729217529297, - "epoch": 0.191904047976012, - "grad_norm": 4.853080558533273, - "kl": 3.99609375, - "learning_rate": 1.9507313072657057e-05, - "loss": 0.2929, - "reward": 1.5277346670627594, - "reward_std": 0.40065091848373413, - "rewards/accuracy_reward": 0.0677083358168602, - "rewards/reasoning_steps_reward": 0.9513889253139496, - "rewards/repetition_penalty_reward": -0.15542514994740486, - "rewards/tag_count_reward": 0.6640625149011612, + "completion_length": 343.4166717529297, + "epoch": 0.09598800149981253, + "grad_norm": 14.853099160994677, + "kl": 0.8349609375, + "learning_rate": 1.9104477611940298e-05, + "loss": -0.1063, + "reward": 1.2628207504749298, + "reward_std": 0.58592738956213, + "rewards/accuracy_reward": 0.17187500651925802, + "rewards/reasoning_steps_reward": 0.8315972536802292, + "rewards/repetition_penalty_reward": -0.20809948071837425, + "rewards/tag_count_reward": 0.4674479216337204, "step": 64 }, { "clip_ratio": 0.0, - "completion_length": 745.9583587646484, - "epoch": 0.19490254872563717, - "grad_norm": 2.314098183884576, - "kl": 3.66015625, - "learning_rate": 1.9474215499390912e-05, - "loss": 0.3051, - "reward": 1.5480964481830597, - "reward_std": 0.35690316557884216, - "rewards/accuracy_reward": 0.015625000465661287, - "rewards/reasoning_steps_reward": 0.9253472238779068, - "rewards/repetition_penalty_reward": -0.1090216189622879, - "rewards/tag_count_reward": 0.716145858168602, + "completion_length": 316.5364685058594, + "epoch": 0.09748781402324709, + "grad_norm": 2.3731241908019913, + "kl": 0.974609375, + "learning_rate": 1.9402985074626868e-05, + "loss": -0.1689, + "reward": 1.2171970903873444, + "reward_std": 0.6599665582180023, + "rewards/accuracy_reward": 0.18750000931322575, + "rewards/reasoning_steps_reward": 0.8020833730697632, + "rewards/repetition_penalty_reward": -0.21509456261992455, + "rewards/tag_count_reward": 0.4427083432674408, "step": 65 }, { "clip_ratio": 0.0, - "completion_length": 594.9427261352539, - "epoch": 0.19790104947526238, - "grad_norm": 2.348236466880056, - "kl": 2.451171875, - "learning_rate": 1.944007201054492e-05, - "loss": 0.3924, - "reward": 1.5447391867637634, - "reward_std": 0.509847991168499, - "rewards/accuracy_reward": 0.03645833441987634, - "rewards/reasoning_steps_reward": 0.8750000149011612, - "rewards/repetition_penalty_reward": -0.10630252212285995, - "rewards/tag_count_reward": 0.739583358168602, + "completion_length": 365.17708587646484, + "epoch": 0.09898762654668167, + "grad_norm": 1.1098635397425487, + "kl": 0.8447265625, + "learning_rate": 1.9701492537313435e-05, + "loss": -0.0102, + "reward": 1.2291882634162903, + "reward_std": 0.40925049781799316, + "rewards/accuracy_reward": 0.06770833395421505, + "rewards/reasoning_steps_reward": 0.942708358168602, + "rewards/repetition_penalty_reward": -0.2408638447523117, + "rewards/tag_count_reward": 0.459635429084301, "step": 66 }, { "clip_ratio": 0.0, - "completion_length": 620.7760620117188, - "epoch": 0.20089955022488756, - "grad_norm": 0.6607962588027059, - "kl": 4.9921875, - "learning_rate": 1.9404886375423982e-05, - "loss": 0.3226, - "reward": 1.5875399112701416, - "reward_std": 0.30489787086844444, - "rewards/accuracy_reward": 0.015625000465661287, - "rewards/reasoning_steps_reward": 0.9479166865348816, - "rewards/repetition_penalty_reward": -0.11298095621168613, - "rewards/tag_count_reward": 0.7369792014360428, + "completion_length": 352.6770935058594, + "epoch": 0.10048743907011623, + "grad_norm": 0.9724287484197509, + "kl": 0.814453125, + "learning_rate": 2e-05, + "loss": -0.068, + "reward": 1.5917953252792358, + "reward_std": 0.6212150007486343, + "rewards/accuracy_reward": 0.3645833469927311, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.24153802171349525, + "rewards/tag_count_reward": 0.5104166865348816, "step": 67 }, { "clip_ratio": 0.0, - "completion_length": 544.3229293823242, - "epoch": 0.20389805097451275, - "grad_norm": 1.2843597560384168, - "kl": 4.6953125, - "learning_rate": 1.93686624783818e-05, - "loss": 0.2727, - "reward": 1.6841119825839996, - "reward_std": 0.3194814845919609, - "rewards/accuracy_reward": 0.03125000046566129, - "rewards/reasoning_steps_reward": 0.9565972238779068, - "rewards/repetition_penalty_reward": -0.11493314802646637, - "rewards/tag_count_reward": 0.8111979514360428, + "completion_length": 379.45314025878906, + "epoch": 0.1019872515935508, + "grad_norm": 0.8395794709698925, + "kl": 0.4814453125, + "learning_rate": 1.9999862464405377e-05, + "loss": -0.0332, + "reward": 1.6506008505821228, + "reward_std": 0.4853805750608444, + "rewards/accuracy_reward": 0.3489583358168602, + "rewards/reasoning_steps_reward": 0.986111119389534, + "rewards/repetition_penalty_reward": -0.29254162311553955, + "rewards/tag_count_reward": 0.6080729365348816, "step": 68 }, { "clip_ratio": 0.0, - "completion_length": 547.8958435058594, - "epoch": 0.20689655172413793, - "grad_norm": 0.709620329405288, - "kl": 4.9921875, - "learning_rate": 1.9331404318392028e-05, - "loss": 0.2179, - "reward": 1.8003253638744354, - "reward_std": 0.37343159317970276, - "rewards/accuracy_reward": 0.08854166837409139, - "rewards/reasoning_steps_reward": 0.9565972685813904, - "rewards/repetition_penalty_reward": -0.1015844214707613, - "rewards/tag_count_reward": 0.856770858168602, + "completion_length": 406.4166793823242, + "epoch": 0.10348706411698538, + "grad_norm": 1.5362392749572744, + "kl": 0.56005859375, + "learning_rate": 1.9999449861404716e-05, + "loss": -0.0482, + "reward": 1.6696705520153046, + "reward_std": 0.5340171456336975, + "rewards/accuracy_reward": 0.416666679084301, + "rewards/reasoning_steps_reward": 0.9843750149011612, + "rewards/repetition_penalty_reward": -0.3342357203364372, + "rewards/tag_count_reward": 0.602864608168602, "step": 69 }, { "clip_ratio": 0.0, - "completion_length": 495.00000762939453, - "epoch": 0.2098950524737631, - "grad_norm": 0.7619813319396578, - "kl": 4.671875, - "learning_rate": 1.9293116008606838e-05, - "loss": 0.188, - "reward": 1.8160299956798553, - "reward_std": 0.27780742943286896, - "rewards/accuracy_reward": 0.046875000931322575, - "rewards/reasoning_steps_reward": 0.9739583432674408, - "rewards/repetition_penalty_reward": -0.1188659518957138, - "rewards/tag_count_reward": 0.9140625298023224, + "completion_length": 373.52083587646484, + "epoch": 0.10498687664041995, + "grad_norm": 1.5205558305374207, + "kl": 0.63134765625, + "learning_rate": 1.999876220234753e-05, + "loss": -0.0912, + "reward": 1.6751901507377625, + "reward_std": 0.4232308343052864, + "rewards/accuracy_reward": 0.36458334140479565, + "rewards/reasoning_steps_reward": 0.9843750149011612, + "rewards/repetition_penalty_reward": -0.26100781559944153, + "rewards/tag_count_reward": 0.5872395932674408, "step": 70 }, { "clip_ratio": 0.0, - "completion_length": 646.7083511352539, - "epoch": 0.2128935532233883, - "grad_norm": 0.7272889048910283, - "kl": 5.125, - "learning_rate": 1.925380177590282e-05, - "loss": 0.212, - "reward": 1.8391913771629333, - "reward_std": 0.2926352843642235, - "rewards/accuracy_reward": 0.03125000046566129, - "rewards/reasoning_steps_reward": 0.9635417014360428, - "rewards/repetition_penalty_reward": -0.09830864518880844, - "rewards/tag_count_reward": 0.9427083730697632, + "completion_length": 632.9062805175781, + "epoch": 0.10648668916385452, + "grad_norm": 0.41857006484500325, + "kl": 0.51123046875, + "learning_rate": 1.999779950614934e-05, + "loss": -0.0632, + "reward": 1.9554068744182587, + "reward_std": 0.36940491758286953, + "rewards/accuracy_reward": 0.6510416865348816, + "rewards/reasoning_steps_reward": 0.9843750149011612, + "rewards/repetition_penalty_reward": -0.3766243904829025, + "rewards/tag_count_reward": 0.696614608168602, "step": 71 }, { "clip_ratio": 0.0, - "completion_length": 691.5677108764648, - "epoch": 0.2158920539730135, - "grad_norm": 0.5588946822791705, - "kl": 5.1640625, - "learning_rate": 1.921346596041437e-05, - "loss": 0.2451, - "reward": 1.953030288219452, - "reward_std": 0.3250604011118412, - "rewards/accuracy_reward": 0.09895833348855376, - "rewards/reasoning_steps_reward": 0.9583333879709244, - "rewards/repetition_penalty_reward": -0.061292664147913456, - "rewards/tag_count_reward": 0.9570312798023224, + "completion_length": 740.0573120117188, + "epoch": 0.10798650168728909, + "grad_norm": 0.9159052896835216, + "kl": 1.2783203125, + "learning_rate": 1.999656179929115e-05, + "loss": -0.0142, + "reward": 1.6608175337314606, + "reward_std": 0.33733372390270233, + "rewards/accuracy_reward": 0.338541679084301, + "rewards/reasoning_steps_reward": 0.9895833432674408, + "rewards/repetition_penalty_reward": -0.3639221265912056, + "rewards/tag_count_reward": 0.696614608168602, "step": 72 }, { "clip_ratio": 0.0, - "completion_length": 690.0364837646484, - "epoch": 0.21889055472263869, - "grad_norm": 0.5154663707678571, - "kl": 4.48828125, - "learning_rate": 1.917211301505453e-05, - "loss": 0.2857, - "reward": 1.984506070613861, - "reward_std": 0.4579034373164177, - "rewards/accuracy_reward": 0.2083333395421505, - "rewards/reasoning_steps_reward": 0.8940972983837128, - "rewards/repetition_penalty_reward": -0.043705823831260204, - "rewards/tag_count_reward": 0.9257812798023224, + "completion_length": 640.7708435058594, + "epoch": 0.10948631421072366, + "grad_norm": 0.6371368439400827, + "kl": 0.52587890625, + "learning_rate": 1.9995049115818706e-05, + "loss": -0.0003, + "reward": 2.032588928937912, + "reward_std": 0.3413342162966728, + "rewards/accuracy_reward": 0.5989583432674408, + "rewards/reasoning_steps_reward": 0.9947916716337204, + "rewards/repetition_penalty_reward": -0.2942340075969696, + "rewards/tag_count_reward": 0.7330729365348816, "step": 73 }, { "clip_ratio": 0.0, - "completion_length": 421.1666793823242, - "epoch": 0.22188905547226387, - "grad_norm": 0.7720729416463754, - "kl": 2.33984375, - "learning_rate": 1.9129747505023438e-05, - "loss": 0.2752, - "reward": 2.0761736631393433, - "reward_std": 0.5404656231403351, - "rewards/accuracy_reward": 0.27604167349636555, - "rewards/reasoning_steps_reward": 0.8906250894069672, - "rewards/repetition_penalty_reward": -0.041013902984559536, - "rewards/tag_count_reward": 0.950520858168602, + "completion_length": 540.7708435058594, + "epoch": 0.11098612673415822, + "grad_norm": 0.48623664555771756, + "kl": 0.60498046875, + "learning_rate": 1.9993261497341575e-05, + "loss": -0.0128, + "reward": 2.286080002784729, + "reward_std": 0.3919960707426071, + "rewards/accuracy_reward": 0.8020833730697632, + "rewards/reasoning_steps_reward": 0.9947916716337204, + "rewards/repetition_penalty_reward": -0.2516804449260235, + "rewards/tag_count_reward": 0.7408854365348816, "step": 74 }, { "clip_ratio": 0.0, - "completion_length": 286.5260429382324, - "epoch": 0.22488755622188905, - "grad_norm": 0.8315806740092944, - "kl": 1.33984375, - "learning_rate": 1.9086374107304312e-05, - "loss": 0.2576, - "reward": 2.031211197376251, - "reward_std": 0.42878295481204987, - "rewards/accuracy_reward": 0.18229167303070426, - "rewards/reasoning_steps_reward": 0.9322916269302368, - "rewards/repetition_penalty_reward": -0.03389316704124212, - "rewards/tag_count_reward": 0.950520858168602, + "completion_length": 480.89064025878906, + "epoch": 0.1124859392575928, + "grad_norm": 0.596999402158007, + "kl": 0.3017578125, + "learning_rate": 1.9991198993031992e-05, + "loss": -0.0083, + "reward": 2.0890829265117645, + "reward_std": 0.5263374149799347, + "rewards/accuracy_reward": 0.5781250074505806, + "rewards/reasoning_steps_reward": 0.9843750149011612, + "rewards/repetition_penalty_reward": -0.20649003237485886, + "rewards/tag_count_reward": 0.7330729216337204, "step": 75 }, { "clip_ratio": 0.0, - "completion_length": 235.10938262939453, - "epoch": 0.22788605697151423, - "grad_norm": 0.9080987838709209, - "kl": 0.7822265625, - "learning_rate": 1.9041997610147166e-05, - "loss": 0.1866, - "reward": 2.127915918827057, - "reward_std": 0.4003848433494568, - "rewards/accuracy_reward": 0.244791679084301, - "rewards/reasoning_steps_reward": 0.9739583879709244, - "rewards/repetition_penalty_reward": -0.0491675129160285, - "rewards/tag_count_reward": 0.9583333432674408, + "completion_length": 388.0833511352539, + "epoch": 0.11398575178102738, + "grad_norm": 7.4143713553392665, + "kl": 1.03271484375, + "learning_rate": 1.99888616596235e-05, + "loss": -0.019, + "reward": 2.0290270149707794, + "reward_std": 0.5157302021980286, + "rewards/accuracy_reward": 0.5000000149011612, + "rewards/reasoning_steps_reward": 0.987847238779068, + "rewards/repetition_penalty_reward": -0.16845569387078285, + "rewards/tag_count_reward": 0.7096354365348816, "step": 76 }, { "clip_ratio": 0.0, - "completion_length": 257.5625114440918, - "epoch": 0.23088455772113944, - "grad_norm": 1.013137501122327, - "kl": 0.517578125, - "learning_rate": 1.8996622912540182e-05, - "loss": 0.1469, - "reward": 2.171510338783264, - "reward_std": 0.43282945454120636, - "rewards/accuracy_reward": 0.260416679084301, - "rewards/reasoning_steps_reward": 0.9947916567325592, - "rewards/repetition_penalty_reward": -0.048541837371885777, - "rewards/tag_count_reward": 0.9648437798023224, + "completion_length": 372.8698043823242, + "epoch": 0.11548556430446194, + "grad_norm": 1.6227089625901896, + "kl": 0.443359375, + "learning_rate": 1.9986249561409415e-05, + "loss": -0.1403, + "reward": 2.0930177569389343, + "reward_std": 0.5763500481843948, + "rewards/accuracy_reward": 0.583333358168602, + "rewards/reasoning_steps_reward": 0.9843750298023224, + "rewards/repetition_penalty_reward": -0.14135734736919403, + "rewards/tag_count_reward": 0.6666666716337204, "step": 77 }, { "clip_ratio": 0.0, - "completion_length": 375.3698043823242, - "epoch": 0.23388305847076463, - "grad_norm": 0.8031277081942563, - "kl": 0.859375, - "learning_rate": 1.8950255023668876e-05, - "loss": 0.4293, - "reward": 2.1424886882305145, - "reward_std": 0.5950686857104301, - "rewards/accuracy_reward": 0.354166679084301, - "rewards/reasoning_steps_reward": 0.9635417014360428, - "rewards/repetition_penalty_reward": -0.046313464641571045, - "rewards/tag_count_reward": 0.8710937649011612, + "completion_length": 461.5208435058594, + "epoch": 0.11698537682789652, + "grad_norm": 0.510885474337652, + "kl": 0.268310546875, + "learning_rate": 1.998336277024103e-05, + "loss": -0.0721, + "reward": 2.0055829286575317, + "reward_std": 0.4140588045120239, + "rewards/accuracy_reward": 0.5260416828095913, + "rewards/reasoning_steps_reward": 0.9809027910232544, + "rewards/repetition_penalty_reward": -0.22531990334391594, + "rewards/tag_count_reward": 0.7239583432674408, "step": 78 }, { "clip_ratio": 0.0, - "completion_length": 491.32814025878906, - "epoch": 0.2368815592203898, - "grad_norm": 1.7806633395846703, - "kl": 0.951171875, - "learning_rate": 1.8902899062363142e-05, - "loss": 0.6398, - "reward": 1.9999366104602814, - "reward_std": 0.7513662576675415, - "rewards/accuracy_reward": 0.3437500074505806, - "rewards/reasoning_steps_reward": 0.9409722983837128, - "rewards/repetition_penalty_reward": -0.032181489281356335, - "rewards/tag_count_reward": 0.747395858168602, + "completion_length": 501.5573043823242, + "epoch": 0.11848518935133108, + "grad_norm": 0.47404049136724163, + "kl": 0.241943359375, + "learning_rate": 1.998020136552566e-05, + "loss": 0.001, + "reward": 2.024559497833252, + "reward_std": 0.4507194012403488, + "rewards/accuracy_reward": 0.5572916716337204, + "rewards/reasoning_steps_reward": 0.9809028059244156, + "rewards/repetition_penalty_reward": -0.2545204684138298, + "rewards/tag_count_reward": 0.7408854365348816, "step": 79 }, { "clip_ratio": 0.0, - "completion_length": 629.9583587646484, - "epoch": 0.239880059970015, - "grad_norm": 0.894829793618965, - "kl": 0.8115234375, - "learning_rate": 1.8854560256532098e-05, - "loss": 0.5427, - "reward": 1.8730204403400421, - "reward_std": 0.8077119290828705, - "rewards/accuracy_reward": 0.354166679084301, - "rewards/reasoning_steps_reward": 0.9253472834825516, - "rewards/repetition_penalty_reward": -0.02628516126424074, - "rewards/tag_count_reward": 0.6197916865348816, + "completion_length": 470.04688262939453, + "epoch": 0.11998500187476566, + "grad_norm": 0.6494805682984701, + "kl": 0.2578125, + "learning_rate": 1.9976765434224426e-05, + "loss": 0.0393, + "reward": 2.306118667125702, + "reward_std": 0.3345252051949501, + "rewards/accuracy_reward": 0.7812500149011612, + "rewards/reasoning_steps_reward": 0.9826389253139496, + "rewards/repetition_penalty_reward": -0.2064681462943554, + "rewards/tag_count_reward": 0.7486979365348816, "step": 80 }, { "clip_ratio": 0.0, - "completion_length": 446.9895935058594, - "epoch": 0.24287856071964017, - "grad_norm": 1.5621375006184983, - "kl": 0.9130859375, - "learning_rate": 1.8805243942587e-05, - "loss": 0.7422, - "reward": 2.229804575443268, - "reward_std": 0.8122627884149551, - "rewards/accuracy_reward": 0.5520833358168602, - "rewards/reasoning_steps_reward": 0.9565972536802292, - "rewards/repetition_penalty_reward": -0.04970931261777878, - "rewards/tag_count_reward": 0.7708333432674408, + "completion_length": 371.0416717529297, + "epoch": 0.12148481439820022, + "grad_norm": 0.6158896116207796, + "kl": 0.27783203125, + "learning_rate": 1.9973055070849912e-05, + "loss": -0.0077, + "reward": 2.2738314270973206, + "reward_std": 0.3718957081437111, + "rewards/accuracy_reward": 0.692708358168602, + "rewards/reasoning_steps_reward": 0.9652778059244156, + "rewards/repetition_penalty_reward": -0.1354569736868143, + "rewards/tag_count_reward": 0.7513020932674408, "step": 81 }, { "clip_ratio": 0.0, - "completion_length": 326.78126525878906, - "epoch": 0.24587706146926536, - "grad_norm": 0.734038354451238, - "kl": 0.5283203125, - "learning_rate": 1.8754955564852082e-05, - "loss": 0.4824, - "reward": 2.3457452058792114, - "reward_std": 0.41162994503974915, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/reasoning_steps_reward": 0.9930555671453476, - "rewards/repetition_penalty_reward": -0.08611250855028629, - "rewards/tag_count_reward": 0.8971354216337204, + "completion_length": 331.3958435058594, + "epoch": 0.1229846269216348, + "grad_norm": 0.681584270450499, + "kl": 0.27490234375, + "learning_rate": 1.996907037746352e-05, + "loss": -0.0158, + "reward": 2.2777538299560547, + "reward_std": 0.45179731398820877, + "rewards/accuracy_reward": 0.5989583432674408, + "rewards/reasoning_steps_reward": 0.9565972685813904, + "rewards/repetition_penalty_reward": -0.07597886584699154, + "rewards/tag_count_reward": 0.7981770932674408, "step": 82 }, { "clip_ratio": 0.0, - "completion_length": 268.6093864440918, - "epoch": 0.24887556221889057, - "grad_norm": 0.7617786864490775, - "kl": 0.4599609375, - "learning_rate": 1.870370067496355e-05, - "loss": 0.4915, - "reward": 2.696290135383606, - "reward_std": 0.42367345839738846, - "rewards/accuracy_reward": 0.8697916716337204, - "rewards/reasoning_steps_reward": 0.9756944477558136, - "rewards/repetition_penalty_reward": -0.09581072442233562, - "rewards/tag_count_reward": 0.9466145932674408, + "completion_length": 326.64063262939453, + "epoch": 0.12448443944506937, + "grad_norm": 0.6980728390227965, + "kl": 0.236083984375, + "learning_rate": 1.9964811463672685e-05, + "loss": -0.0154, + "reward": 2.332102954387665, + "reward_std": 0.472077339887619, + "rewards/accuracy_reward": 0.5468750149011612, + "rewards/reasoning_steps_reward": 0.9427083730697632, + "rewards/repetition_penalty_reward": -0.07024090643972158, + "rewards/tag_count_reward": 0.9127604365348816, "step": 83 }, { "clip_ratio": 0.0, - "completion_length": 251.5833396911621, - "epoch": 0.2518740629685157, - "grad_norm": 0.8354783555661154, - "kl": 0.4912109375, - "learning_rate": 1.8651484931256685e-05, - "loss": 0.3193, - "reward": 2.5308874249458313, - "reward_std": 0.34281000867486, - "rewards/accuracy_reward": 0.6770833432674408, - "rewards/reasoning_steps_reward": 0.9843750298023224, - "rewards/repetition_penalty_reward": -0.09281065501272678, - "rewards/tag_count_reward": 0.9622395932674408, + "completion_length": 345.2291793823242, + "epoch": 0.12598425196850394, + "grad_norm": 0.6918666028590195, + "kl": 0.29150390625, + "learning_rate": 1.996027844662785e-05, + "loss": -0.0238, + "reward": 2.347783863544464, + "reward_std": 0.497940331697464, + "rewards/accuracy_reward": 0.479166679084301, + "rewards/reasoning_steps_reward": 0.9878472536802292, + "rewards/repetition_penalty_reward": -0.06454263348132372, + "rewards/tag_count_reward": 0.9453125149011612, "step": 84 }, { "clip_ratio": 0.0, - "completion_length": 217.48438262939453, - "epoch": 0.25487256371814093, - "grad_norm": 0.8219462123752692, - "kl": 0.4619140625, - "learning_rate": 1.8598314098141208e-05, - "loss": 0.3273, - "reward": 2.6751975417137146, - "reward_std": 0.2594960853457451, - "rewards/accuracy_reward": 0.8072916865348816, - "rewards/reasoning_steps_reward": 0.970486119389534, - "rewards/repetition_penalty_reward": -0.0817470382899046, - "rewards/tag_count_reward": 0.9791667014360428, + "completion_length": 352.5729293823242, + "epoch": 0.1274840644919385, + "grad_norm": 0.662339973250043, + "kl": 0.207763671875, + "learning_rate": 1.9955471451019264e-05, + "loss": 0.0105, + "reward": 2.5184255242347717, + "reward_std": 0.423542745411396, + "rewards/accuracy_reward": 0.645833358168602, + "rewards/reasoning_steps_reward": 0.9756944477558136, + "rewards/repetition_penalty_reward": -0.06534186284989119, + "rewards/tag_count_reward": 0.9622395932674408, "step": 85 }, { "clip_ratio": 0.0, - "completion_length": 238.35937881469727, - "epoch": 0.25787106446776614, - "grad_norm": 0.8013857932799926, - "kl": 0.5302734375, - "learning_rate": 1.8544194045464888e-05, - "loss": 0.5448, - "reward": 2.324418604373932, - "reward_std": 0.5969930738210678, - "rewards/accuracy_reward": 0.5625000223517418, - "rewards/reasoning_steps_reward": 0.8784722536802292, - "rewards/repetition_penalty_reward": -0.06837661191821098, - "rewards/tag_count_reward": 0.9518229365348816, + "completion_length": 373.8958435058594, + "epoch": 0.1289838770153731, + "grad_norm": 1.1704532474263813, + "kl": 0.46435546875, + "learning_rate": 1.995039060907352e-05, + "loss": -0.0028, + "reward": 2.421631157398224, + "reward_std": 0.5489099845290184, + "rewards/accuracy_reward": 0.5989583432674408, + "rewards/reasoning_steps_reward": 0.96875, + "rewards/repetition_penalty_reward": -0.05753568932414055, + "rewards/tag_count_reward": 0.911458358168602, "step": 86 }, { "clip_ratio": 0.0, - "completion_length": 253.87500762939453, - "epoch": 0.2608695652173913, - "grad_norm": 1.2404216424832606, - "kl": 0.623046875, - "learning_rate": 1.848913074786555e-05, - "loss": 0.3312, - "reward": 2.4448606371879578, - "reward_std": 0.5359718501567841, - "rewards/accuracy_reward": 0.6927083507180214, - "rewards/reasoning_steps_reward": 0.8732638955116272, - "rewards/repetition_penalty_reward": -0.07163260504603386, - "rewards/tag_count_reward": 0.950520858168602, + "completion_length": 353.6979217529297, + "epoch": 0.13048368953880765, + "grad_norm": 3.3058755481715214, + "kl": 1.44921875, + "learning_rate": 1.994503606054994e-05, + "loss": -0.0166, + "reward": 2.3031901717185974, + "reward_std": 0.5215350016951561, + "rewards/accuracy_reward": 0.510416679084301, + "rewards/reasoning_steps_reward": 0.9618055820465088, + "rewards/repetition_penalty_reward": -0.0635634008795023, + "rewards/tag_count_reward": 0.8945312798023224, "step": 87 }, { "clip_ratio": 0.0, - "completion_length": 266.10939025878906, - "epoch": 0.2638680659670165, - "grad_norm": 2.28058295704287, - "kl": 0.61572265625, - "learning_rate": 1.843313028411149e-05, - "loss": 0.5956, - "reward": 2.511428415775299, - "reward_std": 0.48727843910455704, - "rewards/accuracy_reward": 0.708333358168602, - "rewards/reasoning_steps_reward": 0.9253472536802292, - "rewards/repetition_penalty_reward": -0.05845023598521948, - "rewards/tag_count_reward": 0.9361979365348816, + "completion_length": 340.1927185058594, + "epoch": 0.13198350206224221, + "grad_norm": 1.3397859193343156, + "kl": 0.47265625, + "learning_rate": 1.9939407952736737e-05, + "loss": -0.0518, + "reward": 2.3962929248809814, + "reward_std": 0.5894087105989456, + "rewards/accuracy_reward": 0.598958358168602, + "rewards/reasoning_steps_reward": 0.9809027910232544, + "rewards/repetition_penalty_reward": -0.07028705440461636, + "rewards/tag_count_reward": 0.8867187649011612, "step": 88 }, { "clip_ratio": 0.0, - "completion_length": 290.9114685058594, - "epoch": 0.26686656671664166, - "grad_norm": 259.99242737782754, - "kl": 3.0830078125, - "learning_rate": 1.8376198836430415e-05, - "loss": 0.6364, - "reward": 2.46868097782135, - "reward_std": 0.635415643453598, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/reasoning_steps_reward": 0.9461806118488312, - "rewards/repetition_penalty_reward": -0.06083299312740564, - "rewards/tag_count_reward": 0.9166667014360428, + "completion_length": 319.2916793823242, + "epoch": 0.13348331458567678, + "grad_norm": 1.6274329590710197, + "kl": 0.83447265625, + "learning_rate": 1.9933506440446932e-05, + "loss": -0.0982, + "reward": 2.4367164373397827, + "reward_std": 0.6716105788946152, + "rewards/accuracy_reward": 0.6458333507180214, + "rewards/reasoning_steps_reward": 0.9548611789941788, + "rewards/repetition_penalty_reward": -0.051998937502503395, + "rewards/tag_count_reward": 0.888020858168602, "step": 89 }, { "clip_ratio": 0.0, - "completion_length": 379.1302185058594, - "epoch": 0.2698650674662669, - "grad_norm": 1.9382632692598936, - "kl": 0.8193359375, - "learning_rate": 1.8318342689826938e-05, - "loss": 0.7494, - "reward": 2.3082011342048645, - "reward_std": 0.7488896995782852, - "rewards/accuracy_reward": 0.5572916716337204, - "rewards/reasoning_steps_reward": 0.9392361342906952, - "rewards/repetition_penalty_reward": -0.04249336663633585, - "rewards/tag_count_reward": 0.8541667014360428, + "completion_length": 329.4791717529297, + "epoch": 0.13498312710911137, + "grad_norm": 0.7057852059401287, + "kl": 0.335693359375, + "learning_rate": 1.992733168601413e-05, + "loss": -0.069, + "reward": 2.477515757083893, + "reward_std": 0.4706941097974777, + "rewards/accuracy_reward": 0.6562500149011612, + "rewards/reasoning_steps_reward": 0.9704861491918564, + "rewards/repetition_penalty_reward": -0.07369956281036139, + "rewards/tag_count_reward": 0.9244791865348816, "step": 90 }, { "clip_ratio": 0.0, - "completion_length": 344.7395935058594, - "epoch": 0.272863568215892, - "grad_norm": 1.4041925703207834, - "kl": 0.7041015625, - "learning_rate": 1.8259568231388737e-05, - "loss": 0.7354, - "reward": 2.346260666847229, - "reward_std": 0.6538338512182236, - "rewards/accuracy_reward": 0.552083358168602, - "rewards/reasoning_steps_reward": 0.9739583730697632, - "rewards/repetition_penalty_reward": -0.04696857463568449, - "rewards/tag_count_reward": 0.8671875149011612, + "completion_length": 321.5677185058594, + "epoch": 0.13648293963254593, + "grad_norm": 0.7018868149602894, + "kl": 0.37060546875, + "learning_rate": 1.9920883859288035e-05, + "loss": -0.0407, + "reward": 2.4766435027122498, + "reward_std": 0.5056409984827042, + "rewards/accuracy_reward": 0.6510416865348816, + "rewards/reasoning_steps_reward": 0.975694477558136, + "rewards/repetition_penalty_reward": -0.07326984778046608, + "rewards/tag_count_reward": 0.9231770932674408, "step": 91 }, { "clip_ratio": 0.0, - "completion_length": 329.8073043823242, - "epoch": 0.27586206896551724, - "grad_norm": 0.9388360459997904, - "kl": 0.697265625, - "learning_rate": 1.819988194958146e-05, - "loss": 0.7864, - "reward": 2.366865336894989, - "reward_std": 0.7087779939174652, - "rewards/accuracy_reward": 0.5937500149011612, - "rewards/reasoning_steps_reward": 0.947916716337204, - "rewards/repetition_penalty_reward": -0.053707641549408436, - "rewards/tag_count_reward": 0.8789062798023224, + "completion_length": 309.5208435058594, + "epoch": 0.1379827521559805, + "grad_norm": 1.1625720671949755, + "kl": 0.3447265625, + "learning_rate": 1.991416313762978e-05, + "loss": -0.0604, + "reward": 2.4806445837020874, + "reward_std": 0.46803582459688187, + "rewards/accuracy_reward": 0.677083358168602, + "rewards/reasoning_steps_reward": 0.9531250447034836, + "rewards/repetition_penalty_reward": -0.08445966802537441, + "rewards/tag_count_reward": 0.9348958432674408, "step": 92 }, { "clip_ratio": 0.0, - "completion_length": 323.0104293823242, - "epoch": 0.27886056971514245, - "grad_norm": 1.0904070944040622, - "kl": 0.67578125, - "learning_rate": 1.8139290433532415e-05, - "loss": 0.7764, - "reward": 2.373181402683258, - "reward_std": 0.7174654752016068, - "rewards/accuracy_reward": 0.6093750298023224, - "rewards/reasoning_steps_reward": 0.9392361044883728, - "rewards/repetition_penalty_reward": -0.05433602724224329, - "rewards/tag_count_reward": 0.8789062798023224, + "completion_length": 333.87500762939453, + "epoch": 0.13948256467941508, + "grad_norm": 0.7197532890419, + "kl": 0.40478515625, + "learning_rate": 1.990716970590706e-05, + "loss": -0.0581, + "reward": 2.5884640216827393, + "reward_std": 0.3948858380317688, + "rewards/accuracy_reward": 0.739583358168602, + "rewards/reasoning_steps_reward": 0.9826389104127884, + "rewards/repetition_penalty_reward": -0.08688333258032799, + "rewards/tag_count_reward": 0.9531250149011612, "step": 93 }, { "clip_ratio": 0.0, - "completion_length": 286.31250762939453, - "epoch": 0.2818590704647676, - "grad_norm": 1.345138349799746, - "kl": 0.6591796875, - "learning_rate": 1.807780037230315e-05, - "loss": 0.9459, - "reward": 2.401767611503601, - "reward_std": 0.7818585783243179, - "rewards/accuracy_reward": 0.6718750149011612, - "rewards/reasoning_steps_reward": 0.9218750447034836, - "rewards/repetition_penalty_reward": -0.052659488283097744, - "rewards/tag_count_reward": 0.8606770932674408, + "completion_length": 331.9166793823242, + "epoch": 0.14098237720284965, + "grad_norm": 3.2153625010582005, + "kl": 5.23779296875, + "learning_rate": 1.989990375648903e-05, + "loss": -0.0484, + "reward": 2.4388928413391113, + "reward_std": 0.33317605406045914, + "rewards/accuracy_reward": 0.6093750149011612, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.09322533197700977, + "rewards/tag_count_reward": 0.950520858168602, "step": 94 }, { "clip_ratio": 0.0, - "completion_length": 320.9583435058594, - "epoch": 0.2848575712143928, - "grad_norm": 1.6639693291434057, - "kl": 0.837890625, - "learning_rate": 1.8015418554151024e-05, - "loss": 0.7482, - "reward": 2.097085416316986, - "reward_std": 0.9631160348653793, - "rewards/accuracy_reward": 0.536458358168602, - "rewards/reasoning_steps_reward": 0.8246527761220932, - "rewards/repetition_penalty_reward": -0.04397374298423529, - "rewards/tag_count_reward": 0.7799479216337204, + "completion_length": 354.98439025878906, + "epoch": 0.1424821897262842, + "grad_norm": 1.7303354539555662, + "kl": 0.7705078125, + "learning_rate": 1.9892365489241023e-05, + "loss": -0.0393, + "reward": 2.3029019832611084, + "reward_std": 0.4447196274995804, + "rewards/accuracy_reward": 0.4843750149011612, + "rewards/reasoning_steps_reward": 0.9791667014360428, + "rewards/repetition_penalty_reward": -0.10855649225413799, + "rewards/tag_count_reward": 0.9479166716337204, "step": 95 }, { "clip_ratio": 0.0, - "completion_length": 107.17187881469727, - "epoch": 0.28785607196401797, - "grad_norm": 2.6809348147477285, - "kl": 1.669921875, - "learning_rate": 1.7952151865779792e-05, - "loss": -0.1937, - "reward": 1.646718680858612, - "reward_std": 1.0644067823886871, - "rewards/accuracy_reward": 0.3437500037252903, - "rewards/reasoning_steps_reward": 0.7274306118488312, - "rewards/repetition_penalty_reward": -0.03253482934087515, - "rewards/tag_count_reward": 0.6080729514360428, + "completion_length": 365.9635467529297, + "epoch": 0.1439820022497188, + "grad_norm": 0.6730986080559331, + "kl": 0.39892578125, + "learning_rate": 1.988455511151906e-05, + "loss": -0.0247, + "reward": 2.3117390871047974, + "reward_std": 0.43109431117773056, + "rewards/accuracy_reward": 0.4687500111758709, + "rewards/reasoning_steps_reward": 0.9809028059244156, + "rewards/repetition_penalty_reward": -0.11968471482396126, + "rewards/tag_count_reward": 0.9817708432674408, "step": 96 }, { "clip_ratio": 0.0, - "completion_length": 137.05208587646484, - "epoch": 0.2908545727136432, - "grad_norm": 1.661973040336361, - "kl": 0.6298828125, - "learning_rate": 1.7888007291579357e-05, - "loss": -0.0513, - "reward": 2.4063791632652283, - "reward_std": 0.647190622985363, - "rewards/accuracy_reward": 0.6197916865348816, - "rewards/reasoning_steps_reward": 0.9531250596046448, - "rewards/repetition_penalty_reward": -0.0636730408295989, - "rewards/tag_count_reward": 0.8971354365348816, + "completion_length": 367.5104293823242, + "epoch": 0.14548181477315336, + "grad_norm": 0.6453267932676297, + "kl": 0.283203125, + "learning_rate": 1.987647283816412e-05, + "loss": -0.016, + "reward": 2.5260453820228577, + "reward_std": 0.3044511452317238, + "rewards/accuracy_reward": 0.6562500149011612, + "rewards/reasoning_steps_reward": 0.987847238779068, + "rewards/repetition_penalty_reward": -0.11284366995096207, + "rewards/tag_count_reward": 0.9947916716337204, "step": 97 }, { "clip_ratio": 0.0, - "completion_length": 151.68750381469727, - "epoch": 0.2938530734632684, - "grad_norm": 2.7127088778027684, - "kl": 0.8984375, - "learning_rate": 1.7822991912854716e-05, - "loss": 0.195, - "reward": 2.5627546310424805, - "reward_std": 0.4789634570479393, - "rewards/accuracy_reward": 0.7135416865348816, - "rewards/reasoning_steps_reward": 0.9826389104127884, - "rewards/repetition_penalty_reward": -0.060509427450597286, - "rewards/tag_count_reward": 0.9270833432674408, + "completion_length": 338.60938262939453, + "epoch": 0.14698162729658792, + "grad_norm": 0.702216348224918, + "kl": 0.31689453125, + "learning_rate": 1.9868118891496268e-05, + "loss": 0.013, + "reward": 2.7034881114959717, + "reward_std": 0.254051860421896, + "rewards/accuracy_reward": 0.817708358168602, + "rewards/reasoning_steps_reward": 0.9843750149011612, + "rewards/repetition_penalty_reward": -0.09599110670387745, + "rewards/tag_count_reward": 0.9973958432674408, "step": 98 }, { "clip_ratio": 0.0, - "completion_length": 211.39062881469727, - "epoch": 0.29685157421289354, - "grad_norm": 18.504039023965966, - "kl": 1.9638671875, - "learning_rate": 1.77571129070442e-05, - "loss": 0.3326, - "reward": 2.2352594137191772, - "reward_std": 0.7260878309607506, - "rewards/accuracy_reward": 0.583333358168602, - "rewards/reasoning_steps_reward": 0.890625, - "rewards/repetition_penalty_reward": -0.06291783228516579, - "rewards/tag_count_reward": 0.82421875, + "completion_length": 336.6822967529297, + "epoch": 0.1484814398200225, + "grad_norm": 0.6294487017254564, + "kl": 0.28369140625, + "learning_rate": 1.98594935013085e-05, + "loss": -0.0063, + "reward": 2.4194209575653076, + "reward_std": 0.31747131049633026, + "rewards/accuracy_reward": 0.5312500074505806, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.09012776426970959, + "rewards/tag_count_reward": 0.9921875149011612, "step": 99 }, { "clip_ratio": 0.0, - "completion_length": 410.6458435058594, - "epoch": 0.29985007496251875, - "grad_norm": 7.369461971351607, - "kl": 1.18359375, - "learning_rate": 1.7690377546927134e-05, - "loss": 0.182, - "reward": 2.507060259580612, - "reward_std": 0.49995335936546326, - "rewards/accuracy_reward": 0.6458333488553762, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.055439687334001064, - "rewards/tag_count_reward": 0.9375000298023224, + "completion_length": 336.93750762939453, + "epoch": 0.14998125234345708, + "grad_norm": 0.7622200900254438, + "kl": 0.2880859375, + "learning_rate": 1.985059690486045e-05, + "loss": 0.0147, + "reward": 2.602359890937805, + "reward_std": 0.4236067906022072, + "rewards/accuracy_reward": 0.708333358168602, + "rewards/reasoning_steps_reward": 0.9878472536802292, + "rewards/repetition_penalty_reward": -0.07949773035943508, + "rewards/tag_count_reward": 0.985677108168602, "step": 100 }, { "clip_ratio": 0.0, - "completion_length": 614.0625152587891, - "epoch": 0.3028485757121439, - "grad_norm": 4.425874093822116, - "kl": 1.77734375, - "learning_rate": 1.7622793199820935e-05, - "loss": 0.2128, - "reward": 2.6125723719596863, - "reward_std": 0.4479042589664459, - "rewards/accuracy_reward": 0.7239583432674408, - "rewards/reasoning_steps_reward": 0.987847238779068, - "rewards/repetition_penalty_reward": -0.0510562164708972, - "rewards/tag_count_reward": 0.9518229216337204, + "completion_length": 311.07814025878906, + "epoch": 0.15148106486689164, + "grad_norm": 0.820138039709133, + "kl": 0.572998046875, + "learning_rate": 1.9841429346871863e-05, + "loss": -0.023, + "reward": 2.39324152469635, + "reward_std": 0.5194854661822319, + "rewards/accuracy_reward": 0.5104166865348816, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.0703001944348216, + "rewards/tag_count_reward": 0.9739583432674408, "step": 101 }, { "clip_ratio": 0.0, - "completion_length": 817.4895935058594, - "epoch": 0.3058470764617691, - "grad_norm": 4.565018933297784, - "kl": 3.43359375, - "learning_rate": 1.7554367326767793e-05, - "loss": 0.1219, - "reward": 2.5981725454330444, - "reward_std": 0.4292430207133293, - "rewards/accuracy_reward": 0.6875000223517418, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.043320574797689915, - "rewards/tag_count_reward": 0.9609375298023224, + "completion_length": 294.9427185058594, + "epoch": 0.1529808773903262, + "grad_norm": 0.6430598505979571, + "kl": 0.318359375, + "learning_rate": 1.9831991079515836e-05, + "loss": -0.0646, + "reward": 2.543990731239319, + "reward_std": 0.49773871898651123, + "rewards/accuracy_reward": 0.7239583432674408, + "rewards/reasoning_steps_reward": 0.91493059694767, + "rewards/repetition_penalty_reward": -0.07536709867417812, + "rewards/tag_count_reward": 0.98046875, "step": 102 }, { "clip_ratio": 0.0, - "completion_length": 140.87500381469727, - "epoch": 0.30884557721139433, - "grad_norm": 14.749084867590199, - "kl": 1.5126953125, - "learning_rate": 1.7485107481711014e-05, - "loss": 0.3689, - "reward": 2.522024154663086, - "reward_std": 0.3764337971806526, - "rewards/accuracy_reward": 0.645833358168602, - "rewards/reasoning_steps_reward": 0.9878472238779068, - "rewards/repetition_penalty_reward": -0.07910444028675556, - "rewards/tag_count_reward": 0.9674479365348816, + "completion_length": 309.2604293823242, + "epoch": 0.1544806899137608, + "grad_norm": 0.9425563472766899, + "kl": 0.55615234375, + "learning_rate": 1.982228236241192e-05, + "loss": -0.063, + "reward": 2.535941779613495, + "reward_std": 0.4504154324531555, + "rewards/accuracy_reward": 0.6822916865348816, + "rewards/reasoning_steps_reward": 0.94618059694767, + "rewards/repetition_penalty_reward": -0.07039502263069153, + "rewards/tag_count_reward": 0.977864608168602, "step": 103 }, { "clip_ratio": 0.0, - "completion_length": 140.06771278381348, - "epoch": 0.3118440779610195, - "grad_norm": 1.5854423021252229, - "kl": 0.75390625, - "learning_rate": 1.7415021310661073e-05, - "loss": 0.2421, - "reward": 2.5152770280838013, - "reward_std": 0.5358111336827278, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/reasoning_steps_reward": 0.989583358168602, - "rewards/repetition_penalty_reward": -0.08628544956445694, - "rewards/tag_count_reward": 0.9453125149011612, + "completion_length": 309.23439025878906, + "epoch": 0.15598050243719536, + "grad_norm": 0.8058787849857703, + "kl": 0.38916015625, + "learning_rate": 1.9812303462618945e-05, + "loss": -0.0416, + "reward": 2.438301682472229, + "reward_std": 0.46069950610399246, + "rewards/accuracy_reward": 0.5729166939854622, + "rewards/reasoning_steps_reward": 0.9652778059244156, + "rewards/repetition_penalty_reward": -0.05692401435226202, + "rewards/tag_count_reward": 0.9570312649011612, "step": 104 }, { "clip_ratio": 0.0, - "completion_length": 167.8229217529297, - "epoch": 0.3148425787106447, - "grad_norm": 2.0606435436365245, - "kl": 0.578125, - "learning_rate": 1.7344116550851546e-05, - "loss": 0.477, - "reward": 2.623923897743225, - "reward_std": 0.48354143649339676, - "rewards/accuracy_reward": 0.7343750149011612, - "rewards/reasoning_steps_reward": 0.9947916716337204, - "rewards/repetition_penalty_reward": -0.07659695856273174, - "rewards/tag_count_reward": 0.9713541865348816, + "completion_length": 318.7916717529297, + "epoch": 0.15748031496062992, + "grad_norm": 0.8045229402138098, + "kl": 0.427734375, + "learning_rate": 1.9802054654627694e-05, + "loss": -0.0673, + "reward": 2.3380337357521057, + "reward_std": 0.5075135007500648, + "rewards/accuracy_reward": 0.5, + "rewards/reasoning_steps_reward": 0.9670139402151108, + "rewards/repetition_penalty_reward": -0.06778228841722012, + "rewards/tag_count_reward": 0.938802108168602, "step": 105 }, { "clip_ratio": 0.0, - "completion_length": 142.77083587646484, - "epoch": 0.31784107946026985, - "grad_norm": 10.522238174054582, - "kl": 1.28515625, - "learning_rate": 1.7272401029884932e-05, - "loss": 0.2673, - "reward": 2.4795719981193542, - "reward_std": 0.5011198297142982, - "rewards/accuracy_reward": 0.5937500149011612, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.08379613049328327, - "rewards/tag_count_reward": 0.9765625149011612, + "completion_length": 338.3854217529297, + "epoch": 0.15898012748406448, + "grad_norm": 0.7209652038046875, + "kl": 0.27587890625, + "learning_rate": 1.9791536220353355e-05, + "loss": -0.0348, + "reward": 2.4589534401893616, + "reward_std": 0.45359161496162415, + "rewards/accuracy_reward": 0.578125, + "rewards/reasoning_steps_reward": 0.9826389104127884, + "rewards/repetition_penalty_reward": -0.06665424816310406, + "rewards/tag_count_reward": 0.9648437649011612, "step": 106 }, { "clip_ratio": 0.0, - "completion_length": 134.3802146911621, - "epoch": 0.32083958020989506, - "grad_norm": 3.3181439930079146, - "kl": 0.7978515625, - "learning_rate": 1.719988266486854e-05, - "loss": 0.1636, - "reward": 2.5852218866348267, - "reward_std": 0.4785446897149086, - "rewards/accuracy_reward": 0.6875000149011612, - "rewards/reasoning_steps_reward": 0.9947916716337204, - "rewards/repetition_penalty_reward": -0.08665317296981812, - "rewards/tag_count_reward": 0.9895833432674408, + "completion_length": 311.0208511352539, + "epoch": 0.16047994000749907, + "grad_norm": 0.6649826820375959, + "kl": 0.36474609375, + "learning_rate": 1.9780748449127745e-05, + "loss": -0.061, + "reward": 2.545065224170685, + "reward_std": 0.5411981120705605, + "rewards/accuracy_reward": 0.692708358168602, + "rewards/reasoning_steps_reward": 0.9670138955116272, + "rewards/repetition_penalty_reward": -0.07950077578425407, + "rewards/tag_count_reward": 0.9648437649011612, "step": 107 }, { "clip_ratio": 0.0, - "completion_length": 152.4479217529297, - "epoch": 0.3238380809595202, - "grad_norm": 3.2954339112996336, - "kl": 0.8662109375, - "learning_rate": 1.7126569461540445e-05, - "loss": 0.1118, - "reward": 2.564746856689453, - "reward_std": 0.34381402283906937, - "rewards/accuracy_reward": 0.661458358168602, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.08759699948132038, - "rewards/tag_count_reward": 0.9908854365348816, + "completion_length": 315.9896011352539, + "epoch": 0.16197975253093364, + "grad_norm": 0.8786847698949797, + "kl": 0.3779296875, + "learning_rate": 1.976969163769137e-05, + "loss": -0.083, + "reward": 2.484778046607971, + "reward_std": 0.5565851628780365, + "rewards/accuracy_reward": 0.6354166865348816, + "rewards/reasoning_steps_reward": 0.9704861342906952, + "rewards/repetition_penalty_reward": -0.06513523031026125, + "rewards/tag_count_reward": 0.9440104365348816, "step": 108 }, { "clip_ratio": 0.0, - "completion_length": 170.9166717529297, - "epoch": 0.3268365817091454, - "grad_norm": 5.1235193996536745, - "kl": 1.318359375, - "learning_rate": 1.70524695133857e-05, - "loss": 0.2634, - "reward": 2.745190441608429, - "reward_std": 0.3354305140674114, - "rewards/accuracy_reward": 0.8697916865348816, - "rewards/reasoning_steps_reward": 0.9947916716337204, - "rewards/repetition_penalty_reward": -0.09465333260595798, - "rewards/tag_count_reward": 0.9752604365348816, + "completion_length": 327.48438262939453, + "epoch": 0.1634795650543682, + "grad_norm": 0.8374810187766324, + "kl": 0.47119140625, + "learning_rate": 1.9758366090185255e-05, + "loss": -0.0833, + "reward": 2.4872137904167175, + "reward_std": 0.546069398522377, + "rewards/accuracy_reward": 0.677083358168602, + "rewards/reasoning_steps_reward": 0.9600694626569748, + "rewards/repetition_penalty_reward": -0.07962668687105179, + "rewards/tag_count_reward": 0.9296875298023224, "step": 109 }, { "clip_ratio": 0.0, - "completion_length": 157.3489646911621, - "epoch": 0.32983508245877063, - "grad_norm": 6.042829717081394, - "kl": 0.63818359375, - "learning_rate": 1.6977591000742855e-05, - "loss": 0.1922, - "reward": 2.687567353248596, - "reward_std": 0.2190675288438797, - "rewards/accuracy_reward": 0.7916667014360428, - "rewards/reasoning_steps_reward": 0.9947916716337204, - "rewards/repetition_penalty_reward": -0.08326606638729572, - "rewards/tag_count_reward": 0.9843750149011612, + "completion_length": 266.3177185058594, + "epoch": 0.1649793775778028, + "grad_norm": 8.874417749876018, + "kl": 1.828125, + "learning_rate": 1.974677211814259e-05, + "loss": -0.1294, + "reward": 1.922803819179535, + "reward_std": 0.5993900671601295, + "rewards/accuracy_reward": 0.2812500111758709, + "rewards/reasoning_steps_reward": 0.9583333283662796, + "rewards/repetition_penalty_reward": -0.0719878925010562, + "rewards/tag_count_reward": 0.7552083432674408, "step": 110 }, { "clip_ratio": 0.0, - "completion_length": 149.07813262939453, - "epoch": 0.3328335832083958, - "grad_norm": 27.456070537172923, - "kl": 1.2197265625, - "learning_rate": 1.6901942189900867e-05, - "loss": 0.0855, - "reward": 2.70491623878479, - "reward_std": 0.4226943477988243, - "rewards/accuracy_reward": 0.7968750149011612, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.07372974790632725, - "rewards/tag_count_reward": 0.981770858168602, + "completion_length": 282.6145935058594, + "epoch": 0.16647919010123735, + "grad_norm": 3.0395634773886893, + "kl": 1.529296875, + "learning_rate": 1.973491004048014e-05, + "loss": -0.1461, + "reward": 2.2567337453365326, + "reward_std": 0.6766453832387924, + "rewards/accuracy_reward": 0.5364583507180214, + "rewards/reasoning_steps_reward": 0.9548611491918564, + "rewards/repetition_penalty_reward": -0.07963782362639904, + "rewards/tag_count_reward": 0.845052108168602, "step": 111 }, { "clip_ratio": 0.0, - "completion_length": 152.4583396911621, - "epoch": 0.335832083958021, - "grad_norm": 1317.4357368028043, - "kl": 24.67529296875, - "learning_rate": 1.6825531432186545e-05, - "loss": 1.4049, - "reward": 2.6002301573753357, - "reward_std": 0.3956380560994148, - "rewards/accuracy_reward": 0.6979166716337204, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.0807595532387495, - "rewards/tag_count_reward": 0.9830729365348816, + "completion_length": 294.3958435058594, + "epoch": 0.1679790026246719, + "grad_norm": 0.7041219895358697, + "kl": 0.29052734375, + "learning_rate": 1.9722780183489477e-05, + "loss": -0.0136, + "reward": 2.4400432407855988, + "reward_std": 0.19339026510715485, + "rewards/accuracy_reward": 0.546875013038516, + "rewards/reasoning_steps_reward": 0.9947916716337204, + "rewards/repetition_penalty_reward": -0.08990472927689552, + "rewards/tag_count_reward": 0.9882812649011612, "step": 112 }, { "clip_ratio": 0.0, - "completion_length": 151.70313262939453, - "epoch": 0.33883058470764615, - "grad_norm": 186.82025106193726, - "kl": 5.892578125, - "learning_rate": 1.6748367163042577e-05, - "loss": 0.1938, - "reward": 2.6722583174705505, - "reward_std": 0.47787418961524963, - "rewards/accuracy_reward": 0.7968750149011612, - "rewards/reasoning_steps_reward": 0.9947916716337204, - "rewards/repetition_penalty_reward": -0.07123132981359959, - "rewards/tag_count_reward": 0.9518229216337204, + "completion_length": 285.1770935058594, + "epoch": 0.16947881514810648, + "grad_norm": 0.8587640795737388, + "kl": 0.6201171875, + "learning_rate": 1.9710382880828028e-05, + "loss": -0.0167, + "reward": 2.425099015235901, + "reward_std": 0.20590725913643837, + "rewards/accuracy_reward": 0.5677083507180214, + "rewards/reasoning_steps_reward": 0.9687500298023224, + "rewards/repetition_penalty_reward": -0.09443240240216255, + "rewards/tag_count_reward": 0.9830729365348816, "step": 113 }, { "clip_ratio": 0.0, - "completion_length": 151.92188262939453, - "epoch": 0.34182908545727136, - "grad_norm": 4.951400788715416, - "kl": 0.75830078125, - "learning_rate": 1.6670457901096328e-05, - "loss": 0.0216, - "reward": 2.5308876037597656, - "reward_std": 0.4789142981171608, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.07197706028819084, - "rewards/tag_count_reward": 0.9361979514360428, + "completion_length": 303.3541793823242, + "epoch": 0.17097862767154107, + "grad_norm": 0.6234593444589465, + "kl": 0.29541015625, + "learning_rate": 1.969771847350987e-05, + "loss": 0.0059, + "reward": 2.361898422241211, + "reward_std": 0.21231437101960182, + "rewards/accuracy_reward": 0.44791667722165585, + "rewards/reasoning_steps_reward": 0.9947916716337204, + "rewards/repetition_penalty_reward": -0.07560167275369167, + "rewards/tag_count_reward": 0.9947916716337204, "step": 114 }, { "clip_ratio": 0.0, - "completion_length": 154.68750381469727, - "epoch": 0.3448275862068966, - "grad_norm": 1.1162974968117996, - "kl": 0.4677734375, - "learning_rate": 1.659181224721938e-05, - "loss": 0.021, - "reward": 2.730695605278015, - "reward_std": 0.3208945095539093, - "rewards/accuracy_reward": 0.8385416865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.07008570153266191, - "rewards/tag_count_reward": 0.9622395932674408, + "completion_length": 261.2395935058594, + "epoch": 0.17247844019497563, + "grad_norm": 0.7078512660636744, + "kl": 0.2978515625, + "learning_rate": 1.968478730989636e-05, + "loss": -0.0062, + "reward": 2.1943055987358093, + "reward_std": 0.16293694078922272, + "rewards/accuracy_reward": 0.2760416716337204, + "rewards/reasoning_steps_reward": 0.9843750149011612, + "rewards/repetition_penalty_reward": -0.05569446366280317, + "rewards/tag_count_reward": 0.9895833432674408, "step": 115 }, { "clip_ratio": 0.0, - "completion_length": 166.71875381469727, - "epoch": 0.34782608695652173, - "grad_norm": 2.052554622098234, - "kl": 0.48193359375, - "learning_rate": 1.6512438883578047e-05, - "loss": 0.0718, - "reward": 2.7236560583114624, - "reward_std": 0.2550913393497467, - "rewards/accuracy_reward": 0.8437500298023224, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.0706147812306881, - "rewards/tag_count_reward": 0.9505208432674408, + "completion_length": 253.6718864440918, + "epoch": 0.1739782527184102, + "grad_norm": 0.7740364590084335, + "kl": 0.3125, + "learning_rate": 1.9671589745686563e-05, + "loss": 0.0356, + "reward": 2.4594351053237915, + "reward_std": 0.15120337810367346, + "rewards/accuracy_reward": 0.5312500102445483, + "rewards/reasoning_steps_reward": 0.9843750149011612, + "rewards/repetition_penalty_reward": -0.052283622324466705, + "rewards/tag_count_reward": 0.9960937649011612, "step": 116 }, { "clip_ratio": 0.0, - "completion_length": 150.95833587646484, - "epoch": 0.35082458770614694, - "grad_norm": 7.239442859707766, - "kl": 1.1904296875, - "learning_rate": 1.6432346572674897e-05, - "loss": 0.0808, - "reward": 2.7389387488365173, - "reward_std": 0.36363864317536354, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.06835305411368608, - "rewards/tag_count_reward": 0.9531250149011612, + "completion_length": 240.8385467529297, + "epoch": 0.17547806524184478, + "grad_norm": 0.6815939277095767, + "kl": 0.5048828125, + "learning_rate": 1.965812614390743e-05, + "loss": 0.014, + "reward": 2.592436134815216, + "reward_std": 0.151106015779078, + "rewards/accuracy_reward": 0.6614583507180214, + "rewards/reasoning_steps_reward": 0.9947916716337204, + "rewards/repetition_penalty_reward": -0.0533972904086113, + "rewards/tag_count_reward": 0.989583358168602, "step": 117 }, { "clip_ratio": 0.0, - "completion_length": 159.48958587646484, - "epoch": 0.3538230884557721, - "grad_norm": 2.1682395821362865, - "kl": 0.53564453125, - "learning_rate": 1.6351544156381413e-05, - "loss": 0.0727, - "reward": 2.877605438232422, - "reward_std": 0.2183693777769804, - "rewards/accuracy_reward": 0.9687500149011612, - "rewards/reasoning_steps_reward": 0.9965277761220932, - "rewards/repetition_penalty_reward": -0.055120449513196945, - "rewards/tag_count_reward": 0.9674479365348816, + "completion_length": 229.6510467529297, + "epoch": 0.17697787776527935, + "grad_norm": 0.8414344996361253, + "kl": 0.3369140625, + "learning_rate": 1.9644396874903865e-05, + "loss": 0.0194, + "reward": 2.4946166276931763, + "reward_std": 0.23435868322849274, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.998263880610466, + "rewards/repetition_penalty_reward": -0.04401196166872978, + "rewards/tag_count_reward": 0.9986979216337204, "step": 118 }, { "clip_ratio": 0.0, - "completion_length": 157.2395896911621, - "epoch": 0.3568215892053973, - "grad_norm": 38.71837348464348, - "kl": 2.7021484375, - "learning_rate": 1.6270040554961866e-05, - "loss": 0.2602, - "reward": 2.6591410040855408, - "reward_std": 0.34208307787775993, - "rewards/accuracy_reward": 0.739583358168602, + "completion_length": 213.03125762939453, + "epoch": 0.1784776902887139, + "grad_norm": 0.7595325300854338, + "kl": 0.33984375, + "learning_rate": 1.9630402316328506e-05, + "loss": 0.0338, + "reward": 2.496553957462311, + "reward_std": 0.13166913157328963, + "rewards/accuracy_reward": 0.526041679084301, "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.05440086964517832, - "rewards/tag_count_reward": 0.973958358168602, + "rewards/repetition_penalty_reward": -0.022977421525865793, + "rewards/tag_count_reward": 0.993489608168602, "step": 119 }, { "clip_ratio": 0.0, - "completion_length": 145.41666793823242, - "epoch": 0.3598200899550225, - "grad_norm": 4.316214986822223, - "kl": 0.68115234375, - "learning_rate": 1.6187844766088586e-05, - "loss": 0.2187, - "reward": 2.7976441979408264, - "reward_std": 0.3110164441168308, - "rewards/accuracy_reward": 0.8802083432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.043501587584614754, - "rewards/tag_count_reward": 0.9609375149011612, + "completion_length": 192.5364646911621, + "epoch": 0.17997750281214847, + "grad_norm": 0.8887334027661878, + "kl": 0.33837890625, + "learning_rate": 1.9616142853131342e-05, + "loss": 0.0245, + "reward": 2.5528977513313293, + "reward_std": 0.1918736957013607, + "rewards/accuracy_reward": 0.5781250176951289, + "rewards/reasoning_steps_reward": 0.9947916716337204, + "rewards/repetition_penalty_reward": -0.018716785591095686, + "rewards/tag_count_reward": 0.9986979216337204, "step": 120 }, { "clip_ratio": 0.0, - "completion_length": 130.14062690734863, - "epoch": 0.36281859070464767, - "grad_norm": 1.648081178543267, - "kl": 0.45947265625, - "learning_rate": 1.6104965863848615e-05, - "loss": -0.0071, - "reward": 2.7758957743644714, - "reward_std": 0.303048393689096, - "rewards/accuracy_reward": 0.8593750149011612, - "rewards/reasoning_steps_reward": 0.9930555671453476, - "rewards/repetition_penalty_reward": -0.027055577840656042, - "rewards/tag_count_reward": 0.9505208432674408, + "completion_length": 184.10416793823242, + "epoch": 0.18147731533558306, + "grad_norm": 0.945237692955735, + "kl": 0.3544921875, + "learning_rate": 1.9601618877549113e-05, + "loss": 0.0562, + "reward": 2.4059919714927673, + "reward_std": 0.2110268771648407, + "rewards/accuracy_reward": 0.43229167722165585, + "rewards/reasoning_steps_reward": 0.9947916716337204, + "rewards/repetition_penalty_reward": -0.01718513877131045, + "rewards/tag_count_reward": 0.99609375, "step": 121 }, { "clip_ratio": 0.0, - "completion_length": 125.65104675292969, - "epoch": 0.3658170914542729, - "grad_norm": 1.2095122817688286, - "kl": 0.59716796875, - "learning_rate": 1.6021412997741994e-05, - "loss": -0.0131, - "reward": 2.820314347743988, - "reward_std": 0.2720828726887703, - "rewards/accuracy_reward": 0.8854166865348816, - "rewards/reasoning_steps_reward": 0.9843750149011612, - "rewards/repetition_penalty_reward": -0.027342125307768583, - "rewards/tag_count_reward": 0.977864608168602, + "completion_length": 196.5833396911621, + "epoch": 0.18297712785901762, + "grad_norm": 0.8305782528862219, + "kl": 0.35009765625, + "learning_rate": 1.9586830789094548e-05, + "loss": 0.0135, + "reward": 2.6578049659729004, + "reward_std": 0.27065123803913593, + "rewards/accuracy_reward": 0.6927083432674408, + "rewards/reasoning_steps_reward": 0.9947916716337204, + "rewards/repetition_penalty_reward": -0.023184617515653372, + "rewards/tag_count_reward": 0.9934895932674408, "step": 122 }, { "clip_ratio": 0.0, - "completion_length": 149.05729293823242, - "epoch": 0.36881559220389803, - "grad_norm": 58.69921900530141, - "kl": 5.037109375, - "learning_rate": 1.593719539167169e-05, - "loss": 0.309, - "reward": 2.668303608894348, - "reward_std": 0.38671524077653885, - "rewards/accuracy_reward": 0.7343750149011612, - "rewards/reasoning_steps_reward": 0.986111119389534, - "rewards/repetition_penalty_reward": -0.01963039650581777, - "rewards/tag_count_reward": 0.9674479365348816, + "completion_length": 190.0885467529297, + "epoch": 0.1844769403824522, + "grad_norm": 0.8502534823244641, + "kl": 0.3662109375, + "learning_rate": 1.9571778994545356e-05, + "loss": -0.0024, + "reward": 2.5200935006141663, + "reward_std": 0.20754290046170354, + "rewards/accuracy_reward": 0.541666679084301, + "rewards/reasoning_steps_reward": 0.9947916716337204, + "rewards/repetition_penalty_reward": -0.012458672048524022, + "rewards/tag_count_reward": 0.99609375, "step": 123 }, { "clip_ratio": 0.0, - "completion_length": 124.10937881469727, - "epoch": 0.37181409295352325, - "grad_norm": 13.742825745337187, - "kl": 0.82275390625, - "learning_rate": 1.5852322342925294e-05, - "loss": 0.0778, - "reward": 2.820654571056366, - "reward_std": 0.292279414832592, - "rewards/accuracy_reward": 0.8750000149011612, + "completion_length": 193.5520896911621, + "epoch": 0.18597675290588678, + "grad_norm": 0.9748005418103824, + "kl": 0.376953125, + "learning_rate": 1.9556463907933038e-05, + "loss": 0.0298, + "reward": 2.628620147705078, + "reward_std": 0.28626001439988613, + "rewards/accuracy_reward": 0.6562500298023224, "rewards/reasoning_steps_reward": 0.9947916716337204, - "rewards/repetition_penalty_reward": -0.021793504944071174, - "rewards/tag_count_reward": 0.9726562649011612, + "rewards/repetition_penalty_reward": -0.01460911170579493, + "rewards/tag_count_reward": 0.9921875149011612, "step": 124 }, { "clip_ratio": 0.0, - "completion_length": 139.92187881469727, - "epoch": 0.3748125937031484, - "grad_norm": 2.232033331871603, - "kl": 0.484375, - "learning_rate": 1.5766803221148676e-05, - "loss": 0.105, - "reward": 2.7213205695152283, - "reward_std": 0.32770272716879845, - "rewards/accuracy_reward": 0.7760416865348816, - "rewards/reasoning_steps_reward": 0.9947916716337204, - "rewards/repetition_penalty_reward": -0.0208670892752707, - "rewards/tag_count_reward": 0.9713541865348816, + "completion_length": 192.9114646911621, + "epoch": 0.18747656542932134, + "grad_norm": 1.0358678308364058, + "kl": 0.47607421875, + "learning_rate": 1.9540885950531507e-05, + "loss": 0.01, + "reward": 2.6429734230041504, + "reward_std": 0.3361320048570633, + "rewards/accuracy_reward": 0.692708358168602, + "rewards/reasoning_steps_reward": 0.987847238779068, + "rewards/repetition_penalty_reward": -0.0232592502143234, + "rewards/tag_count_reward": 0.9856770932674408, "step": 125 }, { "clip_ratio": 0.0, - "completion_length": 123.58854293823242, - "epoch": 0.3778110944527736, - "grad_norm": 1.0178638016852966, - "kl": 0.49951171875, - "learning_rate": 1.568064746731156e-05, - "loss": 0.011, - "reward": 2.6907403469085693, - "reward_std": 0.35912006720900536, - "rewards/accuracy_reward": 0.7447916865348816, - "rewards/reasoning_steps_reward": 0.9843750149011612, - "rewards/repetition_penalty_reward": -0.01498887687921524, - "rewards/tag_count_reward": 0.9765625149011612, + "completion_length": 201.3958396911621, + "epoch": 0.1889763779527559, + "grad_norm": 1.1639527894884278, + "kl": 0.4892578125, + "learning_rate": 1.9525045550845482e-05, + "loss": 0.0132, + "reward": 2.66941100358963, + "reward_std": 0.25102632492780685, + "rewards/accuracy_reward": 0.7343750149011612, + "rewards/reasoning_steps_reward": 0.989583358168602, + "rewards/repetition_penalty_reward": -0.03241182304918766, + "rewards/tag_count_reward": 0.977864608168602, "step": 126 }, { "clip_ratio": 0.0, - "completion_length": 119.31250190734863, - "epoch": 0.3808095952023988, - "grad_norm": 1.1770522092684346, - "kl": 0.888671875, - "learning_rate": 1.5593864592665333e-05, - "loss": -0.0463, - "reward": 2.7778323888778687, - "reward_std": 0.38024942576885223, - "rewards/accuracy_reward": 0.848958358168602, - "rewards/reasoning_steps_reward": 0.9756944626569748, - "rewards/repetition_penalty_reward": -0.014268482336774468, - "rewards/tag_count_reward": 0.9674479365348816, + "completion_length": 204.21875762939453, + "epoch": 0.19047619047619047, + "grad_norm": 2.368193480039129, + "kl": 1.4140625, + "learning_rate": 1.9508943144598726e-05, + "loss": 0.0439, + "reward": 2.4418792724609375, + "reward_std": 0.3257411792874336, + "rewards/accuracy_reward": 0.5260416716337204, + "rewards/reasoning_steps_reward": 0.9809028059244156, + "rewards/repetition_penalty_reward": -0.020794388838112354, + "rewards/tag_count_reward": 0.9557291865348816, "step": 127 }, { "clip_ratio": 0.0, - "completion_length": 121.88021087646484, - "epoch": 0.383808095952024, - "grad_norm": 1.1780540061687166, - "kl": 0.45703125, - "learning_rate": 1.550646417769301e-05, - "loss": -0.0279, - "reward": 2.692775547504425, - "reward_std": 0.4350905865430832, - "rewards/accuracy_reward": 0.755208358168602, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.012953592464327812, - "rewards/tag_count_reward": 0.9713541865348816, + "completion_length": 183.81250762939453, + "epoch": 0.19197600299962506, + "grad_norm": 11.16928341041206, + "kl": 3.96875, + "learning_rate": 1.9492579174722043e-05, + "loss": 0.0888, + "reward": 2.5279927849769592, + "reward_std": 0.21637535840272903, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.986111119389534, + "rewards/repetition_penalty_reward": -0.021920496597886086, + "rewards/tag_count_reward": 0.9804687649011612, "step": 128 }, { "clip_ratio": 0.0, - "completion_length": 113.96354484558105, - "epoch": 0.3868065967016492, - "grad_norm": 1.695442486056536, - "kl": 0.6533203125, - "learning_rate": 1.541845587105159e-05, - "loss": -0.0497, - "reward": 2.6655062437057495, - "reward_std": 0.5277450531721115, - "rewards/accuracy_reward": 0.7656250298023224, - "rewards/reasoning_steps_reward": 0.965277835726738, - "rewards/repetition_penalty_reward": -0.014615388121455908, - "rewards/tag_count_reward": 0.9492187649011612, + "completion_length": 203.21875381469727, + "epoch": 0.19347581552305962, + "grad_norm": 84.32921564323993, + "kl": 12.12060546875, + "learning_rate": 1.9475954091341098e-05, + "loss": 0.4114, + "reward": 2.7423276901245117, + "reward_std": 0.2567145712673664, + "rewards/accuracy_reward": 0.8125000149011612, + "rewards/reasoning_steps_reward": 0.9774305820465088, + "rewards/repetition_penalty_reward": -0.02937377756461501, + "rewards/tag_count_reward": 0.9817708432674408, "step": 129 }, { "clip_ratio": 0.0, - "completion_length": 119.38020896911621, - "epoch": 0.38980509745127434, - "grad_norm": 1.53256816336331, - "kl": 0.7158203125, - "learning_rate": 1.532984938850689e-05, - "loss": -0.0148, - "reward": 2.6938071250915527, - "reward_std": 0.4943315237760544, - "rewards/accuracy_reward": 0.7760416865348816, - "rewards/reasoning_steps_reward": 0.9756944924592972, - "rewards/repetition_penalty_reward": -0.009752006619237363, - "rewards/tag_count_reward": 0.9518229365348816, + "completion_length": 215.53125762939453, + "epoch": 0.19497562804649418, + "grad_norm": 0.8646472823122007, + "kl": 0.4267578125, + "learning_rate": 1.9459068351764032e-05, + "loss": 0.0533, + "reward": 2.3658406734466553, + "reward_std": 0.14664249867200851, + "rewards/accuracy_reward": 0.4114583432674408, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.029992765747010708, + "rewards/tag_count_reward": 0.9843750149011612, "step": 130 }, { "clip_ratio": 0.0, - "completion_length": 120.53646087646484, - "epoch": 0.39280359820089955, - "grad_norm": 2.1945480451348214, - "kl": 0.9951171875, - "learning_rate": 1.524065451186095e-05, - "loss": -0.0313, - "reward": 2.531009793281555, - "reward_std": 0.5116675943136215, - "rewards/accuracy_reward": 0.6302083432674408, - "rewards/reasoning_steps_reward": 0.9670139253139496, - "rewards/repetition_penalty_reward": -0.016733432421460748, - "rewards/tag_count_reward": 0.9505208432674408, + "completion_length": 222.41666793823242, + "epoch": 0.19647544056992877, + "grad_norm": 0.8326546294074292, + "kl": 0.48095703125, + "learning_rate": 1.94419224204689e-05, + "loss": 0.0102, + "reward": 2.5197873711586, + "reward_std": 0.3018568679690361, + "rewards/accuracy_reward": 0.5885416828095913, + "rewards/reasoning_steps_reward": 0.987847238779068, + "rewards/repetition_penalty_reward": -0.03056000219658017, + "rewards/tag_count_reward": 0.973958358168602, "step": 131 }, { "clip_ratio": 0.0, - "completion_length": 123.78125381469727, - "epoch": 0.39580209895052476, - "grad_norm": 4.5129778094997155, - "kl": 1.5009765625, - "learning_rate": 1.5150881087872184e-05, - "loss": 0.0509, - "reward": 2.7841535210609436, - "reward_std": 0.3951072469353676, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 0.9774305820465088, - "rewards/repetition_penalty_reward": -0.013589567271992564, - "rewards/tag_count_reward": 0.966145858168602, + "completion_length": 221.04687881469727, + "epoch": 0.19797525309336333, + "grad_norm": 0.8968322010706142, + "kl": 0.5380859375, + "learning_rate": 1.9424516769090863e-05, + "loss": -0.0605, + "reward": 2.4514987468719482, + "reward_std": 0.38813910633325577, + "rewards/accuracy_reward": 0.546875, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.03070627013221383, + "rewards/tag_count_reward": 0.970052108168602, "step": 132 }, { "clip_ratio": 0.0, - "completion_length": 127.78646087646484, - "epoch": 0.3988005997001499, - "grad_norm": 38.945437112042384, - "kl": 6.130859375, - "learning_rate": 1.5060539027168317e-05, - "loss": 0.2991, - "reward": 2.6191622018814087, - "reward_std": 0.470632191747427, - "rewards/accuracy_reward": 0.7031250149011612, - "rewards/reasoning_steps_reward": 0.9774305671453476, - "rewards/repetition_penalty_reward": -0.013216342777013779, - "rewards/tag_count_reward": 0.9518229365348816, + "completion_length": 216.1197967529297, + "epoch": 0.1994750656167979, + "grad_norm": 0.720816161798331, + "kl": 0.40869140625, + "learning_rate": 1.9406851876409254e-05, + "loss": -0.0202, + "reward": 2.6190152168273926, + "reward_std": 0.29873134195804596, + "rewards/accuracy_reward": 0.7031250298023224, + "rewards/reasoning_steps_reward": 0.9670139104127884, + "rewards/repetition_penalty_reward": -0.0354987857863307, + "rewards/tag_count_reward": 0.9843750149011612, "step": 133 }, { "clip_ratio": 0.0, - "completion_length": 122.30208587646484, - "epoch": 0.4017991004497751, - "grad_norm": 11.506961978299184, - "kl": 2.6162109375, - "learning_rate": 1.4969638303152296e-05, - "loss": 0.0406, - "reward": 2.60586279630661, - "reward_std": 0.49396244436502457, - "rewards/accuracy_reward": 0.692708358168602, - "rewards/reasoning_steps_reward": 0.9687500596046448, - "rewards/repetition_penalty_reward": -0.019137236289680004, - "rewards/tag_count_reward": 0.9635416865348816, + "completion_length": 218.9479217529297, + "epoch": 0.20097487814023246, + "grad_norm": 0.8292765779680333, + "kl": 0.4521484375, + "learning_rate": 1.938892822833437e-05, + "loss": -0.0563, + "reward": 2.720021903514862, + "reward_std": 0.4130494073033333, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9756944626569748, + "rewards/repetition_penalty_reward": -0.02650598995387554, + "rewards/tag_count_reward": 0.9791666865348816, "step": 134 }, { "clip_ratio": 0.0, - "completion_length": 119.50521278381348, - "epoch": 0.4047976011994003, - "grad_norm": 2.0647869792670006, - "kl": 0.6298828125, - "learning_rate": 1.4878188950901275e-05, - "loss": 0.0324, - "reward": 2.8466954231262207, - "reward_std": 0.35728102922439575, - "rewards/accuracy_reward": 0.9010416865348816, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.011811425443738699, - "rewards/tag_count_reward": 0.9713541865348816, + "completion_length": 248.96875381469727, + "epoch": 0.20247469066366705, + "grad_norm": 0.751372103948477, + "kl": 0.578125, + "learning_rate": 1.9370746317894135e-05, + "loss": -0.0156, + "reward": 2.5825566053390503, + "reward_std": 0.4179469347000122, + "rewards/accuracy_reward": 0.6562500149011612, + "rewards/reasoning_steps_reward": 0.9791666567325592, + "rewards/repetition_penalty_reward": -0.03332882048562169, + "rewards/tag_count_reward": 0.9804687649011612, "step": 135 }, { "clip_ratio": 0.0, - "completion_length": 120.55729293823242, - "epoch": 0.4077961019490255, - "grad_norm": 1.2964235131560873, - "kl": 0.46630859375, - "learning_rate": 1.4786201066058767e-05, - "loss": 0.023, - "reward": 2.9288823008537292, - "reward_std": 0.20188137842342257, - "rewards/accuracy_reward": 0.9531250149011612, - "rewards/reasoning_steps_reward": 0.9947916716337204, - "rewards/repetition_penalty_reward": -0.011221994878724217, - "rewards/tag_count_reward": 0.9921875149011612, + "completion_length": 247.97917556762695, + "epoch": 0.2039745031871016, + "grad_norm": 2.8917497533599517, + "kl": 0.42822265625, + "learning_rate": 1.9352306645220518e-05, + "loss": -0.0077, + "reward": 2.4527002573013306, + "reward_std": 0.44423961639404297, + "rewards/accuracy_reward": 0.5312500223517418, + "rewards/reasoning_steps_reward": 0.9843750149011612, + "rewards/repetition_penalty_reward": -0.029070657677948475, + "rewards/tag_count_reward": 0.9661458432674408, "step": 136 }, { "clip_ratio": 0.0, - "completion_length": 121.47917175292969, - "epoch": 0.4107946026986507, - "grad_norm": 4.427417919453853, - "kl": 0.8115234375, - "learning_rate": 1.4693684803720139e-05, - "loss": 0.0257, - "reward": 2.752627432346344, - "reward_std": 0.28887180984020233, - "rewards/accuracy_reward": 0.7968750149011612, - "rewards/reasoning_steps_reward": 0.9878472238779068, - "rewards/repetition_penalty_reward": -0.017771947663277388, - "rewards/tag_count_reward": 0.9856770932674408, + "completion_length": 244.0677146911621, + "epoch": 0.20547431571053618, + "grad_norm": 0.7664671252381398, + "kl": 0.3759765625, + "learning_rate": 1.9333609717535788e-05, + "loss": 0.015, + "reward": 2.645112633705139, + "reward_std": 0.43032628297805786, + "rewards/accuracy_reward": 0.723958358168602, + "rewards/reasoning_steps_reward": 0.975694477558136, + "rewards/repetition_penalty_reward": -0.03500907029956579, + "rewards/tag_count_reward": 0.98046875, "step": 137 }, { "clip_ratio": 0.0, - "completion_length": 116.52604484558105, - "epoch": 0.41379310344827586, - "grad_norm": 3.187742864763733, - "kl": 0.6494140625, - "learning_rate": 1.4600650377311523e-05, - "loss": 0.0343, - "reward": 2.9099594950675964, - "reward_std": 0.214247893425636, - "rewards/accuracy_reward": 0.9375000149011612, - "rewards/reasoning_steps_reward": 0.9947916716337204, - "rewards/repetition_penalty_reward": -0.011915652547031641, - "rewards/tag_count_reward": 0.9895833432674408, + "completion_length": 236.29687881469727, + "epoch": 0.20697412823397077, + "grad_norm": 0.9030694840003886, + "kl": 0.48681640625, + "learning_rate": 1.931465604913856e-05, + "loss": -0.0409, + "reward": 2.5029749870300293, + "reward_std": 0.49775829538702965, + "rewards/accuracy_reward": 0.6197916865348816, + "rewards/reasoning_steps_reward": 0.9565972685813904, + "rewards/repetition_penalty_reward": -0.03565355762839317, + "rewards/tag_count_reward": 0.962239608168602, "step": 138 }, { "clip_ratio": 0.0, - "completion_length": 116.72396087646484, - "epoch": 0.41679160419790107, - "grad_norm": 1.8697959272406102, - "kl": 0.705078125, - "learning_rate": 1.4507108057462297e-05, - "loss": -0.0029, - "reward": 2.6899372935295105, - "reward_std": 0.2231074832379818, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9878472238779068, - "rewards/repetition_penalty_reward": -0.010149642825126648, - "rewards/tag_count_reward": 0.9830729365348816, + "completion_length": 248.0729217529297, + "epoch": 0.20847394075740533, + "grad_norm": 2.8562687232727018, + "kl": 3.19384765625, + "learning_rate": 1.9295446161389644e-05, + "loss": -0.0617, + "reward": 2.431870937347412, + "reward_std": 0.6133489608764648, + "rewards/accuracy_reward": 0.5729166865348816, + "rewards/reasoning_steps_reward": 0.9444445073604584, + "rewards/repetition_penalty_reward": -0.030802744440734386, + "rewards/tag_count_reward": 0.9453125149011612, "step": 139 }, { "clip_ratio": 0.0, - "completion_length": 111.42708778381348, - "epoch": 0.4197901049475262, - "grad_norm": 23.21366760717317, - "kl": 1.552734375, - "learning_rate": 1.4413068170871252e-05, - "loss": 0.0225, - "reward": 2.7771247029304504, - "reward_std": 0.2754965058993548, - "rewards/accuracy_reward": 0.8229167014360428, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.008465770864859223, - "rewards/tag_count_reward": 0.9765625149011612, + "completion_length": 227.87500762939453, + "epoch": 0.2099737532808399, + "grad_norm": 11.214953837004993, + "kl": 1.908203125, + "learning_rate": 1.9275980582697707e-05, + "loss": -0.079, + "reward": 2.5050920844078064, + "reward_std": 0.5771966800093651, + "rewards/accuracy_reward": 0.6770833432674408, + "rewards/reasoning_steps_reward": 0.925347313284874, + "rewards/repetition_penalty_reward": -0.032234320882707834, + "rewards/tag_count_reward": 0.934895858168602, "step": 140 }, { "clip_ratio": 0.0, - "completion_length": 116.60937881469727, - "epoch": 0.42278860569715143, - "grad_norm": 1.2588277588625658, - "kl": 0.47802734375, - "learning_rate": 1.4318541099166556e-05, - "loss": 0.03, - "reward": 2.9449267387390137, - "reward_std": 0.10798337496817112, - "rewards/accuracy_reward": 0.9635416716337204, - "rewards/reasoning_steps_reward": 0.9965277761220932, - "rewards/repetition_penalty_reward": -0.009934437868651003, - "rewards/tag_count_reward": 0.9947916716337204, + "completion_length": 244.82813262939453, + "epoch": 0.21147356580427445, + "grad_norm": 0.8549661150364268, + "kl": 0.64990234375, + "learning_rate": 1.9256259848504737e-05, + "loss": -0.0524, + "reward": 2.30450901389122, + "reward_std": 0.440264068543911, + "rewards/accuracy_reward": 0.432291679084301, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.03359871078282595, + "rewards/tag_count_reward": 0.954427108168602, "step": 141 }, { "clip_ratio": 0.0, - "completion_length": 107.51041793823242, - "epoch": 0.4257871064467766, - "grad_norm": 1.6180185537864251, - "kl": 0.7041015625, - "learning_rate": 1.4223537277759667e-05, - "loss": -0.0281, - "reward": 2.9196689128875732, - "reward_std": 0.2676460010698065, - "rewards/accuracy_reward": 0.9635416865348816, - "rewards/reasoning_steps_reward": 0.9809027910232544, - "rewards/repetition_penalty_reward": -0.005244338244665414, - "rewards/tag_count_reward": 0.98046875, + "completion_length": 231.68750381469727, + "epoch": 0.21297337832770905, + "grad_norm": 1.9836172250089459, + "kl": 1.09423828125, + "learning_rate": 1.9236284501271317e-05, + "loss": -0.0897, + "reward": 2.265152394771576, + "reward_std": 0.5349541902542114, + "rewards/accuracy_reward": 0.4427083358168602, + "rewards/reasoning_steps_reward": 0.9288195073604584, + "rewards/repetition_penalty_reward": -0.03215676499530673, + "rewards/tag_count_reward": 0.9257812649011612, "step": 142 }, { "clip_ratio": 0.0, - "completion_length": 106.96354293823242, - "epoch": 0.4287856071964018, - "grad_norm": 3.173672388607253, - "kl": 0.861328125, - "learning_rate": 1.4128067194693316e-05, - "loss": 0.0193, - "reward": 2.866965413093567, - "reward_std": 0.36002135276794434, - "rewards/accuracy_reward": 0.911458358168602, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.006298626307398081, - "rewards/tag_count_reward": 0.9687500149011612, + "completion_length": 229.2760467529297, + "epoch": 0.2144731908511436, + "grad_norm": 2.657790488829322, + "kl": 2.5712890625, + "learning_rate": 1.9216055090461693e-05, + "loss": -0.1423, + "reward": 2.298185646533966, + "reward_std": 0.6972773224115372, + "rewards/accuracy_reward": 0.526041679084301, + "rewards/reasoning_steps_reward": 0.911458358168602, + "rewards/repetition_penalty_reward": -0.033845747821033, + "rewards/tag_count_reward": 0.8945312798023224, "step": 143 }, { "clip_ratio": 0.0, - "completion_length": 110.04166984558105, - "epoch": 0.431784107946027, - "grad_norm": 3.72381753576245, - "kl": 1.1474609375, - "learning_rate": 1.4032141389483648e-05, - "loss": 0.0526, - "reward": 2.784775972366333, - "reward_std": 0.2770255096256733, - "rewards/accuracy_reward": 0.817708358168602, - "rewards/reasoning_steps_reward": 0.9913194328546524, - "rewards/repetition_penalty_reward": -0.007324688020162284, - "rewards/tag_count_reward": 0.9830729365348816, + "completion_length": 219.67187881469727, + "epoch": 0.21597300337457817, + "grad_norm": 0.8727878748564548, + "kl": 0.5927734375, + "learning_rate": 1.9195572172528678e-05, + "loss": -0.097, + "reward": 2.390109956264496, + "reward_std": 0.429065003991127, + "rewards/accuracy_reward": 0.557291679084301, + "rewards/reasoning_steps_reward": 0.9357638955116272, + "rewards/repetition_penalty_reward": -0.03914357628673315, + "rewards/tag_count_reward": 0.9361979365348816, "step": 144 }, { "clip_ratio": 0.0, - "completion_length": 100.05208396911621, - "epoch": 0.43478260869565216, - "grad_norm": 98.84817232879341, - "kl": 7.1748046875, - "learning_rate": 1.3935770451956732e-05, - "loss": 0.2566, - "reward": 2.7451387643814087, - "reward_std": 0.23619456216692924, - "rewards/accuracy_reward": 0.770833358168602, - "rewards/reasoning_steps_reward": 0.9947916865348816, - "rewards/repetition_penalty_reward": -0.006163446931168437, - "rewards/tag_count_reward": 0.9856770932674408, + "completion_length": 231.10937881469727, + "epoch": 0.21747281589801276, + "grad_norm": 0.7543285200149539, + "kl": 0.5830078125, + "learning_rate": 1.9174836310898334e-05, + "loss": -0.0843, + "reward": 2.436844766139984, + "reward_std": 0.527190275490284, + "rewards/accuracy_reward": 0.604166679084301, + "rewards/reasoning_steps_reward": 0.9618056118488312, + "rewards/repetition_penalty_reward": -0.04188787518069148, + "rewards/tag_count_reward": 0.9127604216337204, "step": 145 }, { "clip_ratio": 0.0, - "completion_length": 95.88541984558105, - "epoch": 0.43778110944527737, - "grad_norm": 1.6790256708529574, - "kl": 0.71337890625, - "learning_rate": 1.3838965021079447e-05, - "loss": 0.022, - "reward": 2.582995355129242, - "reward_std": 0.33666881918907166, - "rewards/accuracy_reward": 0.630208358168602, - "rewards/reasoning_steps_reward": 0.9756944477558136, - "rewards/repetition_penalty_reward": -0.0033762191596906632, - "rewards/tag_count_reward": 0.9804687649011612, + "completion_length": 229.2083396911621, + "epoch": 0.21897262842144732, + "grad_norm": 0.639298152517526, + "kl": 0.673828125, + "learning_rate": 1.9153848075954465e-05, + "loss": -0.1119, + "reward": 2.3503913283348083, + "reward_std": 0.4797208532691002, + "rewards/accuracy_reward": 0.5416666679084301, + "rewards/reasoning_steps_reward": 0.954861119389534, + "rewards/repetition_penalty_reward": -0.06150110438466072, + "rewards/tag_count_reward": 0.915364608168602, "step": 146 }, { "clip_ratio": 0.0, - "completion_length": 99.12500190734863, - "epoch": 0.4407796101949025, - "grad_norm": 1.000815515911012, - "kl": 0.4951171875, - "learning_rate": 1.3741735783785022e-05, - "loss": 0.014, - "reward": 2.7670981884002686, - "reward_std": 0.24415395595133305, - "rewards/accuracy_reward": 0.7864583432674408, - "rewards/reasoning_steps_reward": 0.9947916716337204, - "rewards/repetition_penalty_reward": -0.0037352032377384603, - "rewards/tag_count_reward": 0.9895833432674408, + "completion_length": 237.23958587646484, + "epoch": 0.2204724409448819, + "grad_norm": 3.076436673284199, + "kl": 0.697265625, + "learning_rate": 1.9132608045022954e-05, + "loss": -0.0704, + "reward": 2.4564713835716248, + "reward_std": 0.4616401568055153, + "rewards/accuracy_reward": 0.6197916716337204, + "rewards/reasoning_steps_reward": 0.9826388955116272, + "rewards/repetition_penalty_reward": -0.0834592841565609, + "rewards/tag_count_reward": 0.9375000149011612, "step": 147 }, { "clip_ratio": 0.0, - "completion_length": 90.77604293823242, - "epoch": 0.44377811094452774, - "grad_norm": 10.482881482497314, - "kl": 1.677734375, - "learning_rate": 1.3644093473793213e-05, - "loss": 0.0242, - "reward": 2.8607059121131897, - "reward_std": 0.39441923797130585, - "rewards/accuracy_reward": 0.9166667014360428, - "rewards/reasoning_steps_reward": 0.987847238779068, - "rewards/repetition_penalty_reward": -0.004745513964735437, - "rewards/tag_count_reward": 0.9609375149011612, + "completion_length": 217.3697967529297, + "epoch": 0.22197225346831645, + "grad_norm": 0.7063771066104143, + "kl": 0.52490234375, + "learning_rate": 1.9111116802355853e-05, + "loss": -0.0912, + "reward": 2.410871744155884, + "reward_std": 0.5071974396705627, + "rewards/accuracy_reward": 0.583333358168602, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.060916513204574585, + "rewards/tag_count_reward": 0.923177108168602, "step": 148 }, { "clip_ratio": 0.0, - "completion_length": 98.33333587646484, - "epoch": 0.44677661169415295, - "grad_norm": 3.0619014190201397, - "kl": 0.70263671875, - "learning_rate": 1.3546048870425356e-05, - "loss": -0.0034, - "reward": 2.7546103596687317, - "reward_std": 0.3782291766256094, - "rewards/accuracy_reward": 0.802083358168602, - "rewards/reasoning_steps_reward": 0.9826389104127884, - "rewards/repetition_penalty_reward": -0.007976488093845546, - "rewards/tag_count_reward": 0.9778645932674408, + "completion_length": 220.9895896911621, + "epoch": 0.22347206599175104, + "grad_norm": 0.759921740363777, + "kl": 0.7099609375, + "learning_rate": 1.9089374939115335e-05, + "loss": -0.1306, + "reward": 2.3488033413887024, + "reward_std": 0.6183192133903503, + "rewards/accuracy_reward": 0.5677083507180214, + "rewards/reasoning_steps_reward": 0.939236119389534, + "rewards/repetition_penalty_reward": -0.06569322943687439, + "rewards/tag_count_reward": 0.907552108168602, "step": 149 }, { "clip_ratio": 0.0, - "completion_length": 96.68750381469727, - "epoch": 0.4497751124437781, - "grad_norm": 1.757178598271745, - "kl": 0.6220703125, - "learning_rate": 1.3447612797414371e-05, - "loss": -0.0372, - "reward": 2.696329712867737, - "reward_std": 0.48958031833171844, - "rewards/accuracy_reward": 0.78125, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.003757232567295432, - "rewards/tag_count_reward": 0.9466146230697632, + "completion_length": 217.5833396911621, + "epoch": 0.2249718785151856, + "grad_norm": 5.333380760593154, + "kl": 1.005859375, + "learning_rate": 1.906738305335741e-05, + "loss": -0.1023, + "reward": 2.319095730781555, + "reward_std": 0.6061732321977615, + "rewards/accuracy_reward": 0.5572916865348816, + "rewards/reasoning_steps_reward": 0.9461806118488312, + "rewards/repetition_penalty_reward": -0.09062658622860909, + "rewards/tag_count_reward": 0.9062500149011612, "step": 150 }, { "clip_ratio": 0.0, - "completion_length": 96.88021087646484, - "epoch": 0.4527736131934033, - "grad_norm": 1.0864550515764144, - "kl": 0.47802734375, - "learning_rate": 1.3348796121709862e-05, - "loss": -0.0041, - "reward": 2.914130389690399, - "reward_std": 0.26327061653137207, - "rewards/accuracy_reward": 0.942708358168602, - "rewards/reasoning_steps_reward": 0.991319477558136, - "rewards/repetition_penalty_reward": -0.004272434976883233, - "rewards/tag_count_reward": 0.9843750149011612, + "completion_length": 221.75000762939453, + "epoch": 0.22647169103862017, + "grad_norm": 1.2824333531797762, + "kl": 2.69970703125, + "learning_rate": 1.90451417500155e-05, + "loss": -0.091, + "reward": 2.622887670993805, + "reward_std": 0.5151065196841955, + "rewards/accuracy_reward": 0.8072916865348816, + "rewards/reasoning_steps_reward": 0.949652835726738, + "rewards/repetition_penalty_reward": -0.06895278673619032, + "rewards/tag_count_reward": 0.934895858168602, "step": 151 }, { "clip_ratio": 0.0, - "completion_length": 98.10416793823242, - "epoch": 0.45577211394302847, - "grad_norm": 1.1731563120826676, - "kl": 0.47802734375, - "learning_rate": 1.3249609752278454e-05, - "loss": 0.006, - "reward": 2.5750681161880493, - "reward_std": 0.31880153343081474, - "rewards/accuracy_reward": 0.6093750074505806, - "rewards/reasoning_steps_reward": 0.9930555671453476, - "rewards/repetition_penalty_reward": -0.006529179809149355, - "rewards/tag_count_reward": 0.9791666716337204, + "completion_length": 220.57292556762695, + "epoch": 0.22797150356205476, + "grad_norm": 0.7007261870507696, + "kl": 0.51171875, + "learning_rate": 1.902265164088378e-05, + "loss": -0.0408, + "reward": 2.4209659099578857, + "reward_std": 0.49653469771146774, + "rewards/accuracy_reward": 0.5729166865348816, + "rewards/reasoning_steps_reward": 0.97743059694767, + "rewards/repetition_penalty_reward": -0.08120433799922466, + "rewards/tag_count_reward": 0.9518229365348816, "step": 152 }, { "clip_ratio": 0.0, - "completion_length": 92.29166984558105, - "epoch": 0.4587706146926537, - "grad_norm": 1.1346690177946404, - "kl": 0.45361328125, - "learning_rate": 1.315006463889948e-05, - "loss": -0.0176, - "reward": 2.9095540046691895, - "reward_std": 0.2783464193344116, - "rewards/accuracy_reward": 0.9531250298023224, - "rewards/reasoning_steps_reward": 0.9826389104127884, - "rewards/repetition_penalty_reward": -0.0027724849642254412, - "rewards/tag_count_reward": 0.9765625149011612, + "completion_length": 232.13541793823242, + "epoch": 0.22947131608548932, + "grad_norm": 0.65651651084759, + "kl": 0.345703125, + "learning_rate": 1.899991334460036e-05, + "loss": 0.0085, + "reward": 2.626923680305481, + "reward_std": 0.23435234278440475, + "rewards/accuracy_reward": 0.7343750298023224, + "rewards/reasoning_steps_reward": 0.9913194626569748, + "rewards/repetition_penalty_reward": -0.08835414797067642, + "rewards/tag_count_reward": 0.9895833432674408, "step": 153 }, { "clip_ratio": 0.0, - "completion_length": 96.83333587646484, - "epoch": 0.4617691154422789, - "grad_norm": 1.1933914174647031, - "kl": 0.48486328125, - "learning_rate": 1.3050171770956176e-05, - "loss": 0.0015, - "reward": 2.72132408618927, - "reward_std": 0.32843077182769775, - "rewards/accuracy_reward": 0.7656250298023224, - "rewards/reasoning_steps_reward": 0.9895833432674408, - "rewards/repetition_penalty_reward": -0.009144885872956365, - "rewards/tag_count_reward": 0.9752604365348816, + "completion_length": 218.23437881469727, + "epoch": 0.23097112860892388, + "grad_norm": 0.7528729019247309, + "kl": 0.63623046875, + "learning_rate": 1.8976927486630252e-05, + "loss": 0.013, + "reward": 2.2777963876724243, + "reward_std": 0.3943594992160797, + "rewards/accuracy_reward": 0.4218750149011612, + "rewards/reasoning_steps_reward": 0.9704861044883728, + "rewards/repetition_penalty_reward": -0.08722100034356117, + "rewards/tag_count_reward": 0.97265625, "step": 154 }, { "clip_ratio": 0.0, - "completion_length": 99.83333587646484, - "epoch": 0.46476761619190404, - "grad_norm": 1.2021972493346804, - "kl": 0.47314453125, - "learning_rate": 1.2949942176222497e-05, - "loss": -0.0459, - "reward": 2.7491848468780518, - "reward_std": 0.4561289846897125, - "rewards/accuracy_reward": 0.8229167014360428, - "rewards/reasoning_steps_reward": 0.9791666716337204, - "rewards/repetition_penalty_reward": -0.0034193213214166462, - "rewards/tag_count_reward": 0.9505208432674408, + "completion_length": 210.3385467529297, + "epoch": 0.23247094113235844, + "grad_norm": 9.645470554535006, + "kl": 1.740234375, + "learning_rate": 1.8953694699248193e-05, + "loss": -0.0036, + "reward": 2.56173574924469, + "reward_std": 0.3356763105839491, + "rewards/accuracy_reward": 0.6927083507180214, + "rewards/reasoning_steps_reward": 0.9843750149011612, + "rewards/repetition_penalty_reward": -0.09060808829963207, + "rewards/tag_count_reward": 0.9752604365348816, "step": 155 }, { "clip_ratio": 0.0, - "completion_length": 106.64062690734863, - "epoch": 0.46776611694152925, - "grad_norm": 1.3131511244620084, - "kl": 0.56982421875, - "learning_rate": 1.2849386919645686e-05, - "loss": -0.0388, - "reward": 2.743965268135071, - "reward_std": 0.39171791821718216, - "rewards/accuracy_reward": 0.8281250149011612, - "rewards/reasoning_steps_reward": 0.9826389104127884, - "rewards/repetition_penalty_reward": -0.010809163737576455, - "rewards/tag_count_reward": 0.9440104216337204, + "completion_length": 206.6614646911621, + "epoch": 0.23397075365579303, + "grad_norm": 0.7853069216863842, + "kl": 0.40625, + "learning_rate": 1.893021562152122e-05, + "loss": 0.0268, + "reward": 2.8448190689086914, + "reward_std": 0.22713683173060417, + "rewards/accuracy_reward": 0.9531250149011612, + "rewards/reasoning_steps_reward": 0.986111119389534, + "rewards/repetition_penalty_reward": -0.05405241996049881, + "rewards/tag_count_reward": 0.9596354365348816, "step": 156 }, { "clip_ratio": 0.0, - "completion_length": 103.53125190734863, - "epoch": 0.4707646176911544, - "grad_norm": 1.69232758602469, - "kl": 0.56396484375, - "learning_rate": 1.2748517102124755e-05, - "loss": -0.0245, - "reward": 2.7549294233322144, - "reward_std": 0.3383819945156574, - "rewards/accuracy_reward": 0.817708358168602, - "rewards/reasoning_steps_reward": 0.9826389402151108, - "rewards/repetition_penalty_reward": -0.006355439778417349, - "rewards/tag_count_reward": 0.9609375149011612, + "completion_length": 181.40625762939453, + "epoch": 0.2354705661792276, + "grad_norm": 2.706381634571779, + "kl": 0.9560546875, + "learning_rate": 1.8906490899291125e-05, + "loss": 0.0151, + "reward": 2.746555209159851, + "reward_std": 0.21183432638645172, + "rewards/accuracy_reward": 0.8125000149011612, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.0498858280479908, + "rewards/tag_count_reward": 0.9908854365348816, "step": 157 }, { "clip_ratio": 0.0, - "completion_length": 102.05208587646484, - "epoch": 0.4737631184407796, - "grad_norm": 2.265769890602018, - "kl": 0.73681640625, - "learning_rate": 1.2647343859284997e-05, - "loss": -0.0689, - "reward": 2.662193477153778, - "reward_std": 0.4981583207845688, - "rewards/accuracy_reward": 0.786458358168602, - "rewards/reasoning_steps_reward": 0.9722222536802292, - "rewards/repetition_penalty_reward": -0.011851702001877129, - "rewards/tag_count_reward": 0.9153645932674408, + "completion_length": 176.8020896911621, + "epoch": 0.23697037870266216, + "grad_norm": 1.1526188899297036, + "kl": 0.47314453125, + "learning_rate": 1.888252118515666e-05, + "loss": -0.0084, + "reward": 2.248090386390686, + "reward_std": 0.29650406911969185, + "rewards/accuracy_reward": 0.33333334140479565, + "rewards/reasoning_steps_reward": 0.9687500149011612, + "rewards/repetition_penalty_reward": -0.03576376847922802, + "rewards/tag_count_reward": 0.9817708432674408, "step": 158 }, { "clip_ratio": 0.0, - "completion_length": 103.22916984558105, - "epoch": 0.4767616191904048, - "grad_norm": 2.255759107812268, - "kl": 0.79541015625, - "learning_rate": 1.2545878360248633e-05, - "loss": 0.0144, - "reward": 2.8301729559898376, - "reward_std": 0.3695299196988344, - "rewards/accuracy_reward": 0.9010416865348816, - "rewards/reasoning_steps_reward": 0.9982638955116272, - "rewards/repetition_penalty_reward": -0.009236796642653644, - "rewards/tag_count_reward": 0.9401041865348816, + "completion_length": 167.71875381469727, + "epoch": 0.23847019122609675, + "grad_norm": 0.8634079765155277, + "kl": 0.38037109375, + "learning_rate": 1.88583071384556e-05, + "loss": 0.015, + "reward": 2.783277213573456, + "reward_std": 0.27950316295027733, + "rewards/accuracy_reward": 0.8385416865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.03746931627392769, + "rewards/tag_count_reward": 0.99609375, "step": 159 }, { "clip_ratio": 0.0, - "completion_length": 100.39062881469727, - "epoch": 0.47976011994003, - "grad_norm": 7.190371202010685, - "kl": 1.99609375, - "learning_rate": 1.2444131806401818e-05, - "loss": 0.0233, - "reward": 2.587311804294586, - "reward_std": 0.4335063770413399, - "rewards/accuracy_reward": 0.6875000298023224, - "rewards/reasoning_steps_reward": 0.9722222536802292, - "rewards/repetition_penalty_reward": -0.007306311279535294, - "rewards/tag_count_reward": 0.9348958432674408, + "completion_length": 159.5208396911621, + "epoch": 0.2399700037495313, + "grad_norm": 0.9199485039081772, + "kl": 0.43212890625, + "learning_rate": 1.883384942524661e-05, + "loss": 0.0147, + "reward": 2.60649836063385, + "reward_std": 0.3520687147974968, + "rewards/accuracy_reward": 0.6822916865348816, + "rewards/reasoning_steps_reward": 0.9756944626569748, + "rewards/repetition_penalty_reward": -0.03456068178638816, + "rewards/tag_count_reward": 0.9830729365348816, "step": 160 }, { "clip_ratio": 0.0, - "completion_length": 106.15104484558105, - "epoch": 0.4827586206896552, - "grad_norm": 11.57931701299926, - "kl": 2.580078125, - "learning_rate": 1.2342115430158024e-05, - "loss": 0.0864, - "reward": 2.5142497420310974, - "reward_std": 0.592901311814785, - "rewards/accuracy_reward": 0.6510417014360428, - "rewards/reasoning_steps_reward": 0.9722222238779068, - "rewards/repetition_penalty_reward": -0.012660016654990613, - "rewards/tag_count_reward": 0.903645858168602, + "completion_length": 157.79687881469727, + "epoch": 0.24146981627296588, + "grad_norm": 0.9081570917337833, + "kl": 0.4970703125, + "learning_rate": 1.880914871829092e-05, + "loss": 0.003, + "reward": 2.5711691975593567, + "reward_std": 0.3647758923470974, + "rewards/accuracy_reward": 0.6510416865348816, + "rewards/reasoning_steps_reward": 0.9756944626569748, + "rewards/repetition_penalty_reward": -0.034733695443719625, + "rewards/tag_count_reward": 0.9791666716337204, "step": 161 }, { "clip_ratio": 0.0, - "completion_length": 102.13541793823242, - "epoch": 0.48575712143928035, - "grad_norm": 4.161233572872993, - "kl": 1.421875, - "learning_rate": 1.223984049371805e-05, - "loss": 0.0328, - "reward": 2.5102869868278503, - "reward_std": 0.34895218163728714, - "rewards/accuracy_reward": 0.5937500223517418, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.0066401001531630754, - "rewards/tag_count_reward": 0.9440104365348816, + "completion_length": 160.578125, + "epoch": 0.24296962879640044, + "grad_norm": 0.7819084575897781, + "kl": 0.42578125, + "learning_rate": 1.8784205697033803e-05, + "loss": 0.0206, + "reward": 2.7083481550216675, + "reward_std": 0.23164555057883263, + "rewards/accuracy_reward": 0.770833358168602, + "rewards/reasoning_steps_reward": 0.9913194477558136, + "rewards/repetition_penalty_reward": -0.04208600614219904, + "rewards/tag_count_reward": 0.98828125, "step": 162 }, { "clip_ratio": 0.0, - "completion_length": 100.828125, - "epoch": 0.48875562218890556, - "grad_norm": 3.9809594705586053, - "kl": 1.162109375, - "learning_rate": 1.2137318287826699e-05, - "loss": -0.0078, - "reward": 2.5699517726898193, - "reward_std": 0.4455106034874916, - "rewards/accuracy_reward": 0.6770833507180214, - "rewards/reasoning_steps_reward": 0.9652777910232544, - "rewards/repetition_penalty_reward": -0.015117747941985726, - "rewards/tag_count_reward": 0.942708358168602, + "completion_length": 152.55208587646484, + "epoch": 0.24446944131983503, + "grad_norm": 1.4454320643452467, + "kl": 1.4462890625, + "learning_rate": 1.875902104758592e-05, + "loss": -0.0319, + "reward": 2.8053765892982483, + "reward_std": 0.314240001142025, + "rewards/accuracy_reward": 0.8750000298023224, + "rewards/reasoning_steps_reward": 0.9774305671453476, + "rewards/repetition_penalty_reward": -0.028824928449466825, + "rewards/tag_count_reward": 0.9817708432674408, "step": 163 }, { "clip_ratio": 0.0, - "completion_length": 102.94270896911621, - "epoch": 0.4917541229385307, - "grad_norm": 19.775528338909474, - "kl": 2.1142578125, - "learning_rate": 1.2034560130526341e-05, - "loss": 0.0267, - "reward": 2.7591124176979065, - "reward_std": 0.3192651905119419, - "rewards/accuracy_reward": 0.8281250298023224, - "rewards/reasoning_steps_reward": 0.9687500447034836, - "rewards/repetition_penalty_reward": -0.006512661639135331, - "rewards/tag_count_reward": 0.9687500149011612, + "completion_length": 141.1666717529297, + "epoch": 0.2459692538432696, + "grad_norm": 16.262069244582136, + "kl": 6.43603515625, + "learning_rate": 1.873359546270442e-05, + "loss": 0.0621, + "reward": 2.3994747400283813, + "reward_std": 0.2881123125553131, + "rewards/accuracy_reward": 0.4583333358168602, + "rewards/reasoning_steps_reward": 0.9843750149011612, + "rewards/repetition_penalty_reward": -0.02630652580410242, + "rewards/tag_count_reward": 0.9830729365348816, "step": 164 }, { "clip_ratio": 0.0, - "completion_length": 106.59896278381348, - "epoch": 0.4947526236881559, - "grad_norm": 1.1276005185146887, - "kl": 0.5009765625, - "learning_rate": 1.1931577365907433e-05, - "loss": 0.0095, - "reward": 2.7071834206581116, - "reward_std": 0.24475602060556412, - "rewards/accuracy_reward": 0.7500000149011612, - "rewards/reasoning_steps_reward": 0.9826388955116272, - "rewards/repetition_penalty_reward": -0.009830599068664014, - "rewards/tag_count_reward": 0.984375, + "completion_length": 152.3072967529297, + "epoch": 0.24746906636670415, + "grad_norm": 29.040919544310924, + "kl": 5.2421875, + "learning_rate": 1.8707929641773876e-05, + "loss": 0.2231, + "reward": 2.7308311462402344, + "reward_std": 0.3806469663977623, + "rewards/accuracy_reward": 0.8125000149011612, + "rewards/reasoning_steps_reward": 0.9704861342906952, + "rewards/repetition_penalty_reward": -0.02481132885441184, + "rewards/tag_count_reward": 0.9726562649011612, "step": 165 }, { "clip_ratio": 0.0, - "completion_length": 103.80729675292969, - "epoch": 0.49775112443778113, - "grad_norm": 1.4396819065845348, - "kl": 0.5517578125, - "learning_rate": 1.1828381362856195e-05, - "loss": 0.0061, - "reward": 2.7072086334228516, - "reward_std": 0.2500213086605072, - "rewards/accuracy_reward": 0.7552083432674408, + "completion_length": 137.9166717529297, + "epoch": 0.24896887889013875, + "grad_norm": 4.111363572085073, + "kl": 1.9384765625, + "learning_rate": 1.8682024290787092e-05, + "loss": -0.0177, + "reward": 2.639627754688263, + "reward_std": 0.49519332498311996, + "rewards/accuracy_reward": 0.723958358168602, "rewards/reasoning_steps_reward": 0.9809028059244156, - "rewards/repetition_penalty_reward": -0.008069265051744878, - "rewards/tag_count_reward": 0.9791666716337204, + "rewards/repetition_penalty_reward": -0.022264669416472316, + "rewards/tag_count_reward": 0.9570312649011612, "step": 166 }, { "clip_ratio": 0.0, - "completion_length": 105.23437690734863, - "epoch": 0.5007496251874063, - "grad_norm": 1.4162738213578512, - "kl": 0.58447265625, - "learning_rate": 1.1724983513799505e-05, - "loss": -0.028, - "reward": 2.83200603723526, - "reward_std": 0.2547433339059353, - "rewards/accuracy_reward": 0.880208358168602, - "rewards/reasoning_steps_reward": 0.9774305820465088, - "rewards/repetition_penalty_reward": -0.006101653270889074, - "rewards/tag_count_reward": 0.98046875, + "completion_length": 143.21875762939453, + "epoch": 0.2504686914135733, + "grad_norm": 2.038792986981373, + "kl": 1.0732421875, + "learning_rate": 1.8655880122325633e-05, + "loss": -0.0366, + "reward": 2.5853430032730103, + "reward_std": 0.3307228982448578, + "rewards/accuracy_reward": 0.6770833432674408, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.020993857644498348, + "rewards/tag_count_reward": 0.9570312649011612, "step": 167 }, { "clip_ratio": 0.0, - "completion_length": 102.34896087646484, - "epoch": 0.5037481259370314, - "grad_norm": 1.5741520276879495, - "kl": 0.5859375, - "learning_rate": 1.1621395233447247e-05, - "loss": -0.0284, - "reward": 2.856131374835968, - "reward_std": 0.336722657084465, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 0.970486119389534, - "rewards/repetition_penalty_reward": -0.00628192734438926, - "rewards/tag_count_reward": 0.9752604216337204, + "completion_length": 137.98438262939453, + "epoch": 0.25196850393700787, + "grad_norm": 1.1481401231898494, + "kl": 1.109375, + "learning_rate": 1.862949785554025e-05, + "loss": -0.0102, + "reward": 2.74522465467453, + "reward_std": 0.24203419662080705, + "rewards/accuracy_reward": 0.7864583432674408, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.013021895661950111, + "rewards/tag_count_reward": 0.9856770932674408, "step": 168 }, { "clip_ratio": 0.0, - "completion_length": 103.50521087646484, - "epoch": 0.5067466266866567, - "grad_norm": 3.143223791327549, - "kl": 0.8681640625, - "learning_rate": 1.1517627957532155e-05, - "loss": -0.0218, - "reward": 2.8190484642982483, - "reward_std": 0.3480563126504421, - "rewards/accuracy_reward": 0.927083358168602, - "rewards/reasoning_steps_reward": 0.9409722536802292, - "rewards/repetition_penalty_reward": -0.012548819300718606, - "rewards/tag_count_reward": 0.9635416865348816, + "completion_length": 139.8385467529297, + "epoch": 0.25346831646044243, + "grad_norm": 0.9880924032467467, + "kl": 0.47119140625, + "learning_rate": 1.8602878216131093e-05, + "loss": -0.0024, + "reward": 2.5487694144248962, + "reward_std": 0.3845925033092499, + "rewards/accuracy_reward": 0.5989583432674408, + "rewards/reasoning_steps_reward": 0.9895833432674408, + "rewards/repetition_penalty_reward": -0.016334847081452608, + "rewards/tag_count_reward": 0.9765625, "step": 169 }, { "clip_ratio": 0.0, - "completion_length": 98.671875, - "epoch": 0.5097451274362819, - "grad_norm": 4.087516775956481, - "kl": 1.3857421875, - "learning_rate": 1.1413693141547354e-05, - "loss": -0.0334, - "reward": 2.7752469778060913, - "reward_std": 0.4824451133608818, - "rewards/accuracy_reward": 0.9010416865348816, - "rewards/reasoning_steps_reward": 0.9531250149011612, - "rewards/repetition_penalty_reward": -0.009909308631904423, - "rewards/tag_count_reward": 0.9309895932674408, + "completion_length": 144.5000057220459, + "epoch": 0.254968128983877, + "grad_norm": 1.1326948326278583, + "kl": 0.64599609375, + "learning_rate": 1.8576021936327747e-05, + "loss": -0.0373, + "reward": 2.54432612657547, + "reward_std": 0.4256473332643509, + "rewards/accuracy_reward": 0.598958358168602, + "rewards/reasoning_steps_reward": 0.9809028059244156, + "rewards/repetition_penalty_reward": -0.016003758762963116, + "rewards/tag_count_reward": 0.9804687649011612, "step": 170 }, { "clip_ratio": 0.0, - "completion_length": 108.43750190734863, - "epoch": 0.512743628185907, - "grad_norm": 1.808660970560849, - "kl": 1.2373046875, - "learning_rate": 1.1309602259481726e-05, - "loss": -0.0589, - "reward": 2.6380687952041626, - "reward_std": 0.5042757764458656, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.97743059694767, - "rewards/repetition_penalty_reward": -0.013841008301824331, - "rewards/tag_count_reward": 0.9453125, + "completion_length": 141.0885467529297, + "epoch": 0.25646794150731156, + "grad_norm": 0.907847690585872, + "kl": 0.45703125, + "learning_rate": 1.8548929754869095e-05, + "loss": -0.036, + "reward": 2.566925883293152, + "reward_std": 0.34289751946926117, + "rewards/accuracy_reward": 0.619791679084301, + "rewards/reasoning_steps_reward": 0.9809027910232544, + "rewards/repetition_penalty_reward": -0.022049905732274055, + "rewards/tag_count_reward": 0.9882812649011612, "step": 171 }, { "clip_ratio": 0.0, - "completion_length": 107.00000190734863, - "epoch": 0.5157421289355323, - "grad_norm": 3.2045053936261754, - "kl": 1.26953125, - "learning_rate": 1.1205366802553231e-05, - "loss": -0.0773, - "reward": 2.749893009662628, - "reward_std": 0.5450254082679749, - "rewards/accuracy_reward": 0.8437500298023224, - "rewards/reasoning_steps_reward": 0.9670138955116272, - "rewards/repetition_penalty_reward": -0.007485455018468201, - "rewards/tag_count_reward": 0.9466145932674408, + "completion_length": 145.02605056762695, + "epoch": 0.2579677540307462, + "grad_norm": 4.163866220713108, + "kl": 1.8134765625, + "learning_rate": 1.8521602416982998e-05, + "loss": -0.0077, + "reward": 2.6473976969718933, + "reward_std": 0.26540281693451107, + "rewards/accuracy_reward": 0.7135416716337204, + "rewards/reasoning_steps_reward": 0.9739583432674408, + "rewards/repetition_penalty_reward": -0.01926900539547205, + "rewards/tag_count_reward": 0.9791666865348816, "step": 172 }, { "clip_ratio": 0.0, - "completion_length": 104.08854484558105, - "epoch": 0.5187406296851574, - "grad_norm": 4.461811464537438, - "kl": 1.9453125, - "learning_rate": 1.1100998277940316e-05, - "loss": -0.0487, - "reward": 2.8390066623687744, - "reward_std": 0.4234722927212715, - "rewards/accuracy_reward": 0.927083358168602, - "rewards/reasoning_steps_reward": 0.9687500149011612, - "rewards/repetition_penalty_reward": -0.007347506354562938, - "rewards/tag_count_reward": 0.9505208432674408, + "completion_length": 151.84896087646484, + "epoch": 0.25946756655418074, + "grad_norm": 0.7222410832833253, + "kl": 0.3779296875, + "learning_rate": 1.8494040674365785e-05, + "loss": -0.0015, + "reward": 2.90548974275589, + "reward_std": 0.15629624016582966, + "rewards/accuracy_reward": 0.9375000149011612, + "rewards/reasoning_steps_reward": 0.9913194626569748, + "rewards/repetition_penalty_reward": -0.01812157710082829, + "rewards/tag_count_reward": 0.9947916865348816, "step": 173 }, { "clip_ratio": 0.0, - "completion_length": 110.59895896911621, - "epoch": 0.5217391304347826, - "grad_norm": 1.1431834037063382, - "kl": 0.70361328125, - "learning_rate": 1.0996508207511565e-05, - "loss": -0.0495, - "reward": 2.7981215715408325, - "reward_std": 0.32188424095511436, - "rewards/accuracy_reward": 0.880208358168602, - "rewards/reasoning_steps_reward": 0.9809027910232544, - "rewards/repetition_penalty_reward": -0.02262500289361924, - "rewards/tag_count_reward": 0.9596354216337204, + "completion_length": 167.0052146911621, + "epoch": 0.2609673790776153, + "grad_norm": 0.7895271947524055, + "kl": 0.3896484375, + "learning_rate": 1.8466245285161593e-05, + "loss": -0.0036, + "reward": 2.6482080221176147, + "reward_std": 0.2084397617727518, + "rewards/accuracy_reward": 0.6822916865348816, + "rewards/reasoning_steps_reward": 0.9947916716337204, + "rewards/repetition_penalty_reward": -0.02236495795659721, + "rewards/tag_count_reward": 0.9934895932674408, "step": 174 }, { "clip_ratio": 0.0, - "completion_length": 116.19270896911621, - "epoch": 0.5247376311844077, - "grad_norm": 6.089832729503498, - "kl": 1.47265625, - "learning_rate": 1.089190812655374e-05, - "loss": 0.0148, - "reward": 2.733491837978363, - "reward_std": 0.37758616358041763, - "rewards/accuracy_reward": 0.8020833432674408, - "rewards/reasoning_steps_reward": 0.9843750298023224, - "rewards/repetition_penalty_reward": -0.019112566020339727, - "rewards/tag_count_reward": 0.9661458432674408, + "completion_length": 174.53125762939453, + "epoch": 0.26246719160104987, + "grad_norm": 0.8431262926543861, + "kl": 0.537109375, + "learning_rate": 1.8438217013941494e-05, + "loss": -0.0388, + "reward": 2.708277404308319, + "reward_std": 0.4050016924738884, + "rewards/accuracy_reward": 0.7812500149011612, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.03130600415170193, + "rewards/tag_count_reward": 0.9791666716337204, "step": 175 }, { "clip_ratio": 0.0, - "completion_length": 122.48437690734863, - "epoch": 0.527736131934033, - "grad_norm": 1.0305128960164511, - "kl": 0.55419921875, - "learning_rate": 1.0787209582498315e-05, - "loss": 0.0177, - "reward": 2.8952815532684326, - "reward_std": 0.21657648496329784, - "rewards/accuracy_reward": 0.9322916865348816, - "rewards/reasoning_steps_reward": 0.9947916865348816, - "rewards/repetition_penalty_reward": -0.0161769341211766, - "rewards/tag_count_reward": 0.9843750149011612, + "completion_length": 179.11979293823242, + "epoch": 0.26396700412448443, + "grad_norm": 0.8034495401351996, + "kl": 0.494140625, + "learning_rate": 1.8409956631682475e-05, + "loss": -0.0131, + "reward": 2.5661210417747498, + "reward_std": 0.3050253167748451, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 0.9843750298023224, + "rewards/repetition_penalty_reward": -0.023722857236862183, + "rewards/tag_count_reward": 0.9804687649011612, "step": 176 }, { "clip_ratio": 0.0, - "completion_length": 122.35416984558105, - "epoch": 0.5307346326836582, - "grad_norm": 1.3250160837320035, - "kl": 0.54052734375, - "learning_rate": 1.0682424133646712e-05, - "loss": 0.0118, - "reward": 2.8939477801322937, - "reward_std": 0.1932453876361251, - "rewards/accuracy_reward": 0.9218750149011612, - "rewards/reasoning_steps_reward": 0.9947916716337204, - "rewards/repetition_penalty_reward": -0.012302407994866371, - "rewards/tag_count_reward": 0.9895833432674408, + "completion_length": 201.26562881469727, + "epoch": 0.265466816647919, + "grad_norm": 0.9139688015624915, + "kl": 0.6591796875, + "learning_rate": 1.838146491574624e-05, + "loss": -0.0279, + "reward": 2.6243019104003906, + "reward_std": 0.411740280687809, + "rewards/accuracy_reward": 0.6979166865348816, + "rewards/reasoning_steps_reward": 0.975694477558136, + "rewards/repetition_penalty_reward": -0.024569710716605186, + "rewards/tag_count_reward": 0.9752604216337204, "step": 177 }, { "clip_ratio": 0.0, - "completion_length": 116.97916984558105, - "epoch": 0.5337331334332833, - "grad_norm": 1.1202124937220295, - "kl": 0.50048828125, - "learning_rate": 1.0577563347894286e-05, - "loss": 0.0343, - "reward": 2.8213730454444885, - "reward_std": 0.12381599424406886, - "rewards/accuracy_reward": 0.8489583432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.017168688587844372, - "rewards/tag_count_reward": 0.9895833432674408, + "completion_length": 203.03125762939453, + "epoch": 0.26696662917135355, + "grad_norm": 0.7129835639250315, + "kl": 0.44970703125, + "learning_rate": 1.83527426498578e-05, + "loss": 0.0649, + "reward": 2.529602825641632, + "reward_std": 0.2042294256389141, + "rewards/accuracy_reward": 0.5677083395421505, + "rewards/reasoning_steps_reward": 0.9965277910232544, + "rewards/repetition_penalty_reward": -0.030727183911949396, + "rewards/tag_count_reward": 0.9960937649011612, "step": 178 }, { "clip_ratio": 0.0, - "completion_length": 115.73958587646484, - "epoch": 0.5367316341829086, - "grad_norm": 0.9845571322277763, - "kl": 0.4287109375, - "learning_rate": 1.0472638801453287e-05, - "loss": 0.0274, - "reward": 2.886799395084381, - "reward_std": 0.19121930375695229, - "rewards/accuracy_reward": 0.9010417014360428, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.011638134252279997, - "rewards/tag_count_reward": 0.9973958432674408, + "completion_length": 228.640625, + "epoch": 0.26846644169478817, + "grad_norm": 1.3351814365191201, + "kl": 1.0810546875, + "learning_rate": 1.832379062408394e-05, + "loss": -0.035, + "reward": 2.6204107999801636, + "reward_std": 0.32189762964844704, + "rewards/accuracy_reward": 0.7187500149011612, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.031064926646649837, + "rewards/tag_count_reward": 0.9674479365348816, "step": 179 }, { "clip_ratio": 0.0, - "completion_length": 106.58333396911621, - "epoch": 0.5397301349325337, - "grad_norm": 1.3925075814052417, - "kl": 0.671875, - "learning_rate": 1.0367662077574898e-05, - "loss": 0.006, - "reward": 2.823054552078247, - "reward_std": 0.2618987523019314, - "rewards/accuracy_reward": 0.8593750149011612, - "rewards/reasoning_steps_reward": 0.998263880610466, - "rewards/repetition_penalty_reward": -0.009844740270636976, - "rewards/tag_count_reward": 0.9752604514360428, + "completion_length": 221.95833587646484, + "epoch": 0.26996625421822273, + "grad_norm": 0.6416068587211241, + "kl": 0.63525390625, + "learning_rate": 1.8294609634811482e-05, + "loss": -0.0238, + "reward": 2.7204131484031677, + "reward_std": 0.26637247391045094, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9774305820465088, + "rewards/repetition_penalty_reward": -0.02785079344175756, + "rewards/tag_count_reward": 0.9791666865348816, "step": 180 }, { "clip_ratio": 0.0, - "completion_length": 105.11979484558105, - "epoch": 0.5427286356821589, - "grad_norm": 2.280358299836913, - "kl": 0.8330078125, - "learning_rate": 1.0262644765270472e-05, - "loss": 0.0481, - "reward": 2.8652560710906982, - "reward_std": 0.35385340452194214, - "rewards/accuracy_reward": 0.9062500298023224, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.008007904718397185, - "rewards/tag_count_reward": 0.973958358168602, + "completion_length": 228.32812881469727, + "epoch": 0.2714660667416573, + "grad_norm": 13.370041013613847, + "kl": 2.455078125, + "learning_rate": 1.8265200484725364e-05, + "loss": -0.0158, + "reward": 2.6316198110580444, + "reward_std": 0.4319092929363251, + "rewards/accuracy_reward": 0.7447916865348816, + "rewards/reasoning_steps_reward": 0.9687500298023224, + "rewards/repetition_penalty_reward": -0.031140658538788557, + "rewards/tag_count_reward": 0.9492187649011612, "step": 181 }, { "clip_ratio": 0.0, - "completion_length": 98.85416793823242, - "epoch": 0.545727136431784, - "grad_norm": 8.675677318683208, - "kl": 1.4765625, - "learning_rate": 1.0157598458032165e-05, - "loss": 0.056, - "reward": 2.7417566776275635, - "reward_std": 0.3702938035130501, - "rewards/accuracy_reward": 0.7812500149011612, - "rewards/reasoning_steps_reward": 0.9947916865348816, - "rewards/repetition_penalty_reward": -0.004337083431892097, - "rewards/tag_count_reward": 0.9700520932674408, + "completion_length": 219.8541717529297, + "epoch": 0.27296587926509186, + "grad_norm": 6.890078416473243, + "kl": 4.005859375, + "learning_rate": 1.823556398278657e-05, + "loss": -0.0641, + "reward": 2.506425619125366, + "reward_std": 0.6498684138059616, + "rewards/accuracy_reward": 0.6614583432674408, + "rewards/reasoning_steps_reward": 0.9340277761220932, + "rewards/repetition_penalty_reward": -0.026560600381344557, + "rewards/tag_count_reward": 0.9375000149011612, "step": 182 }, { "clip_ratio": 0.0, - "completion_length": 94.06771087646484, - "epoch": 0.5487256371814093, - "grad_norm": 9.74056119805903, - "kl": 1.44775390625, - "learning_rate": 1.0052534752553063e-05, - "loss": 0.0459, - "reward": 2.8742672204971313, - "reward_std": 0.2908545406535268, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 0.9774305671453476, - "rewards/repetition_penalty_reward": -0.0042049614421557635, - "rewards/tag_count_reward": 0.984375, + "completion_length": 224.7083396911621, + "epoch": 0.2744656917885264, + "grad_norm": 0.8460594296171978, + "kl": 0.955078125, + "learning_rate": 1.820570094420989e-05, + "loss": -0.0508, + "reward": 2.5241443514823914, + "reward_std": 0.5859170779585838, + "rewards/accuracy_reward": 0.6458333507180214, + "rewards/reasoning_steps_reward": 0.9565972536802292, + "rewards/repetition_penalty_reward": -0.026202970184385777, + "rewards/tag_count_reward": 0.9479167014360428, "step": 183 }, { "clip_ratio": 0.0, - "completion_length": 94.85937690734863, - "epoch": 0.5517241379310345, - "grad_norm": 6.447606788332282, - "kl": 1.68603515625, - "learning_rate": 9.947465247446942e-06, - "loss": 0.0884, - "reward": 2.8351686000823975, - "reward_std": 0.41117405891418457, - "rewards/accuracy_reward": 0.9010416865348816, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.007279429468326271, - "rewards/tag_count_reward": 0.9622395932674408, + "completion_length": 227.98438262939453, + "epoch": 0.275965504311961, + "grad_norm": 2.8022393456487182, + "kl": 3.2421875, + "learning_rate": 1.817561219044148e-05, + "loss": -0.1265, + "reward": 2.574957013130188, + "reward_std": 0.7494899779558182, + "rewards/accuracy_reward": 0.770833358168602, + "rewards/reasoning_steps_reward": 0.91493059694767, + "rewards/repetition_penalty_reward": -0.030077794566750526, + "rewards/tag_count_reward": 0.9192708432674408, "step": 184 }, { "clip_ratio": 0.0, - "completion_length": 92.75000381469727, - "epoch": 0.5547226386806596, - "grad_norm": 38.497156677104286, - "kl": 2.2099609375, - "learning_rate": 9.842401541967838e-06, - "loss": 0.1526, - "reward": 2.821943938732147, - "reward_std": 0.33023548871278763, - "rewards/accuracy_reward": 0.8593750298023224, - "rewards/reasoning_steps_reward": 0.989583358168602, - "rewards/repetition_penalty_reward": -0.0035769873938988894, - "rewards/tag_count_reward": 0.9765625, + "completion_length": 212.43750381469727, + "epoch": 0.27746531683539555, + "grad_norm": 1.6154380778220612, + "kl": 1.8671875, + "learning_rate": 1.814529854913626e-05, + "loss": -0.1303, + "reward": 2.467576324939728, + "reward_std": 0.7319738119840622, + "rewards/accuracy_reward": 0.661458358168602, + "rewards/reasoning_steps_reward": 0.9184027910232544, + "rewards/repetition_penalty_reward": -0.027649471536278725, + "rewards/tag_count_reward": 0.9153645932674408, "step": 185 }, { "clip_ratio": 0.0, - "completion_length": 88.43229484558105, - "epoch": 0.5577211394302849, - "grad_norm": 1.83258753969118, - "kl": 0.9951171875, - "learning_rate": 9.737355234729531e-06, - "loss": 0.017, - "reward": 2.743259012699127, - "reward_std": 0.3549557775259018, - "rewards/accuracy_reward": 0.8072916865348816, - "rewards/reasoning_steps_reward": 0.9722222536802292, - "rewards/repetition_penalty_reward": -0.005005000915843993, - "rewards/tag_count_reward": 0.96875, + "completion_length": 225.0989646911621, + "epoch": 0.27896512935883017, + "grad_norm": 11.447938130338011, + "kl": 5.7109375, + "learning_rate": 1.8114760854135168e-05, + "loss": -0.0203, + "reward": 2.524846374988556, + "reward_std": 0.6509635746479034, + "rewards/accuracy_reward": 0.7031250149011612, + "rewards/reasoning_steps_reward": 0.9392361640930176, + "rewards/repetition_penalty_reward": -0.025066897738724947, + "rewards/tag_count_reward": 0.9075520932674408, "step": 186 }, { "clip_ratio": 0.0, - "completion_length": 90.44791984558105, - "epoch": 0.56071964017991, - "grad_norm": 1.8795466176250475, - "kl": 1.1171875, - "learning_rate": 9.632337922425106e-06, - "loss": -0.0016, - "reward": 2.803585946559906, - "reward_std": 0.39651068300008774, - "rewards/accuracy_reward": 0.8750000149011612, - "rewards/reasoning_steps_reward": 0.973958358168602, - "rewards/repetition_penalty_reward": -0.008914108970202506, - "rewards/tag_count_reward": 0.9635416716337204, + "completion_length": 190.2239646911621, + "epoch": 0.28046494188226473, + "grad_norm": 1.2909186023075674, + "kl": 2.044921875, + "learning_rate": 1.808399994544222e-05, + "loss": -0.163, + "reward": 2.393586277961731, + "reward_std": 0.7894448935985565, + "rewards/accuracy_reward": 0.6770833507180214, + "rewards/reasoning_steps_reward": 0.8593750298023224, + "rewards/repetition_penalty_reward": -0.01917426590807736, + "rewards/tag_count_reward": 0.8763021230697632, "step": 187 }, { "clip_ratio": 0.0, - "completion_length": 91.19791984558105, - "epoch": 0.5637181409295352, - "grad_norm": 43.26072438039279, - "kl": 3.529296875, - "learning_rate": 9.527361198546715e-06, - "loss": 0.1745, - "reward": 2.712351143360138, - "reward_std": 0.5067479014396667, - "rewards/accuracy_reward": 0.8385416716337204, - "rewards/reasoning_steps_reward": 0.947916716337204, - "rewards/repetition_penalty_reward": -0.005096889741253108, - "rewards/tag_count_reward": 0.9309895932674408, + "completion_length": 216.42708587646484, + "epoch": 0.2819647544056993, + "grad_norm": 1.5283509521120155, + "kl": 1.4580078125, + "learning_rate": 1.805301666920138e-05, + "loss": -0.0956, + "reward": 2.425880491733551, + "reward_std": 0.6612659990787506, + "rewards/accuracy_reward": 0.6250000149011612, + "rewards/reasoning_steps_reward": 0.9236111491918564, + "rewards/repetition_penalty_reward": -0.027678666170686483, + "rewards/tag_count_reward": 0.9049479514360428, "step": 188 }, { "clip_ratio": 0.0, - "completion_length": 91.64062690734863, - "epoch": 0.5667166416791605, - "grad_norm": 11.228523577104813, - "kl": 1.244140625, - "learning_rate": 9.422436652105718e-06, - "loss": 0.1033, - "reward": 2.70283704996109, - "reward_std": 0.4315161928534508, - "rewards/accuracy_reward": 0.786458358168602, - "rewards/reasoning_steps_reward": 0.9652777910232544, - "rewards/repetition_penalty_reward": -0.00462843308923766, - "rewards/tag_count_reward": 0.9557291716337204, + "completion_length": 211.4895896911621, + "epoch": 0.28346456692913385, + "grad_norm": 0.7866126552096253, + "kl": 0.4970703125, + "learning_rate": 1.802181187767332e-05, + "loss": -0.05, + "reward": 2.5987101793289185, + "reward_std": 0.4513225108385086, + "rewards/accuracy_reward": 0.697916679084301, + "rewards/reasoning_steps_reward": 0.973958358168602, + "rewards/repetition_penalty_reward": -0.03410231741145253, + "rewards/tag_count_reward": 0.9609375149011612, "step": 189 }, { "clip_ratio": 0.0, - "completion_length": 90.95833778381348, - "epoch": 0.5697151424287856, - "grad_norm": 6.30477846185162, - "kl": 1.2138671875, - "learning_rate": 9.317575866353293e-06, - "loss": 0.1868, - "reward": 2.829277455806732, - "reward_std": 0.42368828505277634, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 0.9791667014360428, - "rewards/repetition_penalty_reward": -0.004055907833389938, - "rewards/tag_count_reward": 0.9375000149011612, + "completion_length": 210.44271087646484, + "epoch": 0.2849643794525684, + "grad_norm": 1.8848248356549524, + "kl": 0.82763671875, + "learning_rate": 1.7990386429211945e-05, + "loss": 0.0075, + "reward": 2.6982374787330627, + "reward_std": 0.35126618295907974, + "rewards/accuracy_reward": 0.7760416865348816, + "rewards/reasoning_steps_reward": 0.9774305671453476, + "rewards/repetition_penalty_reward": -0.02789103239774704, + "rewards/tag_count_reward": 0.9726562798023224, "step": 190 }, { "clip_ratio": 0.0, - "completion_length": 93.19791793823242, - "epoch": 0.5727136431784108, - "grad_norm": 7.430329871632132, - "kl": 2.046875, - "learning_rate": 9.212790417501688e-06, - "loss": 0.1354, - "reward": 2.5388087034225464, - "reward_std": 0.6883069798350334, - "rewards/accuracy_reward": 0.7031250149011612, - "rewards/reasoning_steps_reward": 0.9461805820465088, - "rewards/repetition_penalty_reward": -0.0050282846204936504, - "rewards/tag_count_reward": 0.8945312649011612, + "completion_length": 207.36980056762695, + "epoch": 0.286464191976003, + "grad_norm": 0.709217032508712, + "kl": 0.396484375, + "learning_rate": 1.7958741188240808e-05, + "loss": 0.003, + "reward": 2.6602558493614197, + "reward_std": 0.3209294006228447, + "rewards/accuracy_reward": 0.7135416865348816, + "rewards/reasoning_steps_reward": 0.9947916716337204, + "rewards/repetition_penalty_reward": -0.03635886125266552, + "rewards/tag_count_reward": 0.9882812649011612, "step": 191 }, { "clip_ratio": 0.0, - "completion_length": 98.67187690734863, - "epoch": 0.5757121439280359, - "grad_norm": 5.377382713696718, - "kl": 1.591796875, - "learning_rate": 9.108091873446264e-06, - "loss": 0.2739, - "reward": 2.752804219722748, - "reward_std": 0.5407753884792328, - "rewards/accuracy_reward": 0.8854166716337204, - "rewards/reasoning_steps_reward": 0.9340278059244156, - "rewards/repetition_penalty_reward": -0.004140403761994094, - "rewards/tag_count_reward": 0.9375000149011612, + "completion_length": 196.33855056762695, + "epoch": 0.2879640044994376, + "grad_norm": 0.690421758006153, + "kl": 0.44775390625, + "learning_rate": 1.7926877025229313e-05, + "loss": 0.0006, + "reward": 2.72603303194046, + "reward_std": 0.37794774770736694, + "rewards/accuracy_reward": 0.7968750149011612, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.03481765231117606, + "rewards/tag_count_reward": 0.9778645932674408, "step": 192 }, { "clip_ratio": 0.0, - "completion_length": 103.41666984558105, - "epoch": 0.5787106446776612, - "grad_norm": 4.0000714621638105, - "kl": 1.837890625, - "learning_rate": 9.003491792488438e-06, - "loss": 0.2431, - "reward": 2.7179980874061584, - "reward_std": 0.5703071355819702, - "rewards/accuracy_reward": 0.8437500298023224, - "rewards/reasoning_steps_reward": 0.9461805671453476, - "rewards/repetition_penalty_reward": -0.004224258096655831, - "rewards/tag_count_reward": 0.9322916865348816, + "completion_length": 209.93750381469727, + "epoch": 0.28946381702287216, + "grad_norm": 0.6642189099853063, + "kl": 0.52783203125, + "learning_rate": 1.789479481666878e-05, + "loss": 0.0102, + "reward": 2.6431049704551697, + "reward_std": 0.30012810230255127, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9895833432674408, + "rewards/repetition_penalty_reward": -0.03528041858226061, + "rewards/tag_count_reward": 0.9804687649011612, "step": 193 }, { "clip_ratio": 0.0, - "completion_length": 109.62500190734863, - "epoch": 0.5817091454272864, - "grad_norm": 6.784297297705125, - "kl": 1.486328125, - "learning_rate": 8.899001722059687e-06, - "loss": 0.4446, - "reward": 2.69238018989563, - "reward_std": 0.5326372683048248, - "rewards/accuracy_reward": 0.7916667014360428, - "rewards/reasoning_steps_reward": 0.9548610895872116, - "rewards/repetition_penalty_reward": -0.005970706697553396, - "rewards/tag_count_reward": 0.9518229216337204, + "completion_length": 194.14063262939453, + "epoch": 0.2909636295463067, + "grad_norm": 3.7403729265839174, + "kl": 1.55224609375, + "learning_rate": 1.786249544504834e-05, + "loss": -0.0579, + "reward": 2.4264142513275146, + "reward_std": 0.48074235022068024, + "rewards/accuracy_reward": 0.5104166865348816, + "rewards/reasoning_steps_reward": 0.9843750149011612, + "rewards/repetition_penalty_reward": -0.03973169345408678, + "rewards/tag_count_reward": 0.9713542014360428, "step": 194 }, { "clip_ratio": 0.0, - "completion_length": 115.79167175292969, - "epoch": 0.5847076461769115, - "grad_norm": 15.68094738303537, - "kl": 1.1884765625, - "learning_rate": 8.79463319744677e-06, - "loss": 0.553, - "reward": 2.6782643795013428, - "reward_std": 0.4203122928738594, - "rewards/accuracy_reward": 0.7656250149011612, - "rewards/reasoning_steps_reward": 0.9531250447034836, - "rewards/repetition_penalty_reward": -0.004027447925182059, - "rewards/tag_count_reward": 0.9635416716337204, + "completion_length": 199.9322967529297, + "epoch": 0.2924634420697413, + "grad_norm": 5.690002700824813, + "kl": 16.08984375, + "learning_rate": 1.7829979798830646e-05, + "loss": -0.0034, + "reward": 2.7136351466178894, + "reward_std": 0.36977453902363777, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9895833432674408, + "rewards/repetition_penalty_reward": -0.03766712639480829, + "rewards/tag_count_reward": 0.970052108168602, "step": 195 }, { "clip_ratio": 0.0, - "completion_length": 107.8281307220459, - "epoch": 0.5877061469265368, - "grad_norm": 9.395887357942577, - "kl": 1.21875, - "learning_rate": 8.690397740518279e-06, - "loss": 0.4238, - "reward": 2.8701762557029724, - "reward_std": 0.38676780834794044, - "rewards/accuracy_reward": 0.9375000298023224, - "rewards/reasoning_steps_reward": 0.9687500149011612, - "rewards/repetition_penalty_reward": -0.0035217873519286513, - "rewards/tag_count_reward": 0.9674479365348816, + "completion_length": 185.40104293823242, + "epoch": 0.29396325459317585, + "grad_norm": 1.0626617195260077, + "kl": 2.44140625, + "learning_rate": 1.779724877242745e-05, + "loss": -0.0171, + "reward": 2.54072505235672, + "reward_std": 0.3664104826748371, + "rewards/accuracy_reward": 0.6406250149011612, + "rewards/reasoning_steps_reward": 0.9809027910232544, + "rewards/repetition_penalty_reward": -0.046948717907071114, + "rewards/tag_count_reward": 0.9661458432674408, "step": 196 }, { "clip_ratio": 0.0, - "completion_length": 106.45312690734863, - "epoch": 0.5907046476761619, - "grad_norm": 4.57053771862413, - "kl": 0.8564453125, - "learning_rate": 8.586306858452653e-06, - "loss": 0.3129, - "reward": 2.78133487701416, - "reward_std": 0.38999389111995697, - "rewards/accuracy_reward": 0.833333358168602, - "rewards/reasoning_steps_reward": 0.9809028059244156, - "rewards/repetition_penalty_reward": -0.006859738496132195, - "rewards/tag_count_reward": 0.9739583432674408, + "completion_length": 187.1822967529297, + "epoch": 0.2954630671166104, + "grad_norm": 2.4457099387311376, + "kl": 4.498046875, + "learning_rate": 1.776430326617498e-05, + "loss": 0.0135, + "reward": 2.7412848472595215, + "reward_std": 0.20034414064139128, + "rewards/accuracy_reward": 0.796875, + "rewards/reasoning_steps_reward": 0.9947916716337204, + "rewards/repetition_penalty_reward": -0.039965324103832245, + "rewards/tag_count_reward": 0.989583358168602, "step": 197 }, { "clip_ratio": 0.0, - "completion_length": 114.46354675292969, - "epoch": 0.5937031484257871, - "grad_norm": 11.162254121408726, - "kl": 1.4287109375, - "learning_rate": 8.48237204246785e-06, - "loss": 0.5309, - "reward": 2.7748392820358276, - "reward_std": 0.4289344698190689, - "rewards/accuracy_reward": 0.8697917014360428, - "rewards/reasoning_steps_reward": 0.9496528059244156, - "rewards/repetition_penalty_reward": -0.00554278859635815, - "rewards/tag_count_reward": 0.9609375, + "completion_length": 199.6354217529297, + "epoch": 0.296962879640045, + "grad_norm": 0.7391409818658768, + "kl": 0.830078125, + "learning_rate": 1.77311441863092e-05, + "loss": -0.053, + "reward": 2.715912103652954, + "reward_std": 0.4153308942914009, + "rewards/accuracy_reward": 0.8125000149011612, + "rewards/reasoning_steps_reward": 0.982638880610466, + "rewards/repetition_penalty_reward": -0.04927895776927471, + "rewards/tag_count_reward": 0.9700520932674408, "step": 198 }, { "clip_ratio": 0.0, - "completion_length": 111.04166793823242, - "epoch": 0.5967016491754122, - "grad_norm": 29.168148040502643, - "kl": 1.78515625, - "learning_rate": 8.378604766552756e-06, - "loss": 0.3664, - "reward": 2.7375897765159607, - "reward_std": 0.32485630363225937, - "rewards/accuracy_reward": 0.7812500149011612, - "rewards/reasoning_steps_reward": 0.9895833432674408, - "rewards/repetition_penalty_reward": -0.0072020008228719234, - "rewards/tag_count_reward": 0.9739583432674408, + "completion_length": 185.04687881469727, + "epoch": 0.2984626921634796, + "grad_norm": 0.7197244320446806, + "kl": 0.33837890625, + "learning_rate": 1.769777244494086e-05, + "loss": 0.0279, + "reward": 2.6263469457626343, + "reward_std": 0.35684073716402054, + "rewards/accuracy_reward": 0.6927083432674408, + "rewards/reasoning_steps_reward": 0.9913194626569748, + "rewards/repetition_penalty_reward": -0.042055959813296795, + "rewards/tag_count_reward": 0.984375, "step": 199 }, { "clip_ratio": 0.0, - "completion_length": 125.81250381469727, - "epoch": 0.5997001499250375, - "grad_norm": 7.416359123988612, - "kl": 0.955078125, - "learning_rate": 8.275016486200498e-06, - "loss": 0.623, - "reward": 2.769349992275238, - "reward_std": 0.4332389682531357, - "rewards/accuracy_reward": 0.8385416865348816, - "rewards/reasoning_steps_reward": 0.9670139104127884, - "rewards/repetition_penalty_reward": -0.004955811775289476, - "rewards/tag_count_reward": 0.96875, + "completion_length": 187.06250762939453, + "epoch": 0.29996250468691416, + "grad_norm": 0.7272305025229391, + "kl": 0.3505859375, + "learning_rate": 1.7664188960030422e-05, + "loss": 0.0127, + "reward": 2.607051908969879, + "reward_std": 0.35574449226260185, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9930555671453476, + "rewards/repetition_penalty_reward": -0.03313909750431776, + "rewards/tag_count_reward": 0.9804687649011612, "step": 200 }, { "clip_ratio": 0.0, - "completion_length": 1024.0, - "epoch": 0.6026986506746627, - "grad_norm": 27.882830639662522, - "kl": 2.0904541015625, - "learning_rate": 8.17161863714381e-06, - "loss": 0.0835, - "reward": 0.0013020833721384406, - "reward_std": 0.0052083334885537624, - "rewards/accuracy_reward": 0.0, - "rewards/reasoning_steps_reward": 0.0, - "rewards/repetition_penalty_reward": 0.0, - "rewards/tag_count_reward": 0.0013020833721384406, + "completion_length": 172.6354217529297, + "epoch": 0.3014623172103487, + "grad_norm": 1.8628860952936386, + "kl": 0.56298828125, + "learning_rate": 1.7630394655362798e-05, + "loss": -0.0152, + "reward": 2.3712204694747925, + "reward_std": 0.8324102908372879, + "rewards/accuracy_reward": 0.6302083432674408, + "rewards/reasoning_steps_reward": 0.9062500596046448, + "rewards/repetition_penalty_reward": -0.035029674880206585, + "rewards/tag_count_reward": 0.8697916865348816, "step": 201 }, { "clip_ratio": 0.0, - "completion_length": 1014.3333435058594, - "epoch": 0.6056971514242878, - "grad_norm": 0.6042533284398771, - "kl": 0.113037109375, - "learning_rate": 8.06842263409257e-06, - "loss": 0.0419, - "reward": 0.029826793004758656, - "reward_std": 0.11930717201903462, - "rewards/accuracy_reward": 0.0052083334885537624, - "rewards/reasoning_steps_reward": 0.010416666977107525, - "rewards/repetition_penalty_reward": -0.00012112403055652976, - "rewards/tag_count_reward": 0.014322917093522847, + "completion_length": 163.6354217529297, + "epoch": 0.3029621297337833, + "grad_norm": 1.0953361912511788, + "kl": 0.48681640625, + "learning_rate": 1.7596390460521946e-05, + "loss": -0.0349, + "reward": 2.3163662552833557, + "reward_std": 0.572496585547924, + "rewards/accuracy_reward": 0.463541679084301, + "rewards/reasoning_steps_reward": 0.9496527910232544, + "rewards/repetition_penalty_reward": -0.025213704211637378, + "rewards/tag_count_reward": 0.9283854365348816, "step": 202 }, { "clip_ratio": 0.0, - "completion_length": 1019.1822967529297, - "epoch": 0.6086956521739131, - "grad_norm": 1.3832204376438657, - "kl": 0.21142578125, - "learning_rate": 7.965439869473664e-06, - "loss": 0.0271, - "reward": 0.018031382700428367, - "reward_std": 0.06882480159401894, - "rewards/accuracy_reward": 0.0052083334885537624, - "rewards/reasoning_steps_reward": 0.0052083334885537624, - "rewards/repetition_penalty_reward": -0.00019778480054810643, - "rewards/tag_count_reward": 0.007812500232830644, + "completion_length": 153.9427146911621, + "epoch": 0.30446194225721784, + "grad_norm": 1.9105972336877117, + "kl": 0.41455078125, + "learning_rate": 1.7562177310865296e-05, + "loss": 0.1019, + "reward": 2.7670071125030518, + "reward_std": 0.4459151364862919, + "rewards/accuracy_reward": 0.8593750298023224, + "rewards/reasoning_steps_reward": 0.9670139104127884, + "rewards/repetition_penalty_reward": -0.021621438674628735, + "rewards/tag_count_reward": 0.9622395932674408, "step": 203 }, { "clip_ratio": 0.0, - "completion_length": 1024.0, - "epoch": 0.6116941529235382, - "grad_norm": 0.9850061937944327, - "kl": 0.1051025390625, - "learning_rate": 7.862681712173304e-06, - "loss": 0.0042, - "reward": 0.007812500232830644, - "reward_std": 0.027949271723628044, - "rewards/accuracy_reward": 0.0, - "rewards/reasoning_steps_reward": 0.0, - "rewards/repetition_penalty_reward": 0.0, - "rewards/tag_count_reward": 0.007812500232830644, + "completion_length": 147.7395896911621, + "epoch": 0.3059617547806524, + "grad_norm": 1.1424371878407156, + "kl": 0.4150390625, + "learning_rate": 1.7527756147498026e-05, + "loss": 0.007, + "reward": 2.516347885131836, + "reward_std": 0.38063713908195496, + "rewards/accuracy_reward": 0.609375, + "rewards/reasoning_steps_reward": 0.9687500447034836, + "rewards/repetition_penalty_reward": -0.024016746319830418, + "rewards/tag_count_reward": 0.9622395932674408, "step": 204 }, { "clip_ratio": 0.0, - "completion_length": 1009.2812652587891, - "epoch": 0.6146926536731634, - "grad_norm": 0.44253170115395524, - "kl": 0.1737060546875, - "learning_rate": 7.760159506281955e-06, - "loss": 0.0639, - "reward": 0.04947916732635349, - "reward_std": 0.19791666930541396, - "rewards/accuracy_reward": 0.015625000465661287, - "rewards/reasoning_steps_reward": 0.015625000465661287, - "rewards/repetition_penalty_reward": 0.0, - "rewards/tag_count_reward": 0.01822916732635349, + "completion_length": 142.0208396911621, + "epoch": 0.30746156730408697, + "grad_norm": 0.8745105132943645, + "kl": 0.33447265625, + "learning_rate": 1.7493127917247168e-05, + "loss": 0.017, + "reward": 2.7387551069259644, + "reward_std": 0.25662550423294306, + "rewards/accuracy_reward": 0.8020833432674408, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.022095664869993925, + "rewards/tag_count_reward": 0.9726562649011612, "step": 205 }, { "clip_ratio": 0.0, - "completion_length": 995.3854370117188, - "epoch": 0.6176911544227887, - "grad_norm": 3.8020369247401415, - "kl": 0.369384765625, - "learning_rate": 7.65788456984198e-06, - "loss": 0.1046, - "reward": 0.10146949626505375, - "reward_std": 0.3067319616675377, - "rewards/accuracy_reward": 0.026041667442768812, - "rewards/reasoning_steps_reward": 0.031250000931322575, - "rewards/repetition_penalty_reward": -9.300596138928086e-05, - "rewards/tag_count_reward": 0.044270834885537624, + "completion_length": 144.41667556762695, + "epoch": 0.3089613798275216, + "grad_norm": 0.841666968800414, + "kl": 0.3515625, + "learning_rate": 1.7458293572635573e-05, + "loss": -0.0625, + "reward": 2.5762619972229004, + "reward_std": 0.39808357134461403, + "rewards/accuracy_reward": 0.6406250149011612, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.025734632275998592, + "rewards/tag_count_reward": 0.9752604216337204, "step": 206 }, { "clip_ratio": 0.0, - "completion_length": 980.5520935058594, - "epoch": 0.6206896551724138, - "grad_norm": 2.4622521421222046, - "kl": 0.35595703125, - "learning_rate": 7.555868193598188e-06, - "loss": 0.1499, - "reward": 0.15076165180653334, - "reward_std": 0.4339568931609392, - "rewards/accuracy_reward": 0.041666666977107525, - "rewards/reasoning_steps_reward": 0.046875000931322575, - "rewards/repetition_penalty_reward": -0.0002800179208861664, - "rewards/tag_count_reward": 0.06250000186264515, + "completion_length": 145.5989646911621, + "epoch": 0.31046119235095615, + "grad_norm": 0.8971484142091678, + "kl": 0.4033203125, + "learning_rate": 1.7423254071855696e-05, + "loss": 0.0317, + "reward": 2.6069366335868835, + "reward_std": 0.4525081217288971, + "rewards/accuracy_reward": 0.692708358168602, + "rewards/reasoning_steps_reward": 0.9826389104127884, + "rewards/repetition_penalty_reward": -0.020233482588082552, + "rewards/tag_count_reward": 0.9518229365348816, "step": 207 }, { "clip_ratio": 0.0, - "completion_length": 985.7864685058594, - "epoch": 0.623688155922039, - "grad_norm": 508.3775666051304, - "kl": 21.75, - "learning_rate": 7.4541216397513705e-06, - "loss": 1.0583, - "reward": 0.15980212949216366, - "reward_std": 0.5328295826911926, - "rewards/accuracy_reward": 0.03645833441987634, - "rewards/reasoning_steps_reward": 0.057291668839752674, - "rewards/repetition_penalty_reward": -0.0003541221594787203, - "rewards/tag_count_reward": 0.06640625093132257, + "completion_length": 136.8541717529297, + "epoch": 0.3119610048743907, + "grad_norm": 0.7957437127746465, + "kl": 0.4130859375, + "learning_rate": 1.7388010378743255e-05, + "loss": -0.0478, + "reward": 2.5852027535438538, + "reward_std": 0.3762405589222908, + "rewards/accuracy_reward": 0.6614583432674408, + "rewards/reasoning_steps_reward": 0.9687500149011612, + "rewards/repetition_penalty_reward": -0.015057688346132636, + "rewards/tag_count_reward": 0.970052108168602, "step": 208 }, { "clip_ratio": 0.0, - "completion_length": 980.5989685058594, - "epoch": 0.6266866566716641, - "grad_norm": 16.61142685491831, - "kl": 1.8037109375, - "learning_rate": 7.352656140715006e-06, - "loss": 0.1989, - "reward": 0.16810668725520372, - "reward_std": 0.42627183347940445, - "rewards/accuracy_reward": 0.03645833441987634, - "rewards/reasoning_steps_reward": 0.05381944612599909, - "rewards/repetition_penalty_reward": -0.00029610148339997977, - "rewards/tag_count_reward": 0.07812500139698386, + "completion_length": 141.765625, + "epoch": 0.3134608173978253, + "grad_norm": 0.7389331807682286, + "kl": 0.45556640625, + "learning_rate": 1.735256346275071e-05, + "loss": -0.0314, + "reward": 2.709416389465332, + "reward_std": 0.4428819492459297, + "rewards/accuracy_reward": 0.7968750298023224, + "rewards/reasoning_steps_reward": 0.965277761220932, + "rewards/repetition_penalty_reward": -0.022788605885580182, + "rewards/tag_count_reward": 0.9700520932674408, "step": 209 }, { "clip_ratio": 0.0, - "completion_length": 990.5625152587891, - "epoch": 0.6296851574212894, - "grad_norm": 5.532107109224189, - "kl": 0.662109375, - "learning_rate": 7.2514828978752434e-06, - "loss": 0.1355, - "reward": 0.1534007415175438, - "reward_std": 0.41936828941106796, - "rewards/accuracy_reward": 0.031250000931322575, - "rewards/reasoning_steps_reward": 0.04687500232830644, - "rewards/repetition_penalty_reward": -0.0002450980609864928, - "rewards/tag_count_reward": 0.07552083488553762, + "completion_length": 144.4114646911621, + "epoch": 0.31496062992125984, + "grad_norm": 0.8326796660055926, + "kl": 0.36376953125, + "learning_rate": 1.7316914298920592e-05, + "loss": -0.0442, + "reward": 2.6346763372421265, + "reward_std": 0.4088163301348686, + "rewards/accuracy_reward": 0.6979167014360428, + "rewards/reasoning_steps_reward": 0.9722222536802292, + "rewards/repetition_penalty_reward": -0.01853547664359212, + "rewards/tag_count_reward": 0.9830729216337204, "step": 210 }, { "clip_ratio": 0.0, - "completion_length": 942.3229370117188, - "epoch": 0.6326836581709145, - "grad_norm": 14.765094590304768, - "kl": 0.7763671875, - "learning_rate": 7.150613080354315e-06, - "loss": 0.262, - "reward": 0.32499945536255836, - "reward_std": 0.7303483635187149, - "rewards/accuracy_reward": 0.06770833535119891, - "rewards/reasoning_steps_reward": 0.118055559694767, - "rewards/repetition_penalty_reward": -0.0013894452131353319, - "rewards/tag_count_reward": 0.1406250037252903, + "completion_length": 153.4947967529297, + "epoch": 0.3164604424446944, + "grad_norm": 0.8162698667548337, + "kl": 0.4296875, + "learning_rate": 1.7281063867858687e-05, + "loss": -0.0033, + "reward": 2.47624808549881, + "reward_std": 0.4159582331776619, + "rewards/accuracy_reward": 0.5520833507180214, + "rewards/reasoning_steps_reward": 0.9809028059244156, + "rewards/repetition_penalty_reward": -0.025488153100013733, + "rewards/tag_count_reward": 0.9687500149011612, "step": 211 }, { "clip_ratio": 0.0, - "completion_length": 893.5156402587891, - "epoch": 0.6356821589205397, - "grad_norm": 13.402868555399907, - "kl": 2.154296875, - "learning_rate": 7.050057823777503e-06, - "loss": 0.4338, - "reward": 0.4721004366874695, - "reward_std": 0.9947518408298492, - "rewards/accuracy_reward": 0.13020833767950535, - "rewards/reasoning_steps_reward": 0.1545138955116272, - "rewards/repetition_penalty_reward": -0.0014238738513085991, - "rewards/tag_count_reward": 0.1888020895421505, + "completion_length": 173.90625381469727, + "epoch": 0.31796025496812896, + "grad_norm": 0.9992737614199144, + "kl": 0.435546875, + "learning_rate": 1.7245013155707076e-05, + "loss": 0.0243, + "reward": 2.806881844997406, + "reward_std": 0.2670608460903168, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9895833283662796, + "rewards/repetition_penalty_reward": -0.0316599381621927, + "rewards/tag_count_reward": 0.9947916716337204, "step": 212 }, { "clip_ratio": 0.0, - "completion_length": 882.8437652587891, - "epoch": 0.638680659670165, - "grad_norm": 533.9503112402125, - "kl": 46.984375, - "learning_rate": 6.9498282290438235e-06, - "loss": 2.4742, - "reward": 0.5224730595946312, - "reward_std": 0.9336346387863159, - "rewards/accuracy_reward": 0.1354166716337204, - "rewards/reasoning_steps_reward": 0.1666666716337204, - "rewards/repetition_penalty_reward": -0.0009644359670346603, - "rewards/tag_count_reward": 0.221354179084301, + "completion_length": 187.99480056762695, + "epoch": 0.3194600674915636, + "grad_norm": 15.20697089186289, + "kl": 1.06494140625, + "learning_rate": 1.7208763154116973e-05, + "loss": 0.1191, + "reward": 2.6377468705177307, + "reward_std": 0.27828123420476913, + "rewards/accuracy_reward": 0.692708358168602, + "rewards/reasoning_steps_reward": 0.987847238779068, + "rewards/repetition_penalty_reward": -0.028485802467912436, + "rewards/tag_count_reward": 0.9856770932674408, "step": 213 }, { "clip_ratio": 0.0, - "completion_length": 821.7448120117188, - "epoch": 0.6416791604197901, - "grad_norm": 149.18286801865162, - "kl": 16.4375, - "learning_rate": 6.849935361100522e-06, - "loss": 1.2104, - "reward": 0.7398104518651962, - "reward_std": 1.1720031201839447, - "rewards/accuracy_reward": 0.2083333432674408, - "rewards/reasoning_steps_reward": 0.2586805671453476, - "rewards/repetition_penalty_reward": -0.0019430473039392382, - "rewards/tag_count_reward": 0.2747395932674408, + "completion_length": 301.7708435058594, + "epoch": 0.32095988001499814, + "grad_norm": 20.978080391787007, + "kl": 1.6455078125, + "learning_rate": 1.7172314860221494e-05, + "loss": 0.2695, + "reward": 2.531617820262909, + "reward_std": 0.30965887755155563, + "rewards/accuracy_reward": 0.6093750149011612, + "rewards/reasoning_steps_reward": 0.9843750149011612, + "rewards/repetition_penalty_reward": -0.037392658181488514, + "rewards/tag_count_reward": 0.9752604216337204, "step": 214 }, { "clip_ratio": 0.0, - "completion_length": 840.9218902587891, - "epoch": 0.6446776611694153, - "grad_norm": 30.355372142343207, - "kl": 3.22265625, - "learning_rate": 6.750390247721549e-06, - "loss": 0.562, - "reward": 0.6818769425153732, - "reward_std": 1.1269759833812714, - "rewards/accuracy_reward": 0.17708333767950535, - "rewards/reasoning_steps_reward": 0.2413194589316845, - "rewards/repetition_penalty_reward": -0.0021508438512682915, - "rewards/tag_count_reward": 0.2656250074505806, + "completion_length": 445.5572967529297, + "epoch": 0.3224596925384327, + "grad_norm": 1.4225466343917932, + "kl": 1.603515625, + "learning_rate": 1.713566927660818e-05, + "loss": 0.0372, + "reward": 2.569119691848755, + "reward_std": 0.31232137233018875, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 0.9895833432674408, + "rewards/repetition_penalty_reward": -0.03374499548226595, + "rewards/tag_count_reward": 0.9882812649011612, "step": 215 }, { "clip_ratio": 0.0, - "completion_length": 803.9166870117188, - "epoch": 0.6476761619190404, - "grad_norm": 23.89552805694713, - "kl": 1.5625, - "learning_rate": 6.651203878290139e-06, - "loss": 0.5275, - "reward": 0.8299884647130966, - "reward_std": 1.1851888000965118, - "rewards/accuracy_reward": 0.2031250037252903, - "rewards/reasoning_steps_reward": 0.2934028059244156, - "rewards/repetition_penalty_reward": -0.0011747851967811584, - "rewards/tag_count_reward": 0.334635429084301, + "completion_length": 265.4427146911621, + "epoch": 0.32395950506186727, + "grad_norm": 5.079986124787856, + "kl": 1.427734375, + "learning_rate": 1.7098827411291474e-05, + "loss": 0.1262, + "reward": 2.6483259201049805, + "reward_std": 0.43077797442674637, + "rewards/accuracy_reward": 0.723958358168602, + "rewards/reasoning_steps_reward": 0.986111119389534, + "rewards/repetition_penalty_reward": -0.03570183180272579, + "rewards/tag_count_reward": 0.973958358168602, "step": 216 }, { "clip_ratio": 0.0, - "completion_length": 718.4583435058594, - "epoch": 0.6506746626686657, - "grad_norm": 30.60183508089706, - "kl": 2.451171875, - "learning_rate": 6.552387202585629e-06, - "loss": 0.7066, - "reward": 1.0522598177194595, - "reward_std": 1.2560299634933472, - "rewards/accuracy_reward": 0.2708333432674408, - "rewards/reasoning_steps_reward": 0.3732639104127884, - "rewards/repetition_penalty_reward": -0.0032958039082586765, - "rewards/tag_count_reward": 0.4114583432674408, + "completion_length": 216.0260467529297, + "epoch": 0.32545931758530183, + "grad_norm": 8.779280281061196, + "kl": 0.5869140625, + "learning_rate": 1.7061790277684935e-05, + "loss": 0.0993, + "reward": 2.6443240642547607, + "reward_std": 0.3985915258526802, + "rewards/accuracy_reward": 0.7447917014360428, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.03753373399376869, + "rewards/tag_count_reward": 0.9648437649011612, "step": 217 }, { "clip_ratio": 0.0, - "completion_length": 718.8177185058594, - "epoch": 0.6536731634182908, - "grad_norm": 64.25492952390515, - "kl": 7.8125, - "learning_rate": 6.453951129574644e-06, - "loss": 1.02, - "reward": 1.1462399363517761, - "reward_std": 1.311941385269165, - "rewards/accuracy_reward": 0.3229166753590107, - "rewards/reasoning_steps_reward": 0.3888889104127884, - "rewards/repetition_penalty_reward": -0.0017635486146900803, - "rewards/tag_count_reward": 0.4361979216337204, + "completion_length": 196.91146087646484, + "epoch": 0.3269591301087364, + "grad_norm": 8.182772315553562, + "kl": 0.6435546875, + "learning_rate": 1.7024558894573408e-05, + "loss": 0.135, + "reward": 2.516254484653473, + "reward_std": 0.3703230023384094, + "rewards/accuracy_reward": 0.583333358168602, + "rewards/reasoning_steps_reward": 0.987847238779068, + "rewards/repetition_penalty_reward": -0.03148850845173001, + "rewards/tag_count_reward": 0.9765625149011612, "step": 218 }, { "clip_ratio": 0.0, - "completion_length": 574.0937652587891, - "epoch": 0.656671664167916, - "grad_norm": 108.73570895032785, - "kl": 14.359375, - "learning_rate": 6.355906526206788e-06, - "loss": 1.8085, - "reward": 1.5105059146881104, - "reward_std": 1.3671056032180786, - "rewards/accuracy_reward": 0.432291679084301, - "rewards/reasoning_steps_reward": 0.5277777910232544, - "rewards/repetition_penalty_reward": -0.004251136677339673, - "rewards/tag_count_reward": 0.5546875149011612, + "completion_length": 202.86458587646484, + "epoch": 0.32845894263217096, + "grad_norm": 121464435.65252294, + "kl": 3831808.0, + "learning_rate": 1.698713428608497e-05, + "loss": 331049.0938, + "reward": 2.498434364795685, + "reward_std": 0.5815113484859467, + "rewards/accuracy_reward": 0.6197916865348816, + "rewards/reasoning_steps_reward": 0.9687500149011612, + "rewards/repetition_penalty_reward": -0.02500331262126565, + "rewards/tag_count_reward": 0.934895858168602, "step": 219 }, { "clip_ratio": 0.0, - "completion_length": 675.6041870117188, - "epoch": 0.6596701649175413, - "grad_norm": 68.16871801389341, - "kl": 10.140625, - "learning_rate": 6.2582642162149775e-06, - "loss": 1.2516, - "reward": 1.230929583311081, - "reward_std": 1.2748659253120422, - "rewards/accuracy_reward": 0.3072916716337204, - "rewards/reasoning_steps_reward": 0.4531250149011612, - "rewards/repetition_penalty_reward": -0.003445470007136464, - "rewards/tag_count_reward": 0.4739583507180214, + "completion_length": 234.6614646911621, + "epoch": 0.3299587551556056, + "grad_norm": 5166178.874158303, + "kl": 174346.3125, + "learning_rate": 1.694951748166278e-05, + "loss": 24312.8574, + "reward": 2.523725211620331, + "reward_std": 0.5246653333306313, + "rewards/accuracy_reward": 0.6562500149011612, + "rewards/reasoning_steps_reward": 0.9687500447034836, + "rewards/repetition_penalty_reward": -0.043983266688883305, + "rewards/tag_count_reward": 0.942708358168602, "step": 220 }, { "clip_ratio": 0.0, - "completion_length": 565.9896087646484, - "epoch": 0.6626686656671664, - "grad_norm": 37.36448608931068, - "kl": 7.1015625, - "learning_rate": 6.161034978920555e-06, - "loss": 1.2308, - "reward": 1.6033472418785095, - "reward_std": 1.2990168035030365, - "rewards/accuracy_reward": 0.4375000149011612, - "rewards/reasoning_steps_reward": 0.5798611491918564, - "rewards/repetition_penalty_reward": -0.003857686650007963, - "rewards/tag_count_reward": 0.5898437649011612, + "completion_length": 223.1145896911621, + "epoch": 0.33145856767904014, + "grad_norm": 431.25017200680634, + "kl": 21.69677734375, + "learning_rate": 1.6911709516036755e-05, + "loss": 3.13, + "reward": 2.538953959941864, + "reward_std": 0.5608213990926743, + "rewards/accuracy_reward": 0.6718750149011612, + "rewards/reasoning_steps_reward": 0.9600694924592972, + "rewards/repetition_penalty_reward": -0.029188551474362612, + "rewards/tag_count_reward": 0.9361979216337204, "step": 221 }, { "clip_ratio": 0.0, - "completion_length": 613.5521087646484, - "epoch": 0.6656671664167916, - "grad_norm": 42.67953665381653, - "kl": 6.75, - "learning_rate": 6.064229548043272e-06, - "loss": 1.1467, - "reward": 1.5173589289188385, - "reward_std": 1.3208832442760468, - "rewards/accuracy_reward": 0.416666679084301, - "rewards/reasoning_steps_reward": 0.5625000447034836, - "rewards/repetition_penalty_reward": -0.004776577930897474, - "rewards/tag_count_reward": 0.5429687649011612, + "completion_length": 276.19792556762695, + "epoch": 0.3329583802024747, + "grad_norm": 16.73765270319652, + "kl": 0.51953125, + "learning_rate": 1.6873711429195095e-05, + "loss": 0.3655, + "reward": 2.327203631401062, + "reward_std": 0.42651342228055, + "rewards/accuracy_reward": 0.4739583358168602, + "rewards/reasoning_steps_reward": 0.9635417014360428, + "rewards/repetition_penalty_reward": -0.03868194296956062, + "rewards/tag_count_reward": 0.9283854365348816, "step": 222 }, { "clip_ratio": 0.0, - "completion_length": 550.3437652587891, - "epoch": 0.6686656671664168, - "grad_norm": 33.134335694451195, - "kl": 5.140625, - "learning_rate": 5.9678586105163535e-06, - "loss": 1.071, - "reward": 1.621566891670227, - "reward_std": 1.2871178090572357, - "rewards/accuracy_reward": 0.4427083432674408, - "rewards/reasoning_steps_reward": 0.6006944626569748, - "rewards/repetition_penalty_reward": -0.005169248324818909, - "rewards/tag_count_reward": 0.583333358168602, + "completion_length": 263.83333587646484, + "epoch": 0.33445819272590926, + "grad_norm": 16.356024789507156, + "kl": 0.49072265625, + "learning_rate": 1.6835524266355698e-05, + "loss": 0.4278, + "reward": 2.46044385433197, + "reward_std": 0.501777321100235, + "rewards/accuracy_reward": 0.5989583432674408, + "rewards/reasoning_steps_reward": 0.9791666716337204, + "rewards/repetition_penalty_reward": -0.034347846638411283, + "rewards/tag_count_reward": 0.9166666865348816, "step": 223 }, { "clip_ratio": 0.0, - "completion_length": 479.4739761352539, - "epoch": 0.671664167916042, - "grad_norm": 47.63738327986576, - "kl": 4.26953125, - "learning_rate": 5.8719328053066886e-06, - "loss": 1.1644, - "reward": 1.8401104509830475, - "reward_std": 1.2440669536590576, - "rewards/accuracy_reward": 0.5156250149011612, - "rewards/reasoning_steps_reward": 0.6840278059244156, - "rewards/repetition_penalty_reward": -0.005375679756980389, - "rewards/tag_count_reward": 0.645833358168602, + "completion_length": 249.49480438232422, + "epoch": 0.3359580052493438, + "grad_norm": 2132.408739618289, + "kl": 61.46533203125, + "learning_rate": 1.6797149077937395e-05, + "loss": 6.8572, + "reward": 2.410385847091675, + "reward_std": 0.3996973782777786, + "rewards/accuracy_reward": 0.5364583507180214, + "rewards/reasoning_steps_reward": 0.9843750149011612, + "rewards/repetition_penalty_reward": -0.038832977414131165, + "rewards/tag_count_reward": 0.9283854365348816, "step": 224 }, { "clip_ratio": 0.0, - "completion_length": 488.96356201171875, - "epoch": 0.6746626686656672, - "grad_norm": 78.62849172701118, - "kl": 9.546875, - "learning_rate": 5.776462722240337e-06, - "loss": 1.7087, - "reward": 1.9351984560489655, - "reward_std": 1.149868130683899, - "rewards/accuracy_reward": 0.5104166865348816, - "rewards/reasoning_steps_reward": 0.763888955116272, - "rewards/repetition_penalty_reward": -0.004471685038879514, - "rewards/tag_count_reward": 0.665364608168602, + "completion_length": 238.79167556762695, + "epoch": 0.3374578177727784, + "grad_norm": 1.5282888356192013, + "kl": 0.42333984375, + "learning_rate": 1.6758586919531054e-05, + "loss": 0.4067, + "reward": 2.5930500626564026, + "reward_std": 0.40433521568775177, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9687499701976776, + "rewards/repetition_penalty_reward": -0.039762526750564575, + "rewards/tag_count_reward": 0.934895858168602, "step": 225 }, { "clip_ratio": 0.0, - "completion_length": 463.96875762939453, - "epoch": 0.6776611694152923, - "grad_norm": 50.31185538870822, - "kl": 9.921875, - "learning_rate": 5.6814589008334475e-06, - "loss": 1.669, - "reward": 1.9726946949958801, - "reward_std": 1.1515787243843079, - "rewards/accuracy_reward": 0.5312500149011612, - "rewards/reasoning_steps_reward": 0.7743055671453476, - "rewards/repetition_penalty_reward": -0.006038067163899541, - "rewards/tag_count_reward": 0.6731770932674408, + "completion_length": 230.0520896911621, + "epoch": 0.33895763029621295, + "grad_norm": 0.8528067604314957, + "kl": 0.41748046875, + "learning_rate": 1.671983885187055e-05, + "loss": 0.2675, + "reward": 2.4493818879127502, + "reward_std": 0.3525548577308655, + "rewards/accuracy_reward": 0.5468750074505806, + "rewards/reasoning_steps_reward": 0.975694477558136, + "rewards/repetition_penalty_reward": -0.028916888870298862, + "rewards/tag_count_reward": 0.9557291865348816, "step": 226 }, { "clip_ratio": 0.0, - "completion_length": 484.86981201171875, - "epoch": 0.6806596701649176, - "grad_norm": 25.321634983297866, - "kl": 7.1796875, - "learning_rate": 5.58693182912875e-06, - "loss": 1.4453, - "reward": 2.014142394065857, - "reward_std": 1.1831817626953125, - "rewards/accuracy_reward": 0.567708358168602, - "rewards/reasoning_steps_reward": 0.788194477558136, - "rewards/repetition_penalty_reward": -0.0032188262266572565, - "rewards/tag_count_reward": 0.661458358168602, + "completion_length": 258.1666793823242, + "epoch": 0.34045744281964757, + "grad_norm": 2.1183154515503513, + "kl": 0.462890625, + "learning_rate": 1.6680905940803596e-05, + "loss": 0.7181, + "reward": 2.582502543926239, + "reward_std": 0.494766004383564, + "rewards/accuracy_reward": 0.697916679084301, + "rewards/reasoning_steps_reward": 0.9739583730697632, + "rewards/repetition_penalty_reward": -0.02947685099206865, + "rewards/tag_count_reward": 0.9401041865348816, "step": 227 }, { "clip_ratio": 0.0, - "completion_length": 444.3385543823242, - "epoch": 0.6836581709145427, - "grad_norm": 31.357353768527005, - "kl": 2.78125, - "learning_rate": 5.4928919425377035e-06, - "loss": 1.1606, - "reward": 2.098558932542801, - "reward_std": 1.0963150560855865, - "rewards/accuracy_reward": 0.5885416865348816, - "rewards/reasoning_steps_reward": 0.8263889402151108, - "rewards/repetition_penalty_reward": -0.0038717142306268215, - "rewards/tag_count_reward": 0.6875000149011612, + "completion_length": 244.5989646911621, + "epoch": 0.34195725534308213, + "grad_norm": 4.12408868442433, + "kl": 0.43505859375, + "learning_rate": 1.66417892572624e-05, + "loss": 0.3667, + "reward": 2.5001248717308044, + "reward_std": 0.44768981635570526, + "rewards/accuracy_reward": 0.5989583432674408, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.04154195077717304, + "rewards/tag_count_reward": 0.9635416865348816, "step": 228 }, { "clip_ratio": 0.0, - "completion_length": 514.4323043823242, - "epoch": 0.6866566716641679, - "grad_norm": 30.871800901209355, - "kl": 2.44921875, - "learning_rate": 5.399349622688479e-06, - "loss": 0.9849, - "reward": 1.9208540618419647, - "reward_std": 1.0992612540721893, - "rewards/accuracy_reward": 0.4739583358168602, - "rewards/reasoning_steps_reward": 0.8159722685813904, - "rewards/repetition_penalty_reward": -0.005795358796603978, - "rewards/tag_count_reward": 0.6367187649011612, + "completion_length": 252.33855056762695, + "epoch": 0.3434570678665167, + "grad_norm": 29.415667963205827, + "kl": 0.52783203125, + "learning_rate": 1.6602489877234235e-05, + "loss": 0.4238, + "reward": 2.625883162021637, + "reward_std": 0.4509083032608032, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9618055671453476, + "rewards/repetition_penalty_reward": -0.03514129016548395, + "rewards/tag_count_reward": 0.9492187649011612, "step": 229 }, { "clip_ratio": 0.0, - "completion_length": 407.44793701171875, - "epoch": 0.6896551724137931, - "grad_norm": 29.451600398259565, - "kl": 2.1015625, - "learning_rate": 5.306315196279864e-06, - "loss": 1.1623, - "reward": 2.2249255180358887, - "reward_std": 1.0277338027954102, - "rewards/accuracy_reward": 0.630208358168602, - "rewards/reasoning_steps_reward": 0.8836805820465088, - "rewards/repetition_penalty_reward": -0.0051092767680529505, - "rewards/tag_count_reward": 0.7161458432674408, + "completion_length": 259.85417556762695, + "epoch": 0.34495688038995126, + "grad_norm": 18.06563359145896, + "kl": 0.6884765625, + "learning_rate": 1.656300888173181e-05, + "loss": 0.6687, + "reward": 2.646353542804718, + "reward_std": 0.518807977437973, + "rewards/accuracy_reward": 0.770833358168602, + "rewards/reasoning_steps_reward": 0.9652778506278992, + "rewards/repetition_penalty_reward": -0.028559860307723284, + "rewards/tag_count_reward": 0.938802108168602, "step": 230 }, { "clip_ratio": 0.0, - "completion_length": 408.5989685058594, - "epoch": 0.6926536731634183, - "grad_norm": 33.38250055371977, - "kl": 5.451171875, - "learning_rate": 5.213798933941237e-06, - "loss": 1.5075, - "reward": 2.2532132267951965, - "reward_std": 1.0094469636678696, - "rewards/accuracy_reward": 0.645833358168602, - "rewards/reasoning_steps_reward": 0.8767361044883728, - "rewards/repetition_penalty_reward": -0.003731334174517542, - "rewards/tag_count_reward": 0.7343750298023224, + "completion_length": 231.1197967529297, + "epoch": 0.3464566929133858, + "grad_norm": 15.68283618975048, + "kl": 3.099609375, + "learning_rate": 1.6523347356763572e-05, + "loss": 0.8542, + "reward": 2.695122182369232, + "reward_std": 0.4237038269639015, + "rewards/accuracy_reward": 0.8072916865348816, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.034478533547371626, + "rewards/tag_count_reward": 0.9570312649011612, "step": 231 }, { "clip_ratio": 0.0, - "completion_length": 442.53125762939453, - "epoch": 0.6956521739130435, - "grad_norm": 31.956941133841717, - "kl": 6.71875, - "learning_rate": 5.121811049098728e-06, - "loss": 1.5082, - "reward": 2.063460409641266, - "reward_std": 1.0131369531154633, - "rewards/accuracy_reward": 0.5104166865348816, - "rewards/reasoning_steps_reward": 0.864583358168602, - "rewards/repetition_penalty_reward": -0.006852172431536019, - "rewards/tag_count_reward": 0.6953125149011612, + "completion_length": 302.8698081970215, + "epoch": 0.3479565054368204, + "grad_norm": 193.87550647562585, + "kl": 9.875, + "learning_rate": 1.6483506393303807e-05, + "loss": 1.5385, + "reward": 2.4312188625335693, + "reward_std": 0.5264012217521667, + "rewards/accuracy_reward": 0.598958358168602, + "rewards/reasoning_steps_reward": 0.9496528208255768, + "rewards/repetition_penalty_reward": -0.03275693487375975, + "rewards/tag_count_reward": 0.915364608168602, "step": 232 }, { "clip_ratio": 0.0, - "completion_length": 461.07813262939453, - "epoch": 0.6986506746626686, - "grad_norm": 14.583503473769614, - "kl": 4.49609375, - "learning_rate": 5.030361696847706e-06, - "loss": 1.3138, - "reward": 2.1591392755508423, - "reward_std": 1.082452654838562, - "rewards/accuracy_reward": 0.6041666865348816, - "rewards/reasoning_steps_reward": 0.8767361640930176, - "rewards/repetition_penalty_reward": -0.004055280558532104, - "rewards/tag_count_reward": 0.6822916865348816, + "completion_length": 264.13542556762695, + "epoch": 0.34945631796025495, + "grad_norm": 36.22860317274317, + "kl": 0.8671875, + "learning_rate": 1.644348708726263e-05, + "loss": 0.6915, + "reward": 2.4658846259117126, + "reward_std": 0.6284756064414978, + "rewards/accuracy_reward": 0.5885416716337204, + "rewards/reasoning_steps_reward": 0.9618055820465088, + "rewards/repetition_penalty_reward": -0.023264775052666664, + "rewards/tag_count_reward": 0.9388020932674408, "step": 233 }, { "clip_ratio": 0.0, - "completion_length": 465.6146011352539, - "epoch": 0.7016491754122939, - "grad_norm": 31.93736272659645, - "kl": 2.73046875, - "learning_rate": 4.939460972831684e-06, - "loss": 1.1015, - "reward": 2.054699957370758, - "reward_std": 1.0152440816164017, - "rewards/accuracy_reward": 0.5208333432674408, - "rewards/reasoning_steps_reward": 0.8628472238779068, - "rewards/repetition_penalty_reward": -0.004761879798024893, - "rewards/tag_count_reward": 0.6757812649011612, + "completion_length": 286.3333435058594, + "epoch": 0.35095613048368957, + "grad_norm": 9.424469802346994, + "kl": 2.83984375, + "learning_rate": 1.640329053945585e-05, + "loss": 0.8706, + "reward": 2.417014181613922, + "reward_std": 0.6688820198178291, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.940972238779068, + "rewards/repetition_penalty_reward": -0.023958142613992095, + "rewards/tag_count_reward": 0.895833358168602, "step": 234 }, { "clip_ratio": 0.0, - "completion_length": 424.4479293823242, - "epoch": 0.704647676161919, - "grad_norm": 22.78953660465126, - "kl": 1.619140625, - "learning_rate": 4.849118912127817e-06, - "loss": 1.1123, - "reward": 2.1616870760917664, - "reward_std": 1.1319475769996643, - "rewards/accuracy_reward": 0.6354166865348816, - "rewards/reasoning_steps_reward": 0.8315972685813904, - "rewards/repetition_penalty_reward": -0.004545638337731361, - "rewards/tag_count_reward": 0.6992187649011612, + "completion_length": 234.5885467529297, + "epoch": 0.35245594300712413, + "grad_norm": 194.2142920957737, + "kl": 16.7421875, + "learning_rate": 1.6362917855574694e-05, + "loss": 1.9856, + "reward": 2.713364839553833, + "reward_std": 0.46920711547136307, + "rewards/accuracy_reward": 0.817708358168602, + "rewards/reasoning_steps_reward": 0.9774305671453476, + "rewards/repetition_penalty_reward": -0.029690792318433523, + "rewards/tag_count_reward": 0.9479166716337204, "step": 235 }, { "clip_ratio": 0.0, - "completion_length": 432.97918701171875, - "epoch": 0.7076461769115442, - "grad_norm": 32.912310324473026, - "kl": 3.009765625, - "learning_rate": 4.759345488139054e-06, - "loss": 1.2209, - "reward": 2.037692070007324, - "reward_std": 1.0419560819864273, - "rewards/accuracy_reward": 0.5208333507180214, - "rewards/reasoning_steps_reward": 0.8315972238779068, - "rewards/repetition_penalty_reward": -0.0022385247866623104, - "rewards/tag_count_reward": 0.6875000298023224, + "completion_length": 288.375, + "epoch": 0.3539557555305587, + "grad_norm": 27.915108177299196, + "kl": 5.3359375, + "learning_rate": 1.6322370146155372e-05, + "loss": 1.2636, + "reward": 2.4878005385398865, + "reward_std": 0.5347049832344055, + "rewards/accuracy_reward": 0.630208358168602, + "rewards/reasoning_steps_reward": 0.9652778059244156, + "rewards/repetition_penalty_reward": -0.028258674778044224, + "rewards/tag_count_reward": 0.9205729514360428, "step": 236 }, { "clip_ratio": 0.0, - "completion_length": 415.2864685058594, - "epoch": 0.7106446776611695, - "grad_norm": 11.37715164851099, - "kl": 1.5185546875, - "learning_rate": 4.670150611493116e-06, - "loss": 1.1303, - "reward": 2.193199932575226, - "reward_std": 1.067429170012474, - "rewards/accuracy_reward": 0.6406250149011612, - "rewards/reasoning_steps_reward": 0.8437500447034836, - "rewards/repetition_penalty_reward": -0.002112621790729463, - "rewards/tag_count_reward": 0.7109375149011612, + "completion_length": 230.7604217529297, + "epoch": 0.35545556805399325, + "grad_norm": 17.59840442919455, + "kl": 0.9892578125, + "learning_rate": 1.6281648526548556e-05, + "loss": 0.6013, + "reward": 2.604456603527069, + "reward_std": 0.3973645493388176, + "rewards/accuracy_reward": 0.734375, + "rewards/reasoning_steps_reward": 0.9392361789941788, + "rewards/repetition_penalty_reward": -0.030092121567577124, + "rewards/tag_count_reward": 0.9609375149011612, "step": 237 }, { "clip_ratio": 0.0, - "completion_length": 432.12500762939453, - "epoch": 0.7136431784107946, - "grad_norm": 21.21508512413439, - "kl": 2.18359375, - "learning_rate": 4.581544128948413e-06, - "loss": 1.1413, - "reward": 2.1001856327056885, - "reward_std": 1.1043085157871246, - "rewards/accuracy_reward": 0.5885416865348816, - "rewards/reasoning_steps_reward": 0.822916716337204, - "rewards/repetition_penalty_reward": -0.005283167352899909, - "rewards/tag_count_reward": 0.6940104365348816, + "completion_length": 259.2395935058594, + "epoch": 0.3569553805774278, + "grad_norm": 24.885038866025347, + "kl": 1.595703125, + "learning_rate": 1.6240754116888673e-05, + "loss": 0.8111, + "reward": 2.5495967864990234, + "reward_std": 0.5210294723510742, + "rewards/accuracy_reward": 0.6822916865348816, + "rewards/reasoning_steps_reward": 0.9531250149011612, + "rewards/repetition_penalty_reward": -0.027226188685745, + "rewards/tag_count_reward": 0.9414062798023224, "step": 238 }, { "clip_ratio": 0.0, - "completion_length": 455.89064025878906, - "epoch": 0.7166416791604198, - "grad_norm": 3.7024792757899943, - "kl": 1.119140625, - "learning_rate": 4.493535822306993e-06, - "loss": 1.0115, - "reward": 1.9977717995643616, - "reward_std": 1.0762813091278076, - "rewards/accuracy_reward": 0.546875, - "rewards/reasoning_steps_reward": 0.7881944477558136, - "rewards/repetition_penalty_reward": -0.00396440684562549, - "rewards/tag_count_reward": 0.6666666865348816, + "completion_length": 241.46875762939453, + "epoch": 0.3584551931008624, + "grad_norm": 563.2951701790159, + "kl": 37.115234375, + "learning_rate": 1.6199688042063118e-05, + "loss": 4.5597, + "reward": 2.6697729229927063, + "reward_std": 0.3740931283682585, + "rewards/accuracy_reward": 0.7760416865348816, + "rewards/reasoning_steps_reward": 0.9704861342906952, + "rewards/repetition_penalty_reward": -0.03378618019632995, + "rewards/tag_count_reward": 0.95703125, "step": 239 }, { "clip_ratio": 0.0, - "completion_length": 377.89583587646484, - "epoch": 0.719640179910045, - "grad_norm": 3.6760487188389717, - "kl": 0.953125, - "learning_rate": 4.406135407334669e-06, - "loss": 1.1086, - "reward": 2.1464386582374573, - "reward_std": 1.1137472093105316, - "rewards/accuracy_reward": 0.5989583432674408, - "rewards/reasoning_steps_reward": 0.8159722238779068, - "rewards/repetition_penalty_reward": -0.002866994822397828, - "rewards/tag_count_reward": 0.7343750298023224, + "completion_length": 207.78646087646484, + "epoch": 0.35995500562429694, + "grad_norm": 194.22792255309713, + "kl": 11.2109375, + "learning_rate": 1.6158451431681292e-05, + "loss": 2.2416, + "reward": 2.570555090904236, + "reward_std": 0.43018101900815964, + "rewards/accuracy_reward": 0.6510416865348816, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.0240629562176764, + "rewards/tag_count_reward": 0.9713541716337204, "step": 240 }, { "clip_ratio": 0.0, - "completion_length": 384.5364685058594, - "epoch": 0.7226386806596702, - "grad_norm": 19.529434988342086, - "kl": 1.9482421875, - "learning_rate": 4.319352532688444e-06, - "loss": 1.2755, - "reward": 2.1653665900230408, - "reward_std": 1.1197065114974976, - "rewards/accuracy_reward": 0.6250000149011612, - "rewards/reasoning_steps_reward": 0.8142361342906952, - "rewards/repetition_penalty_reward": -0.003036284673726186, - "rewards/tag_count_reward": 0.7291666865348816, + "completion_length": 227.88021087646484, + "epoch": 0.36145481814773156, + "grad_norm": 16.659702726061884, + "kl": 1.123046875, + "learning_rate": 1.6117045420043545e-05, + "loss": 0.5796, + "reward": 2.6110920906066895, + "reward_std": 0.44603806734085083, + "rewards/accuracy_reward": 0.6979166865348816, + "rewards/reasoning_steps_reward": 0.9809028059244156, + "rewards/repetition_penalty_reward": -0.03126898966729641, + "rewards/tag_count_reward": 0.9635416865348816, "step": 241 }, { "clip_ratio": 0.0, - "completion_length": 377.8020935058594, - "epoch": 0.7256371814092953, - "grad_norm": 46.6982672472414, - "kl": 2.0419921875, - "learning_rate": 4.2331967788513295e-06, - "loss": 1.2806, - "reward": 2.2072007060050964, - "reward_std": 1.220471739768982, - "rewards/accuracy_reward": 0.6979166865348816, - "rewards/reasoning_steps_reward": 0.7881944924592972, - "rewards/repetition_penalty_reward": -0.0028688511229120195, - "rewards/tag_count_reward": 0.723958358168602, + "completion_length": 225.95833587646484, + "epoch": 0.3629546306711661, + "grad_norm": 7.320890528168202, + "kl": 2.55078125, + "learning_rate": 1.6075471146109957e-05, + "loss": 0.7141, + "reward": 2.701682925224304, + "reward_std": 0.4787740185856819, + "rewards/accuracy_reward": 0.7968750149011612, + "rewards/reasoning_steps_reward": 0.9652778059244156, + "rewards/repetition_penalty_reward": -0.026615716982632875, + "rewards/tag_count_reward": 0.966145858168602, "step": 242 }, { "clip_ratio": 0.0, - "completion_length": 396.3802261352539, - "epoch": 0.7286356821589205, - "grad_norm": 51.973275524782814, - "kl": 2.06640625, - "learning_rate": 4.1476776570747065e-06, - "loss": 1.0713, - "reward": 1.942684918642044, - "reward_std": 1.1059914082288742, - "rewards/accuracy_reward": 0.4687500223517418, - "rewards/reasoning_steps_reward": 0.7656250447034836, - "rewards/repetition_penalty_reward": -0.005231783725321293, - "rewards/tag_count_reward": 0.7135416865348816, + "completion_length": 247.0260467529297, + "epoch": 0.3644544431946007, + "grad_norm": 112.81198496662604, + "kl": 10.6669921875, + "learning_rate": 1.603372975346903e-05, + "loss": 1.84, + "reward": 2.496858596801758, + "reward_std": 0.453279048204422, + "rewards/accuracy_reward": 0.6093750074505806, + "rewards/reasoning_steps_reward": 0.9618055820465088, + "rewards/repetition_penalty_reward": -0.02614493854343891, + "rewards/tag_count_reward": 0.9518229365348816, "step": 243 }, { "clip_ratio": 0.0, - "completion_length": 428.0104293823242, - "epoch": 0.7316341829085458, - "grad_norm": 4.890418211224268, - "kl": 1.2119140625, - "learning_rate": 4.0628046083283134e-06, - "loss": 1.0296, - "reward": 1.890177071094513, - "reward_std": 1.1783712059259415, - "rewards/accuracy_reward": 0.5000000149011612, - "rewards/reasoning_steps_reward": 0.7187500149011612, - "rewards/repetition_penalty_reward": -0.0030522070010192692, - "rewards/tag_count_reward": 0.6744791865348816, + "completion_length": 246.4895896911621, + "epoch": 0.36595425571803525, + "grad_norm": 21.266417808536865, + "kl": 2.26171875, + "learning_rate": 1.599182239030621e-05, + "loss": 0.7589, + "reward": 2.5812110900878906, + "reward_std": 0.5132277607917786, + "rewards/accuracy_reward": 0.6770833507180214, + "rewards/reasoning_steps_reward": 0.9791666716337204, + "rewards/repetition_penalty_reward": -0.025559830013662577, + "rewards/tag_count_reward": 0.9505208432674408, "step": 244 }, { "clip_ratio": 0.0, - "completion_length": 387.3073043823242, - "epoch": 0.7346326836581709, - "grad_norm": 5.532289024750771, - "kl": 1.078125, - "learning_rate": 3.9785870022580075e-06, - "loss": 1.1278, - "reward": 2.0230748057365417, - "reward_std": 1.2535961270332336, - "rewards/accuracy_reward": 0.5833333432674408, - "rewards/reasoning_steps_reward": 0.7326389104127884, - "rewards/repetition_penalty_reward": -0.005137050000485033, - "rewards/tag_count_reward": 0.7122395932674408, + "completion_length": 256.6614646911621, + "epoch": 0.3674540682414698, + "grad_norm": 256.9075653283988, + "kl": 19.359375, + "learning_rate": 1.594975020937233e-05, + "loss": 2.8854, + "reward": 2.6606767177581787, + "reward_std": 0.455551914870739, + "rewards/accuracy_reward": 0.7760417014360428, + "rewards/reasoning_steps_reward": 0.9687500298023224, + "rewards/repetition_penalty_reward": -0.025521302595734596, + "rewards/tag_count_reward": 0.9414062649011612, "step": 245 }, { "clip_ratio": 0.0, - "completion_length": 348.17189025878906, - "epoch": 0.7376311844077961, - "grad_norm": 9.03974459360704, - "kl": 0.9912109375, - "learning_rate": 3.895034136151388e-06, - "loss": 1.2011, - "reward": 2.1917267441749573, - "reward_std": 1.1604805290699005, - "rewards/accuracy_reward": 0.661458358168602, - "rewards/reasoning_steps_reward": 0.7812499850988388, - "rewards/repetition_penalty_reward": -0.004887988092377782, - "rewards/tag_count_reward": 0.7539062649011612, + "completion_length": 260.8906364440918, + "epoch": 0.3689538807649044, + "grad_norm": 54.830877036198636, + "kl": 7.4140625, + "learning_rate": 1.590751436795186e-05, + "loss": 1.4602, + "reward": 2.603227972984314, + "reward_std": 0.5533816516399384, + "rewards/accuracy_reward": 0.7187500149011612, + "rewards/reasoning_steps_reward": 0.9635416865348816, + "rewards/repetition_penalty_reward": -0.025678453966975212, + "rewards/tag_count_reward": 0.946614608168602, "step": 246 }, { "clip_ratio": 0.0, - "completion_length": 371.46876525878906, - "epoch": 0.7406296851574213, - "grad_norm": 10.803578128694179, - "kl": 0.986328125, - "learning_rate": 3.8121552339114166e-06, - "loss": 1.1353, - "reward": 2.1688408851623535, - "reward_std": 1.189782053232193, - "rewards/accuracy_reward": 0.645833358168602, - "rewards/reasoning_steps_reward": 0.7847222536802292, - "rewards/repetition_penalty_reward": -0.00390218710526824, - "rewards/tag_count_reward": 0.7421875298023224, + "completion_length": 281.8541679382324, + "epoch": 0.37045369328833894, + "grad_norm": 30.40863534370435, + "kl": 0.8173828125, + "learning_rate": 1.5865116027831123e-05, + "loss": 0.6622, + "reward": 2.464226543903351, + "reward_std": 0.5860697254538536, + "rewards/accuracy_reward": 0.5989583432674408, + "rewards/reasoning_steps_reward": 0.9722222536802292, + "rewards/repetition_penalty_reward": -0.030131183564662933, + "rewards/tag_count_reward": 0.9231770932674408, "step": 247 }, { "clip_ratio": 0.0, - "completion_length": 352.3073043823242, - "epoch": 0.7436281859070465, - "grad_norm": 15.262258125639379, - "kl": 0.8876953125, - "learning_rate": 3.729959445038136e-06, - "loss": 1.1792, - "reward": 2.193057656288147, - "reward_std": 1.1223880648612976, - "rewards/accuracy_reward": 0.6458333656191826, - "rewards/reasoning_steps_reward": 0.7951389104127884, - "rewards/repetition_penalty_reward": -0.004425037943292409, - "rewards/tag_count_reward": 0.7565104365348816, + "completion_length": 252.2604217529297, + "epoch": 0.37195350581177355, + "grad_norm": 23.77354715994608, + "kl": 0.6748046875, + "learning_rate": 1.5822556355266302e-05, + "loss": 0.6163, + "reward": 2.6034846901893616, + "reward_std": 0.5462495759129524, + "rewards/accuracy_reward": 0.708333358168602, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.02498761680908501, + "rewards/tag_count_reward": 0.9479166865348816, "step": 248 }, { "clip_ratio": 0.0, - "completion_length": 383.93751525878906, - "epoch": 0.7466266866566716, - "grad_norm": 2160.279743864228, - "kl": 34.333984375, - "learning_rate": 3.6484558436185936e-06, - "loss": 4.9009, - "reward": 2.1113908290863037, - "reward_std": 1.0678627490997314, - "rewards/accuracy_reward": 0.5625000149011612, - "rewards/reasoning_steps_reward": 0.8263889253139496, - "rewards/repetition_penalty_reward": -0.005362731404602528, - "rewards/tag_count_reward": 0.7278645932674408, + "completion_length": 221.90105056762695, + "epoch": 0.3734533183352081, + "grad_norm": 14.516331745406989, + "kl": 1.0048828125, + "learning_rate": 1.577983652095137e-05, + "loss": 0.4184, + "reward": 2.332468032836914, + "reward_std": 0.43112215772271156, + "rewards/accuracy_reward": 0.4322916716337204, + "rewards/reasoning_steps_reward": 0.967013955116272, + "rewards/repetition_penalty_reward": -0.03168142307549715, + "rewards/tag_count_reward": 0.9648437649011612, "step": 249 }, { "clip_ratio": 0.0, - "completion_length": 367.54689025878906, - "epoch": 0.7496251874062968, - "grad_norm": 43.212145640693585, - "kl": 1.115234375, - "learning_rate": 3.5676534273251072e-06, - "loss": 1.1604, - "reward": 2.2231311798095703, - "reward_std": 1.064414381980896, - "rewards/accuracy_reward": 0.6406250074505806, - "rewards/reasoning_steps_reward": 0.8472222685813904, - "rewards/repetition_penalty_reward": -0.004299435357097536, - "rewards/tag_count_reward": 0.739583358168602, + "completion_length": 211.7447967529297, + "epoch": 0.3749531308586427, + "grad_norm": 46.523365934279404, + "kl": 4.62451171875, + "learning_rate": 1.5736957699985887e-05, + "loss": 0.9192, + "reward": 2.580030918121338, + "reward_std": 0.3245397238060832, + "rewards/accuracy_reward": 0.6510416865348816, + "rewards/reasoning_steps_reward": 0.9895833432674408, + "rewards/repetition_penalty_reward": -0.031948281452059746, + "rewards/tag_count_reward": 0.9713541716337204, "step": 250 }, { "clip_ratio": 0.0, - "completion_length": 352.95314025878906, - "epoch": 0.7526236881559221, - "grad_norm": 1377.7529772591768, - "kl": 48.345703125, - "learning_rate": 3.487561116421958e-06, - "loss": 4.8422, - "reward": 2.2985642552375793, - "reward_std": 0.997696116566658, - "rewards/accuracy_reward": 0.6822917014360428, - "rewards/reasoning_steps_reward": 0.8715277910232544, - "rewards/repetition_penalty_reward": -0.003953163628466427, - "rewards/tag_count_reward": 0.7486979365348816, + "completion_length": 229.50521087646484, + "epoch": 0.37645294338207724, + "grad_norm": 46.335938002602354, + "kl": 4.9765625, + "learning_rate": 1.5693921071842688e-05, + "loss": 1.1019, + "reward": 2.6523364782333374, + "reward_std": 0.4070918932557106, + "rewards/accuracy_reward": 0.7291666716337204, + "rewards/reasoning_steps_reward": 0.987847238779068, + "rewards/repetition_penalty_reward": -0.03212546557188034, + "rewards/tag_count_reward": 0.9674479216337204, "step": 251 }, { "clip_ratio": 0.0, - "completion_length": 307.4739646911621, - "epoch": 0.7556221889055472, - "grad_norm": 237.6223200520559, - "kl": 7.603515625, - "learning_rate": 3.408187752780624e-06, - "loss": 1.9096, - "reward": 2.386752665042877, - "reward_std": 0.99474136531353, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/reasoning_steps_reward": 0.901041716337204, - "rewards/repetition_penalty_reward": -0.01298706023953855, - "rewards/tag_count_reward": 0.7903646230697632, + "completion_length": 237.23438262939453, + "epoch": 0.3779527559055118, + "grad_norm": 19.678447582967497, + "kl": 1.244140625, + "learning_rate": 1.5650727820335417e-05, + "loss": 0.624, + "reward": 2.570410132408142, + "reward_std": 0.5100410208106041, + "rewards/accuracy_reward": 0.692708358168602, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.032888590823858976, + "rewards/tag_count_reward": 0.9453125149011612, "step": 252 }, { "clip_ratio": 0.0, - "completion_length": 272.8541793823242, - "epoch": 0.7586206896551724, - "grad_norm": 103.11957636266683, - "kl": 1.3759765625, - "learning_rate": 3.329542098903674e-06, - "loss": 1.2511, - "reward": 2.4556241035461426, - "reward_std": 0.754611574113369, - "rewards/accuracy_reward": 0.708333358168602, - "rewards/reasoning_steps_reward": 0.9288194626569748, - "rewards/repetition_penalty_reward": -0.010955846635624766, - "rewards/tag_count_reward": 0.829427108168602, + "completion_length": 247.0572967529297, + "epoch": 0.37945256842894637, + "grad_norm": 10.000060356187625, + "kl": 1.7998046875, + "learning_rate": 1.5607379133585978e-05, + "loss": 0.4729, + "reward": 2.7505863904953003, + "reward_std": 0.4299619048833847, + "rewards/accuracy_reward": 0.859375, + "rewards/reasoning_steps_reward": 0.9635416865348816, + "rewards/repetition_penalty_reward": -0.02675750060006976, + "rewards/tag_count_reward": 0.9544270932674408, "step": 253 }, { "clip_ratio": 0.0, - "completion_length": 216.80209350585938, - "epoch": 0.7616191904047976, - "grad_norm": 72.74924199958468, - "kl": 1.1376953125, - "learning_rate": 3.2516328369574247e-06, - "loss": 1.1836, - "reward": 2.5425453782081604, - "reward_std": 0.6245445907115936, - "rewards/accuracy_reward": 0.713541679084301, - "rewards/reasoning_steps_reward": 0.9479166865348816, - "rewards/repetition_penalty_reward": -0.005631837877444923, - "rewards/tag_count_reward": 0.8867187649011612, + "completion_length": 255.86980056762695, + "epoch": 0.38095238095238093, + "grad_norm": 41.51304950449503, + "kl": 6.734375, + "learning_rate": 1.5563876203991856e-05, + "loss": 1.5806, + "reward": 2.535602629184723, + "reward_std": 0.5331927165389061, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.958333358168602, + "rewards/repetition_penalty_reward": -0.02299123560078442, + "rewards/tag_count_reward": 0.9335937649011612, "step": 254 }, { "clip_ratio": 0.0, - "completion_length": 288.1458396911621, - "epoch": 0.7646176911544228, - "grad_norm": 18.047951176172486, - "kl": 0.7626953125, - "learning_rate": 3.174468567813461e-06, - "loss": 1.2569, - "reward": 2.4596019983291626, - "reward_std": 0.9773845970630646, - "rewards/accuracy_reward": 0.755208358168602, - "rewards/reasoning_steps_reward": 0.8854167014360428, - "rewards/repetition_penalty_reward": -0.003939674003049731, - "rewards/tag_count_reward": 0.8229166865348816, + "completion_length": 231.61979293823242, + "epoch": 0.38245219347581555, + "grad_norm": 18.09973578264405, + "kl": 4.19921875, + "learning_rate": 1.55202202281933e-05, + "loss": 0.9442, + "reward": 2.68158096075058, + "reward_std": 0.38162920624017715, + "rewards/accuracy_reward": 0.7656250298023224, + "rewards/reasoning_steps_reward": 0.9826389253139496, + "rewards/repetition_penalty_reward": -0.025016394443809986, + "rewards/tag_count_reward": 0.9583333432674408, "step": 255 }, { "clip_ratio": 0.0, - "completion_length": 279.9635467529297, - "epoch": 0.767616191904048, - "grad_norm": 2617.8075543625873, - "kl": 1.8046875, - "learning_rate": 3.0980578100991356e-06, - "loss": 0.9846, - "reward": 2.420164167881012, - "reward_std": 0.901138424873352, + "completion_length": 236.7083396911621, + "epoch": 0.3839520059992501, + "grad_norm": 23.810294289503528, + "kl": 1.501953125, + "learning_rate": 1.5476412407040445e-05, + "loss": 0.6148, + "reward": 2.6228463649749756, + "reward_std": 0.43365930393338203, "rewards/accuracy_reward": 0.739583358168602, - "rewards/reasoning_steps_reward": 0.8559027910232544, - "rewards/repetition_penalty_reward": -0.0034470059908926487, - "rewards/tag_count_reward": 0.8281250149011612, + "rewards/reasoning_steps_reward": 0.9635416865348816, + "rewards/repetition_penalty_reward": -0.02689332398585975, + "rewards/tag_count_reward": 0.946614608168602, "step": 256 }, { "clip_ratio": 0.0, - "completion_length": 244.65104293823242, - "epoch": 0.7706146926536732, - "grad_norm": 10.063702176948555, - "kl": 0.7568359375, - "learning_rate": 3.022408999257148e-06, - "loss": 1.0871, - "reward": 2.4764610528945923, - "reward_std": 0.8722782582044601, - "rewards/accuracy_reward": 0.7656250298023224, - "rewards/reasoning_steps_reward": 0.8697917014360428, - "rewards/repetition_penalty_reward": -0.004007782437838614, - "rewards/tag_count_reward": 0.845052108168602, + "completion_length": 292.3333435058594, + "epoch": 0.3854518185226847, + "grad_norm": 40.613619721197324, + "kl": 2.0693359375, + "learning_rate": 1.5432453945560223e-05, + "loss": 0.8716, + "reward": 2.567777395248413, + "reward_std": 0.5245895758271217, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9531250149011612, + "rewards/repetition_penalty_reward": -0.028576852288097143, + "rewards/tag_count_reward": 0.9140625298023224, "step": 257 }, { "clip_ratio": 0.0, - "completion_length": 236.68750762939453, - "epoch": 0.7736131934032984, - "grad_norm": 57.97300190458974, - "kl": 0.9248046875, - "learning_rate": 2.947530486614303e-06, - "loss": 1.1472, - "reward": 2.488058924674988, - "reward_std": 0.8399006128311157, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9062500149011612, - "rewards/repetition_penalty_reward": -0.008034905651584268, - "rewards/tag_count_reward": 0.860677108168602, + "completion_length": 249.99480438232422, + "epoch": 0.38695163104611924, + "grad_norm": 21.07252470454373, + "kl": 3.251953125, + "learning_rate": 1.5388346052923268e-05, + "loss": 1.0205, + "reward": 2.599771797657013, + "reward_std": 0.545002818107605, + "rewards/accuracy_reward": 0.7187500074505806, + "rewards/reasoning_steps_reward": 0.9722222536802292, + "rewards/repetition_penalty_reward": -0.028700455324724317, + "rewards/tag_count_reward": 0.9375000298023224, "step": 258 }, { "clip_ratio": 0.0, - "completion_length": 290.46875762939453, - "epoch": 0.7766116941529235, - "grad_norm": 911.2077972417934, - "kl": 33.609375, - "learning_rate": 2.8734305384595598e-06, - "loss": 1.6782, - "reward": 2.371084213256836, - "reward_std": 0.9771549105644226, - "rewards/accuracy_reward": 0.6822916865348816, - "rewards/reasoning_steps_reward": 0.8663194626569748, - "rewards/repetition_penalty_reward": -0.005651986633893102, - "rewards/tag_count_reward": 0.8281250149011612, + "completion_length": 210.9166717529297, + "epoch": 0.3884514435695538, + "grad_norm": 142.3585656392472, + "kl": 10.0546875, + "learning_rate": 1.534408994241063e-05, + "loss": 1.7735, + "reward": 2.710343062877655, + "reward_std": 0.4035199508070946, + "rewards/accuracy_reward": 0.7916666716337204, + "rewards/reasoning_steps_reward": 0.9756944626569748, + "rewards/repetition_penalty_reward": -0.03227862901985645, + "rewards/tag_count_reward": 0.9752604216337204, "step": 259 }, { "clip_ratio": 0.0, - "completion_length": 237.38542556762695, - "epoch": 0.7796101949025487, - "grad_norm": 326.19482464697984, - "kl": 3.037109375, - "learning_rate": 2.8001173351314625e-06, - "loss": 1.277, - "reward": 2.5145729780197144, - "reward_std": 0.8064324706792831, - "rewards/accuracy_reward": 0.770833358168602, - "rewards/reasoning_steps_reward": 0.8888888955116272, - "rewards/repetition_penalty_reward": -0.007128427678253502, - "rewards/tag_count_reward": 0.8619791716337204, + "completion_length": 237.3958396911621, + "epoch": 0.38995125609298836, + "grad_norm": 377.6752575854368, + "kl": 24.46875, + "learning_rate": 1.5299686831380395e-05, + "loss": 3.8202, + "reward": 2.407591462135315, + "reward_std": 0.3708171471953392, + "rewards/accuracy_reward": 0.5208333414047956, + "rewards/reasoning_steps_reward": 0.9704861342906952, + "rewards/repetition_penalty_reward": -0.04206139035522938, + "rewards/tag_count_reward": 0.958333358168602, "step": 260 }, { "clip_ratio": 0.0, - "completion_length": 247.31771087646484, - "epoch": 0.782608695652174, - "grad_norm": 38.273912583736674, - "kl": 1.0126953125, - "learning_rate": 2.7275989701150684e-06, - "loss": 1.3591, - "reward": 2.555409252643585, - "reward_std": 0.9130035191774368, - "rewards/accuracy_reward": 0.8125000149011612, - "rewards/reasoning_steps_reward": 0.8854166865348816, - "rewards/repetition_penalty_reward": -0.004486680845730007, - "rewards/tag_count_reward": 0.8619791865348816, + "completion_length": 232.5989646911621, + "epoch": 0.3914510686164229, + "grad_norm": 930.2326367615931, + "kl": 29.53125, + "learning_rate": 1.5255137941234228e-05, + "loss": 5.1202, + "reward": 2.6035396456718445, + "reward_std": 0.47209347784519196, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.01972440304234624, + "rewards/tag_count_reward": 0.9635417014360428, "step": 261 }, { "clip_ratio": 0.0, - "completion_length": 211.36458587646484, - "epoch": 0.7856071964017991, - "grad_norm": 63.967595082731656, - "kl": 0.875, - "learning_rate": 2.6558834491484576e-06, - "loss": 1.0617, - "reward": 2.4956788420677185, - "reward_std": 0.7778256386518478, - "rewards/accuracy_reward": 0.708333358168602, - "rewards/reasoning_steps_reward": 0.9062500149011612, - "rewards/repetition_penalty_reward": -0.005623297765851021, - "rewards/tag_count_reward": 0.8867187798023224, + "completion_length": 253.92188262939453, + "epoch": 0.39295088113985754, + "grad_norm": 56.25971418456335, + "kl": 7.8515625, + "learning_rate": 1.5210444497383745e-05, + "loss": 1.6828, + "reward": 2.4639751315116882, + "reward_std": 0.519036740064621, + "rewards/accuracy_reward": 0.5885416865348816, + "rewards/reasoning_steps_reward": 0.958333358168602, + "rewards/repetition_penalty_reward": -0.020400056848302484, + "rewards/tag_count_reward": 0.9375000149011612, "step": 262 }, { "clip_ratio": 0.0, - "completion_length": 203.95312881469727, - "epoch": 0.7886056971514243, - "grad_norm": 195.88629364008162, - "kl": 1.525390625, - "learning_rate": 2.5849786893389296e-06, - "loss": 1.5584, - "reward": 2.52648663520813, - "reward_std": 0.8215616047382355, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9131944626569748, - "rewards/repetition_penalty_reward": -0.0038954283227212727, - "rewards/tag_count_reward": 0.8880208432674408, + "completion_length": 206.9895896911621, + "epoch": 0.3944506936632921, + "grad_norm": 37.53129740951539, + "kl": 0.9560546875, + "learning_rate": 1.5165607729216822e-05, + "loss": 0.5769, + "reward": 2.6286195516586304, + "reward_std": 0.4291228875517845, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.97743059694767, + "rewards/repetition_penalty_reward": -0.025894425809383392, + "rewards/tag_count_reward": 0.9687500149011612, "step": 263 }, { "clip_ratio": 0.0, - "completion_length": 234.9270896911621, - "epoch": 0.7916041979010495, - "grad_norm": 111.05889429266634, - "kl": 2.357421875, - "learning_rate": 2.514892518288988e-06, - "loss": 1.6347, - "reward": 2.5823854207992554, - "reward_std": 0.9507413059473038, - "rewards/accuracy_reward": 0.848958358168602, - "rewards/reasoning_steps_reward": 0.8750000149011612, - "rewards/repetition_penalty_reward": -0.004854308412177488, - "rewards/tag_count_reward": 0.8632812649011612, + "completion_length": 224.0260467529297, + "epoch": 0.39595050618672667, + "grad_norm": 40.41532757864787, + "kl": 0.7841796875, + "learning_rate": 1.5120628870063772e-05, + "loss": 0.5366, + "reward": 2.569106698036194, + "reward_std": 0.5048164129257202, + "rewards/accuracy_reward": 0.6770833507180214, + "rewards/reasoning_steps_reward": 0.9548611342906952, + "rewards/repetition_penalty_reward": -0.02377540967427194, + "rewards/tag_count_reward": 0.9609375149011612, "step": 264 }, { "clip_ratio": 0.0, - "completion_length": 236.2187614440918, - "epoch": 0.7946026986506747, - "grad_norm": 71.84249826406462, - "kl": 2.79296875, - "learning_rate": 2.445632673232208e-06, - "loss": 1.4097, - "reward": 2.507424294948578, - "reward_std": 0.8585045337677002, - "rewards/accuracy_reward": 0.770833358168602, - "rewards/reasoning_steps_reward": 0.8802083432674408, - "rewards/repetition_penalty_reward": -0.002992400841321796, - "rewards/tag_count_reward": 0.8593750298023224, + "completion_length": 232.37500381469727, + "epoch": 0.39745031871016123, + "grad_norm": 36.78339467024414, + "kl": 1.271484375, + "learning_rate": 1.5075509157163422e-05, + "loss": 0.6666, + "reward": 2.532996416091919, + "reward_std": 0.406702384352684, + "rewards/accuracy_reward": 0.614583358168602, + "rewards/reasoning_steps_reward": 0.9739583730697632, + "rewards/repetition_penalty_reward": -0.017784894444048405, + "rewards/tag_count_reward": 0.962239608168602, "step": 265 }, { "clip_ratio": 0.0, - "completion_length": 287.5520935058594, - "epoch": 0.7976011994002998, - "grad_norm": 952.7545080808868, - "kl": 22.484375, - "learning_rate": 2.3772068001790682e-06, - "loss": 3.905, - "reward": 2.385596811771393, - "reward_std": 0.9874599426984787, - "rewards/accuracy_reward": 0.6979166865348816, - "rewards/reasoning_steps_reward": 0.8645833730697632, - "rewards/repetition_penalty_reward": -0.0063302937196567655, - "rewards/tag_count_reward": 0.8294270932674408, + "completion_length": 269.87500762939453, + "epoch": 0.3989501312335958, + "grad_norm": 5.827263683756955, + "kl": 5.0390625, + "learning_rate": 1.503024983162908e-05, + "loss": 1.1664, + "reward": 2.5889384150505066, + "reward_std": 0.5250253453850746, + "rewards/accuracy_reward": 0.723958358168602, + "rewards/reasoning_steps_reward": 0.9618055820465088, + "rewards/repetition_penalty_reward": -0.025211102329194546, + "rewards/tag_count_reward": 0.9283854216337204, "step": 266 }, { "clip_ratio": 0.0, - "completion_length": 205.85938262939453, - "epoch": 0.800599700149925, - "grad_norm": 345.0356082309632, - "kl": 10.75, - "learning_rate": 2.309622453072867e-06, - "loss": 2.8112, - "reward": 2.638276517391205, - "reward_std": 0.6796813756227493, - "rewards/accuracy_reward": 0.8229166716337204, - "rewards/reasoning_steps_reward": 0.9236111342906952, - "rewards/repetition_penalty_reward": -0.005386793913203292, - "rewards/tag_count_reward": 0.8971354514360428, + "completion_length": 237.25000381469727, + "epoch": 0.40044994375703036, + "grad_norm": 50.76618742939356, + "kl": 6.140625, + "learning_rate": 1.49848521384144e-05, + "loss": 1.4081, + "reward": 2.7229838967323303, + "reward_std": 0.47072865813970566, + "rewards/accuracy_reward": 0.8229166865348816, + "rewards/reasoning_steps_reward": 0.970486119389534, + "rewards/repetition_penalty_reward": -0.03396061668172479, + "rewards/tag_count_reward": 0.9635416865348816, "step": 267 }, { "clip_ratio": 0.0, - "completion_length": 261.25000381469727, - "epoch": 0.8035982008995503, - "grad_norm": 243.7268824601945, - "kl": 3.1484375, - "learning_rate": 2.2428870929558012e-06, - "loss": 1.6183, - "reward": 2.373494029045105, - "reward_std": 0.9536427110433578, - "rewards/accuracy_reward": 0.6718750149011612, - "rewards/reasoning_steps_reward": 0.8663194626569748, - "rewards/repetition_penalty_reward": -0.005846357671543956, - "rewards/tag_count_reward": 0.8411458432674408, + "completion_length": 216.90625762939453, + "epoch": 0.4019497562804649, + "grad_norm": 5.736957116926136, + "kl": 3.244140625, + "learning_rate": 1.4939317326279125e-05, + "loss": 1.0251, + "reward": 2.5634613633155823, + "reward_std": 0.4809972904622555, + "rewards/accuracy_reward": 0.6718750298023224, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.02681647054851055, + "rewards/tag_count_reward": 0.953125, "step": 268 }, { "clip_ratio": 0.0, - "completion_length": 212.04688262939453, - "epoch": 0.8065967016491754, - "grad_norm": 203.47240043545727, - "kl": 2.35546875, - "learning_rate": 2.177008087145286e-06, - "loss": 1.358, - "reward": 2.610089957714081, - "reward_std": 0.7462358474731445, - "rewards/accuracy_reward": 0.7968750149011612, - "rewards/reasoning_steps_reward": 0.923611119389534, - "rewards/repetition_penalty_reward": -0.006229604536201805, - "rewards/tag_count_reward": 0.8958333432674408, + "completion_length": 234.33855056762695, + "epoch": 0.40344956880389954, + "grad_norm": 23.899798832527537, + "kl": 4.408203125, + "learning_rate": 1.489364664775475e-05, + "loss": 0.9419, + "reward": 2.684835433959961, + "reward_std": 0.39333367347717285, + "rewards/accuracy_reward": 0.770833358168602, + "rewards/reasoning_steps_reward": 0.9843750149011612, + "rewards/repetition_penalty_reward": -0.03261255333200097, + "rewards/tag_count_reward": 0.9622395932674408, "step": 269 }, { "clip_ratio": 0.0, - "completion_length": 220.5156364440918, - "epoch": 0.8095952023988006, - "grad_norm": 104.0415374329618, - "kl": 4.044921875, - "learning_rate": 2.111992708420646e-06, - "loss": 1.7589, - "reward": 2.612178146839142, - "reward_std": 0.8539304882287979, - "rewards/accuracy_reward": 0.8385416865348816, - "rewards/reasoning_steps_reward": 0.89930559694767, - "rewards/repetition_penalty_reward": -0.004575358587317169, - "rewards/tag_count_reward": 0.8789062798023224, + "completion_length": 237.65625762939453, + "epoch": 0.4049493813273341, + "grad_norm": 2.2657313852356853, + "kl": 2.919921875, + "learning_rate": 1.4847841359110058e-05, + "loss": 0.7915, + "reward": 2.613522946834564, + "reward_std": 0.3725850097835064, + "rewards/accuracy_reward": 0.7135416939854622, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.027970252325758338, + "rewards/tag_count_reward": 0.9557291865348816, "step": 270 }, { "clip_ratio": 0.0, - "completion_length": 228.5260467529297, - "epoch": 0.8125937031484258, - "grad_norm": 162.74223326789044, - "kl": 11.875, - "learning_rate": 2.047848134220213e-06, - "loss": 2.9084, - "reward": 2.4650405645370483, - "reward_std": 0.8374519795179367, - "rewards/accuracy_reward": 0.6979166865348816, - "rewards/reasoning_steps_reward": 0.9027778059244156, - "rewards/repetition_penalty_reward": -0.008049842552281916, - "rewards/tag_count_reward": 0.872395858168602, + "completion_length": 230.23438262939453, + "epoch": 0.40644919385076866, + "grad_norm": 11.972730389593089, + "kl": 3.646484375, + "learning_rate": 1.480190272031657e-05, + "loss": 0.8564, + "reward": 2.5715832710266113, + "reward_std": 0.4187803417444229, + "rewards/accuracy_reward": 0.6770833507180214, + "rewards/reasoning_steps_reward": 0.9704861640930176, + "rewards/repetition_penalty_reward": -0.030413302592933178, + "rewards/tag_count_reward": 0.954427108168602, "step": 271 }, { "clip_ratio": 0.0, - "completion_length": 274.6197929382324, - "epoch": 0.815592203898051, - "grad_norm": 103.66672953735151, - "kl": 8.3125, - "learning_rate": 1.984581445848981e-06, - "loss": 2.2752, - "reward": 2.4658045768737793, - "reward_std": 0.9265211671590805, - "rewards/accuracy_reward": 0.7656250298023224, - "rewards/reasoning_steps_reward": 0.8697917014360428, - "rewards/repetition_penalty_reward": -0.00554958607244771, - "rewards/tag_count_reward": 0.8359375298023224, + "completion_length": 203.9166717529297, + "epoch": 0.4079490063742032, + "grad_norm": 17.38669681328992, + "kl": 1.04296875, + "learning_rate": 1.475583199501389e-05, + "loss": 0.4368, + "reward": 2.6774919033050537, + "reward_std": 0.440396323800087, + "rewards/accuracy_reward": 0.7604166865348816, + "rewards/reasoning_steps_reward": 0.9722222238779068, + "rewards/repetition_penalty_reward": -0.03170954994857311, + "rewards/tag_count_reward": 0.9765625149011612, "step": 272 }, { "clip_ratio": 0.0, - "completion_length": 287.3697967529297, - "epoch": 0.8185907046476761, - "grad_norm": 106.41271848435063, - "kl": 6.5703125, - "learning_rate": 1.9221996276968523e-06, - "loss": 2.0641, - "reward": 2.389641523361206, - "reward_std": 1.0108384490013123, - "rewards/accuracy_reward": 0.7135416865348816, - "rewards/reasoning_steps_reward": 0.8628472536802292, - "rewards/repetition_penalty_reward": -0.007059881230816245, - "rewards/tag_count_reward": 0.8203125149011612, + "completion_length": 230.77605056762695, + "epoch": 0.4094488188976378, + "grad_norm": 14.315967716823838, + "kl": 1.7646484375, + "learning_rate": 1.4709630450474936e-05, + "loss": 0.679, + "reward": 2.7193071246147156, + "reward_std": 0.4441717490553856, + "rewards/accuracy_reward": 0.8385416865348816, + "rewards/reasoning_steps_reward": 0.9618055820465088, + "rewards/repetition_penalty_reward": -0.03676935099065304, + "rewards/tag_count_reward": 0.9557291865348816, "step": 273 }, { "clip_ratio": 0.0, - "completion_length": 218.31250762939453, - "epoch": 0.8215892053973014, - "grad_norm": 78.45170527728588, - "kl": 3.9296875, - "learning_rate": 1.8607095664675868e-06, - "loss": 1.7482, - "reward": 2.5906487703323364, - "reward_std": 0.8199838548898697, - "rewards/accuracy_reward": 0.8020833432674408, - "rewards/reasoning_steps_reward": 0.9062500149011612, - "rewards/repetition_penalty_reward": -0.007007573178270832, - "rewards/tag_count_reward": 0.8893229365348816, + "completion_length": 216.60938262939453, + "epoch": 0.41094863142107235, + "grad_norm": 12.679917824503232, + "kl": 2.2265625, + "learning_rate": 1.466329935757109e-05, + "loss": 0.6142, + "reward": 2.5890852212905884, + "reward_std": 0.3576386868953705, + "rewards/accuracy_reward": 0.6614583507180214, + "rewards/reasoning_steps_reward": 0.987847238779068, + "rewards/repetition_penalty_reward": -0.03157464787364006, + "rewards/tag_count_reward": 0.9713541865348816, "step": 274 }, { "clip_ratio": 0.0, - "completion_length": 255.67708587646484, - "epoch": 0.8245877061469266, - "grad_norm": 28.349825823882362, - "kl": 5.97265625, - "learning_rate": 1.8001180504185401e-06, - "loss": 1.9051, - "reward": 2.504952549934387, - "reward_std": 0.9115692526102066, - "rewards/accuracy_reward": 0.755208358168602, - "rewards/reasoning_steps_reward": 0.8923611491918564, - "rewards/repetition_penalty_reward": -0.0032940262462943792, - "rewards/tag_count_reward": 0.860677108168602, + "completion_length": 210.6302146911621, + "epoch": 0.4124484439445069, + "grad_norm": 6.724833772098711, + "kl": 1.8125, + "learning_rate": 1.4616839990737232e-05, + "loss": 0.4751, + "reward": 2.6611116528511047, + "reward_std": 0.3041349947452545, + "rewards/accuracy_reward": 0.7291666716337204, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.028124571312218904, + "rewards/tag_count_reward": 0.9739583432674408, "step": 275 }, { "clip_ratio": 0.0, - "completion_length": 223.79688262939453, - "epoch": 0.8275862068965517, - "grad_norm": 82.66659054217332, - "kl": 9.0390625, - "learning_rate": 1.7404317686112638e-06, - "loss": 2.4944, - "reward": 2.659183382987976, - "reward_std": 0.785422757267952, - "rewards/accuracy_reward": 0.8541667014360428, - "rewards/reasoning_steps_reward": 0.923611119389534, - "rewards/repetition_penalty_reward": -0.007917315931990743, - "rewards/tag_count_reward": 0.8893229365348816, + "completion_length": 241.07812881469727, + "epoch": 0.41394825646794153, + "grad_norm": 12.361659642014535, + "kl": 3.11328125, + "learning_rate": 1.4570253627936693e-05, + "loss": 0.8863, + "reward": 2.647101581096649, + "reward_std": 0.40448398888111115, + "rewards/accuracy_reward": 0.7447916865348816, + "rewards/reasoning_steps_reward": 0.987847238779068, + "rewards/repetition_penalty_reward": -0.030849804636090994, + "rewards/tag_count_reward": 0.9453125149011612, "step": 276 }, { "clip_ratio": 0.0, - "completion_length": 225.1197967529297, - "epoch": 0.8305847076461769, - "grad_norm": 115.53315284387001, - "kl": 9.71875, - "learning_rate": 1.6816573101730637e-06, - "loss": 2.136, - "reward": 2.557182252407074, - "reward_std": 0.806065671145916, - "rewards/accuracy_reward": 0.7968750149011612, - "rewards/reasoning_steps_reward": 0.8923610895872116, - "rewards/repetition_penalty_reward": -0.004449750704225153, - "rewards/tag_count_reward": 0.8723958432674408, + "completion_length": 203.4791717529297, + "epoch": 0.4154480689913761, + "grad_norm": 10.28432474158588, + "kl": 0.76171875, + "learning_rate": 1.4523541550626093e-05, + "loss": 0.349, + "reward": 2.594405770301819, + "reward_std": 0.1998637057840824, + "rewards/accuracy_reward": 0.6666667014360428, + "rewards/reasoning_steps_reward": 0.9843750298023224, + "rewards/repetition_penalty_reward": -0.04101098608225584, + "rewards/tag_count_reward": 0.984375, "step": 277 }, { "clip_ratio": 0.0, - "completion_length": 259.59375762939453, - "epoch": 0.8335832083958021, - "grad_norm": 55.03228946038301, - "kl": 5.38671875, - "learning_rate": 1.6238011635695849e-06, - "loss": 1.6557, - "reward": 2.512112319469452, - "reward_std": 0.8234718143939972, - "rewards/accuracy_reward": 0.7968750298023224, - "rewards/reasoning_steps_reward": 0.8802083730697632, - "rewards/repetition_penalty_reward": -0.0048148492351174355, - "rewards/tag_count_reward": 0.8398437649011612, + "completion_length": 200.7708396911621, + "epoch": 0.41694788151481066, + "grad_norm": 8.121334172436347, + "kl": 0.89990234375, + "learning_rate": 1.4476705043720099e-05, + "loss": 0.3742, + "reward": 2.8078061938285828, + "reward_std": 0.304847190156579, + "rewards/accuracy_reward": 0.8593750149011612, + "rewards/reasoning_steps_reward": 0.9895833432674408, + "rewards/repetition_penalty_reward": -0.02552723279222846, + "rewards/tag_count_reward": 0.984375, "step": 278 }, { "clip_ratio": 0.0, - "completion_length": 262.9635543823242, - "epoch": 0.8365817091454273, - "grad_norm": 64.0876715179611, - "kl": 6.34375, - "learning_rate": 1.5668697158885104e-06, - "loss": 2.0553, - "reward": 2.3902793526649475, - "reward_std": 0.9917967170476913, - "rewards/accuracy_reward": 0.6979166865348816, - "rewards/reasoning_steps_reward": 0.8541666567325592, - "rewards/repetition_penalty_reward": -0.0029499368683900684, - "rewards/tag_count_reward": 0.8411458432674408, + "completion_length": 205.4895896911621, + "epoch": 0.4184476940382452, + "grad_norm": 8.922984245945559, + "kl": 1.37744140625, + "learning_rate": 1.4429745395556073e-05, + "loss": 0.5918, + "reward": 2.665529727935791, + "reward_std": 0.33949872478842735, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9930555671453476, + "rewards/repetition_penalty_reward": -0.024140472058206797, + "rewards/tag_count_reward": 0.9674479365348816, "step": 279 }, { "clip_ratio": 0.0, - "completion_length": 214.36980056762695, - "epoch": 0.8395802098950524, - "grad_norm": 71.70526659015454, - "kl": 8.18359375, - "learning_rate": 1.5108692521344526e-06, - "loss": 2.302, - "reward": 2.5640260577201843, - "reward_std": 0.7752020061016083, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 0.9097222536802292, - "rewards/repetition_penalty_reward": -0.005852422269526869, - "rewards/tag_count_reward": 0.8893229365348816, + "completion_length": 197.20312881469727, + "epoch": 0.4199475065616798, + "grad_norm": 11.885955592817725, + "kl": 1.419921875, + "learning_rate": 1.4382663897858647e-05, + "loss": 0.3527, + "reward": 2.600801944732666, + "reward_std": 0.2686375230550766, + "rewards/accuracy_reward": 0.6510416865348816, + "rewards/reasoning_steps_reward": 0.9947916716337204, + "rewards/repetition_penalty_reward": -0.030708489008247852, + "rewards/tag_count_reward": 0.985677108168602, "step": 280 }, { "clip_ratio": 0.0, - "completion_length": 207.5260467529297, - "epoch": 0.8425787106446777, - "grad_norm": 41.429427085551154, - "kl": 7.34765625, - "learning_rate": 1.4558059545351144e-06, - "loss": 2.4269, - "reward": 2.5451850295066833, - "reward_std": 0.7376502603292465, - "rewards/accuracy_reward": 0.7447917014360428, - "rewards/reasoning_steps_reward": 0.9184028208255768, - "rewards/repetition_penalty_reward": -0.004728195344796404, - "rewards/tag_count_reward": 0.8867187798023224, + "completion_length": 277.1823043823242, + "epoch": 0.42144731908511435, + "grad_norm": 34.196754104798, + "kl": 5.865234375, + "learning_rate": 1.4335461845704173e-05, + "loss": 1.1386, + "reward": 2.416446477174759, + "reward_std": 0.47412654757499695, + "rewards/accuracy_reward": 0.5729166865348816, + "rewards/reasoning_steps_reward": 0.9548611342906952, + "rewards/repetition_penalty_reward": -0.024091816041618586, + "rewards/tag_count_reward": 0.9127604365348816, "step": 281 }, { "clip_ratio": 0.0, - "completion_length": 176.7864646911621, - "epoch": 0.8455772113943029, - "grad_norm": 83.02833138285867, - "kl": 2.6708984375, - "learning_rate": 1.4016859018587958e-06, - "loss": 1.3215, - "reward": 2.6956359148025513, - "reward_std": 0.533905953168869, - "rewards/accuracy_reward": 0.8437500298023224, - "rewards/reasoning_steps_reward": 0.9357638955116272, - "rewards/repetition_penalty_reward": -0.005753145087510347, - "rewards/tag_count_reward": 0.9218750149011612, + "completion_length": 220.23958587646484, + "epoch": 0.4229471316085489, + "grad_norm": 7.827483997656616, + "kl": 2.251953125, + "learning_rate": 1.428814053748512e-05, + "loss": 0.72, + "reward": 2.669346511363983, + "reward_std": 0.410087987780571, + "rewards/accuracy_reward": 0.7447916865348816, + "rewards/reasoning_steps_reward": 0.9791667014360428, + "rewards/repetition_penalty_reward": -0.02857015887275338, + "rewards/tag_count_reward": 0.9739583432674408, "step": 282 }, { "clip_ratio": 0.0, - "completion_length": 224.3854217529297, - "epoch": 0.848575712143928, - "grad_norm": 85.4845823002127, - "kl": 5.06640625, - "learning_rate": 1.3485150687433168e-06, - "loss": 2.0253, - "reward": 2.631519079208374, - "reward_std": 0.8587917536497116, - "rewards/accuracy_reward": 0.8593750149011612, - "rewards/reasoning_steps_reward": 0.9062499850988388, - "rewards/repetition_penalty_reward": -0.0038975586649030447, - "rewards/tag_count_reward": 0.8697916865348816, + "completion_length": 195.53125, + "epoch": 0.42444694413198353, + "grad_norm": 3.0528671937109757, + "kl": 0.5556640625, + "learning_rate": 1.4240701274874331e-05, + "loss": 0.1773, + "reward": 2.6338695287704468, + "reward_std": 0.2932188091799617, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.987847238779068, + "rewards/repetition_penalty_reward": -0.03236319124698639, + "rewards/tag_count_reward": 0.9908854216337204, "step": 283 }, { "clip_ratio": 0.0, - "completion_length": 234.18230056762695, - "epoch": 0.8515742128935532, - "grad_norm": 74.49346765573262, - "kl": 10.109375, - "learning_rate": 1.2962993250364541e-06, - "loss": 2.5552, - "reward": 2.4909926652908325, - "reward_std": 0.8654274046421051, - "rewards/accuracy_reward": 0.739583358168602, - "rewards/reasoning_steps_reward": 0.8906250447034836, - "rewards/repetition_penalty_reward": -0.009007335815113038, - "rewards/tag_count_reward": 0.8697916716337204, + "completion_length": 196.04687881469727, + "epoch": 0.4259467566554181, + "grad_norm": 16.521107944862717, + "kl": 1.06494140625, + "learning_rate": 1.419314536278925e-05, + "loss": 0.6407, + "reward": 2.675931394100189, + "reward_std": 0.3121615252457559, + "rewards/accuracy_reward": 0.755208358168602, + "rewards/reasoning_steps_reward": 0.9826389104127884, + "rewards/repetition_penalty_reward": -0.030665938276797533, + "rewards/tag_count_reward": 0.9687500149011612, "step": 284 }, { "clip_ratio": 0.0, - "completion_length": 238.6614646911621, - "epoch": 0.8545727136431784, - "grad_norm": 172.140865328852, - "kl": 10.84375, - "learning_rate": 1.2450444351479196e-06, - "loss": 2.3458, - "reward": 2.4398276805877686, - "reward_std": 0.7756916880607605, - "rewards/accuracy_reward": 0.6614583507180214, - "rewards/reasoning_steps_reward": 0.9114583432674408, - "rewards/repetition_penalty_reward": -0.006786908605135977, - "rewards/tag_count_reward": 0.8736979365348816, + "completion_length": 204.18750381469727, + "epoch": 0.42744656917885265, + "grad_norm": 5.497622639890418, + "kl": 0.99462890625, + "learning_rate": 1.4145474109356008e-05, + "loss": 0.4354, + "reward": 2.717790961265564, + "reward_std": 0.28716015443205833, + "rewards/accuracy_reward": 0.7760416865348816, + "rewards/reasoning_steps_reward": 0.987847238779068, + "rewards/repetition_penalty_reward": -0.02656660182401538, + "rewards/tag_count_reward": 0.9804687649011612, "step": 285 }, { "clip_ratio": 0.0, - "completion_length": 184.40625762939453, - "epoch": 0.8575712143928036, - "grad_norm": 33.813378116083186, - "kl": 4.0390625, - "learning_rate": 1.1947560574130013e-06, - "loss": 1.2931, - "reward": 2.638872265815735, - "reward_std": 0.5787665322422981, - "rewards/accuracy_reward": 0.7968750149011612, - "rewards/reasoning_steps_reward": 0.9392361640930176, - "rewards/repetition_penalty_reward": -0.006093142030294985, - "rewards/tag_count_reward": 0.9088541865348816, + "completion_length": 246.9531364440918, + "epoch": 0.4289463817022872, + "grad_norm": 11.12837490424536, + "kl": 3.10546875, + "learning_rate": 1.4097688825873437e-05, + "loss": 0.8672, + "reward": 2.6280420422554016, + "reward_std": 0.39313384145498276, + "rewards/accuracy_reward": 0.7343750149011612, + "rewards/reasoning_steps_reward": 0.975694477558136, + "rewards/repetition_penalty_reward": -0.03905859449878335, + "rewards/tag_count_reward": 0.9570312798023224, "step": 286 }, { "clip_ratio": 0.0, - "completion_length": 267.8020935058594, - "epoch": 0.8605697151424287, - "grad_norm": 29.38596871781673, - "kl": 7.890625, - "learning_rate": 1.1454397434679022e-06, - "loss": 2.2911, - "reward": 2.4633301496505737, - "reward_std": 1.00086210668087, - "rewards/accuracy_reward": 0.755208358168602, - "rewards/reasoning_steps_reward": 0.8715277910232544, - "rewards/repetition_penalty_reward": -0.0019476997549645603, - "rewards/tag_count_reward": 0.8385416865348816, + "completion_length": 230.0572967529297, + "epoch": 0.4304461942257218, + "grad_norm": 9.306320853861074, + "kl": 2.4326171875, + "learning_rate": 1.4049790826777016e-05, + "loss": 0.8369, + "reward": 2.5040841102600098, + "reward_std": 0.3208254538476467, + "rewards/accuracy_reward": 0.5781250074505806, + "rewards/reasoning_steps_reward": 0.9965277910232544, + "rewards/repetition_penalty_reward": -0.031506254337728024, + "rewards/tag_count_reward": 0.9609375149011612, "step": 287 }, { "clip_ratio": 0.0, - "completion_length": 195.26562881469727, - "epoch": 0.863568215892054, - "grad_norm": 22.60777483816971, - "kl": 5.671875, - "learning_rate": 1.0971009376368614e-06, - "loss": 1.9524, - "reward": 2.4915042221546173, - "reward_std": 0.6768720299005508, - "rewards/accuracy_reward": 0.6718750149011612, - "rewards/reasoning_steps_reward": 0.923611119389534, - "rewards/repetition_penalty_reward": -0.00632570160087198, - "rewards/tag_count_reward": 0.9023437798023224, + "completion_length": 217.78125381469727, + "epoch": 0.43194600674915634, + "grad_norm": 5.585862488308667, + "kl": 1.44921875, + "learning_rate": 1.4001781429602704e-05, + "loss": 0.6499, + "reward": 2.556925117969513, + "reward_std": 0.42149753123521805, + "rewards/accuracy_reward": 0.6093750260770321, + "rewards/reasoning_steps_reward": 0.9947916716337204, + "rewards/repetition_penalty_reward": -0.019897868391126394, + "rewards/tag_count_reward": 0.97265625, "step": 288 }, { "clip_ratio": 0.0, - "completion_length": 230.11458587646484, - "epoch": 0.8665667166416792, - "grad_norm": 203.49207627713312, - "kl": 12.6796875, - "learning_rate": 1.049744976331124e-06, - "loss": 3.1976, - "reward": 2.5564926862716675, - "reward_std": 0.9073340892791748, - "rewards/accuracy_reward": 0.7968750149011612, - "rewards/reasoning_steps_reward": 0.8958333432674408, - "rewards/repetition_penalty_reward": -0.0047051976434886456, - "rewards/tag_count_reward": 0.868489608168602, + "completion_length": 207.7552146911621, + "epoch": 0.4334458192725909, + "grad_norm": 5.688055347158159, + "kl": 1.3486328125, + "learning_rate": 1.3953661954950693e-05, + "loss": 0.5048, + "reward": 2.4292953610420227, + "reward_std": 0.3075702078640461, + "rewards/accuracy_reward": 0.5000000102445483, + "rewards/reasoning_steps_reward": 0.9913194477558136, + "rewards/repetition_penalty_reward": -0.03207629453390837, + "rewards/tag_count_reward": 0.970052108168602, "step": 289 }, { "clip_ratio": 0.0, - "completion_length": 198.89062881469727, - "epoch": 0.8695652173913043, - "grad_norm": 124.52484685260382, - "kl": 9.375, - "learning_rate": 1.0033770874598226e-06, - "loss": 2.8325, - "reward": 2.5556147694587708, - "reward_std": 0.8008880615234375, - "rewards/accuracy_reward": 0.7447916865348816, - "rewards/reasoning_steps_reward": 0.9201388955116272, - "rewards/repetition_penalty_reward": -0.003847024345304817, - "rewards/tag_count_reward": 0.8945312798023224, + "completion_length": 187.9166717529297, + "epoch": 0.4349456317960255, + "grad_norm": 1.8549641962399313, + "kl": 0.71044921875, + "learning_rate": 1.3905433726449102e-05, + "loss": 0.251, + "reward": 2.6722521781921387, + "reward_std": 0.3378063030540943, + "rewards/accuracy_reward": 0.7239583432674408, + "rewards/reasoning_steps_reward": 0.9965277910232544, + "rewards/repetition_penalty_reward": -0.03651529923081398, + "rewards/tag_count_reward": 0.9882812649011612, "step": 290 }, { "clip_ratio": 0.0, - "completion_length": 214.3177146911621, - "epoch": 0.8725637181409296, - "grad_norm": 58.07106802782898, - "kl": 4.95703125, - "learning_rate": 9.580023898528346e-07, - "loss": 2.0119, - "reward": 2.5350549817085266, - "reward_std": 0.7193208187818527, - "rewards/accuracy_reward": 0.729166679084301, - "rewards/reasoning_steps_reward": 0.923611119389534, - "rewards/repetition_penalty_reward": -0.012254099652636796, - "rewards/tag_count_reward": 0.8945312649011612, + "completion_length": 193.82813262939453, + "epoch": 0.4364454443194601, + "grad_norm": 2.2180974772052453, + "kl": 1.0986328125, + "learning_rate": 1.3857098070717543e-05, + "loss": 0.5164, + "reward": 2.7617053389549255, + "reward_std": 0.3312147632241249, + "rewards/accuracy_reward": 0.8177083432674408, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.022582892794162035, + "rewards/tag_count_reward": 0.9804687649011612, "step": 291 }, { "clip_ratio": 0.0, - "completion_length": 212.4583396911621, - "epoch": 0.8755622188905547, - "grad_norm": 56.388506410538476, - "kl": 5.796875, - "learning_rate": 9.136258926956887e-07, - "loss": 1.9009, - "reward": 2.553084135055542, - "reward_std": 0.7310795336961746, - "rewards/accuracy_reward": 0.7031250149011612, - "rewards/reasoning_steps_reward": 0.9531250298023224, - "rewards/repetition_penalty_reward": -0.006811692088376731, - "rewards/tag_count_reward": 0.9036458432674408, + "completion_length": 235.54688262939453, + "epoch": 0.43794525684289465, + "grad_norm": 12.18787699766214, + "kl": 2.185546875, + "learning_rate": 1.3808656317330646e-05, + "loss": 0.7891, + "reward": 2.4147286415100098, + "reward_std": 0.5803365781903267, + "rewards/accuracy_reward": 0.5572916828095913, + "rewards/reasoning_steps_reward": 0.9565972685813904, + "rewards/repetition_penalty_reward": -0.05619149189442396, + "rewards/tag_count_reward": 0.9570312649011612, "step": 292 }, { "clip_ratio": 0.0, - "completion_length": 241.4270896911621, - "epoch": 0.8785607196401799, - "grad_norm": 28.637264291998722, - "kl": 9.59375, - "learning_rate": 8.702524949765645e-07, - "loss": 2.5645, - "reward": 2.4492486715316772, - "reward_std": 0.9317169636487961, - "rewards/accuracy_reward": 0.7135416865348816, - "rewards/reasoning_steps_reward": 0.8854167014360428, - "rewards/repetition_penalty_reward": -0.003876364848110825, - "rewards/tag_count_reward": 0.8541666865348816, + "completion_length": 226.35937881469727, + "epoch": 0.4394450693663292, + "grad_norm": 14.298340510742703, + "kl": 1.6005859375, + "learning_rate": 1.3760109798781489e-05, + "loss": 0.6926, + "reward": 2.744252860546112, + "reward_std": 0.3814915865659714, + "rewards/accuracy_reward": 0.8229166716337204, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.053924234583973885, + "rewards/tag_count_reward": 0.9752604365348816, "step": 293 }, { "clip_ratio": 0.0, - "completion_length": 176.4166717529297, - "epoch": 0.881559220389805, - "grad_norm": 62.774529626199765, - "kl": 5.64453125, - "learning_rate": 8.278869849454718e-07, - "loss": 2.3349, - "reward": 2.748142123222351, - "reward_std": 0.7123552411794662, - "rewards/accuracy_reward": 0.9010416865348816, - "rewards/reasoning_steps_reward": 0.9340278208255768, - "rewards/repetition_penalty_reward": -0.003594138892367482, - "rewards/tag_count_reward": 0.9166666865348816, + "completion_length": 206.0677146911621, + "epoch": 0.4409448818897638, + "grad_norm": 1.5380278951289539, + "kl": 0.818359375, + "learning_rate": 1.3711459850444923e-05, + "loss": 0.4797, + "reward": 2.565054178237915, + "reward_std": 0.43462975323200226, + "rewards/accuracy_reward": 0.6562500149011612, + "rewards/reasoning_steps_reward": 0.982638880610466, + "rewards/repetition_penalty_reward": -0.047793143428862095, + "rewards/tag_count_reward": 0.9739583432674408, "step": 294 }, { "clip_ratio": 0.0, - "completion_length": 257.51042556762695, - "epoch": 0.8845577211394303, - "grad_norm": 128.72898033686099, - "kl": 11.6328125, - "learning_rate": 7.865340395856325e-07, - "loss": 2.9018, - "reward": 2.4993616342544556, - "reward_std": 0.9201382249593735, - "rewards/accuracy_reward": 0.7656250149011612, - "rewards/reasoning_steps_reward": 0.8836805820465088, - "rewards/repetition_penalty_reward": -0.0041107177385129035, - "rewards/tag_count_reward": 0.8541666865348816, + "completion_length": 216.1822967529297, + "epoch": 0.44244469441319834, + "grad_norm": 2.6940921154273263, + "kl": 0.70263671875, + "learning_rate": 1.3662707810540867e-05, + "loss": 0.4852, + "reward": 2.7508795261383057, + "reward_std": 0.4059586077928543, + "rewards/accuracy_reward": 0.8281250298023224, + "rewards/reasoning_steps_reward": 0.9965277761220932, + "rewards/repetition_penalty_reward": -0.04642965644598007, + "rewards/tag_count_reward": 0.9726562649011612, "step": 295 }, { "clip_ratio": 0.0, - "completion_length": 208.9895896911621, - "epoch": 0.8875562218890555, - "grad_norm": 19.58108496310109, - "kl": 6.859375, - "learning_rate": 7.461982240971799e-07, - "loss": 2.4699, - "reward": 2.625687062740326, - "reward_std": 0.8259201794862747, - "rewards/accuracy_reward": 0.8072916865348816, - "rewards/reasoning_steps_reward": 0.9322916716337204, - "rewards/repetition_penalty_reward": -0.0032192860962823033, - "rewards/tag_count_reward": 0.8893229216337204, + "completion_length": 205.18230056762695, + "epoch": 0.4439445069366329, + "grad_norm": 3.103564524085595, + "kl": 0.60400390625, + "learning_rate": 1.3613855020097477e-05, + "loss": 0.4706, + "reward": 2.7546513080596924, + "reward_std": 0.3886885233223438, + "rewards/accuracy_reward": 0.8437500149011612, + "rewards/reasoning_steps_reward": 0.9895833432674408, + "rewards/repetition_penalty_reward": -0.05133842211216688, + "rewards/tag_count_reward": 0.9726562649011612, "step": 296 }, { "clip_ratio": 0.0, - "completion_length": 212.17708587646484, - "epoch": 0.8905547226386806, - "grad_norm": 33.259888962364116, - "kl": 6.0, - "learning_rate": 7.068839913931646e-07, - "loss": 2.0135, - "reward": 2.681620717048645, - "reward_std": 0.7299316972494125, - "rewards/accuracy_reward": 0.8645833432674408, - "rewards/reasoning_steps_reward": 0.9322916865348816, - "rewards/repetition_penalty_reward": -0.004577249084832147, - "rewards/tag_count_reward": 0.8893229365348816, + "completion_length": 208.4427146911621, + "epoch": 0.4454443194600675, + "grad_norm": 2.4514035939782155, + "kl": 0.61279296875, + "learning_rate": 1.3564902822914274e-05, + "loss": 0.4279, + "reward": 2.7078776359558105, + "reward_std": 0.3973502516746521, + "rewards/accuracy_reward": 0.7968750149011612, + "rewards/reasoning_steps_reward": 0.9826388955116272, + "rewards/repetition_penalty_reward": -0.05340719223022461, + "rewards/tag_count_reward": 0.9817708432674408, "step": 297 }, { "clip_ratio": 0.0, - "completion_length": 241.8229217529297, - "epoch": 0.8935532233883059, - "grad_norm": 47.82236374571516, - "kl": 7.6875, - "learning_rate": 6.685956816079753e-07, - "loss": 2.4039, - "reward": 2.5991902351379395, - "reward_std": 0.863490641117096, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/reasoning_steps_reward": 0.9062500149011612, - "rewards/repetition_penalty_reward": -0.004976554249878973, - "rewards/tag_count_reward": 0.8645833730697632, + "completion_length": 203.4322967529297, + "epoch": 0.4469441319835021, + "grad_norm": 16.752659312125665, + "kl": 0.775390625, + "learning_rate": 1.3515852565525167e-05, + "loss": 0.3563, + "reward": 2.8165610432624817, + "reward_std": 0.28164857253432274, + "rewards/accuracy_reward": 0.880208358168602, + "rewards/reasoning_steps_reward": 0.9965277910232544, + "rewards/repetition_penalty_reward": -0.05106059880927205, + "rewards/tag_count_reward": 0.9908854365348816, "step": 298 }, { "clip_ratio": 0.0, - "completion_length": 199.1666717529297, - "epoch": 0.896551724137931, - "grad_norm": 21.908160424151905, - "kl": 6.2890625, - "learning_rate": 6.313375216182039e-07, - "loss": 2.1573, - "reward": 2.705993890762329, - "reward_std": 0.7558012455701828, - "rewards/accuracy_reward": 0.8750000149011612, - "rewards/reasoning_steps_reward": 0.9288194626569748, - "rewards/repetition_penalty_reward": -0.00277355604339391, - "rewards/tag_count_reward": 0.9049479365348816, + "completion_length": 196.20833587646484, + "epoch": 0.44844394450693664, + "grad_norm": 29.22658507606216, + "kl": 0.84423828125, + "learning_rate": 1.3466705597161416e-05, + "loss": 0.4067, + "reward": 2.764431357383728, + "reward_std": 0.3144514746963978, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9809028208255768, + "rewards/repetition_penalty_reward": -0.05240898672491312, + "rewards/tag_count_reward": 0.9817708432674408, "step": 299 }, { "clip_ratio": 0.0, - "completion_length": 204.30730056762695, - "epoch": 0.8995502248875562, - "grad_norm": 122.95452439926706, - "kl": 8.9375, - "learning_rate": 5.951136245760181e-07, - "loss": 2.666, - "reward": 2.5854055881500244, - "reward_std": 0.6643490642309189, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9496528059244156, - "rewards/repetition_penalty_reward": -0.0061743835103698075, - "rewards/tag_count_reward": 0.9127604365348816, + "completion_length": 188.73437881469727, + "epoch": 0.4499437570303712, + "grad_norm": 0.9955419431414791, + "kl": 0.42578125, + "learning_rate": 1.3417463269714525e-05, + "loss": 0.2115, + "reward": 2.706632077693939, + "reward_std": 0.2604191384743899, + "rewards/accuracy_reward": 0.7656250149011612, + "rewards/reasoning_steps_reward": 0.9895833432674408, + "rewards/repetition_penalty_reward": -0.03555556107312441, + "rewards/tag_count_reward": 0.9869791865348816, "step": 300 }, { "clip_ratio": 0.0, - "completion_length": 276.95314025878906, - "epoch": 0.9025487256371814, - "grad_norm": 295.5147824374305, - "kl": 15.828125, - "learning_rate": 5.599279894550824e-07, - "loss": 3.1455, - "reward": 2.4131381511688232, - "reward_std": 0.8852957636117935, - "rewards/accuracy_reward": 0.7031250149011612, - "rewards/reasoning_steps_reward": 0.8819444477558136, - "rewards/repetition_penalty_reward": -0.003962728020269424, - "rewards/tag_count_reward": 0.8320312649011612, + "completion_length": 212.2395896911621, + "epoch": 0.45144356955380577, + "grad_norm": 1.461352147416173, + "kl": 0.5546875, + "learning_rate": 1.3368126937699055e-05, + "loss": 0.4384, + "reward": 2.648770272731781, + "reward_std": 0.4800826385617256, + "rewards/accuracy_reward": 0.760416679084301, + "rewards/reasoning_steps_reward": 0.9739583432674408, + "rewards/repetition_penalty_reward": -0.06346932239830494, + "rewards/tag_count_reward": 0.9778645932674408, "step": 301 }, { "clip_ratio": 0.0, - "completion_length": 212.93750762939453, - "epoch": 0.9055472263868066, - "grad_norm": 70.53830308263373, - "kl": 8.5390625, - "learning_rate": 5.257845006090911e-07, - "loss": 2.4792, - "reward": 2.5973342061042786, - "reward_std": 0.7405329048633575, - "rewards/accuracy_reward": 0.786458358168602, - "rewards/reasoning_steps_reward": 0.9322916865348816, - "rewards/repetition_penalty_reward": -0.005530498514417559, - "rewards/tag_count_reward": 0.884114608168602, + "completion_length": 228.32813262939453, + "epoch": 0.45294338207724033, + "grad_norm": 1.0886524612409492, + "kl": 0.60986328125, + "learning_rate": 1.3318697958215358e-05, + "loss": 0.4636, + "reward": 2.561100661754608, + "reward_std": 0.5553513169288635, + "rewards/accuracy_reward": 0.6979166865348816, + "rewards/reasoning_steps_reward": 0.9704861342906952, + "rewards/repetition_penalty_reward": -0.06823972798883915, + "rewards/tag_count_reward": 0.9609375149011612, "step": 302 }, { "clip_ratio": 0.0, - "completion_length": 257.29167556762695, - "epoch": 0.9085457271364318, - "grad_norm": 22.013195631274943, - "kl": 6.3125, - "learning_rate": 4.926869273429447e-07, - "loss": 1.8309, - "reward": 2.465520918369293, - "reward_std": 0.7833161950111389, - "rewards/accuracy_reward": 0.7135416865348816, - "rewards/reasoning_steps_reward": 0.9062500298023224, - "rewards/repetition_penalty_reward": -0.009739560075104237, - "rewards/tag_count_reward": 0.8554687798023224, + "completion_length": 191.9166717529297, + "epoch": 0.4544431946006749, + "grad_norm": 4.083408704872753, + "kl": 0.5341796875, + "learning_rate": 1.3269177690912244e-05, + "loss": 0.4274, + "reward": 2.4237236976623535, + "reward_std": 0.3847580924630165, + "rewards/accuracy_reward": 0.5000000149011612, + "rewards/reasoning_steps_reward": 0.9826389253139496, + "rewards/repetition_penalty_reward": -0.040686123073101044, + "rewards/tag_count_reward": 0.9817708432674408, "step": 303 }, { "clip_ratio": 0.0, - "completion_length": 252.58854293823242, - "epoch": 0.9115442278860569, - "grad_norm": 88.95393283353653, - "kl": 5.8515625, - "learning_rate": 4.606389234966424e-07, - "loss": 1.9041, - "reward": 2.496855139732361, - "reward_std": 0.8814668357372284, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 0.88368059694767, - "rewards/repetition_penalty_reward": -0.00531507984851487, - "rewards/tag_count_reward": 0.8476562649011612, + "completion_length": 180.50000381469727, + "epoch": 0.4559430071241095, + "grad_norm": 5.326097299416955, + "kl": 0.65478515625, + "learning_rate": 1.3219567497949603e-05, + "loss": 0.2812, + "reward": 2.7030047178268433, + "reward_std": 0.3447668179869652, + "rewards/accuracy_reward": 0.7760416865348816, + "rewards/reasoning_steps_reward": 0.987847238779068, + "rewards/repetition_penalty_reward": -0.04786340892314911, + "rewards/tag_count_reward": 0.9869791865348816, "step": 304 }, { "clip_ratio": 0.0, - "completion_length": 211.03125381469727, - "epoch": 0.9145427286356822, - "grad_norm": 152.53165269997837, - "kl": 4.96484375, - "learning_rate": 4.2964402704190555e-07, - "loss": 2.153, - "reward": 2.586493730545044, - "reward_std": 0.8061617463827133, - "rewards/accuracy_reward": 0.786458358168602, - "rewards/reasoning_steps_reward": 0.920138880610466, - "rewards/repetition_penalty_reward": -0.0042182166944257915, - "rewards/tag_count_reward": 0.8841145932674408, + "completion_length": 197.6145896911621, + "epoch": 0.4574428196475441, + "grad_norm": 3.907793025870675, + "kl": 0.7373046875, + "learning_rate": 1.3169868743960904e-05, + "loss": 0.6404, + "reward": 2.686201572418213, + "reward_std": 0.4964783936738968, + "rewards/accuracy_reward": 0.802083358168602, + "rewards/reasoning_steps_reward": 0.970486119389534, + "rewards/repetition_penalty_reward": -0.05251379404217005, + "rewards/tag_count_reward": 0.9661458432674408, "step": 305 }, { "clip_ratio": 0.0, - "completion_length": 216.83333587646484, - "epoch": 0.9175412293853074, - "grad_norm": 88.75542795978963, - "kl": 6.0703125, - "learning_rate": 3.997056596916038e-07, - "loss": 2.1542, - "reward": 2.656059443950653, - "reward_std": 0.79727503657341, - "rewards/accuracy_reward": 0.8645833432674408, - "rewards/reasoning_steps_reward": 0.91493059694767, - "rewards/repetition_penalty_reward": -0.0036628567904699594, - "rewards/tag_count_reward": 0.8802083432674408, + "completion_length": 227.15625381469727, + "epoch": 0.45894263217097864, + "grad_norm": 1.3757226995738794, + "kl": 0.8720703125, + "learning_rate": 1.3120082796015694e-05, + "loss": 0.5179, + "reward": 2.551712214946747, + "reward_std": 0.4617387279868126, + "rewards/accuracy_reward": 0.7031250223517418, + "rewards/reasoning_steps_reward": 0.963541716337204, + "rewards/repetition_penalty_reward": -0.06938168965280056, + "rewards/tag_count_reward": 0.954427108168602, "step": 306 }, { "clip_ratio": 0.0, - "completion_length": 208.17188262939453, - "epoch": 0.9205397301349325, - "grad_norm": 75.82177154852495, - "kl": 8.06640625, - "learning_rate": 3.708271265220087e-07, - "loss": 2.1738, - "reward": 2.6760451197624207, - "reward_std": 0.7257892712950706, - "rewards/accuracy_reward": 0.880208358168602, - "rewards/reasoning_steps_reward": 0.9166666865348816, - "rewards/repetition_penalty_reward": -0.006246598088182509, - "rewards/tag_count_reward": 0.8854166865348816, + "completion_length": 191.1302146911621, + "epoch": 0.4604424446944132, + "grad_norm": 2.380244199934681, + "kl": 0.49951171875, + "learning_rate": 1.3070211023581959e-05, + "loss": 0.3394, + "reward": 2.5620276927948, + "reward_std": 0.2889845799654722, + "rewards/accuracy_reward": 0.635416679084301, + "rewards/reasoning_steps_reward": 0.9895833432674408, + "rewards/repetition_penalty_reward": -0.047347418032586575, + "rewards/tag_count_reward": 0.9843750149011612, "step": 307 }, { "clip_ratio": 0.0, - "completion_length": 226.62500381469727, - "epoch": 0.9235382308845578, - "grad_norm": 167.48098467184784, - "kl": 10.15625, - "learning_rate": 3.430116156079277e-07, - "loss": 2.5838, - "reward": 2.539145290851593, - "reward_std": 0.8195622712373734, - "rewards/accuracy_reward": 0.7552083432674408, - "rewards/reasoning_steps_reward": 0.9166666865348816, - "rewards/repetition_penalty_reward": -0.006427670712582767, - "rewards/tag_count_reward": 0.8736979216337204, + "completion_length": 195.4479217529297, + "epoch": 0.46194225721784776, + "grad_norm": 3.432394493893828, + "kl": 0.86669921875, + "learning_rate": 1.302025479848847e-05, + "loss": 0.6401, + "reward": 2.608788251876831, + "reward_std": 0.3985390290617943, + "rewards/accuracy_reward": 0.7135416865348816, + "rewards/reasoning_steps_reward": 0.975694477558136, + "rewards/repetition_penalty_reward": -0.05050008138641715, + "rewards/tag_count_reward": 0.970052108168602, "step": 308 }, { "clip_ratio": 0.0, - "completion_length": 250.46875381469727, - "epoch": 0.9265367316341829, - "grad_norm": 188.4999317880181, - "kl": 12.703125, - "learning_rate": 3.1626219767075584e-07, - "loss": 2.8945, - "reward": 2.4836193919181824, - "reward_std": 0.8600171506404877, - "rewards/accuracy_reward": 0.739583358168602, - "rewards/reasoning_steps_reward": 0.9079861342906952, - "rewards/repetition_penalty_reward": -0.01551259565167129, - "rewards/tag_count_reward": 0.8515625149011612, + "completion_length": 182.7604217529297, + "epoch": 0.4634420697412823, + "grad_norm": 1.720744202323982, + "kl": 0.62353515625, + "learning_rate": 1.2970215494887057e-05, + "loss": 0.496, + "reward": 2.7085583209991455, + "reward_std": 0.3612633068114519, + "rewards/accuracy_reward": 0.7812500298023224, + "rewards/reasoning_steps_reward": 0.989583358168602, + "rewards/repetition_penalty_reward": -0.04144172929227352, + "rewards/tag_count_reward": 0.9791666865348816, "step": 309 }, { "clip_ratio": 0.0, - "completion_length": 245.3020896911621, - "epoch": 0.9295352323838081, - "grad_norm": 61.9794266708475, - "kl": 9.3046875, - "learning_rate": 2.905818257394799e-07, - "loss": 2.2241, - "reward": 2.4837332367897034, - "reward_std": 0.7677433341741562, - "rewards/accuracy_reward": 0.7291666716337204, - "rewards/reasoning_steps_reward": 0.9079861491918564, - "rewards/repetition_penalty_reward": -0.004982162558007985, - "rewards/tag_count_reward": 0.8515625149011612, + "completion_length": 182.78646087646484, + "epoch": 0.4649418822647169, + "grad_norm": 3.1536070169922437, + "kl": 0.6181640625, + "learning_rate": 1.2920094489214794e-05, + "loss": 0.4513, + "reward": 2.6318033933639526, + "reward_std": 0.3417645953595638, + "rewards/accuracy_reward": 0.708333358168602, + "rewards/reasoning_steps_reward": 0.9895833432674408, + "rewards/repetition_penalty_reward": -0.049186245538294315, + "rewards/tag_count_reward": 0.9830729365348816, "step": 310 }, { "clip_ratio": 0.0, - "completion_length": 223.09375762939453, - "epoch": 0.9325337331334332, - "grad_norm": 38.9834806127014, - "kl": 6.96875, - "learning_rate": 2.659733348246685e-07, - "loss": 2.2927, - "reward": 2.5947685837745667, - "reward_std": 0.8158188462257385, - "rewards/accuracy_reward": 0.8072916716337204, - "rewards/reasoning_steps_reward": 0.9149305671453476, - "rewards/repetition_penalty_reward": -0.005057800153736025, - "rewards/tag_count_reward": 0.8776041716337204, + "completion_length": 172.5208396911621, + "epoch": 0.4664416947881515, + "grad_norm": 2.2535506878920915, + "kl": 0.5439453125, + "learning_rate": 1.2869893160156144e-05, + "loss": 0.2928, + "reward": 2.554309666156769, + "reward_std": 0.3680322840809822, + "rewards/accuracy_reward": 0.635416679084301, + "rewards/reasoning_steps_reward": 0.9826388955116272, + "rewards/repetition_penalty_reward": -0.042912650387734175, + "rewards/tag_count_reward": 0.9791666716337204, "step": 311 }, { "clip_ratio": 0.0, - "completion_length": 237.89063262939453, - "epoch": 0.9355322338830585, - "grad_norm": 93.39130786178272, - "kl": 5.546875, - "learning_rate": 2.4243944160550757e-07, - "loss": 1.9383, - "reward": 2.5787097811698914, - "reward_std": 0.834644541144371, - "rewards/accuracy_reward": 0.817708358168602, - "rewards/reasoning_steps_reward": 0.9010417014360428, - "rewards/repetition_penalty_reward": -0.007227733498439193, - "rewards/tag_count_reward": 0.8671875149011612, + "completion_length": 169.9322967529297, + "epoch": 0.46794150731158607, + "grad_norm": 1.1774122557707811, + "kl": 0.40576171875, + "learning_rate": 1.2819612888605038e-05, + "loss": 0.3948, + "reward": 2.694904148578644, + "reward_std": 0.34949270635843277, + "rewards/accuracy_reward": 0.7552083432674408, + "rewards/reasoning_steps_reward": 0.989583358168602, + "rewards/repetition_penalty_reward": -0.03426253283396363, + "rewards/tag_count_reward": 0.9843750149011612, "step": 312 }, { "clip_ratio": 0.0, - "completion_length": 220.5572967529297, - "epoch": 0.9385307346326837, - "grad_norm": 75.42828643371548, - "kl": 5.5546875, - "learning_rate": 2.199827441298863e-07, - "loss": 1.8684, - "reward": 2.61844664812088, - "reward_std": 0.8282945156097412, - "rewards/accuracy_reward": 0.8437500149011612, - "rewards/reasoning_steps_reward": 0.9062500298023224, - "rewards/repetition_penalty_reward": -0.006553410203196108, - "rewards/tag_count_reward": 0.8750000149011612, + "completion_length": 163.50000381469727, + "epoch": 0.46944131983502063, + "grad_norm": 0.6398131246264415, + "kl": 0.33056640625, + "learning_rate": 1.2769255057626879e-05, + "loss": 0.1586, + "reward": 2.7207802534103394, + "reward_std": 0.12394981307443231, + "rewards/accuracy_reward": 0.7552083432674408, + "rewards/reasoning_steps_reward": 0.9947916716337204, + "rewards/repetition_penalty_reward": -0.0240114142652601, + "rewards/tag_count_reward": 0.9947916865348816, "step": 313 }, { "clip_ratio": 0.0, - "completion_length": 254.90625762939453, - "epoch": 0.9415292353823088, - "grad_norm": 55.21000753237673, - "kl": 6.890625, - "learning_rate": 1.986057215275816e-07, - "loss": 2.2089, - "reward": 2.4655805230140686, - "reward_std": 0.8631928116083145, - "rewards/accuracy_reward": 0.7135416865348816, - "rewards/reasoning_steps_reward": 0.9010416716337204, - "rewards/repetition_penalty_reward": -0.004471502033993602, - "rewards/tag_count_reward": 0.8554687649011612, + "completion_length": 162.06250762939453, + "epoch": 0.4709411323584552, + "grad_norm": 1.0331986505694868, + "kl": 0.3564453125, + "learning_rate": 1.2718821052420518e-05, + "loss": 0.3133, + "reward": 2.7166225910186768, + "reward_std": 0.23217968584503978, + "rewards/accuracy_reward": 0.7656250298023224, + "rewards/reasoning_steps_reward": 0.9913194626569748, + "rewards/repetition_penalty_reward": -0.029905199247878045, + "rewards/tag_count_reward": 0.9895833432674408, "step": 314 }, { "clip_ratio": 0.0, - "completion_length": 199.2916717529297, - "epoch": 0.9445277361319341, - "grad_norm": 13.351484586247325, - "kl": 4.96875, - "learning_rate": 1.7831073373657527e-07, - "loss": 1.2923, - "reward": 2.658263087272644, - "reward_std": 0.573756992816925, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/reasoning_steps_reward": 0.9236111491918564, - "rewards/repetition_penalty_reward": -0.00753560135490261, - "rewards/tag_count_reward": 0.9088541716337204, + "completion_length": 159.98438262939453, + "epoch": 0.47244094488188976, + "grad_norm": 0.8428674192959495, + "kl": 0.3876953125, + "learning_rate": 1.2668312260280136e-05, + "loss": 0.1753, + "reward": 2.8209707736968994, + "reward_std": 0.2351000364869833, + "rewards/accuracy_reward": 0.8750000149011612, + "rewards/reasoning_steps_reward": 0.9843750298023224, + "rewards/repetition_penalty_reward": -0.025383584201335907, + "rewards/tag_count_reward": 0.9869791865348816, "step": 315 }, { "clip_ratio": 0.0, - "completion_length": 211.89583587646484, - "epoch": 0.9475262368815592, - "grad_norm": 14.409679464629958, - "kl": 7.03125, - "learning_rate": 1.5910002124251979e-07, - "loss": 2.3063, - "reward": 2.6654553413391113, - "reward_std": 0.7150935083627701, - "rewards/accuracy_reward": 0.8281250298023224, - "rewards/reasoning_steps_reward": 0.9444444924592972, - "rewards/repetition_penalty_reward": -0.002947573288111016, - "rewards/tag_count_reward": 0.895833358168602, + "completion_length": 181.640625, + "epoch": 0.4739407574053243, + "grad_norm": 1.5853800434552652, + "kl": 0.482421875, + "learning_rate": 1.2617730070557079e-05, + "loss": 0.6332, + "reward": 2.6595771312713623, + "reward_std": 0.5877653062343597, + "rewards/accuracy_reward": 0.770833358168602, + "rewards/reasoning_steps_reward": 0.9756944626569748, + "rewards/repetition_penalty_reward": -0.0517944535240531, + "rewards/tag_count_reward": 0.9648437649011612, "step": 316 }, { "clip_ratio": 0.0, - "completion_length": 269.00521087646484, - "epoch": 0.9505247376311844, - "grad_norm": 157.41546141813902, - "kl": 12.15625, - "learning_rate": 1.4097570483140642e-07, - "loss": 2.5569, - "reward": 2.467340648174286, - "reward_std": 0.888208270072937, - "rewards/accuracy_reward": 0.7239583432674408, - "rewards/reasoning_steps_reward": 0.9097222685813904, - "rewards/repetition_penalty_reward": -0.00878784217638895, - "rewards/tag_count_reward": 0.8424479365348816, + "completion_length": 196.3593864440918, + "epoch": 0.4754405699287589, + "grad_norm": 0.6434608924686329, + "kl": 0.37841796875, + "learning_rate": 1.2567075874621658e-05, + "loss": 0.3751, + "reward": 2.7092297673225403, + "reward_std": 0.3774634450674057, + "rewards/accuracy_reward": 0.817708358168602, + "rewards/reasoning_steps_reward": 0.9791666716337204, + "rewards/repetition_penalty_reward": -0.05769744282588363, + "rewards/tag_count_reward": 0.9700520932674408, "step": 317 }, { "clip_ratio": 0.0, - "completion_length": 203.60938262939453, - "epoch": 0.9535232383808095, - "grad_norm": 141.14810050857622, - "kl": 9.65625, - "learning_rate": 1.239397853554336e-07, - "loss": 3.0646, - "reward": 2.610135555267334, - "reward_std": 0.7995207458734512, - "rewards/accuracy_reward": 0.7968750149011612, - "rewards/reasoning_steps_reward": 0.9253472685813904, - "rewards/repetition_penalty_reward": -0.005315949267242104, - "rewards/tag_count_reward": 0.8932291716337204, + "completion_length": 250.70313262939453, + "epoch": 0.4769403824521935, + "grad_norm": 3.085162084216117, + "kl": 0.607421875, + "learning_rate": 1.2516351065824864e-05, + "loss": 0.9745, + "reward": 2.462671160697937, + "reward_std": 0.7659648507833481, + "rewards/accuracy_reward": 0.7135416865348816, + "rewards/reasoning_steps_reward": 0.973958358168602, + "rewards/repetition_penalty_reward": -0.12196437083184719, + "rewards/tag_count_reward": 0.8971354365348816, "step": 318 }, { "clip_ratio": 0.0, - "completion_length": 220.1302146911621, - "epoch": 0.9565217391304348, - "grad_norm": 104.39513312177709, - "kl": 9.4296875, - "learning_rate": 1.0799414351212234e-07, - "loss": 2.6495, - "reward": 2.5906622409820557, - "reward_std": 0.7361035495996475, - "rewards/accuracy_reward": 0.786458358168602, - "rewards/reasoning_steps_reward": 0.9218750596046448, - "rewards/repetition_penalty_reward": -0.006994032271904871, - "rewards/tag_count_reward": 0.8893229365348816, + "completion_length": 337.12500762939453, + "epoch": 0.47844019497562806, + "grad_norm": 2.5910813299992617, + "kl": 0.67919921875, + "learning_rate": 1.2465557039460048e-05, + "loss": 0.9239, + "reward": 2.1963090300559998, + "reward_std": 0.973323866724968, + "rewards/accuracy_reward": 0.598958358168602, + "rewards/reasoning_steps_reward": 0.960069477558136, + "rewards/repetition_penalty_reward": -0.21167714521288872, + "rewards/tag_count_reward": 0.848958358168602, "step": 319 }, { "clip_ratio": 0.0, - "completion_length": 260.72396087646484, - "epoch": 0.95952023988006, - "grad_norm": 172.54963273214497, - "kl": 12.796875, - "learning_rate": 9.314053963669245e-08, - "loss": 2.6928, - "reward": 2.501668691635132, - "reward_std": 0.9300008714199066, - "rewards/accuracy_reward": 0.7968750149011612, - "rewards/reasoning_steps_reward": 0.8750000596046448, - "rewards/repetition_penalty_reward": -0.006144002894870937, - "rewards/tag_count_reward": 0.8359375298023224, + "completion_length": 282.5729293823242, + "epoch": 0.4799400074990626, + "grad_norm": 0.6692834432378754, + "kl": 0.4453125, + "learning_rate": 1.241469519272453e-05, + "loss": 1.0469, + "reward": 2.4757498502731323, + "reward_std": 0.9635120928287506, + "rewards/accuracy_reward": 0.8072916716337204, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.1574968732893467, + "rewards/tag_count_reward": 0.860677108168602, "step": 320 }, { "clip_ratio": 0.0, - "completion_length": 222.2656364440918, - "epoch": 0.9625187406296851, - "grad_norm": 58.41749025051054, - "kl": 8.359375, - "learning_rate": 7.938061350773241e-08, - "loss": 2.5917, - "reward": 2.5556360483169556, - "reward_std": 0.8327264338731766, - "rewards/accuracy_reward": 0.7656250298023224, - "rewards/reasoning_steps_reward": 0.9166666865348816, - "rewards/repetition_penalty_reward": -0.006864078342914581, - "rewards/tag_count_reward": 0.880208358168602, + "completion_length": 239.3645896911621, + "epoch": 0.4814398200224972, + "grad_norm": 1.2551570949769288, + "kl": 0.373046875, + "learning_rate": 1.2363766924681178e-05, + "loss": 0.9825, + "reward": 2.528091847896576, + "reward_std": 0.9040358066558838, + "rewards/accuracy_reward": 0.8125, + "rewards/reasoning_steps_reward": 0.9635416716337204, + "rewards/repetition_penalty_reward": -0.10602277517318726, + "rewards/tag_count_reward": 0.8580729365348816, "step": 321 }, { "clip_ratio": 0.0, - "completion_length": 199.64583587646484, - "epoch": 0.9655172413793104, - "grad_norm": 21.274250522963346, - "kl": 6.21875, - "learning_rate": 6.671588416617081e-08, - "loss": 2.0353, - "reward": 2.5732468962669373, - "reward_std": 0.6762567013502121, - "rewards/accuracy_reward": 0.7343750149011612, - "rewards/reasoning_steps_reward": 0.934027761220932, - "rewards/repetition_penalty_reward": -0.00661420589312911, - "rewards/tag_count_reward": 0.911458358168602, + "completion_length": 236.21875381469727, + "epoch": 0.48293963254593175, + "grad_norm": 0.7609484269615335, + "kl": 0.39013671875, + "learning_rate": 1.2312773636219919e-05, + "loss": 0.6847, + "reward": 2.2356700897216797, + "reward_std": 0.7553437650203705, + "rewards/accuracy_reward": 0.5729166828095913, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.10764593631029129, + "rewards/tag_count_reward": 0.8190104365348816, "step": 322 }, { "clip_ratio": 0.0, - "completion_length": 248.5416717529297, - "epoch": 0.9685157421289355, - "grad_norm": 30.694601447554664, - "kl": 9.1796875, - "learning_rate": 5.5147749747582744e-08, - "loss": 2.2922, - "reward": 2.4823118448257446, - "reward_std": 0.845875695347786, - "rewards/accuracy_reward": 0.7343750149011612, - "rewards/reasoning_steps_reward": 0.8975694477558136, - "rewards/repetition_penalty_reward": -0.0037993842852301896, - "rewards/tag_count_reward": 0.8541667014360428, + "completion_length": 282.00521087646484, + "epoch": 0.4844394450693663, + "grad_norm": 1.332331758605412, + "kl": 0.44873046875, + "learning_rate": 1.2261716730019202e-05, + "loss": 0.9536, + "reward": 2.294684648513794, + "reward_std": 1.0781968086957932, + "rewards/accuracy_reward": 0.7187500149011612, + "rewards/reasoning_steps_reward": 0.9131944626569748, + "rewards/repetition_penalty_reward": -0.14585364237427711, + "rewards/tag_count_reward": 0.8085937649011612, "step": 323 }, { "clip_ratio": 0.0, - "completion_length": 226.02605056762695, - "epoch": 0.9715142428785607, - "grad_norm": 37.717634935459415, - "kl": 6.8359375, - "learning_rate": 4.467748732783994e-08, - "loss": 2.167, - "reward": 2.6481770277023315, - "reward_std": 0.7722356021404266, - "rewards/accuracy_reward": 0.8385416716337204, - "rewards/reasoning_steps_reward": 0.9270833283662796, - "rewards/repetition_penalty_reward": -0.0015625922533217818, - "rewards/tag_count_reward": 0.884114608168602, + "completion_length": 238.16146850585938, + "epoch": 0.4859392575928009, + "grad_norm": 6163963.310883091, + "kl": 671744.279296875, + "learning_rate": 1.2210597610507418e-05, + "loss": 21419.6289, + "reward": 2.206323742866516, + "reward_std": 0.8206494450569153, + "rewards/accuracy_reward": 0.5312500298023224, + "rewards/reasoning_steps_reward": 0.9496528208255768, + "rewards/repetition_penalty_reward": -0.10661044530570507, + "rewards/tag_count_reward": 0.8320312649011612, "step": 324 }, { "clip_ratio": 0.0, - "completion_length": 195.2916717529297, - "epoch": 0.974512743628186, - "grad_norm": 116.25145619245001, - "kl": 4.23828125, - "learning_rate": 3.530625278212685e-08, - "loss": 1.9012, - "reward": 2.604085922241211, - "reward_std": 0.7135767489671707, + "completion_length": 267.7760543823242, + "epoch": 0.4874390701162355, + "grad_norm": 1.168918265059183, + "kl": 0.455078125, + "learning_rate": 1.2159417683824266e-05, + "loss": 0.9769, + "reward": 2.380450427532196, + "reward_std": 0.9683381170034409, "rewards/accuracy_reward": 0.7604166865348816, - "rewards/reasoning_steps_reward": 0.9392361491918564, - "rewards/repetition_penalty_reward": -0.004421164951054379, - "rewards/tag_count_reward": 0.9088541865348816, + "rewards/reasoning_steps_reward": 0.9357639104127884, + "rewards/repetition_penalty_reward": -0.13864684104919434, + "rewards/tag_count_reward": 0.8229166865348816, "step": 325 }, { "clip_ratio": 0.0, - "completion_length": 225.10937881469727, - "epoch": 0.9775112443778111, - "grad_norm": 47.023072554995835, - "kl": 6.8359375, - "learning_rate": 2.7035080657338287e-08, - "loss": 2.2076, - "reward": 2.577496290206909, - "reward_std": 0.8721825927495956, - "rewards/accuracy_reward": 0.8125000298023224, - "rewards/reasoning_steps_reward": 0.9010417014360428, - "rewards/repetition_penalty_reward": -0.005837142816744745, - "rewards/tag_count_reward": 0.8697916865348816, + "completion_length": 225.47396087646484, + "epoch": 0.48893888263967006, + "grad_norm": 1.2436386439703448, + "kl": 0.51953125, + "learning_rate": 1.2108178357782079e-05, + "loss": 0.7213, + "reward": 2.281162142753601, + "reward_std": 0.5486664846539497, + "rewards/accuracy_reward": 0.5052083432674408, + "rewards/reasoning_steps_reward": 0.9791667014360428, + "rewards/repetition_penalty_reward": -0.08342129364609718, + "rewards/tag_count_reward": 0.880208358168602, "step": 326 }, { "clip_ratio": 0.0, - "completion_length": 232.28125762939453, - "epoch": 0.9805097451274363, - "grad_norm": 91.66318973005008, - "kl": 6.0703125, - "learning_rate": 1.986488405786524e-08, - "loss": 2.2354, - "reward": 2.548638343811035, - "reward_std": 0.8752985298633575, - "rewards/accuracy_reward": 0.7812500149011612, - "rewards/reasoning_steps_reward": 0.9079861640930176, - "rewards/repetition_penalty_reward": -0.006483202683739364, - "rewards/tag_count_reward": 0.8658854365348816, + "completion_length": 214.96875762939453, + "epoch": 0.4904386951631046, + "grad_norm": 0.7171717418281628, + "kl": 0.3896484375, + "learning_rate": 1.205688104182709e-05, + "loss": 0.574, + "reward": 2.455682098865509, + "reward_std": 0.5591145232319832, + "rewards/accuracy_reward": 0.6770833432674408, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.07252970896661282, + "rewards/tag_count_reward": 0.8789062649011612, "step": 327 }, { "clip_ratio": 0.0, - "completion_length": 203.97396087646484, - "epoch": 0.9835082458770614, - "grad_norm": 93.30979532325884, - "kl": 5.15625, - "learning_rate": 1.3796454544796612e-08, - "loss": 2.018, - "reward": 2.5490445494651794, - "reward_std": 0.7066824734210968, - "rewards/accuracy_reward": 0.7343750149011612, - "rewards/reasoning_steps_reward": 0.921875, - "rewards/repetition_penalty_reward": -0.005643073527608067, - "rewards/tag_count_reward": 0.8984375149011612, + "completion_length": 203.1354217529297, + "epoch": 0.4919385076865392, + "grad_norm": 627.8308768620077, + "kl": 74.83154296875, + "learning_rate": 1.2005527147000663e-05, + "loss": 2.9222, + "reward": 2.3696823120117188, + "reward_std": 0.6213457994163036, + "rewards/accuracy_reward": 0.5625000149011612, + "rewards/reasoning_steps_reward": 0.9670138955116272, + "rewards/repetition_penalty_reward": -0.06347744725644588, + "rewards/tag_count_reward": 0.903645858168602, "step": 328 }, { "clip_ratio": 0.0, - "completion_length": 209.05208587646484, - "epoch": 0.9865067466266867, - "grad_norm": 55.95983139614906, - "kl": 5.65625, - "learning_rate": 8.83046204853133e-09, - "loss": 2.0135, - "reward": 2.6783204674720764, - "reward_std": 0.7424749433994293, - "rewards/accuracy_reward": 0.8697916865348816, - "rewards/reasoning_steps_reward": 0.9201388955116272, - "rewards/repetition_penalty_reward": -0.0035372423008084297, - "rewards/tag_count_reward": 0.8919271230697632, + "completion_length": 213.8541717529297, + "epoch": 0.49343832020997375, + "grad_norm": 1.0700058348332315, + "kl": 0.490234375, + "learning_rate": 1.1954118085900503e-05, + "loss": 0.4632, + "reward": 2.4596543312072754, + "reward_std": 0.5267078801989555, + "rewards/accuracy_reward": 0.6354166865348816, + "rewards/reasoning_steps_reward": 0.9704861491918564, + "rewards/repetition_penalty_reward": -0.07202969677746296, + "rewards/tag_count_reward": 0.9257812649011612, "step": 329 }, { "clip_ratio": 0.0, - "completion_length": 248.3541717529297, - "epoch": 0.9895052473763118, - "grad_norm": 45.74323780759467, - "kl": 6.8671875, - "learning_rate": 4.967454794823079e-09, - "loss": 2.1326, - "reward": 2.388830304145813, - "reward_std": 0.7917287796735764, - "rewards/accuracy_reward": 0.6354167014360428, - "rewards/reasoning_steps_reward": 0.8993055522441864, - "rewards/repetition_penalty_reward": -0.006569080462213606, - "rewards/tag_count_reward": 0.8606771230697632, + "completion_length": 257.5260467529297, + "epoch": 0.4949381327334083, + "grad_norm": 1.1628503464475846, + "kl": 0.36474609375, + "learning_rate": 1.1902655272641756e-05, + "loss": 0.6424, + "reward": 2.5043947100639343, + "reward_std": 0.7565517127513885, + "rewards/accuracy_reward": 0.7187500149011612, + "rewards/reasoning_steps_reward": 0.9756944477558136, + "rewards/repetition_penalty_reward": -0.10411231219768524, + "rewards/tag_count_reward": 0.9140625149011612, "step": 330 }, { "clip_ratio": 0.0, - "completion_length": 226.32291793823242, - "epoch": 0.992503748125937, - "grad_norm": 18.948649605644253, - "kl": 6.3515625, - "learning_rate": 2.2078592442553725e-09, - "loss": 1.9433, - "reward": 2.566007614135742, - "reward_std": 0.7248884737491608, - "rewards/accuracy_reward": 0.7656250149011612, - "rewards/reasoning_steps_reward": 0.923611119389534, - "rewards/repetition_penalty_reward": -0.006041057640686631, - "rewards/tag_count_reward": 0.8828125149011612, + "completion_length": 261.04688262939453, + "epoch": 0.49643794525684287, + "grad_norm": 0.6255524381969977, + "kl": 0.37353515625, + "learning_rate": 1.1851140122818155e-05, + "loss": 0.5753, + "reward": 2.406662940979004, + "reward_std": 0.8139507248997688, + "rewards/accuracy_reward": 0.661458358168602, + "rewards/reasoning_steps_reward": 0.958333358168602, + "rewards/repetition_penalty_reward": -0.11417047679424286, + "rewards/tag_count_reward": 0.9010416865348816, "step": 331 }, { "clip_ratio": 0.0, - "completion_length": 199.53646087646484, - "epoch": 0.9955022488755623, - "grad_norm": 54.47996371401269, - "kl": 5.13671875, - "learning_rate": 5.519800451625479e-10, - "loss": 1.9846, - "reward": 2.375428855419159, - "reward_std": 0.6744664013385773, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/reasoning_steps_reward": 0.937500074505806, - "rewards/repetition_penalty_reward": -0.00477961910655722, - "rewards/tag_count_reward": 0.9010416716337204, + "completion_length": 257.12500381469727, + "epoch": 0.4979377577802775, + "grad_norm": 0.7463997657803588, + "kl": 0.373046875, + "learning_rate": 1.1799574053463048e-05, + "loss": 0.6145, + "reward": 2.637652814388275, + "reward_std": 0.6368973329663277, + "rewards/accuracy_reward": 0.8229166865348816, + "rewards/reasoning_steps_reward": 0.9652777761220932, + "rewards/repetition_penalty_reward": -0.09976036194711924, + "rewards/tag_count_reward": 0.9492187649011612, "step": 332 }, { "clip_ratio": 0.0, - "completion_length": 230.6927146911621, - "epoch": 0.9985007496251874, - "grad_norm": 58.000699042461555, - "kl": 7.21875, - "learning_rate": 0.0, - "loss": 2.3714, - "reward": 2.6160709261894226, - "reward_std": 0.8584436923265457, - "rewards/accuracy_reward": 0.8437500149011612, - "rewards/reasoning_steps_reward": 0.9062500149011612, - "rewards/repetition_penalty_reward": -0.00632495793979615, - "rewards/tag_count_reward": 0.8723958432674408, + "completion_length": 238.8333396911621, + "epoch": 0.49943757030371205, + "grad_norm": 0.7834690979097384, + "kl": 0.35546875, + "learning_rate": 1.1747958483010438e-05, + "loss": 0.5165, + "reward": 2.6003527641296387, + "reward_std": 0.7082152366638184, + "rewards/accuracy_reward": 0.8020833432674408, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.09148765727877617, + "rewards/tag_count_reward": 0.9244791865348816, "step": 333 }, { - "epoch": 0.9985007496251874, - "step": 333, + "clip_ratio": 0.0, + "completion_length": 236.93750381469727, + "epoch": 0.5009373828271466, + "grad_norm": 0.7440577499317964, + "kl": 0.36376953125, + "learning_rate": 1.1696294831255961e-05, + "loss": 0.5263, + "reward": 2.6781486868858337, + "reward_std": 0.6196286901831627, + "rewards/accuracy_reward": 0.8593750149011612, + "rewards/reasoning_steps_reward": 0.9531250149011612, + "rewards/repetition_penalty_reward": -0.09008055832237005, + "rewards/tag_count_reward": 0.9557291865348816, + "step": 334 + }, + { + "clip_ratio": 0.0, + "completion_length": 286.8072967529297, + "epoch": 0.5024371953505812, + "grad_norm": 0.736069132286744, + "kl": 0.4169921875, + "learning_rate": 1.1644584519317828e-05, + "loss": 0.7123, + "reward": 2.399460792541504, + "reward_std": 0.8988363444805145, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9479167014360428, + "rewards/repetition_penalty_reward": -0.12918509356677532, + "rewards/tag_count_reward": 0.9140625149011612, + "step": 335 + }, + { + "clip_ratio": 0.0, + "completion_length": 276.28125762939453, + "epoch": 0.5039370078740157, + "grad_norm": 0.5872022790584697, + "kl": 0.36669921875, + "learning_rate": 1.159282896959774e-05, + "loss": 0.6407, + "reward": 2.4547627568244934, + "reward_std": 0.7832493036985397, + "rewards/accuracy_reward": 0.6822916865348816, + "rewards/reasoning_steps_reward": 0.9687500447034836, + "rewards/repetition_penalty_reward": -0.11945611983537674, + "rewards/tag_count_reward": 0.9231770932674408, + "step": 336 + }, + { + "clip_ratio": 0.0, + "completion_length": 265.8177146911621, + "epoch": 0.5054368203974503, + "grad_norm": 0.7563101618131821, + "kl": 0.3818359375, + "learning_rate": 1.1541029605741758e-05, + "loss": 0.68, + "reward": 2.5487101078033447, + "reward_std": 0.7277443781495094, + "rewards/accuracy_reward": 0.7395833432674408, + "rewards/reasoning_steps_reward": 0.9774305671453476, + "rewards/repetition_penalty_reward": -0.10580387711524963, + "rewards/tag_count_reward": 0.9375000298023224, + "step": 337 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.54166793823242, + "epoch": 0.5069366329208849, + "grad_norm": 0.7629172892618677, + "kl": 0.34228515625, + "learning_rate": 1.1489187852601147e-05, + "loss": 0.4758, + "reward": 2.738324999809265, + "reward_std": 0.49979202449321747, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9774305820465088, + "rewards/repetition_penalty_reward": -0.056813917588442564, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 338 + }, + { + "clip_ratio": 0.0, + "completion_length": 286.77083587646484, + "epoch": 0.5084364454443194, + "grad_norm": 1.2875833340510152, + "kl": 0.4716796875, + "learning_rate": 1.143730513619317e-05, + "loss": 0.6866, + "reward": 2.457393527030945, + "reward_std": 0.8834079504013062, + "rewards/accuracy_reward": 0.739583358168602, + "rewards/reasoning_steps_reward": 0.9322916865348816, + "rewards/repetition_penalty_reward": -0.1311482023447752, + "rewards/tag_count_reward": 0.9166666865348816, + "step": 339 + }, + { + "clip_ratio": 0.0, + "completion_length": 262.95312881469727, + "epoch": 0.509936257967754, + "grad_norm": 1.9340766781030916, + "kl": 0.64697265625, + "learning_rate": 1.1385382883661881e-05, + "loss": 0.4332, + "reward": 2.5942657589912415, + "reward_std": 0.6046310998499393, + "rewards/accuracy_reward": 0.8177083432674408, + "rewards/reasoning_steps_reward": 0.9600694626569748, + "rewards/repetition_penalty_reward": -0.11710584536194801, + "rewards/tag_count_reward": 0.9335937649011612, + "step": 340 + }, + { + "clip_ratio": 0.0, + "completion_length": 240.20313262939453, + "epoch": 0.5114360704911886, + "grad_norm": 0.836838721680916, + "kl": 0.41064453125, + "learning_rate": 1.1333422523238858e-05, + "loss": 0.5716, + "reward": 2.467832863330841, + "reward_std": 0.6241201423108578, + "rewards/accuracy_reward": 0.6302083507180214, + "rewards/reasoning_steps_reward": 0.975694477558136, + "rewards/repetition_penalty_reward": -0.08598665753379464, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 341 + }, + { + "clip_ratio": 0.0, + "completion_length": 275.5885467529297, + "epoch": 0.5129358830146231, + "grad_norm": 0.6233336001065312, + "kl": 0.3583984375, + "learning_rate": 1.1281425484203908e-05, + "loss": 0.5216, + "reward": 2.375839054584503, + "reward_std": 0.570319652557373, + "rewards/accuracy_reward": 0.5781250149011612, + "rewards/reasoning_steps_reward": 0.986111119389534, + "rewards/repetition_penalty_reward": -0.12459501624107361, + "rewards/tag_count_reward": 0.9361979365348816, + "step": 342 + }, + { + "clip_ratio": 0.0, + "completion_length": 270.3385543823242, + "epoch": 0.5144356955380578, + "grad_norm": 0.781462095769395, + "kl": 0.4208984375, + "learning_rate": 1.122939319684577e-05, + "loss": 0.8224, + "reward": 2.3723965883255005, + "reward_std": 0.892381027340889, + "rewards/accuracy_reward": 0.6406250074505806, + "rewards/reasoning_steps_reward": 0.9409722536802292, + "rewards/repetition_penalty_reward": -0.1271694442257285, + "rewards/tag_count_reward": 0.9179687649011612, + "step": 343 + }, + { + "clip_ratio": 0.0, + "completion_length": 243.04167556762695, + "epoch": 0.5159355080614924, + "grad_norm": 0.5806453568608354, + "kl": 0.3466796875, + "learning_rate": 1.1177327092422761e-05, + "loss": 0.682, + "reward": 2.4070491790771484, + "reward_std": 0.619476206600666, + "rewards/accuracy_reward": 0.6145833432674408, + "rewards/reasoning_steps_reward": 0.9479166865348816, + "rewards/repetition_penalty_reward": -0.1007634038105607, + "rewards/tag_count_reward": 0.9453125149011612, + "step": 344 + }, + { + "clip_ratio": 0.0, + "completion_length": 239.41146850585938, + "epoch": 0.5174353205849269, + "grad_norm": 2.1866070331433964, + "kl": 0.57275390625, + "learning_rate": 1.1125228603123408e-05, + "loss": 0.6429, + "reward": 2.6293810606002808, + "reward_std": 0.6588820517063141, + "rewards/accuracy_reward": 0.802083358168602, + "rewards/reasoning_steps_reward": 0.9635416865348816, + "rewards/repetition_penalty_reward": -0.09327533282339573, + "rewards/tag_count_reward": 0.9570312649011612, + "step": 345 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.65105056762695, + "epoch": 0.5189351331083615, + "grad_norm": 0.6514816604889703, + "kl": 0.34228515625, + "learning_rate": 1.107309916202705e-05, + "loss": 0.3713, + "reward": 2.694413661956787, + "reward_std": 0.3578246496617794, + "rewards/accuracy_reward": 0.7864583432674408, + "rewards/reasoning_steps_reward": 0.9843750149011612, + "rewards/repetition_penalty_reward": -0.051680129021406174, + "rewards/tag_count_reward": 0.9752604365348816, + "step": 346 + }, + { + "clip_ratio": 0.0, + "completion_length": 208.5416717529297, + "epoch": 0.520434945631796, + "grad_norm": 6.6032813165402935, + "kl": 0.46484375, + "learning_rate": 1.1020940203064425e-05, + "loss": 0.3975, + "reward": 2.6713653802871704, + "reward_std": 0.4076192807406187, + "rewards/accuracy_reward": 0.7604167014360428, + "rewards/reasoning_steps_reward": 0.9913194626569748, + "rewards/repetition_penalty_reward": -0.05563134513795376, + "rewards/tag_count_reward": 0.9752604365348816, + "step": 347 + }, + { + "clip_ratio": 0.0, + "completion_length": 221.29688262939453, + "epoch": 0.5219347581552306, + "grad_norm": 2.4618917497796233, + "kl": 0.431640625, + "learning_rate": 1.096875316097822e-05, + "loss": 0.519, + "reward": 2.731937885284424, + "reward_std": 0.5047559663653374, + "rewards/accuracy_reward": 0.8593750298023224, + "rewards/reasoning_steps_reward": 0.9826389104127884, + "rewards/repetition_penalty_reward": -0.07491973042488098, + "rewards/tag_count_reward": 0.9648437798023224, + "step": 348 + }, + { + "clip_ratio": 0.0, + "completion_length": 252.98959350585938, + "epoch": 0.5234345706786652, + "grad_norm": 0.6526608511425668, + "kl": 0.38037109375, + "learning_rate": 1.0916539471283607e-05, + "loss": 0.4489, + "reward": 2.611004889011383, + "reward_std": 0.6215125024318695, + "rewards/accuracy_reward": 0.8229166865348816, + "rewards/reasoning_steps_reward": 0.9531250298023224, + "rewards/repetition_penalty_reward": -0.10904721543192863, + "rewards/tag_count_reward": 0.9440104365348816, + "step": 349 + }, + { + "clip_ratio": 0.0, + "completion_length": 238.0572967529297, + "epoch": 0.5249343832020997, + "grad_norm": 1.3490617390001882, + "kl": 0.43798828125, + "learning_rate": 1.0864300570228757e-05, + "loss": 0.5916, + "reward": 2.349369764328003, + "reward_std": 0.591891311109066, + "rewards/accuracy_reward": 0.5625000074505806, + "rewards/reasoning_steps_reward": 0.9496527910232544, + "rewards/repetition_penalty_reward": -0.11200192756950855, + "rewards/tag_count_reward": 0.9492187798023224, + "step": 350 + }, + { + "clip_ratio": 0.0, + "completion_length": 202.79688262939453, + "epoch": 0.5264341957255343, + "grad_norm": 0.7377470109327692, + "kl": 0.35205078125, + "learning_rate": 1.0812037894755336e-05, + "loss": 0.4294, + "reward": 2.7148218750953674, + "reward_std": 0.4056655466556549, + "rewards/accuracy_reward": 0.802083358168602, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.06121993251144886, + "rewards/tag_count_reward": 0.973958358168602, + "step": 351 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.13021087646484, + "epoch": 0.5279340082489689, + "grad_norm": 0.9262254939525653, + "kl": 0.3154296875, + "learning_rate": 1.0759752882458972e-05, + "loss": 0.4753, + "reward": 2.716402769088745, + "reward_std": 0.4552469737827778, + "rewards/accuracy_reward": 0.8072916865348816, + "rewards/reasoning_steps_reward": 0.9895833432674408, + "rewards/repetition_penalty_reward": -0.05963901709765196, + "rewards/tag_count_reward": 0.9791666865348816, + "step": 352 + }, + { + "clip_ratio": 0.0, + "completion_length": 222.2343864440918, + "epoch": 0.5294338207724034, + "grad_norm": 1.2036216767618395, + "kl": 0.32861328125, + "learning_rate": 1.0707446971549717e-05, + "loss": 0.6401, + "reward": 2.590088963508606, + "reward_std": 0.5779636353254318, + "rewards/accuracy_reward": 0.755208358168602, + "rewards/reasoning_steps_reward": 0.9704861491918564, + "rewards/repetition_penalty_reward": -0.09524106327444315, + "rewards/tag_count_reward": 0.9596354365348816, + "step": 353 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.28646087646484, + "epoch": 0.530933633295838, + "grad_norm": 0.8356292683714005, + "kl": 0.3642578125, + "learning_rate": 1.0655121600812482e-05, + "loss": 0.6197, + "reward": 2.7302719950675964, + "reward_std": 0.5387823097407818, + "rewards/accuracy_reward": 0.8593750298023224, + "rewards/reasoning_steps_reward": 0.9791667014360428, + "rewards/repetition_penalty_reward": -0.06920717097818851, + "rewards/tag_count_reward": 0.9609375, + "step": 354 + }, + { + "clip_ratio": 0.0, + "completion_length": 219.63542556762695, + "epoch": 0.5324334458192725, + "grad_norm": 0.9550640481800935, + "kl": 0.4052734375, + "learning_rate": 1.0602778209567462e-05, + "loss": 0.7535, + "reward": 2.541829466819763, + "reward_std": 0.7257425487041473, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9548610895872116, + "rewards/repetition_penalty_reward": -0.08751083724200726, + "rewards/tag_count_reward": 0.9453125298023224, + "step": 355 + }, + { + "clip_ratio": 0.0, + "completion_length": 215.01562881469727, + "epoch": 0.5339332583427071, + "grad_norm": 1.070677353710695, + "kl": 0.404296875, + "learning_rate": 1.0550418237630547e-05, + "loss": 1.0339, + "reward": 2.5512691140174866, + "reward_std": 0.7947003692388535, + "rewards/accuracy_reward": 0.7500000149011612, + "rewards/reasoning_steps_reward": 0.9496527910232544, + "rewards/repetition_penalty_reward": -0.09499827027320862, + "rewards/tag_count_reward": 0.9466145932674408, + "step": 356 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.71875762939453, + "epoch": 0.5354330708661418, + "grad_norm": 2.6963525226313068, + "kl": 0.4677734375, + "learning_rate": 1.0498043125273714e-05, + "loss": 0.5703, + "reward": 2.6821329593658447, + "reward_std": 0.52312882989645, + "rewards/accuracy_reward": 0.8020833432674408, + "rewards/reasoning_steps_reward": 0.9722222536802292, + "rewards/repetition_penalty_reward": -0.05831848084926605, + "rewards/tag_count_reward": 0.966145858168602, + "step": 357 + }, + { + "clip_ratio": 0.0, + "completion_length": 222.8020896911621, + "epoch": 0.5369328833895763, + "grad_norm": 4.425571727643998, + "kl": 0.5517578125, + "learning_rate": 1.0445654313185402e-05, + "loss": 0.7903, + "reward": 2.470693051815033, + "reward_std": 0.6807428225874901, + "rewards/accuracy_reward": 0.6562500298023224, + "rewards/reasoning_steps_reward": 0.96875, + "rewards/repetition_penalty_reward": -0.09701530169695616, + "rewards/tag_count_reward": 0.942708358168602, + "step": 358 + }, + { + "clip_ratio": 0.0, + "completion_length": 246.9947967529297, + "epoch": 0.5384326959130109, + "grad_norm": 2.2749469499741446, + "kl": 0.54541015625, + "learning_rate": 1.0393253242430898e-05, + "loss": 0.766, + "reward": 2.610697090625763, + "reward_std": 0.7646311447024345, + "rewards/accuracy_reward": 0.8385416865348816, + "rewards/reasoning_steps_reward": 0.954861119389534, + "rewards/repetition_penalty_reward": -0.11760163493454456, + "rewards/tag_count_reward": 0.9348958432674408, + "step": 359 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.45833587646484, + "epoch": 0.5399325084364455, + "grad_norm": 0.9100444614802594, + "kl": 0.39501953125, + "learning_rate": 1.0340841354412688e-05, + "loss": 0.6344, + "reward": 2.6307865977287292, + "reward_std": 0.5916828811168671, + "rewards/accuracy_reward": 0.8177083432674408, + "rewards/reasoning_steps_reward": 0.9513889104127884, + "rewards/repetition_penalty_reward": -0.08752944599837065, + "rewards/tag_count_reward": 0.9492187798023224, + "step": 360 + }, + { + "clip_ratio": 0.0, + "completion_length": 202.50521087646484, + "epoch": 0.54143232095988, + "grad_norm": 1.3542848450477425, + "kl": 0.458984375, + "learning_rate": 1.0288420090830803e-05, + "loss": 0.5782, + "reward": 2.5363489985466003, + "reward_std": 0.5847525298595428, + "rewards/accuracy_reward": 0.6875000149011612, + "rewards/reasoning_steps_reward": 0.9704861342906952, + "rewards/repetition_penalty_reward": -0.07215808611363173, + "rewards/tag_count_reward": 0.950520858168602, + "step": 361 + }, + { + "clip_ratio": 0.0, + "completion_length": 218.29688262939453, + "epoch": 0.5429321334833146, + "grad_norm": 1.0691193581184522, + "kl": 0.54150390625, + "learning_rate": 1.0235990893643184e-05, + "loss": 0.614, + "reward": 2.446419894695282, + "reward_std": 0.5024325773119926, + "rewards/accuracy_reward": 0.6197916939854622, + "rewards/reasoning_steps_reward": 0.9878472536802292, + "rewards/repetition_penalty_reward": -0.10002116020768881, + "rewards/tag_count_reward": 0.9388020932674408, + "step": 362 + }, + { + "clip_ratio": 0.0, + "completion_length": 239.64063262939453, + "epoch": 0.5444319460067492, + "grad_norm": 1.0363445445035244, + "kl": 0.48095703125, + "learning_rate": 1.0183555205025986e-05, + "loss": 0.8301, + "reward": 2.533776819705963, + "reward_std": 0.8274639621376991, + "rewards/accuracy_reward": 0.7812500298023224, + "rewards/reasoning_steps_reward": 0.9479166865348816, + "rewards/repetition_penalty_reward": -0.11335871182382107, + "rewards/tag_count_reward": 0.9179687798023224, + "step": 363 + }, + { + "clip_ratio": 0.0, + "completion_length": 247.2395896911621, + "epoch": 0.5459317585301837, + "grad_norm": 1.4285370079745647, + "kl": 0.4580078125, + "learning_rate": 1.0131114467333935e-05, + "loss": 0.9421, + "reward": 2.5757275223731995, + "reward_std": 0.940077006816864, + "rewards/accuracy_reward": 0.848958358168602, + "rewards/reasoning_steps_reward": 0.9409722685813904, + "rewards/repetition_penalty_reward": -0.1282655205577612, + "rewards/tag_count_reward": 0.9140625149011612, + "step": 364 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.9322967529297, + "epoch": 0.5474315710536183, + "grad_norm": 2.976909353877276, + "kl": 0.41650390625, + "learning_rate": 1.0078670123060638e-05, + "loss": 0.5808, + "reward": 2.7071104049682617, + "reward_std": 0.5107903629541397, + "rewards/accuracy_reward": 0.8385416865348816, + "rewards/reasoning_steps_reward": 0.9878472238779068, + "rewards/repetition_penalty_reward": -0.0724035445600748, + "rewards/tag_count_reward": 0.9531250149011612, + "step": 365 + }, + { + "clip_ratio": 0.0, + "completion_length": 241.50000381469727, + "epoch": 0.5489313835770528, + "grad_norm": 1.5290214581037804, + "kl": 0.45263671875, + "learning_rate": 1.002622361479891e-05, + "loss": 0.7503, + "reward": 2.2156243920326233, + "reward_std": 0.8056820183992386, + "rewards/accuracy_reward": 0.510416679084301, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.11901108920574188, + "rewards/tag_count_reward": 0.8658854216337204, + "step": 366 + }, + { + "clip_ratio": 0.0, + "completion_length": 285.9895935058594, + "epoch": 0.5504311961004874, + "grad_norm": 1.6300575912920117, + "kl": 0.43505859375, + "learning_rate": 9.973776385201093e-06, + "loss": 0.9551, + "reward": 2.446646273136139, + "reward_std": 0.9035588353872299, + "rewards/accuracy_reward": 0.7604166865348816, + "rewards/reasoning_steps_reward": 0.9756944924592972, + "rewards/repetition_penalty_reward": -0.16316283866763115, + "rewards/tag_count_reward": 0.8736979365348816, + "step": 367 + }, + { + "clip_ratio": 0.0, + "completion_length": 249.50000762939453, + "epoch": 0.551931008623922, + "grad_norm": 2.432545958790851, + "kl": 0.43017578125, + "learning_rate": 9.921329876939365e-06, + "loss": 0.6609, + "reward": 2.449833869934082, + "reward_std": 0.7589901238679886, + "rewards/accuracy_reward": 0.6927083656191826, + "rewards/reasoning_steps_reward": 0.9704861342906952, + "rewards/repetition_penalty_reward": -0.12221488542854786, + "rewards/tag_count_reward": 0.9088542014360428, + "step": 368 + }, + { + "clip_ratio": 0.0, + "completion_length": 288.2135543823242, + "epoch": 0.5534308211473565, + "grad_norm": 1.139092646573938, + "kl": 0.4443359375, + "learning_rate": 9.868885532666068e-06, + "loss": 0.9903, + "reward": 2.31951966881752, + "reward_std": 1.0058775544166565, + "rewards/accuracy_reward": 0.661458358168602, + "rewards/reasoning_steps_reward": 0.9409722685813904, + "rewards/repetition_penalty_reward": -0.17223386466503143, + "rewards/tag_count_reward": 0.8893229514360428, + "step": 369 + }, + { + "clip_ratio": 0.0, + "completion_length": 237.46355056762695, + "epoch": 0.5549306336707911, + "grad_norm": 0.9950921053074119, + "kl": 0.3984375, + "learning_rate": 9.816444794974018e-06, + "loss": 0.6193, + "reward": 2.5471015572547913, + "reward_std": 0.6440750285983086, + "rewards/accuracy_reward": 0.7395833730697632, + "rewards/reasoning_steps_reward": 0.9687500149011612, + "rewards/repetition_penalty_reward": -0.10133606940507889, + "rewards/tag_count_reward": 0.9401041716337204, + "step": 370 + }, + { + "clip_ratio": 0.0, + "completion_length": 236.4583396911621, + "epoch": 0.5564304461942258, + "grad_norm": 1.440820850213343, + "kl": 0.43701171875, + "learning_rate": 9.76400910635682e-06, + "loss": 0.8709, + "reward": 2.2166844606399536, + "reward_std": 0.7375850528478622, + "rewards/accuracy_reward": 0.4583333432674408, + "rewards/reasoning_steps_reward": 0.9461805820465088, + "rewards/repetition_penalty_reward": -0.11100666876882315, + "rewards/tag_count_reward": 0.9231770932674408, + "step": 371 + }, + { + "clip_ratio": 0.0, + "completion_length": 214.1979217529297, + "epoch": 0.5579302587176603, + "grad_norm": 1.089340536225687, + "kl": 0.47412109375, + "learning_rate": 9.7115799091692e-06, + "loss": 0.4976, + "reward": 2.5988621711730957, + "reward_std": 0.6001707911491394, + "rewards/accuracy_reward": 0.7604166716337204, + "rewards/reasoning_steps_reward": 0.9670139104127884, + "rewards/repetition_penalty_reward": -0.07778726145625114, + "rewards/tag_count_reward": 0.9492187649011612, + "step": 372 + }, + { + "clip_ratio": 0.0, + "completion_length": 204.90625381469727, + "epoch": 0.5594300712410949, + "grad_norm": 1.0735796637227366, + "kl": 0.4130859375, + "learning_rate": 9.659158645587319e-06, + "loss": 0.6675, + "reward": 2.7782450914382935, + "reward_std": 0.5766544118523598, + "rewards/accuracy_reward": 0.9062500149011612, + "rewards/reasoning_steps_reward": 0.9756944328546524, + "rewards/repetition_penalty_reward": -0.06984523870050907, + "rewards/tag_count_reward": 0.9661458432674408, + "step": 373 + }, + { + "clip_ratio": 0.0, + "completion_length": 225.63021850585938, + "epoch": 0.5609298837645295, + "grad_norm": 0.6707545410048498, + "kl": 0.36767578125, + "learning_rate": 9.606746757569107e-06, + "loss": 0.6822, + "reward": 2.6383379101753235, + "reward_std": 0.7204165309667587, + "rewards/accuracy_reward": 0.8281250149011612, + "rewards/reasoning_steps_reward": 0.9618055820465088, + "rewards/repetition_penalty_reward": -0.09560317918658257, + "rewards/tag_count_reward": 0.9440104216337204, + "step": 374 + }, + { + "clip_ratio": 0.0, + "completion_length": 209.52605056762695, + "epoch": 0.562429696287964, + "grad_norm": 0.8820593625713022, + "kl": 0.42138671875, + "learning_rate": 9.554345686814601e-06, + "loss": 0.4017, + "reward": 2.5131508708000183, + "reward_std": 0.4276548661291599, + "rewards/accuracy_reward": 0.6302083507180214, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.06410606950521469, + "rewards/tag_count_reward": 0.9609375298023224, + "step": 375 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.4479217529297, + "epoch": 0.5639295088113986, + "grad_norm": 0.6991131907883488, + "kl": 0.3740234375, + "learning_rate": 9.501956874726289e-06, + "loss": 0.4624, + "reward": 2.7490947246551514, + "reward_std": 0.4287390150129795, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9826388955116272, + "rewards/repetition_penalty_reward": -0.05255457712337375, + "rewards/tag_count_reward": 0.9648437798023224, + "step": 376 + }, + { + "clip_ratio": 0.0, + "completion_length": 210.77605056762695, + "epoch": 0.5654293213348331, + "grad_norm": 1.725949820619147, + "kl": 0.44384765625, + "learning_rate": 9.449581762369454e-06, + "loss": 0.5851, + "reward": 2.348751664161682, + "reward_std": 0.5454598441720009, + "rewards/accuracy_reward": 0.494791679084301, + "rewards/reasoning_steps_reward": 0.9687500149011612, + "rewards/repetition_penalty_reward": -0.07051914185285568, + "rewards/tag_count_reward": 0.9557291865348816, + "step": 377 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.84896850585938, + "epoch": 0.5669291338582677, + "grad_norm": 0.7636716518401006, + "kl": 0.38134765625, + "learning_rate": 9.39722179043254e-06, + "loss": 0.238, + "reward": 2.7479037046432495, + "reward_std": 0.232671735342592, + "rewards/accuracy_reward": 0.8177083432674408, + "rewards/reasoning_steps_reward": 0.9895833432674408, + "rewards/repetition_penalty_reward": -0.04636712512001395, + "rewards/tag_count_reward": 0.9869791865348816, + "step": 378 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.34375381469727, + "epoch": 0.5684289463817023, + "grad_norm": 0.832309114343419, + "kl": 0.37158203125, + "learning_rate": 9.344878399187521e-06, + "loss": 0.2999, + "reward": 2.6157588362693787, + "reward_std": 0.41164325177669525, + "rewards/accuracy_reward": 0.7187500223517418, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.05872038099914789, + "rewards/tag_count_reward": 0.9765625149011612, + "step": 379 + }, + { + "clip_ratio": 0.0, + "completion_length": 232.17187881469727, + "epoch": 0.5699287589051368, + "grad_norm": 1.2211358696196093, + "kl": 0.47216796875, + "learning_rate": 9.292553028450286e-06, + "loss": 0.5705, + "reward": 2.6755369901657104, + "reward_std": 0.49481815844774246, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/reasoning_steps_reward": 0.9843750298023224, + "rewards/repetition_penalty_reward": -0.09529644902795553, + "rewards/tag_count_reward": 0.9531250298023224, + "step": 380 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.0364646911621, + "epoch": 0.5714285714285714, + "grad_norm": 2.076589624203147, + "kl": 0.41943359375, + "learning_rate": 9.240247117541031e-06, + "loss": 0.3464, + "reward": 2.6886950731277466, + "reward_std": 0.3029524376615882, + "rewards/accuracy_reward": 0.760416679084301, + "rewards/reasoning_steps_reward": 0.9895833432674408, + "rewards/repetition_penalty_reward": -0.040471707470715046, + "rewards/tag_count_reward": 0.9791666865348816, + "step": 381 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.8385467529297, + "epoch": 0.572928383952006, + "grad_norm": 0.7452755017756123, + "kl": 0.33544921875, + "learning_rate": 9.187962105244667e-06, + "loss": 0.4188, + "reward": 2.848473310470581, + "reward_std": 0.38677336275577545, + "rewards/accuracy_reward": 0.9427083432674408, + "rewards/reasoning_steps_reward": 0.9774305820465088, + "rewards/repetition_penalty_reward": -0.05083235865458846, + "rewards/tag_count_reward": 0.9791666865348816, + "step": 382 + }, + { + "clip_ratio": 0.0, + "completion_length": 213.88021087646484, + "epoch": 0.5744281964754405, + "grad_norm": 0.8123392096930626, + "kl": 0.357421875, + "learning_rate": 9.135699429771245e-06, + "loss": 0.4282, + "reward": 2.7217193841934204, + "reward_std": 0.32535652443766594, + "rewards/accuracy_reward": 0.8177083432674408, + "rewards/reasoning_steps_reward": 0.9947916716337204, + "rewards/repetition_penalty_reward": -0.0608327342197299, + "rewards/tag_count_reward": 0.970052108168602, + "step": 383 + }, + { + "clip_ratio": 0.0, + "completion_length": 210.7864646911621, + "epoch": 0.5759280089988752, + "grad_norm": 0.7045340047560704, + "kl": 0.28662109375, + "learning_rate": 9.083460528716396e-06, + "loss": 0.3565, + "reward": 2.539238929748535, + "reward_std": 0.3681572899222374, + "rewards/accuracy_reward": 0.6093750149011612, + "rewards/reasoning_steps_reward": 0.9982638955116272, + "rewards/repetition_penalty_reward": -0.05537911970168352, + "rewards/tag_count_reward": 0.9869791865348816, + "step": 384 + }, + { + "clip_ratio": 0.0, + "completion_length": 229.43750381469727, + "epoch": 0.5774278215223098, + "grad_norm": 1.429073793360656, + "kl": 0.4013671875, + "learning_rate": 9.031246839021783e-06, + "loss": 0.5535, + "reward": 2.4284881353378296, + "reward_std": 0.5695413202047348, + "rewards/accuracy_reward": 0.557291679084301, + "rewards/reasoning_steps_reward": 0.9791667014360428, + "rewards/repetition_penalty_reward": -0.07541824877262115, + "rewards/tag_count_reward": 0.9674479365348816, + "step": 385 + }, + { + "clip_ratio": 0.0, + "completion_length": 215.5416717529297, + "epoch": 0.5789276340457443, + "grad_norm": 0.7930980550389279, + "kl": 0.396484375, + "learning_rate": 8.979059796935578e-06, + "loss": 0.3136, + "reward": 2.754866659641266, + "reward_std": 0.33939819782972336, + "rewards/accuracy_reward": 0.8385416865348816, + "rewards/reasoning_steps_reward": 0.9947916716337204, + "rewards/repetition_penalty_reward": -0.056331380270421505, + "rewards/tag_count_reward": 0.977864608168602, + "step": 386 + }, + { + "clip_ratio": 0.0, + "completion_length": 220.7135467529297, + "epoch": 0.5804274465691789, + "grad_norm": 2.4297195424195097, + "kl": 0.35107421875, + "learning_rate": 8.926900837972953e-06, + "loss": 0.5168, + "reward": 2.7247214913368225, + "reward_std": 0.5589673742651939, + "rewards/accuracy_reward": 0.8385416865348816, + "rewards/reasoning_steps_reward": 0.9791666567325592, + "rewards/repetition_penalty_reward": -0.0643411623314023, + "rewards/tag_count_reward": 0.9713541865348816, + "step": 387 + }, + { + "clip_ratio": 0.0, + "completion_length": 220.0885467529297, + "epoch": 0.5819272590926134, + "grad_norm": 1.0129393824300377, + "kl": 0.34033203125, + "learning_rate": 8.874771396876597e-06, + "loss": 0.6594, + "reward": 2.5896897315979004, + "reward_std": 0.6142508238554001, + "rewards/accuracy_reward": 0.723958358168602, + "rewards/reasoning_steps_reward": 0.9791667014360428, + "rewards/repetition_penalty_reward": -0.07307065650820732, + "rewards/tag_count_reward": 0.9596354365348816, + "step": 388 + }, + { + "clip_ratio": 0.0, + "completion_length": 245.2187614440918, + "epoch": 0.583427071616048, + "grad_norm": 0.6967111778457452, + "kl": 0.333984375, + "learning_rate": 8.822672907577244e-06, + "loss": 0.7214, + "reward": 2.621681034564972, + "reward_std": 0.8044816702604294, + "rewards/accuracy_reward": 0.8229166865348816, + "rewards/reasoning_steps_reward": 0.9548611044883728, + "rewards/repetition_penalty_reward": -0.09620102681219578, + "rewards/tag_count_reward": 0.9401041716337204, + "step": 389 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.38541793823242, + "epoch": 0.5849268841394826, + "grad_norm": 0.8231881380174958, + "kl": 0.31884765625, + "learning_rate": 8.770606803154235e-06, + "loss": 0.2053, + "reward": 2.5868722200393677, + "reward_std": 0.4082149714231491, + "rewards/accuracy_reward": 0.6562500149011612, + "rewards/reasoning_steps_reward": 0.9878472238779068, + "rewards/repetition_penalty_reward": -0.03899595024995506, + "rewards/tag_count_reward": 0.9817708432674408, + "step": 390 + }, + { + "clip_ratio": 0.0, + "completion_length": 243.2395896911621, + "epoch": 0.5864266966629171, + "grad_norm": 0.7279318824033988, + "kl": 0.353515625, + "learning_rate": 8.718574515796099e-06, + "loss": 0.6324, + "reward": 2.612482249736786, + "reward_std": 0.6807690560817719, + "rewards/accuracy_reward": 0.8072916865348816, + "rewards/reasoning_steps_reward": 0.9548611491918564, + "rewards/repetition_penalty_reward": -0.09237884357571602, + "rewards/tag_count_reward": 0.942708358168602, + "step": 391 + }, + { + "clip_ratio": 0.0, + "completion_length": 276.40625762939453, + "epoch": 0.5879265091863517, + "grad_norm": 1.6569386224229496, + "kl": 0.43408203125, + "learning_rate": 8.666577476761147e-06, + "loss": 0.9446, + "reward": 2.3660694360733032, + "reward_std": 1.0445173233747482, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9288194477558136, + "rewards/repetition_penalty_reward": -0.1317605171352625, + "rewards/tag_count_reward": 0.9023437649011612, + "step": 392 + }, + { + "clip_ratio": 0.0, + "completion_length": 305.8489685058594, + "epoch": 0.5894263217097863, + "grad_norm": 1.3890685202023243, + "kl": 0.42333984375, + "learning_rate": 8.61461711633812e-06, + "loss": 0.7734, + "reward": 2.335377037525177, + "reward_std": 1.022938460111618, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.925347238779068, + "rewards/repetition_penalty_reward": -0.16418905928730965, + "rewards/tag_count_reward": 0.8867187649011612, + "step": 393 + }, + { + "clip_ratio": 0.0, + "completion_length": 332.44792556762695, + "epoch": 0.5909261342332208, + "grad_norm": 1.758344512665889, + "kl": 0.47607421875, + "learning_rate": 8.562694863806833e-06, + "loss": 0.7599, + "reward": 2.0866143703460693, + "reward_std": 1.0357710719108582, + "rewards/accuracy_reward": 0.5000000223517418, + "rewards/reasoning_steps_reward": 0.9027778208255768, + "rewards/repetition_penalty_reward": -0.19246553257107735, + "rewards/tag_count_reward": 0.8763020932674408, + "step": 394 + }, + { + "clip_ratio": 0.0, + "completion_length": 265.70314025878906, + "epoch": 0.5924259467566554, + "grad_norm": 0.7342927358013376, + "kl": 0.3798828125, + "learning_rate": 8.510812147398857e-06, + "loss": 0.6993, + "reward": 2.433952748775482, + "reward_std": 0.7940848171710968, + "rewards/accuracy_reward": 0.661458358168602, + "rewards/reasoning_steps_reward": 0.9461806118488312, + "rewards/repetition_penalty_reward": -0.10988406464457512, + "rewards/tag_count_reward": 0.9361979216337204, + "step": 395 + }, + { + "clip_ratio": 0.0, + "completion_length": 248.14063262939453, + "epoch": 0.59392575928009, + "grad_norm": 0.7272681746221773, + "kl": 0.40625, + "learning_rate": 8.458970394258244e-06, + "loss": 0.6893, + "reward": 2.5480750799179077, + "reward_std": 0.6472647786140442, + "rewards/accuracy_reward": 0.7343750149011612, + "rewards/reasoning_steps_reward": 0.960069477558136, + "rewards/repetition_penalty_reward": -0.09558827057480812, + "rewards/tag_count_reward": 0.9492187798023224, + "step": 396 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.07813262939453, + "epoch": 0.5954255718035245, + "grad_norm": 1.774304333205819, + "kl": 0.43408203125, + "learning_rate": 8.407171030402263e-06, + "loss": 0.9403, + "reward": 2.3916409015655518, + "reward_std": 0.9467423260211945, + "rewards/accuracy_reward": 0.6979167014360428, + "rewards/reasoning_steps_reward": 0.9218750149011612, + "rewards/repetition_penalty_reward": -0.14221329241991043, + "rewards/tag_count_reward": 0.9140625298023224, + "step": 397 + }, + { + "clip_ratio": 0.0, + "completion_length": 236.25000762939453, + "epoch": 0.5969253843269592, + "grad_norm": 0.757311691683406, + "kl": 0.38232421875, + "learning_rate": 8.355415480682176e-06, + "loss": 0.5822, + "reward": 2.5999165177345276, + "reward_std": 0.6902973502874374, + "rewards/accuracy_reward": 0.7968750149011612, + "rewards/reasoning_steps_reward": 0.9670139402151108, + "rewards/repetition_penalty_reward": -0.10017046704888344, + "rewards/tag_count_reward": 0.9361979365348816, + "step": 398 + }, + { + "clip_ratio": 0.0, + "completion_length": 244.0260467529297, + "epoch": 0.5984251968503937, + "grad_norm": 0.8003986775615329, + "kl": 0.32861328125, + "learning_rate": 8.303705168744042e-06, + "loss": 0.8742, + "reward": 2.4283345341682434, + "reward_std": 0.7666500955820084, + "rewards/accuracy_reward": 0.6302083432674408, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.1033495794981718, + "rewards/tag_count_reward": 0.9361979365348816, + "step": 399 + }, + { + "clip_ratio": 0.0, + "completion_length": 283.21355056762695, + "epoch": 0.5999250093738283, + "grad_norm": 1.0199487663866538, + "kl": 0.4130859375, + "learning_rate": 8.252041516989565e-06, + "loss": 0.9242, + "reward": 2.414180636405945, + "reward_std": 0.9542115926742554, + "rewards/accuracy_reward": 0.703125, + "rewards/reasoning_steps_reward": 0.9444444626569748, + "rewards/repetition_penalty_reward": -0.14614922180771828, + "rewards/tag_count_reward": 0.9127604216337204, + "step": 400 + }, + { + "clip_ratio": 0.0, + "completion_length": 243.81771087646484, + "epoch": 0.6014248218972629, + "grad_norm": 1.0978785601748482, + "kl": 0.39208984375, + "learning_rate": 8.200425946536956e-06, + "loss": 0.8053, + "reward": 2.522300899028778, + "reward_std": 0.8708243519067764, + "rewards/accuracy_reward": 0.7604166865348816, + "rewards/reasoning_steps_reward": 0.9461805820465088, + "rewards/repetition_penalty_reward": -0.11137974262237549, + "rewards/tag_count_reward": 0.927083358168602, + "step": 401 + }, + { + "clip_ratio": 0.0, + "completion_length": 244.26562881469727, + "epoch": 0.6029246344206974, + "grad_norm": 0.842070280135897, + "kl": 0.31103515625, + "learning_rate": 8.148859877181849e-06, + "loss": 0.7132, + "reward": 2.6524030566215515, + "reward_std": 0.7079126834869385, + "rewards/accuracy_reward": 0.848958358168602, + "rewards/reasoning_steps_reward": 0.9635417014360428, + "rewards/repetition_penalty_reward": -0.10540947876870632, + "rewards/tag_count_reward": 0.9453125149011612, + "step": 402 + }, + { + "clip_ratio": 0.0, + "completion_length": 246.54687881469727, + "epoch": 0.604424446944132, + "grad_norm": 9.414923410713605, + "kl": 0.35693359375, + "learning_rate": 8.097344727358247e-06, + "loss": 0.6565, + "reward": 2.452816426753998, + "reward_std": 0.8280535340309143, + "rewards/accuracy_reward": 0.6770833507180214, + "rewards/reasoning_steps_reward": 0.9375000298023224, + "rewards/repetition_penalty_reward": -0.10447533056139946, + "rewards/tag_count_reward": 0.942708358168602, + "step": 403 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.87500381469727, + "epoch": 0.6059242594675666, + "grad_norm": 1.1724536798403964, + "kl": 0.390625, + "learning_rate": 8.045881914099503e-06, + "loss": 0.5066, + "reward": 2.6050479412078857, + "reward_std": 0.6001375466585159, + "rewards/accuracy_reward": 0.7447916865348816, + "rewards/reasoning_steps_reward": 0.973958358168602, + "rewards/repetition_penalty_reward": -0.06943127233535051, + "rewards/tag_count_reward": 0.9557291865348816, + "step": 404 + }, + { + "clip_ratio": 0.0, + "completion_length": 255.69792938232422, + "epoch": 0.6074240719910011, + "grad_norm": 0.8089367863487846, + "kl": 0.3916015625, + "learning_rate": 7.99447285299934e-06, + "loss": 0.8552, + "reward": 2.4925881028175354, + "reward_std": 0.8843167871236801, + "rewards/accuracy_reward": 0.7447916865348816, + "rewards/reasoning_steps_reward": 0.942708358168602, + "rewards/repetition_penalty_reward": -0.12069321796298027, + "rewards/tag_count_reward": 0.9257812649011612, + "step": 405 + }, + { + "clip_ratio": 0.0, + "completion_length": 245.58334732055664, + "epoch": 0.6089238845144357, + "grad_norm": 67.41761784039255, + "kl": 2.849609375, + "learning_rate": 7.943118958172917e-06, + "loss": 0.7802, + "reward": 2.5025484561920166, + "reward_std": 0.8601765781641006, + "rewards/accuracy_reward": 0.755208358168602, + "rewards/reasoning_steps_reward": 0.9340278208255768, + "rewards/repetition_penalty_reward": -0.11377106420695782, + "rewards/tag_count_reward": 0.927083358168602, + "step": 406 + }, + { + "clip_ratio": 0.0, + "completion_length": 252.80209350585938, + "epoch": 0.6104236970378702, + "grad_norm": 50.58089501980667, + "kl": 3.66796875, + "learning_rate": 7.891821642217926e-06, + "loss": 0.8609, + "reward": 2.4164522886276245, + "reward_std": 0.693310096859932, + "rewards/accuracy_reward": 0.6354166716337204, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.1191379763185978, + "rewards/tag_count_reward": 0.9348958432674408, + "step": 407 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.53125381469727, + "epoch": 0.6119235095613048, + "grad_norm": 1.7854042705861606, + "kl": 0.47021484375, + "learning_rate": 7.840582316175737e-06, + "loss": 0.6802, + "reward": 2.4898502826690674, + "reward_std": 0.7629896551370621, + "rewards/accuracy_reward": 0.7447916865348816, + "rewards/reasoning_steps_reward": 0.958333358168602, + "rewards/repetition_penalty_reward": -0.13645190559327602, + "rewards/tag_count_reward": 0.9231770932674408, + "step": 408 + }, + { + "clip_ratio": 0.0, + "completion_length": 265.7708435058594, + "epoch": 0.6134233220847394, + "grad_norm": 0.6181557051707668, + "kl": 0.3818359375, + "learning_rate": 7.789402389492582e-06, + "loss": 0.7549, + "reward": 2.4589642882347107, + "reward_std": 0.831140786409378, + "rewards/accuracy_reward": 0.6979166939854622, + "rewards/reasoning_steps_reward": 0.9548611342906952, + "rewards/repetition_penalty_reward": -0.13001148030161858, + "rewards/tag_count_reward": 0.9361979514360428, + "step": 409 + }, + { + "clip_ratio": 0.0, + "completion_length": 257.5104293823242, + "epoch": 0.6149231346081739, + "grad_norm": 0.9617852079175642, + "kl": 0.36083984375, + "learning_rate": 7.738283269980798e-06, + "loss": 0.7348, + "reward": 2.555156171321869, + "reward_std": 0.8534037470817566, + "rewards/accuracy_reward": 0.8072916865348816, + "rewards/reasoning_steps_reward": 0.9444444477558136, + "rewards/repetition_penalty_reward": -0.11845514364540577, + "rewards/tag_count_reward": 0.9218750149011612, + "step": 410 + }, + { + "clip_ratio": 0.0, + "completion_length": 219.22916793823242, + "epoch": 0.6164229471316085, + "grad_norm": 0.7921173872233128, + "kl": 0.29833984375, + "learning_rate": 7.687226363780084e-06, + "loss": 0.5226, + "reward": 2.4889369010925293, + "reward_std": 0.5284189339727163, + "rewards/accuracy_reward": 0.6406250074505806, + "rewards/reasoning_steps_reward": 0.9722222238779068, + "rewards/repetition_penalty_reward": -0.08094170223921537, + "rewards/tag_count_reward": 0.9570312649011612, + "step": 411 + }, + { + "clip_ratio": 0.0, + "completion_length": 205.95312881469727, + "epoch": 0.6179227596550432, + "grad_norm": 0.6969647943689223, + "kl": 0.3193359375, + "learning_rate": 7.636233075318824e-06, + "loss": 0.3959, + "reward": 2.670749843120575, + "reward_std": 0.4937814176082611, + "rewards/accuracy_reward": 0.7968750298023224, + "rewards/reasoning_steps_reward": 0.975694477558136, + "rewards/repetition_penalty_reward": -0.06666341191157699, + "rewards/tag_count_reward": 0.9648437798023224, + "step": 412 + }, + { + "clip_ratio": 0.0, + "completion_length": 234.8854217529297, + "epoch": 0.6194225721784777, + "grad_norm": 0.9557907221459236, + "kl": 0.310546875, + "learning_rate": 7.585304807275473e-06, + "loss": 0.571, + "reward": 2.531363546848297, + "reward_std": 0.6936883656308055, + "rewards/accuracy_reward": 0.7343750149011612, + "rewards/reasoning_steps_reward": 0.9427083283662796, + "rewards/repetition_penalty_reward": -0.0910323103889823, + "rewards/tag_count_reward": 0.9453125298023224, + "step": 413 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.11458587646484, + "epoch": 0.6209223847019123, + "grad_norm": 0.8453338172094219, + "kl": 0.28857421875, + "learning_rate": 7.534442960539956e-06, + "loss": 0.3905, + "reward": 2.7355194687843323, + "reward_std": 0.49409135431051254, + "rewards/accuracy_reward": 0.8385416716337204, + "rewards/reasoning_steps_reward": 0.9809028059244156, + "rewards/repetition_penalty_reward": -0.05527916317805648, + "rewards/tag_count_reward": 0.9713542014360428, + "step": 414 + }, + { + "clip_ratio": 0.0, + "completion_length": 182.48437881469727, + "epoch": 0.6224221972253469, + "grad_norm": 0.7857191402885518, + "kl": 0.303955078125, + "learning_rate": 7.483648934175138e-06, + "loss": 0.1902, + "reward": 2.6866626739501953, + "reward_std": 0.28315746411681175, + "rewards/accuracy_reward": 0.7395833432674408, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.042504156939685345, + "rewards/tag_count_reward": 0.9895833432674408, + "step": 415 + }, + { + "clip_ratio": 0.0, + "completion_length": 192.37500762939453, + "epoch": 0.6239220097487814, + "grad_norm": 1.0846986274613384, + "kl": 0.35107421875, + "learning_rate": 7.432924125378345e-06, + "loss": 0.3789, + "reward": 2.4783818125724792, + "reward_std": 0.5089812204241753, + "rewards/accuracy_reward": 0.5937500223517418, + "rewards/reasoning_steps_reward": 0.9809028208255768, + "rewards/repetition_penalty_reward": -0.0624168599024415, + "rewards/tag_count_reward": 0.9661458432674408, + "step": 416 + }, + { + "clip_ratio": 0.0, + "completion_length": 238.28125762939453, + "epoch": 0.625421822272216, + "grad_norm": 0.6289618055873664, + "kl": 0.32958984375, + "learning_rate": 7.382269929442925e-06, + "loss": 0.6525, + "reward": 2.729068160057068, + "reward_std": 0.6565304845571518, + "rewards/accuracy_reward": 0.9062500149011612, + "rewards/reasoning_steps_reward": 0.9635416865348816, + "rewards/repetition_penalty_reward": -0.09384858049452305, + "rewards/tag_count_reward": 0.9531250298023224, + "step": 417 + }, + { + "clip_ratio": 0.0, + "completion_length": 229.7187614440918, + "epoch": 0.6269216347956506, + "grad_norm": 2.2511596836127628, + "kl": 0.37158203125, + "learning_rate": 7.331687739719868e-06, + "loss": 0.6599, + "reward": 2.739536941051483, + "reward_std": 0.7113066837191582, + "rewards/accuracy_reward": 0.9062500149011612, + "rewards/reasoning_steps_reward": 0.9652778059244156, + "rewards/repetition_penalty_reward": -0.08251171908341348, + "rewards/tag_count_reward": 0.950520858168602, + "step": 418 + }, + { + "clip_ratio": 0.0, + "completion_length": 214.0260467529297, + "epoch": 0.6284214473190851, + "grad_norm": 0.7481899237512065, + "kl": 0.3017578125, + "learning_rate": 7.281178947579484e-06, + "loss": 0.4911, + "reward": 2.677112579345703, + "reward_std": 0.500836968421936, + "rewards/accuracy_reward": 0.802083358168602, + "rewards/reasoning_steps_reward": 0.9843750447034836, + "rewards/repetition_penalty_reward": -0.07549162488430738, + "rewards/tag_count_reward": 0.9661458432674408, + "step": 419 + }, + { + "clip_ratio": 0.0, + "completion_length": 225.0572967529297, + "epoch": 0.6299212598425197, + "grad_norm": 2.4166560990937582, + "kl": 0.6650390625, + "learning_rate": 7.230744942373125e-06, + "loss": 0.5609, + "reward": 2.6925852298736572, + "reward_std": 0.5387180671095848, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/reasoning_steps_reward": 0.9826388955116272, + "rewards/repetition_penalty_reward": -0.08041842561215162, + "rewards/tag_count_reward": 0.9570312649011612, + "step": 420 + }, + { + "clip_ratio": 0.0, + "completion_length": 204.2083396911621, + "epoch": 0.6314210723659542, + "grad_norm": 3.5367701148683515, + "kl": 0.3935546875, + "learning_rate": 7.1803871113949675e-06, + "loss": 0.5299, + "reward": 2.5729278326034546, + "reward_std": 0.5818023979663849, + "rewards/accuracy_reward": 0.708333358168602, + "rewards/reasoning_steps_reward": 0.9791666716337204, + "rewards/repetition_penalty_reward": -0.07290551625192165, + "rewards/tag_count_reward": 0.9583333432674408, + "step": 421 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.77083587646484, + "epoch": 0.6329208848893888, + "grad_norm": 0.81390385216287, + "kl": 0.33544921875, + "learning_rate": 7.13010683984386e-06, + "loss": 0.4888, + "reward": 2.656929612159729, + "reward_std": 0.4153623729944229, + "rewards/accuracy_reward": 0.739583358168602, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.04923371039330959, + "rewards/tag_count_reward": 0.9804687649011612, + "step": 422 + }, + { + "clip_ratio": 0.0, + "completion_length": 219.2291717529297, + "epoch": 0.6344206974128234, + "grad_norm": 0.7193436114614246, + "kl": 0.322998046875, + "learning_rate": 7.07990551078521e-06, + "loss": 0.5418, + "reward": 2.480729579925537, + "reward_std": 0.5471135228872299, + "rewards/accuracy_reward": 0.614583358168602, + "rewards/reasoning_steps_reward": 0.9704861342906952, + "rewards/repetition_penalty_reward": -0.07048568688333035, + "rewards/tag_count_reward": 0.9661458432674408, + "step": 423 + }, + { + "clip_ratio": 0.0, + "completion_length": 217.1458396911621, + "epoch": 0.6359205099362579, + "grad_norm": 1.2355026811447927, + "kl": 0.39794921875, + "learning_rate": 7.029784505112948e-06, + "loss": 0.6063, + "reward": 2.551844298839569, + "reward_std": 0.5733724534511566, + "rewards/accuracy_reward": 0.6979166865348816, + "rewards/reasoning_steps_reward": 0.9687500298023224, + "rewards/repetition_penalty_reward": -0.07836401462554932, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 424 + }, + { + "clip_ratio": 0.0, + "completion_length": 185.15625762939453, + "epoch": 0.6374203224596925, + "grad_norm": 0.7168826565129065, + "kl": 0.306640625, + "learning_rate": 6.979745201511531e-06, + "loss": 0.3983, + "reward": 2.7733737230300903, + "reward_std": 0.3679837482050061, + "rewards/accuracy_reward": 0.8750000149011612, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.0534491918515414, + "rewards/tag_count_reward": 0.97265625, + "step": 425 + }, + { + "clip_ratio": 0.0, + "completion_length": 189.7760467529297, + "epoch": 0.6389201349831272, + "grad_norm": 1.2687322158065155, + "kl": 0.318359375, + "learning_rate": 6.929788976418044e-06, + "loss": 0.6424, + "reward": 2.7650052905082703, + "reward_std": 0.5291210561990738, + "rewards/accuracy_reward": 0.8750000298023224, + "rewards/reasoning_steps_reward": 0.9774305820465088, + "rewards/repetition_penalty_reward": -0.053571161814033985, + "rewards/tag_count_reward": 0.966145858168602, + "step": 426 + }, + { + "clip_ratio": 0.0, + "completion_length": 215.81250762939453, + "epoch": 0.6404199475065617, + "grad_norm": 1.6352357141251739, + "kl": 0.33349609375, + "learning_rate": 6.879917203984306e-06, + "loss": 0.6536, + "reward": 2.448298752307892, + "reward_std": 0.5732715502381325, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.07210064399987459, + "rewards/tag_count_reward": 0.9648437649011612, + "step": 427 + }, + { + "clip_ratio": 0.0, + "completion_length": 213.97917556762695, + "epoch": 0.6419197600299963, + "grad_norm": 1.886886621974555, + "kl": 0.58642578125, + "learning_rate": 6.830131256039094e-06, + "loss": 0.5201, + "reward": 2.543819785118103, + "reward_std": 0.5805186182260513, + "rewards/accuracy_reward": 0.692708358168602, + "rewards/reasoning_steps_reward": 0.9670139104127884, + "rewards/repetition_penalty_reward": -0.07814204692840576, + "rewards/tag_count_reward": 0.9622395932674408, + "step": 428 + }, + { + "clip_ratio": 0.0, + "completion_length": 215.07813262939453, + "epoch": 0.6434195725534309, + "grad_norm": 1.0666842338328122, + "kl": 0.36376953125, + "learning_rate": 6.7804325020504e-06, + "loss": 0.6236, + "reward": 2.523444414138794, + "reward_std": 0.4915116261690855, + "rewards/accuracy_reward": 0.6562500149011612, + "rewards/reasoning_steps_reward": 0.9791666716337204, + "rewards/repetition_penalty_reward": -0.07421195041388273, + "rewards/tag_count_reward": 0.962239608168602, + "step": 429 + }, + { + "clip_ratio": 0.0, + "completion_length": 211.85937881469727, + "epoch": 0.6449193850768654, + "grad_norm": 1.060449887686782, + "kl": 0.2978515625, + "learning_rate": 6.730822309087756e-06, + "loss": 0.5033, + "reward": 2.6772631406784058, + "reward_std": 0.48493412137031555, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.986111119389534, + "rewards/repetition_penalty_reward": -0.06666056625545025, + "rewards/tag_count_reward": 0.9661458432674408, + "step": 430 + }, + { + "clip_ratio": 0.0, + "completion_length": 259.1614646911621, + "epoch": 0.6464191976003, + "grad_norm": 1.3061723100608225, + "kl": 0.361328125, + "learning_rate": 6.6813020417846456e-06, + "loss": 0.7871, + "reward": 2.4965052604675293, + "reward_std": 0.7866370305418968, + "rewards/accuracy_reward": 0.723958358168602, + "rewards/reasoning_steps_reward": 0.9618055522441864, + "rewards/repetition_penalty_reward": -0.12024837546050549, + "rewards/tag_count_reward": 0.930989608168602, + "step": 431 + }, + { + "clip_ratio": 0.0, + "completion_length": 285.40105056762695, + "epoch": 0.6479190101237345, + "grad_norm": 0.7888584050372874, + "kl": 0.3642578125, + "learning_rate": 6.6318730623009465e-06, + "loss": 0.8798, + "reward": 2.5130550861358643, + "reward_std": 1.0070786774158478, + "rewards/accuracy_reward": 0.8229166865348816, + "rewards/reasoning_steps_reward": 0.942708358168602, + "rewards/repetition_penalty_reward": -0.14840331301093102, + "rewards/tag_count_reward": 0.8958333432674408, + "step": 432 + }, + { + "clip_ratio": 0.0, + "completion_length": 265.69272232055664, + "epoch": 0.6494188226471691, + "grad_norm": 2.6122932218321706, + "kl": 0.51123046875, + "learning_rate": 6.582536730285476e-06, + "loss": 0.847, + "reward": 2.5346211194992065, + "reward_std": 0.9135303720831871, + "rewards/accuracy_reward": 0.817708358168602, + "rewards/reasoning_steps_reward": 0.9548611491918564, + "rewards/repetition_penalty_reward": -0.1350838476791978, + "rewards/tag_count_reward": 0.8971354365348816, + "step": 433 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.45313262939453, + "epoch": 0.6509186351706037, + "grad_norm": 0.8518186687660756, + "kl": 0.46337890625, + "learning_rate": 6.5332944028385885e-06, + "loss": 0.5872, + "reward": 2.459271728992462, + "reward_std": 0.7705230340361595, + "rewards/accuracy_reward": 0.7447916865348816, + "rewards/reasoning_steps_reward": 0.9479167014360428, + "rewards/repetition_penalty_reward": -0.14489501249045134, + "rewards/tag_count_reward": 0.9114583432674408, + "step": 434 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.73438262939453, + "epoch": 0.6524184476940382, + "grad_norm": 0.9613160246102254, + "kl": 0.443359375, + "learning_rate": 6.484147434474837e-06, + "loss": 0.9578, + "reward": 2.159851849079132, + "reward_std": 0.9833470582962036, + "rewards/accuracy_reward": 0.520833358168602, + "rewards/reasoning_steps_reward": 0.942708358168602, + "rewards/repetition_penalty_reward": -0.16957524791359901, + "rewards/tag_count_reward": 0.8658854365348816, + "step": 435 + }, + { + "clip_ratio": 0.0, + "completion_length": 305.9010467529297, + "epoch": 0.6539182602174728, + "grad_norm": 0.8755106034657409, + "kl": 0.4609375, + "learning_rate": 6.435097177085728e-06, + "loss": 0.8985, + "reward": 2.2428812980651855, + "reward_std": 0.9718329310417175, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9444445073604584, + "rewards/repetition_penalty_reward": -0.1755215786397457, + "rewards/tag_count_reward": 0.8697916865348816, + "step": 436 + }, + { + "clip_ratio": 0.0, + "completion_length": 252.41667556762695, + "epoch": 0.6554180727409074, + "grad_norm": 0.7422509830131178, + "kl": 0.373046875, + "learning_rate": 6.386144979902527e-06, + "loss": 0.703, + "reward": 2.3507115840911865, + "reward_std": 0.7452640384435654, + "rewards/accuracy_reward": 0.598958358168602, + "rewards/reasoning_steps_reward": 0.9548611491918564, + "rewards/repetition_penalty_reward": -0.11196202971041203, + "rewards/tag_count_reward": 0.9088541865348816, + "step": 437 + }, + { + "clip_ratio": 0.0, + "completion_length": 256.5208435058594, + "epoch": 0.6569178852643419, + "grad_norm": 0.7991022183328756, + "kl": 0.361328125, + "learning_rate": 6.337292189459139e-06, + "loss": 0.7431, + "reward": 2.456838309764862, + "reward_std": 0.9040014296770096, + "rewards/accuracy_reward": 0.7239583432674408, + "rewards/reasoning_steps_reward": 0.9496528059244156, + "rewards/repetition_penalty_reward": -0.12041868269443512, + "rewards/tag_count_reward": 0.9036458432674408, + "step": 438 + }, + { + "clip_ratio": 0.0, + "completion_length": 231.63542938232422, + "epoch": 0.6584176977877765, + "grad_norm": 1.3902018588463414, + "kl": 0.35986328125, + "learning_rate": 6.2885401495550826e-06, + "loss": 0.7122, + "reward": 2.5194268226623535, + "reward_std": 0.7179795950651169, + "rewards/accuracy_reward": 0.729166679084301, + "rewards/reasoning_steps_reward": 0.9670138955116272, + "rewards/repetition_penalty_reward": -0.09732666984200478, + "rewards/tag_count_reward": 0.9205729365348816, + "step": 439 + }, + { + "clip_ratio": 0.0, + "completion_length": 218.578125, + "epoch": 0.6599175103112112, + "grad_norm": 0.9152954380082454, + "kl": 0.36083984375, + "learning_rate": 6.239890201218517e-06, + "loss": 0.492, + "reward": 2.469900608062744, + "reward_std": 0.6492541283369064, + "rewards/accuracy_reward": 0.651041679084301, + "rewards/reasoning_steps_reward": 0.9600694477558136, + "rewards/repetition_penalty_reward": -0.07220023218542337, + "rewards/tag_count_reward": 0.9309895932674408, + "step": 440 + }, + { + "clip_ratio": 0.0, + "completion_length": 225.1458396911621, + "epoch": 0.6614173228346457, + "grad_norm": 1.5238075521377912, + "kl": 0.3486328125, + "learning_rate": 6.191343682669357e-06, + "loss": 0.6152, + "reward": 2.59203839302063, + "reward_std": 0.5786980837583542, + "rewards/accuracy_reward": 0.7447916865348816, + "rewards/reasoning_steps_reward": 0.9895833432674408, + "rewards/repetition_penalty_reward": -0.08374295756220818, + "rewards/tag_count_reward": 0.9414062649011612, + "step": 441 + }, + { + "clip_ratio": 0.0, + "completion_length": 214.8854217529297, + "epoch": 0.6629171353580803, + "grad_norm": 1.014568496944409, + "kl": 0.3203125, + "learning_rate": 6.142901929282459e-06, + "loss": 0.5185, + "reward": 2.4907559752464294, + "reward_std": 0.527790479362011, + "rewards/accuracy_reward": 0.6093750074505806, + "rewards/reasoning_steps_reward": 0.9843750149011612, + "rewards/repetition_penalty_reward": -0.06783778499811888, + "rewards/tag_count_reward": 0.9648437649011612, + "step": 442 + }, + { + "clip_ratio": 0.0, + "completion_length": 213.52083587646484, + "epoch": 0.6644169478815148, + "grad_norm": 0.8930401982873789, + "kl": 0.3203125, + "learning_rate": 6.094566273550899e-06, + "loss": 0.554, + "reward": 2.7469860911369324, + "reward_std": 0.5041925981640816, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9930555671453476, + "rewards/repetition_penalty_reward": -0.06507986783981323, + "rewards/tag_count_reward": 0.9648437798023224, + "step": 443 + }, + { + "clip_ratio": 0.0, + "completion_length": 223.73438262939453, + "epoch": 0.6659167604049494, + "grad_norm": 1.588063897389527, + "kl": 0.3759765625, + "learning_rate": 6.046338045049307e-06, + "loss": 0.3918, + "reward": 2.5898342728614807, + "reward_std": 0.5453440099954605, + "rewards/accuracy_reward": 0.723958358168602, + "rewards/reasoning_steps_reward": 0.9809027910232544, + "rewards/repetition_penalty_reward": -0.07075604610145092, + "rewards/tag_count_reward": 0.9557291865348816, + "step": 444 + }, + { + "clip_ratio": 0.0, + "completion_length": 211.0677146911621, + "epoch": 0.667416572928384, + "grad_norm": 0.7683663208645458, + "kl": 0.33544921875, + "learning_rate": 5.998218570397298e-06, + "loss": 0.5318, + "reward": 2.6227740049362183, + "reward_std": 0.56145179271698, + "rewards/accuracy_reward": 0.7604166865348816, + "rewards/reasoning_steps_reward": 0.970486119389534, + "rewards/repetition_penalty_reward": -0.06646222807466984, + "rewards/tag_count_reward": 0.9583333432674408, + "step": 445 + }, + { + "clip_ratio": 0.0, + "completion_length": 214.59375381469727, + "epoch": 0.6689163854518185, + "grad_norm": 2.006829340741143, + "kl": 0.36865234375, + "learning_rate": 5.950209173222985e-06, + "loss": 0.4647, + "reward": 2.7126694917678833, + "reward_std": 0.5051566585898399, + "rewards/accuracy_reward": 0.8281250298023224, + "rewards/reasoning_steps_reward": 0.986111119389534, + "rewards/repetition_penalty_reward": -0.07031677477061749, + "rewards/tag_count_reward": 0.9687500149011612, + "step": 446 + }, + { + "clip_ratio": 0.0, + "completion_length": 233.4062614440918, + "epoch": 0.6704161979752531, + "grad_norm": 0.8834971529763043, + "kl": 0.36572265625, + "learning_rate": 5.902311174126565e-06, + "loss": 0.4519, + "reward": 2.7030810117721558, + "reward_std": 0.49621870182454586, + "rewards/accuracy_reward": 0.8645833432674408, + "rewards/reasoning_steps_reward": 0.967013880610466, + "rewards/repetition_penalty_reward": -0.08033910719677806, + "rewards/tag_count_reward": 0.9518229216337204, + "step": 447 + }, + { + "clip_ratio": 0.0, + "completion_length": 219.4427146911621, + "epoch": 0.6719160104986877, + "grad_norm": 0.98109027161852, + "kl": 0.359375, + "learning_rate": 5.854525890643996e-06, + "loss": 0.3661, + "reward": 2.567684829235077, + "reward_std": 0.5093120224773884, + "rewards/accuracy_reward": 0.6875000149011612, + "rewards/reasoning_steps_reward": 0.9756944477558136, + "rewards/repetition_penalty_reward": -0.060353430919349194, + "rewards/tag_count_reward": 0.9648437649011612, + "step": 448 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.7916717529297, + "epoch": 0.6734158230221222, + "grad_norm": 0.8311384203525323, + "kl": 0.34130859375, + "learning_rate": 5.806854637210752e-06, + "loss": 0.4174, + "reward": 2.7751079201698303, + "reward_std": 0.3471484985202551, + "rewards/accuracy_reward": 0.8750000149011612, + "rewards/reasoning_steps_reward": 0.9826389253139496, + "rewards/repetition_penalty_reward": -0.0551872905343771, + "rewards/tag_count_reward": 0.9726562798023224, + "step": 449 + }, + { + "clip_ratio": 0.0, + "completion_length": 197.1354217529297, + "epoch": 0.6749156355455568, + "grad_norm": 0.8629385750603632, + "kl": 0.341796875, + "learning_rate": 5.759298725125671e-06, + "loss": 0.2358, + "reward": 2.6736323833465576, + "reward_std": 0.344327449798584, + "rewards/accuracy_reward": 0.755208358168602, + "rewards/reasoning_steps_reward": 0.9843750149011612, + "rewards/repetition_penalty_reward": -0.04251353442668915, + "rewards/tag_count_reward": 0.9765625298023224, + "step": 450 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.3958396911621, + "epoch": 0.6764154480689913, + "grad_norm": 0.9794837217183727, + "kl": 0.3740234375, + "learning_rate": 5.711859462514883e-06, + "loss": 0.4027, + "reward": 2.6471253037452698, + "reward_std": 0.46060725301504135, + "rewards/accuracy_reward": 0.7500000149011612, + "rewards/reasoning_steps_reward": 0.986111119389534, + "rewards/repetition_penalty_reward": -0.049923318438231945, + "rewards/tag_count_reward": 0.9609375298023224, + "step": 451 + }, + { + "clip_ratio": 0.0, + "completion_length": 224.30730056762695, + "epoch": 0.6779152605924259, + "grad_norm": 1.191297396076842, + "kl": 0.4599609375, + "learning_rate": 5.664538154295827e-06, + "loss": 0.7305, + "reward": 2.573324501514435, + "reward_std": 0.6858862191438675, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.970486119389534, + "rewards/repetition_penalty_reward": -0.0768492091447115, + "rewards/tag_count_reward": 0.950520858168602, + "step": 452 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.9322967529297, + "epoch": 0.6794150731158605, + "grad_norm": 1.4404111173702936, + "kl": 0.43310546875, + "learning_rate": 5.617336102141356e-06, + "loss": 0.5628, + "reward": 2.5157384872436523, + "reward_std": 0.43997257202863693, + "rewards/accuracy_reward": 0.630208358168602, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.06542493868619204, + "rewards/tag_count_reward": 0.9648437649011612, + "step": 453 + }, + { + "clip_ratio": 0.0, + "completion_length": 218.62500381469727, + "epoch": 0.6809148856392951, + "grad_norm": 1.959555679611255, + "kl": 0.46826171875, + "learning_rate": 5.570254604443929e-06, + "loss": 0.7598, + "reward": 2.588327646255493, + "reward_std": 0.7168947905302048, + "rewards/accuracy_reward": 0.765625, + "rewards/reasoning_steps_reward": 0.9635416865348816, + "rewards/repetition_penalty_reward": -0.08484961278736591, + "rewards/tag_count_reward": 0.9440104365348816, + "step": 454 + }, + { + "clip_ratio": 0.0, + "completion_length": 232.7395896911621, + "epoch": 0.6824146981627297, + "grad_norm": 148.69573978924745, + "kl": 3.85595703125, + "learning_rate": 5.5232949562799055e-06, + "loss": 1.3481, + "reward": 2.4738574028015137, + "reward_std": 0.779233306646347, + "rewards/accuracy_reward": 0.6770833535119891, + "rewards/reasoning_steps_reward": 0.9618055522441864, + "rewards/repetition_penalty_reward": -0.09862528461962938, + "rewards/tag_count_reward": 0.9335937798023224, + "step": 455 + }, + { + "clip_ratio": 0.0, + "completion_length": 235.4583396911621, + "epoch": 0.6839145106861643, + "grad_norm": 1.9450934585599418, + "kl": 0.41796875, + "learning_rate": 5.4764584493739095e-06, + "loss": 0.7255, + "reward": 2.364755541086197, + "reward_std": 0.6270653009414673, + "rewards/accuracy_reward": 0.5312500149011612, + "rewards/reasoning_steps_reward": 0.9809027761220932, + "rewards/repetition_penalty_reward": -0.09010571241378784, + "rewards/tag_count_reward": 0.942708358168602, + "step": 456 + }, + { + "clip_ratio": 0.0, + "completion_length": 225.8697967529297, + "epoch": 0.6854143232095988, + "grad_norm": 1.9127619691878879, + "kl": 0.4794921875, + "learning_rate": 5.429746372063309e-06, + "loss": 0.8757, + "reward": 2.4884825348854065, + "reward_std": 0.784279853105545, + "rewards/accuracy_reward": 0.6770833432674408, + "rewards/reasoning_steps_reward": 0.963541716337204, + "rewards/repetition_penalty_reward": -0.09224662370979786, + "rewards/tag_count_reward": 0.9401041865348816, + "step": 457 + }, + { + "clip_ratio": 0.0, + "completion_length": 210.8489646911621, + "epoch": 0.6869141357330334, + "grad_norm": 1.2094457444337063, + "kl": 0.4833984375, + "learning_rate": 5.3831600092627704e-06, + "loss": 0.714, + "reward": 2.73754620552063, + "reward_std": 0.6269456818699837, + "rewards/accuracy_reward": 0.895833358168602, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.08667258732020855, + "rewards/tag_count_reward": 0.9492187649011612, + "step": 458 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.3385467529297, + "epoch": 0.688413948256468, + "grad_norm": 1.970689168479106, + "kl": 0.4453125, + "learning_rate": 5.336700642428913e-06, + "loss": 0.5537, + "reward": 2.5139536261558533, + "reward_std": 0.5696927979588509, + "rewards/accuracy_reward": 0.6458333358168602, + "rewards/reasoning_steps_reward": 0.9722222536802292, + "rewards/repetition_penalty_reward": -0.06634163623675704, + "rewards/tag_count_reward": 0.9622395932674408, + "step": 459 + }, + { + "clip_ratio": 0.0, + "completion_length": 254.3125114440918, + "epoch": 0.6899137607799025, + "grad_norm": 0.7625934263204471, + "kl": 0.36962890625, + "learning_rate": 5.290369549525066e-06, + "loss": 0.6621, + "reward": 2.52715528011322, + "reward_std": 0.7988520860671997, + "rewards/accuracy_reward": 0.7760416865348816, + "rewards/reasoning_steps_reward": 0.9461805820465088, + "rewards/repetition_penalty_reward": -0.12345251068472862, + "rewards/tag_count_reward": 0.9283854365348816, + "step": 460 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.43750762939453, + "epoch": 0.6914135733033371, + "grad_norm": 0.989857600799216, + "kl": 0.39501953125, + "learning_rate": 5.2441680049861125e-06, + "loss": 0.6605, + "reward": 2.4125567078590393, + "reward_std": 0.6474236696958542, + "rewards/accuracy_reward": 0.5677083432674408, + "rewards/reasoning_steps_reward": 0.9670139253139496, + "rewards/repetition_penalty_reward": -0.0713843759149313, + "rewards/tag_count_reward": 0.9492187649011612, + "step": 461 + }, + { + "clip_ratio": 0.0, + "completion_length": 217.6875114440918, + "epoch": 0.6929133858267716, + "grad_norm": 0.8986450074706076, + "kl": 0.3359375, + "learning_rate": 5.198097279683434e-06, + "loss": 0.604, + "reward": 2.4094382524490356, + "reward_std": 0.6442625038325787, + "rewards/accuracy_reward": 0.614583358168602, + "rewards/reasoning_steps_reward": 0.9513889104127884, + "rewards/repetition_penalty_reward": -0.09273206302896142, + "rewards/tag_count_reward": 0.9361979365348816, + "step": 462 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.44271087646484, + "epoch": 0.6944131983502062, + "grad_norm": 606.4249596937021, + "kl": 24.65576171875, + "learning_rate": 5.152158640889947e-06, + "loss": 1.5527, + "reward": 2.6447129249572754, + "reward_std": 0.4999554455280304, + "rewards/accuracy_reward": 0.770833358168602, + "rewards/reasoning_steps_reward": 0.9791666716337204, + "rewards/repetition_penalty_reward": -0.06882883794605732, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 463 + }, + { + "clip_ratio": 0.0, + "completion_length": 219.21875762939453, + "epoch": 0.6959130108736408, + "grad_norm": 1.5220309197508348, + "kl": 0.36181640625, + "learning_rate": 5.106353352245254e-06, + "loss": 0.5723, + "reward": 2.6008208990097046, + "reward_std": 0.5897117927670479, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9635416865348816, + "rewards/repetition_penalty_reward": -0.08667923882603645, + "rewards/tag_count_reward": 0.9531250298023224, + "step": 464 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.75521087646484, + "epoch": 0.6974128233970753, + "grad_norm": 1.3974522149419788, + "kl": 0.3447265625, + "learning_rate": 5.060682673720878e-06, + "loss": 0.5287, + "reward": 2.4272356927394867, + "reward_std": 0.3737209364771843, + "rewards/accuracy_reward": 0.5156250251457095, + "rewards/reasoning_steps_reward": 0.9878472536802292, + "rewards/repetition_penalty_reward": -0.05019487999379635, + "rewards/tag_count_reward": 0.973958358168602, + "step": 465 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.8541717529297, + "epoch": 0.6989126359205099, + "grad_norm": 1.6330449051409097, + "kl": 0.28955078125, + "learning_rate": 5.015147861585603e-06, + "loss": 0.5821, + "reward": 2.5756843090057373, + "reward_std": 0.48887188360095024, + "rewards/accuracy_reward": 0.6875000149011612, + "rewards/reasoning_steps_reward": 0.975694477558136, + "rewards/repetition_penalty_reward": -0.05495820567011833, + "rewards/tag_count_reward": 0.9674479216337204, + "step": 466 + }, + { + "clip_ratio": 0.0, + "completion_length": 225.62500381469727, + "epoch": 0.7004124484439445, + "grad_norm": 1.0551997003189855, + "kl": 0.2900390625, + "learning_rate": 4.969750168370924e-06, + "loss": 0.6587, + "reward": 2.587502121925354, + "reward_std": 0.6842982918024063, + "rewards/accuracy_reward": 0.7812500149011612, + "rewards/reasoning_steps_reward": 0.9565972238779068, + "rewards/repetition_penalty_reward": -0.09175141900777817, + "rewards/tag_count_reward": 0.9414062649011612, + "step": 467 + }, + { + "clip_ratio": 0.0, + "completion_length": 215.6979217529297, + "epoch": 0.7019122609673791, + "grad_norm": 1.172638919017115, + "kl": 0.3447265625, + "learning_rate": 4.924490842836584e-06, + "loss": 0.6767, + "reward": 2.4782765209674835, + "reward_std": 0.6802399158477783, + "rewards/accuracy_reward": 0.6510416939854622, + "rewards/reasoning_steps_reward": 0.96875, + "rewards/repetition_penalty_reward": -0.08552560955286026, + "rewards/tag_count_reward": 0.9440104365348816, + "step": 468 + }, + { + "clip_ratio": 0.0, + "completion_length": 246.8020896911621, + "epoch": 0.7034120734908137, + "grad_norm": 1.208509544801629, + "kl": 0.30029296875, + "learning_rate": 4.879371129936233e-06, + "loss": 0.7641, + "reward": 2.485842287540436, + "reward_std": 0.7981886714696884, + "rewards/accuracy_reward": 0.6979166865348816, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.11224810406565666, + "rewards/tag_count_reward": 0.9348958432674408, + "step": 469 + }, + { + "clip_ratio": 0.0, + "completion_length": 217.69271087646484, + "epoch": 0.7049118860142483, + "grad_norm": 1.2784121913501234, + "kl": 0.337890625, + "learning_rate": 4.834392270783183e-06, + "loss": 0.8845, + "reward": 2.6402639150619507, + "reward_std": 0.8152274489402771, + "rewards/accuracy_reward": 0.8437500298023224, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.09411128051578999, + "rewards/tag_count_reward": 0.9322916865348816, + "step": 470 + }, + { + "clip_ratio": 0.0, + "completion_length": 217.55209350585938, + "epoch": 0.7064116985376828, + "grad_norm": 0.9623091877361314, + "kl": 0.330078125, + "learning_rate": 4.789555502616258e-06, + "loss": 0.7554, + "reward": 2.4469590187072754, + "reward_std": 0.7040076702833176, + "rewards/accuracy_reward": 0.6458333507180214, + "rewards/reasoning_steps_reward": 0.9565972238779068, + "rewards/repetition_penalty_reward": -0.09427369153127074, + "rewards/tag_count_reward": 0.9388020932674408, + "step": 471 + }, + { + "clip_ratio": 0.0, + "completion_length": 178.8333396911621, + "epoch": 0.7079115110611174, + "grad_norm": 1.0260244805761438, + "kl": 0.296875, + "learning_rate": 4.744862058765776e-06, + "loss": 0.2163, + "reward": 2.727869689464569, + "reward_std": 0.36412858217954636, + "rewards/accuracy_reward": 0.8072916865348816, + "rewards/reasoning_steps_reward": 0.9774305820465088, + "rewards/repetition_penalty_reward": -0.036019228398799896, + "rewards/tag_count_reward": 0.9791666865348816, + "step": 472 + }, + { + "clip_ratio": 0.0, + "completion_length": 226.10937881469727, + "epoch": 0.709411323584552, + "grad_norm": 0.9517703362898857, + "kl": 0.3935546875, + "learning_rate": 4.700313168619608e-06, + "loss": 0.8129, + "reward": 2.543964922428131, + "reward_std": 0.6564487293362617, + "rewards/accuracy_reward": 0.7500000149011612, + "rewards/reasoning_steps_reward": 0.9670139104127884, + "rewards/repetition_penalty_reward": -0.1053406372666359, + "rewards/tag_count_reward": 0.9322916716337204, + "step": 473 + }, + { + "clip_ratio": 0.0, + "completion_length": 237.50521087646484, + "epoch": 0.7109111361079865, + "grad_norm": 1.006357822901466, + "kl": 0.4013671875, + "learning_rate": 4.655910057589377e-06, + "loss": 0.9248, + "reward": 2.5268329977989197, + "reward_std": 0.9078202545642853, + "rewards/accuracy_reward": 0.7656250149011612, + "rewards/reasoning_steps_reward": 0.9374999850988388, + "rewards/repetition_penalty_reward": -0.10207330994307995, + "rewards/tag_count_reward": 0.9257812649011612, + "step": 474 + }, + { + "clip_ratio": 0.0, + "completion_length": 217.26562881469727, + "epoch": 0.7124109486314211, + "grad_norm": 1.4034228060868457, + "kl": 0.416015625, + "learning_rate": 4.611653947076732e-06, + "loss": 0.7093, + "reward": 2.54375296831131, + "reward_std": 0.6568827331066132, + "rewards/accuracy_reward": 0.723958358168602, + "rewards/reasoning_steps_reward": 0.9687500447034836, + "rewards/repetition_penalty_reward": -0.09166385605931282, + "rewards/tag_count_reward": 0.942708358168602, + "step": 475 + }, + { + "clip_ratio": 0.0, + "completion_length": 221.40625762939453, + "epoch": 0.7139107611548556, + "grad_norm": 2.3545088822080387, + "kl": 0.49462890625, + "learning_rate": 4.567546054439777e-06, + "loss": 0.7007, + "reward": 2.5734556913375854, + "reward_std": 0.699681967496872, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9670138657093048, + "rewards/repetition_penalty_reward": -0.08626659773290157, + "rewards/tag_count_reward": 0.9427083432674408, + "step": 476 + }, + { + "clip_ratio": 0.0, + "completion_length": 227.00000762939453, + "epoch": 0.7154105736782902, + "grad_norm": 1.5468969267610364, + "kl": 0.47802734375, + "learning_rate": 4.523587592959557e-06, + "loss": 0.7472, + "reward": 2.545016586780548, + "reward_std": 0.7152352035045624, + "rewards/accuracy_reward": 0.7135416865348816, + "rewards/reasoning_steps_reward": 0.9722222536802292, + "rewards/repetition_penalty_reward": -0.09126834943890572, + "rewards/tag_count_reward": 0.950520858168602, + "step": 477 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.4739646911621, + "epoch": 0.7169103862017248, + "grad_norm": 2.851573423423965, + "kl": 0.419921875, + "learning_rate": 4.479779771806699e-06, + "loss": 0.4624, + "reward": 2.572330951690674, + "reward_std": 0.5128215774893761, + "rewards/accuracy_reward": 0.6875, + "rewards/reasoning_steps_reward": 0.9774305671453476, + "rewards/repetition_penalty_reward": -0.06265182606875896, + "rewards/tag_count_reward": 0.970052108168602, + "step": 478 + }, + { + "clip_ratio": 0.0, + "completion_length": 204.9322967529297, + "epoch": 0.7184101987251593, + "grad_norm": 0.9372143757391351, + "kl": 0.325927734375, + "learning_rate": 4.436123796008149e-06, + "loss": 0.555, + "reward": 2.526001214981079, + "reward_std": 0.5274576209485531, + "rewards/accuracy_reward": 0.6562500149011612, + "rewards/reasoning_steps_reward": 0.9809028059244156, + "rewards/repetition_penalty_reward": -0.07208916172385216, + "rewards/tag_count_reward": 0.9609375149011612, + "step": 479 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.66146087646484, + "epoch": 0.7199100112485939, + "grad_norm": 1.4178034846859082, + "kl": 0.3603515625, + "learning_rate": 4.392620866414026e-06, + "loss": 0.385, + "reward": 2.7349607348442078, + "reward_std": 0.3730153478682041, + "rewards/accuracy_reward": 0.8072916865348816, + "rewards/reasoning_steps_reward": 0.987847238779068, + "rewards/repetition_penalty_reward": -0.04194910801015794, + "rewards/tag_count_reward": 0.9817708432674408, + "step": 480 + }, + { + "clip_ratio": 0.0, + "completion_length": 208.9270896911621, + "epoch": 0.7214098237720284, + "grad_norm": 0.7783089792228611, + "kl": 0.318359375, + "learning_rate": 4.349272179664586e-06, + "loss": 0.4811, + "reward": 2.5647249817848206, + "reward_std": 0.525773711502552, + "rewards/accuracy_reward": 0.6822916716337204, + "rewards/reasoning_steps_reward": 0.9791666716337204, + "rewards/repetition_penalty_reward": -0.06808762066066265, + "rewards/tag_count_reward": 0.9713541865348816, + "step": 481 + }, + { + "clip_ratio": 0.0, + "completion_length": 211.82812881469727, + "epoch": 0.7229096362954631, + "grad_norm": 0.9640643886143824, + "kl": 0.37255859375, + "learning_rate": 4.3060789281573135e-06, + "loss": 0.5915, + "reward": 2.5671426653862, + "reward_std": 0.6002469211816788, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9670139104127884, + "rewards/repetition_penalty_reward": -0.07955875806510448, + "rewards/tag_count_reward": 0.950520858168602, + "step": 482 + }, + { + "clip_ratio": 0.0, + "completion_length": 210.14583587646484, + "epoch": 0.7244094488188977, + "grad_norm": 2.303908104951783, + "kl": 0.44384765625, + "learning_rate": 4.263042300014112e-06, + "loss": 0.7262, + "reward": 2.679473400115967, + "reward_std": 0.6004434674978256, + "rewards/accuracy_reward": 0.8385416865348816, + "rewards/reasoning_steps_reward": 0.9670139104127884, + "rewards/repetition_penalty_reward": -0.07530095893889666, + "rewards/tag_count_reward": 0.9492187798023224, + "step": 483 + }, + { + "clip_ratio": 0.0, + "completion_length": 221.8177146911621, + "epoch": 0.7259092613423322, + "grad_norm": 1.4472721026550845, + "kl": 0.38330078125, + "learning_rate": 4.220163479048632e-06, + "loss": 0.5802, + "reward": 2.5944811701774597, + "reward_std": 0.6224361211061478, + "rewards/accuracy_reward": 0.7604166865348816, + "rewards/reasoning_steps_reward": 0.9704861491918564, + "rewards/repetition_penalty_reward": -0.08564047096297145, + "rewards/tag_count_reward": 0.9492187798023224, + "step": 484 + }, + { + "clip_ratio": 0.0, + "completion_length": 235.98437881469727, + "epoch": 0.7274090738657668, + "grad_norm": 1.352373192683157, + "kl": 0.4716796875, + "learning_rate": 4.177443644733699e-06, + "loss": 0.8147, + "reward": 2.6568827629089355, + "reward_std": 0.7592978328466415, + "rewards/accuracy_reward": 0.848958358168602, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.09875956550240517, + "rewards/tag_count_reward": 0.9414062649011612, + "step": 485 + }, + { + "clip_ratio": 0.0, + "completion_length": 245.1354217529297, + "epoch": 0.7289088863892014, + "grad_norm": 1.4809254865523336, + "kl": 0.44677734375, + "learning_rate": 4.134883972168877e-06, + "loss": 0.7938, + "reward": 2.593150496482849, + "reward_std": 0.7460722476243973, + "rewards/accuracy_reward": 0.7968750149011612, + "rewards/reasoning_steps_reward": 0.9774305671453476, + "rewards/repetition_penalty_reward": -0.10954048298299313, + "rewards/tag_count_reward": 0.9283854365348816, + "step": 486 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.30729293823242, + "epoch": 0.7304086989126359, + "grad_norm": 0.9850557511849055, + "kl": 0.349609375, + "learning_rate": 4.092485632048142e-06, + "loss": 0.6449, + "reward": 2.609771966934204, + "reward_std": 0.45559458062052727, + "rewards/accuracy_reward": 0.739583358168602, + "rewards/reasoning_steps_reward": 0.9756944477558136, + "rewards/repetition_penalty_reward": -0.06774546951055527, + "rewards/tag_count_reward": 0.9622395932674408, + "step": 487 + }, + { + "clip_ratio": 0.0, + "completion_length": 218.54688262939453, + "epoch": 0.7319085114360705, + "grad_norm": 0.7287131874968185, + "kl": 0.322265625, + "learning_rate": 4.050249790627675e-06, + "loss": 0.582, + "reward": 2.571848511695862, + "reward_std": 0.6445990055799484, + "rewards/accuracy_reward": 0.739583358168602, + "rewards/reasoning_steps_reward": 0.9618055671453476, + "rewards/repetition_penalty_reward": -0.0800613546743989, + "rewards/tag_count_reward": 0.950520858168602, + "step": 488 + }, + { + "clip_ratio": 0.0, + "completion_length": 202.48437881469727, + "epoch": 0.7334083239595051, + "grad_norm": 0.8704961241862007, + "kl": 0.37255859375, + "learning_rate": 4.008177609693791e-06, + "loss": 0.6146, + "reward": 2.626339375972748, + "reward_std": 0.568084180355072, + "rewards/accuracy_reward": 0.770833358168602, + "rewards/reasoning_steps_reward": 0.9704861342906952, + "rewards/repetition_penalty_reward": -0.07201146148145199, + "rewards/tag_count_reward": 0.9570312798023224, + "step": 489 + }, + { + "clip_ratio": 0.0, + "completion_length": 218.3593864440918, + "epoch": 0.7349081364829396, + "grad_norm": 0.9183454896836771, + "kl": 0.42041015625, + "learning_rate": 3.966270246530975e-06, + "loss": 0.5819, + "reward": 2.5615740418434143, + "reward_std": 0.6961846798658371, + "rewards/accuracy_reward": 0.7447916716337204, + "rewards/reasoning_steps_reward": 0.9496527910232544, + "rewards/repetition_penalty_reward": -0.08208927698433399, + "rewards/tag_count_reward": 0.9492187798023224, + "step": 490 + }, + { + "clip_ratio": 0.0, + "completion_length": 228.77084350585938, + "epoch": 0.7364079490063742, + "grad_norm": 0.7687763462508892, + "kl": 0.3623046875, + "learning_rate": 3.924528853890046e-06, + "loss": 0.5661, + "reward": 2.5703277587890625, + "reward_std": 0.6913676261901855, + "rewards/accuracy_reward": 0.739583358168602, + "rewards/reasoning_steps_reward": 0.9600694477558136, + "rewards/repetition_penalty_reward": -0.08375214599072933, + "rewards/tag_count_reward": 0.9544270932674408, + "step": 491 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.3229217529297, + "epoch": 0.7379077615298087, + "grad_norm": 0.751985130971036, + "kl": 0.44140625, + "learning_rate": 3.882954579956455e-06, + "loss": 0.3035, + "reward": 2.8052607774734497, + "reward_std": 0.39791389554739, + "rewards/accuracy_reward": 0.8906250298023224, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.04239548835903406, + "rewards/tag_count_reward": 0.9778645932674408, + "step": 492 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.6979217529297, + "epoch": 0.7394075740532433, + "grad_norm": 0.7659539513605523, + "kl": 0.29736328125, + "learning_rate": 3.841548568318706e-06, + "loss": 0.4617, + "reward": 2.7041468620300293, + "reward_std": 0.5456740781664848, + "rewards/accuracy_reward": 0.8281250149011612, + "rewards/reasoning_steps_reward": 0.9756944626569748, + "rewards/repetition_penalty_reward": -0.060610027983784676, + "rewards/tag_count_reward": 0.9609375149011612, + "step": 493 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.7395896911621, + "epoch": 0.7409073865766779, + "grad_norm": 0.8837591427962717, + "kl": 0.31787109375, + "learning_rate": 3.8003119579368806e-06, + "loss": 0.838, + "reward": 2.5719869136810303, + "reward_std": 0.6782498955726624, + "rewards/accuracy_reward": 0.7187500149011612, + "rewards/reasoning_steps_reward": 0.9670139104127884, + "rewards/repetition_penalty_reward": -0.07471455447375774, + "rewards/tag_count_reward": 0.9609375149011612, + "step": 494 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.1666717529297, + "epoch": 0.7424071991001124, + "grad_norm": 1.4368157135686637, + "kl": 0.29443359375, + "learning_rate": 3.7592458831113256e-06, + "loss": 0.4529, + "reward": 2.4826937317848206, + "reward_std": 0.47983065992593765, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9913194626569748, + "rewards/repetition_penalty_reward": -0.05810481309890747, + "rewards/tag_count_reward": 0.9661458432674408, + "step": 495 + }, + { + "clip_ratio": 0.0, + "completion_length": 202.92187881469727, + "epoch": 0.7439070116235471, + "grad_norm": 1.119059585365707, + "kl": 0.35302734375, + "learning_rate": 3.718351473451448e-06, + "loss": 0.6506, + "reward": 2.689814329147339, + "reward_std": 0.5917665362358093, + "rewards/accuracy_reward": 0.8385417014360428, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.06799825746566057, + "rewards/tag_count_reward": 0.9609375149011612, + "step": 496 + }, + { + "clip_ratio": 0.0, + "completion_length": 214.27083587646484, + "epoch": 0.7454068241469817, + "grad_norm": 10.12905143841689, + "kl": 0.67431640625, + "learning_rate": 3.6776298538446307e-06, + "loss": 0.7213, + "reward": 2.456771969795227, + "reward_std": 0.6756933778524399, + "rewards/accuracy_reward": 0.6093750149011612, + "rewards/reasoning_steps_reward": 0.9704861342906952, + "rewards/repetition_penalty_reward": -0.07751636672765017, + "rewards/tag_count_reward": 0.9544270932674408, + "step": 497 + }, + { + "clip_ratio": 0.0, + "completion_length": 205.1510467529297, + "epoch": 0.7469066366704162, + "grad_norm": 0.9738907873033192, + "kl": 0.33203125, + "learning_rate": 3.6370821444253112e-06, + "loss": 0.3742, + "reward": 2.6159667372703552, + "reward_std": 0.45941271260380745, + "rewards/accuracy_reward": 0.7187500149011612, + "rewards/reasoning_steps_reward": 0.9826389104127884, + "rewards/repetition_penalty_reward": -0.05938061675988138, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 498 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.4427146911621, + "epoch": 0.7484064491938508, + "grad_norm": 2.4109683634611785, + "kl": 0.46923828125, + "learning_rate": 3.5967094605441545e-06, + "loss": 0.5533, + "reward": 2.717509090900421, + "reward_std": 0.5818516314029694, + "rewards/accuracy_reward": 0.8177083432674408, + "rewards/reasoning_steps_reward": 0.986111119389534, + "rewards/repetition_penalty_reward": -0.06026873830705881, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 499 + }, + { + "clip_ratio": 0.0, + "completion_length": 219.06250762939453, + "epoch": 0.7499062617172854, + "grad_norm": 0.7913869745911991, + "kl": 0.34765625, + "learning_rate": 3.5565129127373765e-06, + "loss": 0.6764, + "reward": 2.615268588066101, + "reward_std": 0.6076765581965446, + "rewards/accuracy_reward": 0.7812500298023224, + "rewards/reasoning_steps_reward": 0.96875, + "rewards/repetition_penalty_reward": -0.08134600473567843, + "rewards/tag_count_reward": 0.946614608168602, + "step": 500 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.53646087646484, + "epoch": 0.7514060742407199, + "grad_norm": 1.5573709839742271, + "kl": 0.47705078125, + "learning_rate": 3.5164936066961984e-06, + "loss": 0.3202, + "reward": 2.780966341495514, + "reward_std": 0.427102904766798, + "rewards/accuracy_reward": 0.895833358168602, + "rewards/reasoning_steps_reward": 0.9826388955116272, + "rewards/repetition_penalty_reward": -0.057141443248838186, + "rewards/tag_count_reward": 0.9596354365348816, + "step": 501 + }, + { + "clip_ratio": 0.0, + "completion_length": 217.50521087646484, + "epoch": 0.7529058867641545, + "grad_norm": 0.7528789099140956, + "kl": 0.30615234375, + "learning_rate": 3.476652643236431e-06, + "loss": 0.5839, + "reward": 2.4664222598075867, + "reward_std": 0.6568407695740461, + "rewards/accuracy_reward": 0.6302083507180214, + "rewards/reasoning_steps_reward": 0.958333358168602, + "rewards/repetition_penalty_reward": -0.07394242100417614, + "rewards/tag_count_reward": 0.9518229365348816, + "step": 502 + }, + { + "clip_ratio": 0.0, + "completion_length": 216.70833587646484, + "epoch": 0.754405699287589, + "grad_norm": 1.1949507672459776, + "kl": 0.37646484375, + "learning_rate": 3.436991118268195e-06, + "loss": 0.6479, + "reward": 2.507534086704254, + "reward_std": 0.5748837292194366, + "rewards/accuracy_reward": 0.6510416865348816, + "rewards/reasoning_steps_reward": 0.9774305820465088, + "rewards/repetition_penalty_reward": -0.07536534033715725, + "rewards/tag_count_reward": 0.9544270932674408, + "step": 503 + }, + { + "clip_ratio": 0.0, + "completion_length": 246.4375114440918, + "epoch": 0.7559055118110236, + "grad_norm": 0.79103427108142, + "kl": 0.3798828125, + "learning_rate": 3.3975101227657726e-06, + "loss": 0.7612, + "reward": 2.440228283405304, + "reward_std": 0.8982982710003853, + "rewards/accuracy_reward": 0.677083358168602, + "rewards/reasoning_steps_reward": 0.9409722536802292, + "rewards/repetition_penalty_reward": -0.10230656852945685, + "rewards/tag_count_reward": 0.9244791716337204, + "step": 504 + }, + { + "clip_ratio": 0.0, + "completion_length": 225.0208396911621, + "epoch": 0.7574053243344582, + "grad_norm": 1.382653144301306, + "kl": 0.349609375, + "learning_rate": 3.3582107427376044e-06, + "loss": 0.724, + "reward": 2.6514702439308167, + "reward_std": 0.7024101763963699, + "rewards/accuracy_reward": 0.8125, + "rewards/reasoning_steps_reward": 0.970486119389534, + "rewards/repetition_penalty_reward": -0.08333875052630901, + "rewards/tag_count_reward": 0.9518229216337204, + "step": 505 + }, + { + "clip_ratio": 0.0, + "completion_length": 227.0677146911621, + "epoch": 0.7589051368578927, + "grad_norm": 1.0534398553331488, + "kl": 0.42431640625, + "learning_rate": 3.3190940591964094e-06, + "loss": 0.7011, + "reward": 2.410764992237091, + "reward_std": 0.7002580761909485, + "rewards/accuracy_reward": 0.5989583507180214, + "rewards/reasoning_steps_reward": 0.9531250149011612, + "rewards/repetition_penalty_reward": -0.08793305046856403, + "rewards/tag_count_reward": 0.946614608168602, + "step": 506 + }, + { + "clip_ratio": 0.0, + "completion_length": 243.32813262939453, + "epoch": 0.7604049493813273, + "grad_norm": 2.499315939505216, + "kl": 0.5712890625, + "learning_rate": 3.2801611481294538e-06, + "loss": 0.6833, + "reward": 2.68420547246933, + "reward_std": 0.7678481340408325, + "rewards/accuracy_reward": 0.895833358168602, + "rewards/reasoning_steps_reward": 0.9496528059244156, + "rewards/repetition_penalty_reward": -0.10268685221672058, + "rewards/tag_count_reward": 0.9414062649011612, + "step": 507 + }, + { + "clip_ratio": 0.0, + "completion_length": 231.5833396911621, + "epoch": 0.7619047619047619, + "grad_norm": 1.6168052609374044, + "kl": 0.4296875, + "learning_rate": 3.2414130804689492e-06, + "loss": 0.7108, + "reward": 2.642342746257782, + "reward_std": 0.765022836625576, + "rewards/accuracy_reward": 0.8229167014360428, + "rewards/reasoning_steps_reward": 0.9548611342906952, + "rewards/repetition_penalty_reward": -0.08725807629525661, + "rewards/tag_count_reward": 0.9518229216337204, + "step": 508 + }, + { + "clip_ratio": 0.0, + "completion_length": 226.42188262939453, + "epoch": 0.7634045744281964, + "grad_norm": 1.6205129541399188, + "kl": 0.41552734375, + "learning_rate": 3.202850922062607e-06, + "loss": 0.7307, + "reward": 2.597358763217926, + "reward_std": 0.8237985223531723, + "rewards/accuracy_reward": 0.786458358168602, + "rewards/reasoning_steps_reward": 0.949652835726738, + "rewards/repetition_penalty_reward": -0.08797122351825237, + "rewards/tag_count_reward": 0.9492187649011612, + "step": 509 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.37501525878906, + "epoch": 0.7649043869516311, + "grad_norm": 1.0752711179781738, + "kl": 0.4208984375, + "learning_rate": 3.1644757336443023e-06, + "loss": 0.748, + "reward": 2.258354067802429, + "reward_std": 0.7389141619205475, + "rewards/accuracy_reward": 0.5260416865348816, + "rewards/reasoning_steps_reward": 0.960069477558136, + "rewards/repetition_penalty_reward": -0.14312171563506126, + "rewards/tag_count_reward": 0.915364608168602, + "step": 510 + }, + { + "clip_ratio": 0.0, + "completion_length": 233.68230056762695, + "epoch": 0.7664041994750657, + "grad_norm": 1.0295291505864663, + "kl": 0.3603515625, + "learning_rate": 3.126288570804906e-06, + "loss": 0.813, + "reward": 2.466791570186615, + "reward_std": 0.8091250509023666, + "rewards/accuracy_reward": 0.677083358168602, + "rewards/reasoning_steps_reward": 0.9479167014360428, + "rewards/repetition_penalty_reward": -0.09440644644200802, + "rewards/tag_count_reward": 0.9361979365348816, + "step": 511 + }, + { + "clip_ratio": 0.0, + "completion_length": 257.09375762939453, + "epoch": 0.7679040119985002, + "grad_norm": 1.1762814130428787, + "kl": 0.37109375, + "learning_rate": 3.0882904839632476e-06, + "loss": 0.7208, + "reward": 2.3307202458381653, + "reward_std": 0.7969870269298553, + "rewards/accuracy_reward": 0.5781250298023224, + "rewards/reasoning_steps_reward": 0.9496527761220932, + "rewards/repetition_penalty_reward": -0.12023470550775528, + "rewards/tag_count_reward": 0.9231771230697632, + "step": 512 + }, + { + "clip_ratio": 0.0, + "completion_length": 216.29688262939453, + "epoch": 0.7694038245219348, + "grad_norm": 6.576307135246261, + "kl": 0.46337890625, + "learning_rate": 3.050482518337221e-06, + "loss": 0.5806, + "reward": 2.5199908018112183, + "reward_std": 0.5410640314221382, + "rewards/accuracy_reward": 0.6927083432674408, + "rewards/reasoning_steps_reward": 0.9670138955116272, + "rewards/repetition_penalty_reward": -0.08113771304488182, + "rewards/tag_count_reward": 0.9414062798023224, + "step": 513 + }, + { + "clip_ratio": 0.0, + "completion_length": 244.30208587646484, + "epoch": 0.7709036370453693, + "grad_norm": 0.8493543704484349, + "kl": 0.4072265625, + "learning_rate": 3.012865713915033e-06, + "loss": 0.8002, + "reward": 2.496680438518524, + "reward_std": 0.8196369558572769, + "rewards/accuracy_reward": 0.7343750149011612, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.11226061917841434, + "rewards/tag_count_reward": 0.923177108168602, + "step": 514 + }, + { + "clip_ratio": 0.0, + "completion_length": 228.02605056762695, + "epoch": 0.7724034495688039, + "grad_norm": 0.782983798376921, + "kl": 0.50830078125, + "learning_rate": 2.9754411054265966e-06, + "loss": 0.7341, + "reward": 2.4324201941490173, + "reward_std": 0.795376256108284, + "rewards/accuracy_reward": 0.6614583507180214, + "rewards/reasoning_steps_reward": 0.9340277910232544, + "rewards/repetition_penalty_reward": -0.09665969014167786, + "rewards/tag_count_reward": 0.9335937798023224, + "step": 515 + }, + { + "clip_ratio": 0.0, + "completion_length": 214.140625, + "epoch": 0.7739032620922385, + "grad_norm": 1.2353929587709405, + "kl": 0.3515625, + "learning_rate": 2.9382097223150675e-06, + "loss": 0.6533, + "reward": 2.5715506076812744, + "reward_std": 0.6366243287920952, + "rewards/accuracy_reward": 0.7239583432674408, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.07341482397168875, + "rewards/tag_count_reward": 0.9557291716337204, + "step": 516 + }, + { + "clip_ratio": 0.0, + "completion_length": 258.67188262939453, + "epoch": 0.775403074615673, + "grad_norm": 0.8839020653845002, + "kl": 0.35693359375, + "learning_rate": 2.9011725887085286e-06, + "loss": 0.768, + "reward": 2.4018173813819885, + "reward_std": 0.8271952420473099, + "rewards/accuracy_reward": 0.6406250223517418, + "rewards/reasoning_steps_reward": 0.9513889104127884, + "rewards/repetition_penalty_reward": -0.1159778069704771, + "rewards/tag_count_reward": 0.92578125, + "step": 517 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.9947967529297, + "epoch": 0.7769028871391076, + "grad_norm": 0.7917880481551695, + "kl": 0.3603515625, + "learning_rate": 2.8643307233918192e-06, + "loss": 0.3036, + "reward": 2.690393328666687, + "reward_std": 0.3900110796093941, + "rewards/accuracy_reward": 0.786458358168602, + "rewards/reasoning_steps_reward": 0.982638880610466, + "rewards/repetition_penalty_reward": -0.05136016756296158, + "rewards/tag_count_reward": 0.9726562649011612, + "step": 518 + }, + { + "clip_ratio": 0.0, + "completion_length": 235.57812881469727, + "epoch": 0.7784026996625422, + "grad_norm": 0.8157613306440419, + "kl": 0.3388671875, + "learning_rate": 2.827685139778511e-06, + "loss": 0.6223, + "reward": 2.477262020111084, + "reward_std": 0.7161538153886795, + "rewards/accuracy_reward": 0.6718750149011612, + "rewards/reasoning_steps_reward": 0.9618055671453476, + "rewards/repetition_penalty_reward": -0.09782484546303749, + "rewards/tag_count_reward": 0.9414062649011612, + "step": 519 + }, + { + "clip_ratio": 0.0, + "completion_length": 214.87500381469727, + "epoch": 0.7799025121859767, + "grad_norm": 0.9727421187621869, + "kl": 0.35498046875, + "learning_rate": 2.7912368458830295e-06, + "loss": 0.6298, + "reward": 2.616613507270813, + "reward_std": 0.6122787222266197, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.08390727639198303, + "rewards/tag_count_reward": 0.950520858168602, + "step": 520 + }, + { + "clip_ratio": 0.0, + "completion_length": 208.30208587646484, + "epoch": 0.7814023247094113, + "grad_norm": 0.8429352541214471, + "kl": 0.327392578125, + "learning_rate": 2.7549868442929286e-06, + "loss": 0.4077, + "reward": 2.774833917617798, + "reward_std": 0.5252001956105232, + "rewards/accuracy_reward": 0.911458358168602, + "rewards/reasoning_steps_reward": 0.9722222238779068, + "rewards/repetition_penalty_reward": -0.06848222017288208, + "rewards/tag_count_reward": 0.9596354514360428, + "step": 521 + }, + { + "clip_ratio": 0.0, + "completion_length": 211.09375381469727, + "epoch": 0.7829021372328459, + "grad_norm": 1.0326110093541125, + "kl": 0.31884765625, + "learning_rate": 2.7189361321413144e-06, + "loss": 0.421, + "reward": 2.417696237564087, + "reward_std": 0.3683694452047348, + "rewards/accuracy_reward": 0.5260416865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.06190443877130747, + "rewards/tag_count_reward": 0.9674479216337204, + "step": 522 + }, + { + "clip_ratio": 0.0, + "completion_length": 210.7604217529297, + "epoch": 0.7844019497562804, + "grad_norm": 2.2355283965994865, + "kl": 0.39013671875, + "learning_rate": 2.683085701079412e-06, + "loss": 0.5656, + "reward": 2.6360539197921753, + "reward_std": 0.5976300239562988, + "rewards/accuracy_reward": 0.7864583507180214, + "rewards/reasoning_steps_reward": 0.9670139104127884, + "rewards/repetition_penalty_reward": -0.06924132350832224, + "rewards/tag_count_reward": 0.9518229365348816, + "step": 523 + }, + { + "clip_ratio": 0.0, + "completion_length": 202.9322967529297, + "epoch": 0.7859017622797151, + "grad_norm": 0.9756200081911167, + "kl": 0.3701171875, + "learning_rate": 2.647436537249294e-06, + "loss": 0.545, + "reward": 2.322978913784027, + "reward_std": 0.5455097928643227, + "rewards/accuracy_reward": 0.4531250223517418, + "rewards/reasoning_steps_reward": 0.9774305671453476, + "rewards/repetition_penalty_reward": -0.0633058724924922, + "rewards/tag_count_reward": 0.9557292014360428, + "step": 524 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.1145896911621, + "epoch": 0.7874015748031497, + "grad_norm": 0.7876826353251758, + "kl": 0.40576171875, + "learning_rate": 2.611989621256745e-06, + "loss": 0.5895, + "reward": 2.726710319519043, + "reward_std": 0.5035083070397377, + "rewards/accuracy_reward": 0.8385416865348816, + "rewards/reasoning_steps_reward": 0.9809027910232544, + "rewards/repetition_penalty_reward": -0.06669262330979109, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 525 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.5364646911621, + "epoch": 0.7889013873265842, + "grad_norm": 1.2271941862677709, + "kl": 0.29833984375, + "learning_rate": 2.5767459281443064e-06, + "loss": 0.6082, + "reward": 2.505469024181366, + "reward_std": 0.6449078023433685, + "rewards/accuracy_reward": 0.6562500149011612, + "rewards/reasoning_steps_reward": 0.9670139104127884, + "rewards/repetition_penalty_reward": -0.07352408953011036, + "rewards/tag_count_reward": 0.9557291716337204, + "step": 526 + }, + { + "clip_ratio": 0.0, + "completion_length": 239.5677146911621, + "epoch": 0.7904011998500188, + "grad_norm": 0.9183905328986752, + "kl": 0.4296875, + "learning_rate": 2.541706427364431e-06, + "loss": 0.8342, + "reward": 2.675832152366638, + "reward_std": 0.8275108933448792, + "rewards/accuracy_reward": 0.864583358168602, + "rewards/reasoning_steps_reward": 0.954861119389534, + "rewards/repetition_penalty_reward": -0.09283116087317467, + "rewards/tag_count_reward": 0.9492187649011612, + "step": 527 + }, + { + "clip_ratio": 0.0, + "completion_length": 218.96875762939453, + "epoch": 0.7919010123734533, + "grad_norm": 0.7482995574885967, + "kl": 0.32275390625, + "learning_rate": 2.506872082752834e-06, + "loss": 0.6547, + "reward": 2.569875657558441, + "reward_std": 0.5808727741241455, + "rewards/accuracy_reward": 0.6927083432674408, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.0724856061860919, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 528 + }, + { + "clip_ratio": 0.0, + "completion_length": 208.06250381469727, + "epoch": 0.7934008248968879, + "grad_norm": 1.2950704438743703, + "kl": 0.35400390625, + "learning_rate": 2.4722438525019764e-06, + "loss": 0.3873, + "reward": 2.7978073358535767, + "reward_std": 0.486385278403759, + "rewards/accuracy_reward": 0.9010416865348816, + "rewards/reasoning_steps_reward": 0.9809027910232544, + "rewards/repetition_penalty_reward": -0.052887264639139175, + "rewards/tag_count_reward": 0.9687500298023224, + "step": 529 + }, + { + "clip_ratio": 0.0, + "completion_length": 217.96875381469727, + "epoch": 0.7949006374203225, + "grad_norm": 0.9168395385162036, + "kl": 0.33203125, + "learning_rate": 2.4378226891347056e-06, + "loss": 0.5429, + "reward": 2.631447494029999, + "reward_std": 0.530893087387085, + "rewards/accuracy_reward": 0.7500000149011612, + "rewards/reasoning_steps_reward": 0.9843750149011612, + "rewards/repetition_penalty_reward": -0.07167758233845234, + "rewards/tag_count_reward": 0.9687500149011612, + "step": 530 + }, + { + "clip_ratio": 0.0, + "completion_length": 237.88021087646484, + "epoch": 0.796400449943757, + "grad_norm": 2.5084733671661934, + "kl": 0.4423828125, + "learning_rate": 2.403609539478056e-06, + "loss": 0.5644, + "reward": 2.736763119697571, + "reward_std": 0.614450603723526, + "rewards/accuracy_reward": 0.890625, + "rewards/reasoning_steps_reward": 0.9687500149011612, + "rewards/repetition_penalty_reward": -0.08224734663963318, + "rewards/tag_count_reward": 0.9596354514360428, + "step": 531 + }, + { + "clip_ratio": 0.0, + "completion_length": 201.6822967529297, + "epoch": 0.7979002624671916, + "grad_norm": 1.2103141736384226, + "kl": 0.32958984375, + "learning_rate": 2.3696053446372026e-06, + "loss": 0.2212, + "reward": 2.7792577147483826, + "reward_std": 0.2872903672978282, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9947916716337204, + "rewards/repetition_penalty_reward": -0.047565262764692307, + "rewards/tag_count_reward": 0.9778645932674408, + "step": 532 + }, + { + "clip_ratio": 0.0, + "completion_length": 205.01562881469727, + "epoch": 0.7994000749906262, + "grad_norm": 0.9409501748390533, + "kl": 0.33740234375, + "learning_rate": 2.3358110399695788e-06, + "loss": 0.5166, + "reward": 2.526670515537262, + "reward_std": 0.5588072910904884, + "rewards/accuracy_reward": 0.6510416865348816, + "rewards/reasoning_steps_reward": 0.9670139104127884, + "rewards/repetition_penalty_reward": -0.057530895806849, + "rewards/tag_count_reward": 0.9661458432674408, + "step": 533 + }, + { + "clip_ratio": 0.0, + "completion_length": 210.4322967529297, + "epoch": 0.8008998875140607, + "grad_norm": 0.9487060971483162, + "kl": 0.3642578125, + "learning_rate": 2.302227555059141e-06, + "loss": 0.5154, + "reward": 2.5382038950920105, + "reward_std": 0.591885045170784, + "rewards/accuracy_reward": 0.6614583432674408, + "rewards/reasoning_steps_reward": 0.970486119389534, + "rewards/repetition_penalty_reward": -0.06769899744540453, + "rewards/tag_count_reward": 0.973958358168602, + "step": 534 + }, + { + "clip_ratio": 0.0, + "completion_length": 211.07292556762695, + "epoch": 0.8023997000374953, + "grad_norm": 0.929609225864085, + "kl": 0.396484375, + "learning_rate": 2.2688558136908025e-06, + "loss": 0.5148, + "reward": 2.7448110580444336, + "reward_std": 0.46144695580005646, + "rewards/accuracy_reward": 0.8385416865348816, + "rewards/reasoning_steps_reward": 0.9895833432674408, + "rewards/repetition_penalty_reward": -0.05336613114923239, + "rewards/tag_count_reward": 0.9700520932674408, + "step": 535 + }, + { + "clip_ratio": 0.0, + "completion_length": 217.8802146911621, + "epoch": 0.8038995125609298, + "grad_norm": 1.584357940413058, + "kl": 0.30126953125, + "learning_rate": 2.2356967338250223e-06, + "loss": 0.431, + "reward": 2.501081109046936, + "reward_std": 0.4514058753848076, + "rewards/accuracy_reward": 0.6250000074505806, + "rewards/reasoning_steps_reward": 0.9826388955116272, + "rewards/repetition_penalty_reward": -0.07140162866562605, + "rewards/tag_count_reward": 0.9648437798023224, + "step": 536 + }, + { + "clip_ratio": 0.0, + "completion_length": 233.20834350585938, + "epoch": 0.8053993250843644, + "grad_norm": 64.06543251128925, + "kl": 1.220703125, + "learning_rate": 2.202751227572556e-06, + "loss": 0.7764, + "reward": 2.6655821800231934, + "reward_std": 0.7584892809391022, + "rewards/accuracy_reward": 0.848958358168602, + "rewards/reasoning_steps_reward": 0.958333358168602, + "rewards/repetition_penalty_reward": -0.08441784046590328, + "rewards/tag_count_reward": 0.9427083432674408, + "step": 537 + }, + { + "clip_ratio": 0.0, + "completion_length": 227.40625762939453, + "epoch": 0.8068991376077991, + "grad_norm": 0.7719266837120854, + "kl": 0.37646484375, + "learning_rate": 2.1700202011693573e-06, + "loss": 0.5498, + "reward": 2.6957992911338806, + "reward_std": 0.6702239066362381, + "rewards/accuracy_reward": 0.8854166865348816, + "rewards/reasoning_steps_reward": 0.9496528059244156, + "rewards/repetition_penalty_reward": -0.0819785725325346, + "rewards/tag_count_reward": 0.942708358168602, + "step": 538 + }, + { + "clip_ratio": 0.0, + "completion_length": 234.98958587646484, + "epoch": 0.8083989501312336, + "grad_norm": 2.038897171924456, + "kl": 0.32861328125, + "learning_rate": 2.1375045549516636e-06, + "loss": 0.5421, + "reward": 2.608824074268341, + "reward_std": 0.5336829051375389, + "rewards/accuracy_reward": 0.7239583730697632, + "rewards/reasoning_steps_reward": 0.9895833432674408, + "rewards/repetition_penalty_reward": -0.07867603516206145, + "rewards/tag_count_reward": 0.973958358168602, + "step": 539 + }, + { + "clip_ratio": 0.0, + "completion_length": 213.40105056762695, + "epoch": 0.8098987626546682, + "grad_norm": 1.2689341920662447, + "kl": 0.39599609375, + "learning_rate": 2.105205183331224e-06, + "loss": 0.6115, + "reward": 2.6301316022872925, + "reward_std": 0.6800608858466148, + "rewards/accuracy_reward": 0.8229166865348816, + "rewards/reasoning_steps_reward": 0.9461806118488312, + "rewards/repetition_penalty_reward": -0.08427836652845144, + "rewards/tag_count_reward": 0.9453125298023224, + "step": 540 + }, + { + "clip_ratio": 0.0, + "completion_length": 229.3697967529297, + "epoch": 0.8113985751781028, + "grad_norm": 0.6531166991811557, + "kl": 0.2939453125, + "learning_rate": 2.0731229747706926e-06, + "loss": 0.5031, + "reward": 2.5187647342681885, + "reward_std": 0.5751441568136215, + "rewards/accuracy_reward": 0.6562500149011612, + "rewards/reasoning_steps_reward": 0.965277761220932, + "rewards/repetition_penalty_reward": -0.06630483735352755, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 541 + }, + { + "clip_ratio": 0.0, + "completion_length": 230.78125381469727, + "epoch": 0.8128983877015373, + "grad_norm": 0.6612265161899963, + "kl": 0.31640625, + "learning_rate": 2.041258811759195e-06, + "loss": 0.5724, + "reward": 2.5856213569641113, + "reward_std": 0.6729920580983162, + "rewards/accuracy_reward": 0.7395833507180214, + "rewards/reasoning_steps_reward": 0.9722222536802292, + "rewards/repetition_penalty_reward": -0.08842381555587053, + "rewards/tag_count_reward": 0.9622395932674408, + "step": 542 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.7239646911621, + "epoch": 0.8143982002249719, + "grad_norm": 1.1508949391019638, + "kl": 0.290283203125, + "learning_rate": 2.009613570788057e-06, + "loss": 0.3922, + "reward": 2.7094662189483643, + "reward_std": 0.4391992464661598, + "rewards/accuracy_reward": 0.7968750149011612, + "rewards/reasoning_steps_reward": 0.9774305820465088, + "rewards/repetition_penalty_reward": -0.042704006657004356, + "rewards/tag_count_reward": 0.9778645932674408, + "step": 543 + }, + { + "clip_ratio": 0.0, + "completion_length": 209.62500762939453, + "epoch": 0.8158980127484065, + "grad_norm": 0.7726809000433558, + "kl": 0.29736328125, + "learning_rate": 1.978188122326683e-06, + "loss": 0.4558, + "reward": 2.690703511238098, + "reward_std": 0.49602875113487244, + "rewards/accuracy_reward": 0.8125000149011612, + "rewards/reasoning_steps_reward": 0.9722222238779068, + "rewards/repetition_penalty_reward": -0.06537291780114174, + "rewards/tag_count_reward": 0.9713541716337204, + "step": 544 + }, + { + "clip_ratio": 0.0, + "completion_length": 210.43750762939453, + "epoch": 0.817397825271841, + "grad_norm": 1.4290665690326807, + "kl": 0.3125, + "learning_rate": 1.946983330798621e-06, + "loss": 0.5241, + "reward": 2.562969446182251, + "reward_std": 0.5197947286069393, + "rewards/accuracy_reward": 0.692708358168602, + "rewards/reasoning_steps_reward": 0.9687500149011612, + "rewards/repetition_penalty_reward": -0.06203062180429697, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 545 + }, + { + "clip_ratio": 0.0, + "completion_length": 208.3645896911621, + "epoch": 0.8188976377952756, + "grad_norm": 1.0265157040309887, + "kl": 0.2939453125, + "learning_rate": 1.916000054557783e-06, + "loss": 0.6364, + "reward": 2.629048526287079, + "reward_std": 0.6721891462802887, + "rewards/accuracy_reward": 0.770833358168602, + "rewards/reasoning_steps_reward": 0.9704861491918564, + "rewards/repetition_penalty_reward": -0.0693023600615561, + "rewards/tag_count_reward": 0.9570312798023224, + "step": 546 + }, + { + "clip_ratio": 0.0, + "completion_length": 214.62500762939453, + "epoch": 0.8203974503187101, + "grad_norm": 1.2230109106160663, + "kl": 0.31201171875, + "learning_rate": 1.8852391458648323e-06, + "loss": 0.4479, + "reward": 2.6805137395858765, + "reward_std": 0.4807371646165848, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9843750149011612, + "rewards/repetition_penalty_reward": -0.06297590211033821, + "rewards/tag_count_reward": 0.9674479365348816, + "step": 547 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.45312881469727, + "epoch": 0.8218972628421447, + "grad_norm": 0.8191568879278863, + "kl": 0.3046875, + "learning_rate": 1.854701450863744e-06, + "loss": 0.4753, + "reward": 2.549809992313385, + "reward_std": 0.5221479944884777, + "rewards/accuracy_reward": 0.6562500298023224, + "rewards/reasoning_steps_reward": 0.9826389104127884, + "rewards/repetition_penalty_reward": -0.053922670893371105, + "rewards/tag_count_reward": 0.9648437649011612, + "step": 548 + }, + { + "clip_ratio": 0.0, + "completion_length": 221.0989646911621, + "epoch": 0.8233970753655793, + "grad_norm": 0.9675577366532915, + "kl": 0.36083984375, + "learning_rate": 1.8243878095585244e-06, + "loss": 0.6371, + "reward": 2.734562575817108, + "reward_std": 0.630949005484581, + "rewards/accuracy_reward": 0.8906250298023224, + "rewards/reasoning_steps_reward": 0.9687500149011612, + "rewards/repetition_penalty_reward": -0.08314584195613861, + "rewards/tag_count_reward": 0.9583333432674408, + "step": 549 + }, + { + "clip_ratio": 0.0, + "completion_length": 207.06250762939453, + "epoch": 0.8248968878890138, + "grad_norm": 0.9898045914966018, + "kl": 0.30859375, + "learning_rate": 1.7942990557901119e-06, + "loss": 0.3167, + "reward": 2.5159996151924133, + "reward_std": 0.4262428246438503, + "rewards/accuracy_reward": 0.5937500149011612, + "rewards/reasoning_steps_reward": 0.9895833432674408, + "rewards/repetition_penalty_reward": -0.04780261078849435, + "rewards/tag_count_reward": 0.9804687649011612, + "step": 550 + }, + { + "clip_ratio": 0.0, + "completion_length": 225.83855056762695, + "epoch": 0.8263967004124484, + "grad_norm": 1.3960120493498462, + "kl": 0.44775390625, + "learning_rate": 1.7644360172134323e-06, + "loss": 0.7773, + "reward": 2.514442801475525, + "reward_std": 0.7914524525403976, + "rewards/accuracy_reward": 0.6927083432674408, + "rewards/reasoning_steps_reward": 0.9496528059244156, + "rewards/repetition_penalty_reward": -0.07974134013056755, + "rewards/tag_count_reward": 0.9518229365348816, + "step": 551 + }, + { + "clip_ratio": 0.0, + "completion_length": 210.77605056762695, + "epoch": 0.8278965129358831, + "grad_norm": 0.7179910870567698, + "kl": 0.3623046875, + "learning_rate": 1.734799515274641e-06, + "loss": 0.507, + "reward": 2.6635890007019043, + "reward_std": 0.5695291832089424, + "rewards/accuracy_reward": 0.8072916716337204, + "rewards/reasoning_steps_reward": 0.9635416865348816, + "rewards/repetition_penalty_reward": -0.07078609801828861, + "rewards/tag_count_reward": 0.9635416716337204, + "step": 552 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.89583587646484, + "epoch": 0.8293963254593176, + "grad_norm": 1.660541115002357, + "kl": 0.3603515625, + "learning_rate": 1.7053903651885217e-06, + "loss": 0.3674, + "reward": 2.609849989414215, + "reward_std": 0.3290810212492943, + "rewards/accuracy_reward": 0.708333358168602, + "rewards/reasoning_steps_reward": 0.9791667014360428, + "rewards/repetition_penalty_reward": -0.0516084156697616, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 553 + }, + { + "clip_ratio": 0.0, + "completion_length": 218.2968864440918, + "epoch": 0.8308961379827522, + "grad_norm": 1.1421644219209315, + "kl": 0.3388671875, + "learning_rate": 1.6762093759160614e-06, + "loss": 0.6045, + "reward": 2.6770655512809753, + "reward_std": 0.5518500655889511, + "rewards/accuracy_reward": 0.8177083432674408, + "rewards/reasoning_steps_reward": 0.9826388955116272, + "rewards/repetition_penalty_reward": -0.08161513973027468, + "rewards/tag_count_reward": 0.958333358168602, + "step": 554 + }, + { + "clip_ratio": 0.0, + "completion_length": 271.15625762939453, + "epoch": 0.8323959505061868, + "grad_norm": 2.2156251851166755, + "kl": 0.462890625, + "learning_rate": 1.647257350142204e-06, + "loss": 0.8727, + "reward": 2.427699863910675, + "reward_std": 0.8616139888763428, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.954861119389534, + "rewards/repetition_penalty_reward": -0.12221339344978333, + "rewards/tag_count_reward": 0.9283854514360428, + "step": 555 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.6354217529297, + "epoch": 0.8338957630296213, + "grad_norm": 4.90298950509675, + "kl": 0.46337890625, + "learning_rate": 1.618535084253765e-06, + "loss": 0.6818, + "reward": 2.632472336292267, + "reward_std": 0.5403143912553787, + "rewards/accuracy_reward": 0.7656250298023224, + "rewards/reasoning_steps_reward": 0.9704861342906952, + "rewards/repetition_penalty_reward": -0.06457635015249252, + "rewards/tag_count_reward": 0.9609375149011612, + "step": 556 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.3125114440918, + "epoch": 0.8353955755530559, + "grad_norm": 2.3574073221661074, + "kl": 0.6494140625, + "learning_rate": 1.5900433683175277e-06, + "loss": 0.7594, + "reward": 2.3048887848854065, + "reward_std": 0.8010806366801262, + "rewards/accuracy_reward": 0.572916679084301, + "rewards/reasoning_steps_reward": 0.9531250149011612, + "rewards/repetition_penalty_reward": -0.1365174800157547, + "rewards/tag_count_reward": 0.9153645932674408, + "step": 557 + }, + { + "clip_ratio": 0.0, + "completion_length": 260.7135543823242, + "epoch": 0.8368953880764904, + "grad_norm": 1.9632445237865204, + "kl": 0.466796875, + "learning_rate": 1.5617829860585087e-06, + "loss": 0.6438, + "reward": 2.505094826221466, + "reward_std": 0.7284489870071411, + "rewards/accuracy_reward": 0.7187500149011612, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.10992250964045525, + "rewards/tag_count_reward": 0.930989608168602, + "step": 558 + }, + { + "clip_ratio": 0.0, + "completion_length": 214.82292556762695, + "epoch": 0.838395200599925, + "grad_norm": 1.3529276790590763, + "kl": 0.50244140625, + "learning_rate": 1.533754714838408e-06, + "loss": 0.7146, + "reward": 2.5260643362998962, + "reward_std": 0.6827712506055832, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.953125, + "rewards/repetition_penalty_reward": -0.07810238003730774, + "rewards/tag_count_reward": 0.942708358168602, + "step": 559 + }, + { + "clip_ratio": 0.0, + "completion_length": 295.43750762939453, + "epoch": 0.8398950131233596, + "grad_norm": 1.3595681175219, + "kl": 0.5419921875, + "learning_rate": 1.5059593256342142e-06, + "loss": 0.5809, + "reward": 2.471269369125366, + "reward_std": 0.7434787154197693, + "rewards/accuracy_reward": 0.7239583507180214, + "rewards/reasoning_steps_reward": 0.9739583432674408, + "rewards/repetition_penalty_reward": -0.14852236676961184, + "rewards/tag_count_reward": 0.9218750149011612, + "step": 560 + }, + { + "clip_ratio": 0.0, + "completion_length": 265.3073043823242, + "epoch": 0.8413948256467941, + "grad_norm": 3.39049441324729, + "kl": 0.57568359375, + "learning_rate": 1.4783975830170028e-06, + "loss": 0.7564, + "reward": 2.414111316204071, + "reward_std": 0.9404040277004242, + "rewards/accuracy_reward": 0.6875000149011612, + "rewards/reasoning_steps_reward": 0.9392361491918564, + "rewards/repetition_penalty_reward": -0.1253854325041175, + "rewards/tag_count_reward": 0.9127604365348816, + "step": 561 + }, + { + "clip_ratio": 0.0, + "completion_length": 256.3125114440918, + "epoch": 0.8428946381702287, + "grad_norm": 2.8979024715584325, + "kl": 0.689453125, + "learning_rate": 1.4510702451309055e-06, + "loss": 0.8145, + "reward": 2.4051551818847656, + "reward_std": 0.7071145176887512, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.12175463326275349, + "rewards/tag_count_reward": 0.9296875149011612, + "step": 562 + }, + { + "clip_ratio": 0.0, + "completion_length": 260.7708435058594, + "epoch": 0.8443944506936633, + "grad_norm": 1.408501605320147, + "kl": 0.505859375, + "learning_rate": 1.4239780636722555e-06, + "loss": 0.7652, + "reward": 2.3463268280029297, + "reward_std": 0.7379028648138046, + "rewards/accuracy_reward": 0.5625000149011612, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.11374275013804436, + "rewards/tag_count_reward": 0.9322916865348816, + "step": 563 + }, + { + "clip_ratio": 0.0, + "completion_length": 273.82813262939453, + "epoch": 0.8458942632170978, + "grad_norm": 2.6676821866666742, + "kl": 0.42431640625, + "learning_rate": 1.39712178386891e-06, + "loss": 0.8024, + "reward": 2.43689888715744, + "reward_std": 0.8974379524588585, + "rewards/accuracy_reward": 0.739583358168602, + "rewards/reasoning_steps_reward": 0.928819477558136, + "rewards/repetition_penalty_reward": -0.13905610889196396, + "rewards/tag_count_reward": 0.907552108168602, + "step": 564 + }, + { + "clip_ratio": 0.0, + "completion_length": 264.0989685058594, + "epoch": 0.8473940757405324, + "grad_norm": 0.8627370825958761, + "kl": 0.43115234375, + "learning_rate": 1.3705021444597521e-06, + "loss": 0.7752, + "reward": 2.607667088508606, + "reward_std": 0.782805323600769, + "rewards/accuracy_reward": 0.817708358168602, + "rewards/reasoning_steps_reward": 0.9756944626569748, + "rewards/repetition_penalty_reward": -0.11672540940344334, + "rewards/tag_count_reward": 0.9309895932674408, + "step": 565 + }, + { + "clip_ratio": 0.0, + "completion_length": 273.8802185058594, + "epoch": 0.8488938882639671, + "grad_norm": 0.9159232949782135, + "kl": 0.4345703125, + "learning_rate": 1.344119877674368e-06, + "loss": 0.9667, + "reward": 2.3450130224227905, + "reward_std": 0.9461807906627655, + "rewards/accuracy_reward": 0.6250000149011612, + "rewards/reasoning_steps_reward": 0.9375000149011612, + "rewards/repetition_penalty_reward": -0.1341537069529295, + "rewards/tag_count_reward": 0.9166667014360428, + "step": 566 + }, + { + "clip_ratio": 0.0, + "completion_length": 273.5364685058594, + "epoch": 0.8503937007874016, + "grad_norm": 0.9183785293807163, + "kl": 0.412109375, + "learning_rate": 1.3179757092129087e-06, + "loss": 0.7443, + "reward": 2.350478619337082, + "reward_std": 0.8575991243124008, + "rewards/accuracy_reward": 0.6250000223517418, + "rewards/reasoning_steps_reward": 0.9322917014360428, + "rewards/repetition_penalty_reward": -0.13129226304590702, + "rewards/tag_count_reward": 0.9244791865348816, + "step": 567 + }, + { + "clip_ratio": 0.0, + "completion_length": 260.76563262939453, + "epoch": 0.8518935133108362, + "grad_norm": 0.7672423282948194, + "kl": 0.32177734375, + "learning_rate": 1.292070358226124e-06, + "loss": 0.6237, + "reward": 2.4249655306339264, + "reward_std": 0.6657977253198624, + "rewards/accuracy_reward": 0.6302083507180214, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.11583315394818783, + "rewards/tag_count_reward": 0.9453125149011612, + "step": 568 + }, + { + "clip_ratio": 0.0, + "completion_length": 242.81250762939453, + "epoch": 0.8533933258342707, + "grad_norm": 1.510802559149807, + "kl": 0.3408203125, + "learning_rate": 1.2664045372955858e-06, + "loss": 0.8092, + "reward": 2.3671931624412537, + "reward_std": 0.747007891535759, + "rewards/accuracy_reward": 0.5677083507180214, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.10155703499913216, + "rewards/tag_count_reward": 0.942708358168602, + "step": 569 + }, + { + "clip_ratio": 0.0, + "completion_length": 245.81250762939453, + "epoch": 0.8548931383577053, + "grad_norm": 1.2507925070262236, + "kl": 0.45947265625, + "learning_rate": 1.2409789524140813e-06, + "loss": 0.7475, + "reward": 2.5338348150253296, + "reward_std": 0.7423350065946579, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9461805820465088, + "rewards/repetition_penalty_reward": -0.10375208128243685, + "rewards/tag_count_reward": 0.9414062798023224, + "step": 570 + }, + { + "clip_ratio": 0.0, + "completion_length": 252.5000114440918, + "epoch": 0.8563929508811399, + "grad_norm": 0.898004858587946, + "kl": 0.33984375, + "learning_rate": 1.2157943029661977e-06, + "loss": 0.7782, + "reward": 2.514720618724823, + "reward_std": 0.7610578685998917, + "rewards/accuracy_reward": 0.7343750298023224, + "rewards/reasoning_steps_reward": 0.9548611491918564, + "rewards/repetition_penalty_reward": -0.1081093717366457, + "rewards/tag_count_reward": 0.9335937649011612, + "step": 571 + }, + { + "clip_ratio": 0.0, + "completion_length": 208.18750762939453, + "epoch": 0.8578927634045744, + "grad_norm": 1.539413571101912, + "kl": 0.33935546875, + "learning_rate": 1.1908512817090833e-06, + "loss": 0.8174, + "reward": 2.4172632098197937, + "reward_std": 0.6794392615556717, + "rewards/accuracy_reward": 0.5520833432674408, + "rewards/reasoning_steps_reward": 0.9687500149011612, + "rewards/repetition_penalty_reward": -0.06580978166311979, + "rewards/tag_count_reward": 0.962239608168602, + "step": 572 + }, + { + "clip_ratio": 0.0, + "completion_length": 240.3072967529297, + "epoch": 0.859392575928009, + "grad_norm": 0.8770707592548431, + "kl": 0.335205078125, + "learning_rate": 1.1661505747533897e-06, + "loss": 0.5468, + "reward": 2.615067780017853, + "reward_std": 0.6282743141055107, + "rewards/accuracy_reward": 0.78125, + "rewards/reasoning_steps_reward": 0.9739583432674408, + "rewards/repetition_penalty_reward": -0.09066134784370661, + "rewards/tag_count_reward": 0.9505208432674408, + "step": 573 + }, + { + "clip_ratio": 0.0, + "completion_length": 207.15625762939453, + "epoch": 0.8608923884514436, + "grad_norm": 0.952206754417322, + "kl": 0.29248046875, + "learning_rate": 1.1416928615444013e-06, + "loss": 0.4348, + "reward": 2.639313220977783, + "reward_std": 0.514240987598896, + "rewards/accuracy_reward": 0.7447916865348816, + "rewards/reasoning_steps_reward": 0.9791666716337204, + "rewards/repetition_penalty_reward": -0.054697235114872456, + "rewards/tag_count_reward": 0.9700520932674408, + "step": 574 + }, + { + "clip_ratio": 0.0, + "completion_length": 221.3072967529297, + "epoch": 0.8623922009748781, + "grad_norm": 0.9603023631517865, + "kl": 0.35107421875, + "learning_rate": 1.1174788148433423e-06, + "loss": 0.4322, + "reward": 2.698141098022461, + "reward_std": 0.5075501780956984, + "rewards/accuracy_reward": 0.8541667014360428, + "rewards/reasoning_steps_reward": 0.9704861342906952, + "rewards/repetition_penalty_reward": -0.08093887567520142, + "rewards/tag_count_reward": 0.9544270932674408, + "step": 575 + }, + { + "clip_ratio": 0.0, + "completion_length": 236.0677146911621, + "epoch": 0.8638920134983127, + "grad_norm": 1.2989707455803579, + "kl": 0.3115234375, + "learning_rate": 1.0935091007088761e-06, + "loss": 0.7183, + "reward": 2.5909855365753174, + "reward_std": 0.5980718731880188, + "rewards/accuracy_reward": 0.7447916716337204, + "rewards/reasoning_steps_reward": 0.9774305671453476, + "rewards/repetition_penalty_reward": -0.08566383551806211, + "rewards/tag_count_reward": 0.954427108168602, + "step": 576 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.7135467529297, + "epoch": 0.8653918260217472, + "grad_norm": 1.2239167370700748, + "kl": 0.29150390625, + "learning_rate": 1.069784378478781e-06, + "loss": 0.6738, + "reward": 2.5854490995407104, + "reward_std": 0.7286294102668762, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9704861491918564, + "rewards/repetition_penalty_reward": -0.07123497780412436, + "rewards/tag_count_reward": 0.9570312649011612, + "step": 577 + }, + { + "clip_ratio": 0.0, + "completion_length": 201.0989646911621, + "epoch": 0.8668916385451818, + "grad_norm": 2.135809055954815, + "kl": 0.3212890625, + "learning_rate": 1.046305300751811e-06, + "loss": 0.679, + "reward": 2.635705530643463, + "reward_std": 0.49747517332434654, + "rewards/accuracy_reward": 0.7395833507180214, + "rewards/reasoning_steps_reward": 0.9826389104127884, + "rewards/repetition_penalty_reward": -0.05787099711596966, + "rewards/tag_count_reward": 0.9713542014360428, + "step": 578 + }, + { + "clip_ratio": 0.0, + "completion_length": 235.46875381469727, + "epoch": 0.8683914510686164, + "grad_norm": 1.1578027094404484, + "kl": 0.3466796875, + "learning_rate": 1.0230725133697495e-06, + "loss": 0.7601, + "reward": 2.575222373008728, + "reward_std": 0.6852890402078629, + "rewards/accuracy_reward": 0.7500000149011612, + "rewards/reasoning_steps_reward": 0.9652777761220932, + "rewards/repetition_penalty_reward": -0.0905762929469347, + "rewards/tag_count_reward": 0.950520858168602, + "step": 579 + }, + { + "clip_ratio": 0.0, + "completion_length": 216.43750762939453, + "epoch": 0.869891263592051, + "grad_norm": 0.7213394754759388, + "kl": 0.404296875, + "learning_rate": 1.0000866553996436e-06, + "loss": 0.5595, + "reward": 2.5885783433914185, + "reward_std": 0.6372941508889198, + "rewards/accuracy_reward": 0.7500000149011612, + "rewards/reasoning_steps_reward": 0.9531250149011612, + "rewards/repetition_penalty_reward": -0.06767181493341923, + "rewards/tag_count_reward": 0.9531250298023224, + "step": 580 + }, + { + "clip_ratio": 0.0, + "completion_length": 193.3697967529297, + "epoch": 0.8713910761154856, + "grad_norm": 352.68134893034215, + "kl": 4.25341796875, + "learning_rate": 9.773483591162203e-07, + "loss": 1.1519, + "reward": 2.7451828718185425, + "reward_std": 0.4861754924058914, + "rewards/accuracy_reward": 0.8593750149011612, + "rewards/reasoning_steps_reward": 0.9722222238779068, + "rewards/repetition_penalty_reward": -0.05646645650267601, + "rewards/tag_count_reward": 0.9700520932674408, + "step": 581 + }, + { + "clip_ratio": 0.0, + "completion_length": 215.60938262939453, + "epoch": 0.8728908886389202, + "grad_norm": 1.0240611001356141, + "kl": 0.33935546875, + "learning_rate": 9.548582499845015e-07, + "loss": 0.4393, + "reward": 2.83210825920105, + "reward_std": 0.46040425822138786, + "rewards/accuracy_reward": 0.9479167014360428, + "rewards/reasoning_steps_reward": 0.9774305522441864, + "rewards/repetition_penalty_reward": -0.06329114036634564, + "rewards/tag_count_reward": 0.9700520932674408, + "step": 582 + }, + { + "clip_ratio": 0.0, + "completion_length": 211.89062881469727, + "epoch": 0.8743907011623547, + "grad_norm": 0.898626007525376, + "kl": 0.32861328125, + "learning_rate": 9.326169466425916e-07, + "loss": 0.7951, + "reward": 2.764460861682892, + "reward_std": 0.6820637285709381, + "rewards/accuracy_reward": 0.9062500149011612, + "rewards/reasoning_steps_reward": 0.9687500298023224, + "rewards/repetition_penalty_reward": -0.07408086117357016, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 583 + }, + { + "clip_ratio": 0.0, + "completion_length": 203.23437881469727, + "epoch": 0.8758905136857893, + "grad_norm": 1.0744117383136889, + "kl": 0.33251953125, + "learning_rate": 9.106250608846679e-07, + "loss": 0.4911, + "reward": 2.625205934047699, + "reward_std": 0.5258033722639084, + "rewards/accuracy_reward": 0.739583358168602, + "rewards/reasoning_steps_reward": 0.9774305671453476, + "rewards/repetition_penalty_reward": -0.05795381683856249, + "rewards/tag_count_reward": 0.966145858168602, + "step": 584 + }, + { + "clip_ratio": 0.0, + "completion_length": 251.5260467529297, + "epoch": 0.8773903262092239, + "grad_norm": 1.325096473638462, + "kl": 0.3818359375, + "learning_rate": 8.888831976441481e-07, + "loss": 0.5365, + "reward": 2.3645836114883423, + "reward_std": 0.6718242466449738, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9531250149011612, + "rewards/repetition_penalty_reward": -0.10026027075946331, + "rewards/tag_count_reward": 0.9283854365348816, + "step": 585 + }, + { + "clip_ratio": 0.0, + "completion_length": 217.04166793823242, + "epoch": 0.8788901387326584, + "grad_norm": 0.7704092563337279, + "kl": 0.35693359375, + "learning_rate": 8.673919549770483e-07, + "loss": 0.6, + "reward": 2.743525743484497, + "reward_std": 0.6834591180086136, + "rewards/accuracy_reward": 0.9010416865348816, + "rewards/reasoning_steps_reward": 0.96180559694767, + "rewards/repetition_penalty_reward": -0.07505077961832285, + "rewards/tag_count_reward": 0.9557292014360428, + "step": 586 + }, + { + "clip_ratio": 0.0, + "completion_length": 230.9010467529297, + "epoch": 0.880389951256093, + "grad_norm": 5.725122628830429, + "kl": 0.403564453125, + "learning_rate": 8.461519240455362e-07, + "loss": 0.6512, + "reward": 2.6582990884780884, + "reward_std": 0.676504597067833, + "rewards/accuracy_reward": 0.8281250149011612, + "rewards/reasoning_steps_reward": 0.9635416865348816, + "rewards/repetition_penalty_reward": -0.08388850279152393, + "rewards/tag_count_reward": 0.950520858168602, + "step": 587 + }, + { + "clip_ratio": 0.0, + "completion_length": 219.5572967529297, + "epoch": 0.8818897637795275, + "grad_norm": 0.6438408724607655, + "kl": 0.33056640625, + "learning_rate": 8.251636891016702e-07, + "loss": 0.6033, + "reward": 2.701736092567444, + "reward_std": 0.5977243855595589, + "rewards/accuracy_reward": 0.8489583432674408, + "rewards/reasoning_steps_reward": 0.9635416865348816, + "rewards/repetition_penalty_reward": -0.074305709451437, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 588 + }, + { + "clip_ratio": 0.0, + "completion_length": 205.6458396911621, + "epoch": 0.8833895763029621, + "grad_norm": 0.6460605343575523, + "kl": 0.360595703125, + "learning_rate": 8.044278274713246e-07, + "loss": 0.6139, + "reward": 2.7414156198501587, + "reward_std": 0.5733753256499767, + "rewards/accuracy_reward": 0.8697916865348816, + "rewards/reasoning_steps_reward": 0.9739583730697632, + "rewards/repetition_penalty_reward": -0.06717826426029205, + "rewards/tag_count_reward": 0.9648437649011612, + "step": 589 + }, + { + "clip_ratio": 0.0, + "completion_length": 218.70313262939453, + "epoch": 0.8848893888263967, + "grad_norm": 0.7977081741142966, + "kl": 0.2939453125, + "learning_rate": 7.839449095383111e-07, + "loss": 0.6594, + "reward": 2.5514962673187256, + "reward_std": 0.6692121252417564, + "rewards/accuracy_reward": 0.708333358168602, + "rewards/reasoning_steps_reward": 0.9635417014360428, + "rewards/repetition_penalty_reward": -0.07741007022559643, + "rewards/tag_count_reward": 0.95703125, + "step": 590 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.4739646911621, + "epoch": 0.8863892013498312, + "grad_norm": 0.6568761050630232, + "kl": 0.31201171875, + "learning_rate": 7.637154987286888e-07, + "loss": 0.4408, + "reward": 2.6473821997642517, + "reward_std": 0.49707260727882385, + "rewards/accuracy_reward": 0.7656250149011612, + "rewards/reasoning_steps_reward": 0.975694477558136, + "rewards/repetition_penalty_reward": -0.0652914484962821, + "rewards/tag_count_reward": 0.9713541865348816, + "step": 591 + }, + { + "clip_ratio": 0.0, + "completion_length": 235.76562881469727, + "epoch": 0.8878890138732658, + "grad_norm": 1.190854319926877, + "kl": 0.37451171875, + "learning_rate": 7.437401514952646e-07, + "loss": 0.8083, + "reward": 2.5822657346725464, + "reward_std": 0.691101536154747, + "rewards/accuracy_reward": 0.7552083432674408, + "rewards/reasoning_steps_reward": 0.973958358168602, + "rewards/repetition_penalty_reward": -0.0974218100309372, + "rewards/tag_count_reward": 0.9505208432674408, + "step": 592 + }, + { + "clip_ratio": 0.0, + "completion_length": 220.5312614440918, + "epoch": 0.8893888263967004, + "grad_norm": 0.9514705424643457, + "kl": 0.33349609375, + "learning_rate": 7.240194173022941e-07, + "loss": 0.5884, + "reward": 2.755159914493561, + "reward_std": 0.5876848474144936, + "rewards/accuracy_reward": 0.9218750149011612, + "rewards/reasoning_steps_reward": 0.9670139104127884, + "rewards/repetition_penalty_reward": -0.08945825602859259, + "rewards/tag_count_reward": 0.9557292014360428, + "step": 593 + }, + { + "clip_ratio": 0.0, + "completion_length": 208.0208396911621, + "epoch": 0.890888638920135, + "grad_norm": 3.0869314018149523, + "kl": 0.38623046875, + "learning_rate": 7.045538386103579e-07, + "loss": 0.4621, + "reward": 2.473863959312439, + "reward_std": 0.5747430324554443, + "rewards/accuracy_reward": 0.6093750149011612, + "rewards/reasoning_steps_reward": 0.9687500149011612, + "rewards/repetition_penalty_reward": -0.06650061067193747, + "rewards/tag_count_reward": 0.962239608168602, + "step": 594 + }, + { + "clip_ratio": 0.0, + "completion_length": 218.0052146911621, + "epoch": 0.8923884514435696, + "grad_norm": 0.7131069872089879, + "kl": 0.281494140625, + "learning_rate": 6.853439508614412e-07, + "loss": 0.4492, + "reward": 2.5537226796150208, + "reward_std": 0.550928995013237, + "rewards/accuracy_reward": 0.6927083432674408, + "rewards/reasoning_steps_reward": 0.9704861491918564, + "rewards/repetition_penalty_reward": -0.07040940225124359, + "rewards/tag_count_reward": 0.9609375149011612, + "step": 595 + }, + { + "clip_ratio": 0.0, + "completion_length": 217.3333396911621, + "epoch": 0.8938882639670042, + "grad_norm": 1.0361329256509342, + "kl": 0.32666015625, + "learning_rate": 6.663902824642132e-07, + "loss": 0.7393, + "reward": 2.6442973017692566, + "reward_std": 0.7471431717276573, + "rewards/accuracy_reward": 0.817708358168602, + "rewards/reasoning_steps_reward": 0.9635416716337204, + "rewards/repetition_penalty_reward": -0.08226529462262988, + "rewards/tag_count_reward": 0.9453125298023224, + "step": 596 + }, + { + "clip_ratio": 0.0, + "completion_length": 197.64583587646484, + "epoch": 0.8953880764904387, + "grad_norm": 0.8179894329191135, + "kl": 0.2880859375, + "learning_rate": 6.47693354779484e-07, + "loss": 0.3874, + "reward": 2.6881872415542603, + "reward_std": 0.4334303140640259, + "rewards/accuracy_reward": 0.7864583432674408, + "rewards/reasoning_steps_reward": 0.975694477558136, + "rewards/repetition_penalty_reward": -0.053132264874875546, + "rewards/tag_count_reward": 0.9791667014360428, + "step": 597 + }, + { + "clip_ratio": 0.0, + "completion_length": 226.76562881469727, + "epoch": 0.8968878890138733, + "grad_norm": 1.1693577356428269, + "kl": 0.33154296875, + "learning_rate": 6.29253682105866e-07, + "loss": 0.4538, + "reward": 2.6256433725357056, + "reward_std": 0.4847990833222866, + "rewards/accuracy_reward": 0.7812500298023224, + "rewards/reasoning_steps_reward": 0.9791666716337204, + "rewards/repetition_penalty_reward": -0.08789830794557929, + "rewards/tag_count_reward": 0.9531250149011612, + "step": 598 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.71354293823242, + "epoch": 0.8983877015373078, + "grad_norm": 53.704269130642075, + "kl": 0.4990234375, + "learning_rate": 6.110717716656289e-07, + "loss": 0.4649, + "reward": 2.6057077050209045, + "reward_std": 0.5076233521103859, + "rewards/accuracy_reward": 0.7187500223517418, + "rewards/reasoning_steps_reward": 0.9739583432674408, + "rewards/repetition_penalty_reward": -0.058354973793029785, + "rewards/tag_count_reward": 0.9713541716337204, + "step": 599 + }, + { + "clip_ratio": 0.0, + "completion_length": 217.81250381469727, + "epoch": 0.8998875140607424, + "grad_norm": 1.8065228297910518, + "kl": 0.36083984375, + "learning_rate": 5.931481235907466e-07, + "loss": 0.5947, + "reward": 2.6084994673728943, + "reward_std": 0.6163917481899261, + "rewards/accuracy_reward": 0.7395833432674408, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.0698860315605998, + "rewards/tag_count_reward": 0.9596354216337204, + "step": 600 + }, + { + "clip_ratio": 0.0, + "completion_length": 242.0989646911621, + "epoch": 0.901387326584177, + "grad_norm": 0.7642591170420632, + "kl": 0.32080078125, + "learning_rate": 5.754832309091362e-07, + "loss": 0.7586, + "reward": 2.654149353504181, + "reward_std": 0.6473118215799332, + "rewards/accuracy_reward": 0.8281250149011612, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.10366316325962543, + "rewards/tag_count_reward": 0.9505208730697632, + "step": 601 + }, + { + "clip_ratio": 0.0, + "completion_length": 197.6666717529297, + "epoch": 0.9028871391076115, + "grad_norm": 0.782148037165739, + "kl": 0.28662109375, + "learning_rate": 5.580775795311033e-07, + "loss": 0.541, + "reward": 2.641181170940399, + "reward_std": 0.4843425452709198, + "rewards/accuracy_reward": 0.7656250298023224, + "rewards/reasoning_steps_reward": 0.9687500149011612, + "rewards/repetition_penalty_reward": -0.05543356016278267, + "rewards/tag_count_reward": 0.9622395932674408, + "step": 602 + }, + { + "clip_ratio": 0.0, + "completion_length": 247.57812881469727, + "epoch": 0.9043869516310461, + "grad_norm": 0.6409249324968989, + "kl": 0.29345703125, + "learning_rate": 5.409316482359694e-07, + "loss": 0.7317, + "reward": 2.541344404220581, + "reward_std": 0.7429018467664719, + "rewards/accuracy_reward": 0.723958358168602, + "rewards/reasoning_steps_reward": 0.9600694924592972, + "rewards/repetition_penalty_reward": -0.09320436045527458, + "rewards/tag_count_reward": 0.950520858168602, + "step": 603 + }, + { + "clip_ratio": 0.0, + "completion_length": 207.79688262939453, + "epoch": 0.9058867641544807, + "grad_norm": 0.8747881283239337, + "kl": 0.3603515625, + "learning_rate": 5.240459086589056e-07, + "loss": 0.7424, + "reward": 2.6282320618629456, + "reward_std": 0.7034382522106171, + "rewards/accuracy_reward": 0.7968750298023224, + "rewards/reasoning_steps_reward": 0.9548611342906952, + "rewards/repetition_penalty_reward": -0.07662920840084553, + "rewards/tag_count_reward": 0.9531250149011612, + "step": 604 + }, + { + "clip_ratio": 0.0, + "completion_length": 236.0885467529297, + "epoch": 0.9073865766779152, + "grad_norm": 1.25088846306247, + "kl": 0.345703125, + "learning_rate": 5.074208252779589e-07, + "loss": 0.6663, + "reward": 2.4092161655426025, + "reward_std": 0.7467872053384781, + "rewards/accuracy_reward": 0.583333358168602, + "rewards/reasoning_steps_reward": 0.9635416865348816, + "rewards/repetition_penalty_reward": -0.08427347056567669, + "rewards/tag_count_reward": 0.946614608168602, + "step": 605 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.01562881469727, + "epoch": 0.9088863892013498, + "grad_norm": 0.9993410900248464, + "kl": 0.33935546875, + "learning_rate": 4.910568554012751e-07, + "loss": 0.6047, + "reward": 2.4471018314361572, + "reward_std": 0.4864268973469734, + "rewards/accuracy_reward": 0.5572917014360428, + "rewards/reasoning_steps_reward": 0.987847238779068, + "rewards/repetition_penalty_reward": -0.06808935943990946, + "rewards/tag_count_reward": 0.970052108168602, + "step": 606 + }, + { + "clip_ratio": 0.0, + "completion_length": 215.16146850585938, + "epoch": 0.9103862017247843, + "grad_norm": 1.6025896501489152, + "kl": 0.474609375, + "learning_rate": 4.749544491545199e-07, + "loss": 0.651, + "reward": 2.516863226890564, + "reward_std": 0.6264981552958488, + "rewards/accuracy_reward": 0.6770833507180214, + "rewards/reasoning_steps_reward": 0.9670139253139496, + "rewards/repetition_penalty_reward": -0.08166120201349258, + "rewards/tag_count_reward": 0.954427108168602, + "step": 607 + }, + { + "clip_ratio": 0.0, + "completion_length": 213.46875762939453, + "epoch": 0.911886014248219, + "grad_norm": 0.8390719889566893, + "kl": 0.30908203125, + "learning_rate": 4.591140494684965e-07, + "loss": 0.3055, + "reward": 2.4858875274658203, + "reward_std": 0.4557424336671829, + "rewards/accuracy_reward": 0.5989583432674408, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.060987500473856926, + "rewards/tag_count_reward": 0.9687500298023224, + "step": 608 + }, + { + "clip_ratio": 0.0, + "completion_length": 218.48958587646484, + "epoch": 0.9133858267716536, + "grad_norm": 0.603093011230838, + "kl": 0.2841796875, + "learning_rate": 4.435360920669618e-07, + "loss": 0.5458, + "reward": 2.7762425541877747, + "reward_std": 0.5177741958759725, + "rewards/accuracy_reward": 0.9010416865348816, + "rewards/reasoning_steps_reward": 0.9774305522441864, + "rewards/repetition_penalty_reward": -0.06577137997373939, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 609 + }, + { + "clip_ratio": 0.0, + "completion_length": 224.37500381469727, + "epoch": 0.9148856392950881, + "grad_norm": 1.0751560656614039, + "kl": 0.35693359375, + "learning_rate": 4.282210054546454e-07, + "loss": 0.6732, + "reward": 2.5695890188217163, + "reward_std": 0.6582420766353607, + "rewards/accuracy_reward": 0.7187500149011612, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.08535904064774513, + "rewards/tag_count_reward": 0.9570312798023224, + "step": 610 + }, + { + "clip_ratio": 0.0, + "completion_length": 231.01563262939453, + "epoch": 0.9163854518185227, + "grad_norm": 0.7771230124140907, + "kl": 0.32470703125, + "learning_rate": 4.1316921090545305e-07, + "loss": 0.5413, + "reward": 2.4763482809066772, + "reward_std": 0.6009985208511353, + "rewards/accuracy_reward": 0.6354166865348816, + "rewards/reasoning_steps_reward": 0.9739583432674408, + "rewards/repetition_penalty_reward": -0.09136011637747288, + "rewards/tag_count_reward": 0.9583333432674408, + "step": 611 + }, + { + "clip_ratio": 0.0, + "completion_length": 218.92708587646484, + "epoch": 0.9178852643419573, + "grad_norm": 1.1584104652860914, + "kl": 0.359375, + "learning_rate": 3.9838112245088934e-07, + "loss": 0.5215, + "reward": 2.5999078154563904, + "reward_std": 0.6818754225969315, + "rewards/accuracy_reward": 0.7500000149011612, + "rewards/reasoning_steps_reward": 0.9670139253139496, + "rewards/repetition_penalty_reward": -0.07283534575253725, + "rewards/tag_count_reward": 0.9557291865348816, + "step": 612 + }, + { + "clip_ratio": 0.0, + "completion_length": 214.7604217529297, + "epoch": 0.9193850768653918, + "grad_norm": 0.7260908756916117, + "kl": 0.314453125, + "learning_rate": 3.8385714686866137e-07, + "loss": 0.5458, + "reward": 2.6635963916778564, + "reward_std": 0.5627726316452026, + "rewards/accuracy_reward": 0.802083358168602, + "rewards/reasoning_steps_reward": 0.9756944477558136, + "rewards/repetition_penalty_reward": -0.07381692994385958, + "rewards/tag_count_reward": 0.9596354365348816, + "step": 613 + }, + { + "clip_ratio": 0.0, + "completion_length": 237.6979217529297, + "epoch": 0.9208848893888264, + "grad_norm": 0.9565075434583165, + "kl": 0.298828125, + "learning_rate": 3.695976836714932e-07, + "loss": 0.503, + "reward": 2.370998799800873, + "reward_std": 0.6577398786321282, + "rewards/accuracy_reward": 0.5572916939854622, + "rewards/reasoning_steps_reward": 0.9600694626569748, + "rewards/repetition_penalty_reward": -0.09297691145911813, + "rewards/tag_count_reward": 0.946614608168602, + "step": 614 + }, + { + "clip_ratio": 0.0, + "completion_length": 229.15625381469727, + "epoch": 0.922384701912261, + "grad_norm": 1.135581810486635, + "kl": 0.34228515625, + "learning_rate": 3.556031250961356e-07, + "loss": 0.5926, + "reward": 2.516046404838562, + "reward_std": 0.5974976867437363, + "rewards/accuracy_reward": 0.6770833432674408, + "rewards/reasoning_steps_reward": 0.9670139253139496, + "rewards/repetition_penalty_reward": -0.08377996645867825, + "rewards/tag_count_reward": 0.9557291865348816, + "step": 615 + }, + { + "clip_ratio": 0.0, + "completion_length": 254.8489646911621, + "epoch": 0.9238845144356955, + "grad_norm": 0.8735579754836578, + "kl": 0.34765625, + "learning_rate": 3.4187385609257275e-07, + "loss": 0.7876, + "reward": 2.537445366382599, + "reward_std": 0.8377581238746643, + "rewards/accuracy_reward": 0.7604166865348816, + "rewards/reasoning_steps_reward": 0.960069477558136, + "rewards/repetition_penalty_reward": -0.11533251963555813, + "rewards/tag_count_reward": 0.9322917014360428, + "step": 616 + }, + { + "clip_ratio": 0.0, + "completion_length": 220.47396087646484, + "epoch": 0.9253843269591301, + "grad_norm": 0.7011606608162222, + "kl": 0.40087890625, + "learning_rate": 3.284102543134426e-07, + "loss": 0.4159, + "reward": 2.5848045349121094, + "reward_std": 0.5568958222866058, + "rewards/accuracy_reward": 0.6979166865348816, + "rewards/reasoning_steps_reward": 0.9774305820465088, + "rewards/repetition_penalty_reward": -0.06189696677029133, + "rewards/tag_count_reward": 0.9713541865348816, + "step": 617 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.4947967529297, + "epoch": 0.9268841394825647, + "grad_norm": 0.779792570829681, + "kl": 0.267578125, + "learning_rate": 3.152126901036401e-07, + "loss": 0.3907, + "reward": 2.7681113481521606, + "reward_std": 0.4018867686390877, + "rewards/accuracy_reward": 0.8385416865348816, + "rewards/reasoning_steps_reward": 0.9947916716337204, + "rewards/repetition_penalty_reward": -0.04438872542232275, + "rewards/tag_count_reward": 0.9791666716337204, + "step": 618 + }, + { + "clip_ratio": 0.0, + "completion_length": 215.7916717529297, + "epoch": 0.9283839520059992, + "grad_norm": 0.9727936242036457, + "kl": 0.268310546875, + "learning_rate": 3.0228152649013133e-07, + "loss": 0.5183, + "reward": 2.6839698553085327, + "reward_std": 0.5690836161375046, + "rewards/accuracy_reward": 0.8229166865348816, + "rewards/reasoning_steps_reward": 0.9756944626569748, + "rewards/repetition_penalty_reward": -0.0716724512167275, + "rewards/tag_count_reward": 0.9570312798023224, + "step": 619 + }, + { + "clip_ratio": 0.0, + "completion_length": 217.92708587646484, + "epoch": 0.9298837645294338, + "grad_norm": 950.8548852246184, + "kl": 19.261474609375, + "learning_rate": 2.896171191719743e-07, + "loss": 2.7638, + "reward": 2.5930131673812866, + "reward_std": 0.6507796198129654, + "rewards/accuracy_reward": 0.7343750149011612, + "rewards/reasoning_steps_reward": 0.9670139104127884, + "rewards/repetition_penalty_reward": -0.06931330915540457, + "rewards/tag_count_reward": 0.9609375149011612, + "step": 620 + }, + { + "clip_ratio": 0.0, + "completion_length": 172.92708587646484, + "epoch": 0.9313835770528683, + "grad_norm": 0.977711243590189, + "kl": 0.29345703125, + "learning_rate": 2.772198165105267e-07, + "loss": 0.2329, + "reward": 2.6393333673477173, + "reward_std": 0.36761191859841347, + "rewards/accuracy_reward": 0.692708358168602, + "rewards/reasoning_steps_reward": 0.9913194626569748, + "rewards/repetition_penalty_reward": -0.0316736598033458, + "rewards/tag_count_reward": 0.9869791716337204, + "step": 621 + }, + { + "clip_ratio": 0.0, + "completion_length": 217.26562881469727, + "epoch": 0.932883389576303, + "grad_norm": 1.0412352430260128, + "kl": 0.42578125, + "learning_rate": 2.6508995951986526e-07, + "loss": 0.6586, + "reward": 2.772718071937561, + "reward_std": 0.5336792543530464, + "rewards/accuracy_reward": 0.8906250149011612, + "rewards/reasoning_steps_reward": 0.9809027761220932, + "rewards/repetition_penalty_reward": -0.06495565082877874, + "rewards/tag_count_reward": 0.966145858168602, + "step": 622 + }, + { + "clip_ratio": 0.0, + "completion_length": 232.16667556762695, + "epoch": 0.9343832020997376, + "grad_norm": 0.7362485021348766, + "kl": 0.3427734375, + "learning_rate": 2.532278818574108e-07, + "loss": 0.7523, + "reward": 2.6175056099891663, + "reward_std": 0.7539031505584717, + "rewards/accuracy_reward": 0.817708358168602, + "rewards/reasoning_steps_reward": 0.9548611491918564, + "rewards/repetition_penalty_reward": -0.09647011943161488, + "rewards/tag_count_reward": 0.9414062649011612, + "step": 623 + }, + { + "clip_ratio": 0.0, + "completion_length": 201.53646087646484, + "epoch": 0.9358830146231721, + "grad_norm": 1.5000993664801836, + "kl": 0.28955078125, + "learning_rate": 2.4163390981474354e-07, + "loss": 0.4566, + "reward": 2.5779688954353333, + "reward_std": 0.4579782895743847, + "rewards/accuracy_reward": 0.6718750149011612, + "rewards/reasoning_steps_reward": 0.975694477558136, + "rewards/repetition_penalty_reward": -0.048767429776489735, + "rewards/tag_count_reward": 0.9791666865348816, + "step": 624 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.2552146911621, + "epoch": 0.9373828271466067, + "grad_norm": 1.0373325356696304, + "kl": 0.3564453125, + "learning_rate": 2.3030836230863108e-07, + "loss": 0.5518, + "reward": 2.4835113286972046, + "reward_std": 0.6010262817144394, + "rewards/accuracy_reward": 0.6354166939854622, + "rewards/reasoning_steps_reward": 0.9618056118488312, + "rewards/repetition_penalty_reward": -0.06813805643469095, + "rewards/tag_count_reward": 0.954427108168602, + "step": 625 + }, + { + "clip_ratio": 0.0, + "completion_length": 225.14062881469727, + "epoch": 0.9388826396700413, + "grad_norm": 0.8742390265642193, + "kl": 0.3603515625, + "learning_rate": 2.192515508722559e-07, + "loss": 0.5906, + "reward": 2.5894395112991333, + "reward_std": 0.679242342710495, + "rewards/accuracy_reward": 0.7760416865348816, + "rewards/reasoning_steps_reward": 0.9461805820465088, + "rewards/repetition_penalty_reward": -0.07809533644467592, + "rewards/tag_count_reward": 0.9453125149011612, + "step": 626 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.52084350585938, + "epoch": 0.9403824521934758, + "grad_norm": 1.1468331051712704, + "kl": 0.361328125, + "learning_rate": 2.08463779646646e-07, + "loss": 0.4132, + "reward": 2.633046269416809, + "reward_std": 0.46498178830370307, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.960069477558136, + "rewards/repetition_penalty_reward": -0.06400250736624002, + "rewards/tag_count_reward": 0.9661458432674408, + "step": 627 + }, + { + "clip_ratio": 0.0, + "completion_length": 211.15625381469727, + "epoch": 0.9418822647169104, + "grad_norm": 1.4986712256290156, + "kl": 0.365234375, + "learning_rate": 1.979453453723057e-07, + "loss": 0.6512, + "reward": 2.6833993792533875, + "reward_std": 0.7203980311751366, + "rewards/accuracy_reward": 0.833333358168602, + "rewards/reasoning_steps_reward": 0.958333358168602, + "rewards/repetition_penalty_reward": -0.06529862061142921, + "rewards/tag_count_reward": 0.9570312649011612, + "step": 628 + }, + { + "clip_ratio": 0.0, + "completion_length": 229.2447967529297, + "epoch": 0.943382077240345, + "grad_norm": 0.7753884291535859, + "kl": 0.2939453125, + "learning_rate": 1.8769653738105797e-07, + "loss": 0.499, + "reward": 2.6547706723213196, + "reward_std": 0.4733446016907692, + "rewards/accuracy_reward": 0.786458358168602, + "rewards/reasoning_steps_reward": 0.9826388955116272, + "rewards/repetition_penalty_reward": -0.077868377789855, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 629 + }, + { + "clip_ratio": 0.0, + "completion_length": 215.28125381469727, + "epoch": 0.9448818897637795, + "grad_norm": 1.3985799794968885, + "kl": 0.2900390625, + "learning_rate": 1.7771763758808403e-07, + "loss": 0.6038, + "reward": 2.550901174545288, + "reward_std": 0.5799007415771484, + "rewards/accuracy_reward": 0.677083358168602, + "rewards/reasoning_steps_reward": 0.9826389104127884, + "rewards/repetition_penalty_reward": -0.07236280757933855, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 630 + }, + { + "clip_ratio": 0.0, + "completion_length": 209.70313262939453, + "epoch": 0.9463817022872141, + "grad_norm": 1.4263168938028885, + "kl": 0.3466796875, + "learning_rate": 1.6800892048416618e-07, + "loss": 0.7189, + "reward": 2.665283203125, + "reward_std": 0.6292329207062721, + "rewards/accuracy_reward": 0.8072917014360428, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.0756023209542036, + "rewards/tag_count_reward": 0.9544271230697632, + "step": 631 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.7864646911621, + "epoch": 0.9478815148106486, + "grad_norm": 0.7415274923254211, + "kl": 0.3037109375, + "learning_rate": 1.5857065312814058e-07, + "loss": 0.5161, + "reward": 2.5291183590888977, + "reward_std": 0.5103181153535843, + "rewards/accuracy_reward": 0.651041679084301, + "rewards/reasoning_steps_reward": 0.973958358168602, + "rewards/repetition_penalty_reward": -0.06332961097359657, + "rewards/tag_count_reward": 0.9674479365348816, + "step": 632 + }, + { + "clip_ratio": 0.0, + "completion_length": 211.75000381469727, + "epoch": 0.9493813273340832, + "grad_norm": 1.3628095814309313, + "kl": 0.322265625, + "learning_rate": 1.4940309513955088e-07, + "loss": 0.6802, + "reward": 2.667730212211609, + "reward_std": 0.6961818635463715, + "rewards/accuracy_reward": 0.8281250149011612, + "rewards/reasoning_steps_reward": 0.958333358168602, + "rewards/repetition_penalty_reward": -0.07315511163324118, + "rewards/tag_count_reward": 0.954427108168602, + "step": 633 + }, + { + "clip_ratio": 0.0, + "completion_length": 220.2291717529297, + "epoch": 0.9508811398575178, + "grad_norm": 0.9795351526659545, + "kl": 0.369140625, + "learning_rate": 1.405064986915028e-07, + "loss": 0.6991, + "reward": 2.5635343194007874, + "reward_std": 0.6203908771276474, + "rewards/accuracy_reward": 0.708333358168602, + "rewards/reasoning_steps_reward": 0.9826389104127884, + "rewards/repetition_penalty_reward": -0.08056299947202206, + "rewards/tag_count_reward": 0.9531250298023224, + "step": 634 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.6927146911621, + "epoch": 0.9523809523809523, + "grad_norm": 0.8730687949726732, + "kl": 0.32666015625, + "learning_rate": 1.3188110850373527e-07, + "loss": 0.4296, + "reward": 2.716920852661133, + "reward_std": 0.47606247290968895, + "rewards/accuracy_reward": 0.8281250149011612, + "rewards/reasoning_steps_reward": 0.967013880610466, + "rewards/repetition_penalty_reward": -0.050874427892267704, + "rewards/tag_count_reward": 0.9726562649011612, + "step": 635 + }, + { + "clip_ratio": 0.0, + "completion_length": 221.78125381469727, + "epoch": 0.953880764904387, + "grad_norm": 1.1050385339520583, + "kl": 0.4296875, + "learning_rate": 1.2352716183588022e-07, + "loss": 0.7716, + "reward": 2.4985339045524597, + "reward_std": 0.7328899428248405, + "rewards/accuracy_reward": 0.6822916865348816, + "rewards/reasoning_steps_reward": 0.954861119389534, + "rewards/repetition_penalty_reward": -0.08393146842718124, + "rewards/tag_count_reward": 0.9453125298023224, + "step": 636 + }, + { + "clip_ratio": 0.0, + "completion_length": 220.67188262939453, + "epoch": 0.9553805774278216, + "grad_norm": 1.7375333895919391, + "kl": 0.349609375, + "learning_rate": 1.1544488848094338e-07, + "loss": 0.5867, + "reward": 2.678426444530487, + "reward_std": 0.5324868559837341, + "rewards/accuracy_reward": 0.7968750298023224, + "rewards/reasoning_steps_reward": 0.9843750298023224, + "rewards/repetition_penalty_reward": -0.07287566550076008, + "rewards/tag_count_reward": 0.9700520932674408, + "step": 637 + }, + { + "clip_ratio": 0.0, + "completion_length": 210.47916793823242, + "epoch": 0.9568803899512561, + "grad_norm": 1.4004750786860491, + "kl": 0.3212890625, + "learning_rate": 1.0763451075897713e-07, + "loss": 0.8042, + "reward": 2.6482399106025696, + "reward_std": 0.6535500586032867, + "rewards/accuracy_reward": 0.7812500149011612, + "rewards/reasoning_steps_reward": 0.9756944626569748, + "rewards/repetition_penalty_reward": -0.07485033478587866, + "rewards/tag_count_reward": 0.966145858168602, + "step": 638 + }, + { + "clip_ratio": 0.0, + "completion_length": 190.1354217529297, + "epoch": 0.9583802024746907, + "grad_norm": 1.0307017049735754, + "kl": 0.316650390625, + "learning_rate": 1.0009624351097313e-07, + "loss": 0.4842, + "reward": 2.681637763977051, + "reward_std": 0.5066114142537117, + "rewards/accuracy_reward": 0.7812500298023224, + "rewards/reasoning_steps_reward": 0.9739583432674408, + "rewards/repetition_penalty_reward": -0.0462268297560513, + "rewards/tag_count_reward": 0.9726562798023224, + "step": 639 + }, + { + "clip_ratio": 0.0, + "completion_length": 196.38021087646484, + "epoch": 0.9598800149981253, + "grad_norm": 0.7851546590808128, + "kl": 0.29345703125, + "learning_rate": 9.283029409294263e-08, + "loss": 0.5919, + "reward": 2.8382840156555176, + "reward_std": 0.507376492023468, + "rewards/accuracy_reward": 0.942708358168602, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.05624724645167589, + "rewards/tag_count_reward": 0.9726562649011612, + "step": 640 + }, + { + "clip_ratio": 0.0, + "completion_length": 206.6770896911621, + "epoch": 0.9613798275215598, + "grad_norm": 0.8675967780931345, + "kl": 0.32421875, + "learning_rate": 8.583686237022105e-08, + "loss": 0.5485, + "reward": 2.5225971341133118, + "reward_std": 0.504936508834362, + "rewards/accuracy_reward": 0.6354166716337204, + "rewards/reasoning_steps_reward": 0.982638880610466, + "rewards/repetition_penalty_reward": -0.06551066134124994, + "rewards/tag_count_reward": 0.9700520932674408, + "step": 641 + }, + { + "clip_ratio": 0.0, + "completion_length": 242.56250762939453, + "epoch": 0.9628796400449944, + "grad_norm": 1.4191184069887932, + "kl": 0.40283203125, + "learning_rate": 7.911614071196671e-08, + "loss": 0.6117, + "reward": 2.552360415458679, + "reward_std": 0.6927186399698257, + "rewards/accuracy_reward": 0.75, + "rewards/reasoning_steps_reward": 0.953125, + "rewards/repetition_penalty_reward": -0.09477510303258896, + "rewards/tag_count_reward": 0.9440104514360428, + "step": 642 + }, + { + "clip_ratio": 0.0, + "completion_length": 218.17188262939453, + "epoch": 0.9643794525684289, + "grad_norm": 0.9682440226139685, + "kl": 0.35498046875, + "learning_rate": 7.266831398587082e-08, + "loss": 0.5619, + "reward": 2.6066702604293823, + "reward_std": 0.5623406581580639, + "rewards/accuracy_reward": 0.7552083432674408, + "rewards/reasoning_steps_reward": 0.9635416865348816, + "rewards/repetition_penalty_reward": -0.07171526318416, + "rewards/tag_count_reward": 0.9596354365348816, + "step": 643 + }, + { + "clip_ratio": 0.0, + "completion_length": 265.9270896911621, + "epoch": 0.9658792650918635, + "grad_norm": 108.30769155391553, + "kl": 1.31689453125, + "learning_rate": 6.649355955306802e-08, + "loss": 0.7916, + "reward": 2.512334644794464, + "reward_std": 0.77546027302742, + "rewards/accuracy_reward": 0.7135417014360428, + "rewards/reasoning_steps_reward": 0.9670139402151108, + "rewards/repetition_penalty_reward": -0.10962733440101147, + "rewards/tag_count_reward": 0.9414062798023224, + "step": 644 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.09375762939453, + "epoch": 0.9673790776152981, + "grad_norm": 0.9073762922270306, + "kl": 0.27783203125, + "learning_rate": 6.059204726326373e-08, + "loss": 0.5087, + "reward": 2.4138490557670593, + "reward_std": 0.6195433586835861, + "rewards/accuracy_reward": 0.5364583507180214, + "rewards/reasoning_steps_reward": 0.9895833283662796, + "rewards/repetition_penalty_reward": -0.0731301549822092, + "rewards/tag_count_reward": 0.9609375149011612, + "step": 645 + }, + { + "clip_ratio": 0.0, + "completion_length": 205.7291717529297, + "epoch": 0.9688788901387326, + "grad_norm": 0.6693528675529495, + "kl": 0.2998046875, + "learning_rate": 5.4963939450057846e-08, + "loss": 0.4844, + "reward": 2.7782857418060303, + "reward_std": 0.5295575931668282, + "rewards/accuracy_reward": 0.9010416865348816, + "rewards/reasoning_steps_reward": 0.973958358168602, + "rewards/repetition_penalty_reward": -0.06025600666180253, + "rewards/tag_count_reward": 0.9635416716337204, + "step": 646 + }, + { + "clip_ratio": 0.0, + "completion_length": 200.9947967529297, + "epoch": 0.9703787026621672, + "grad_norm": 60.998327736404455, + "kl": 48.24267578125, + "learning_rate": 4.960939092648165e-08, + "loss": 0.6613, + "reward": 2.77788108587265, + "reward_std": 0.5485429763793945, + "rewards/accuracy_reward": 0.9062500298023224, + "rewards/reasoning_steps_reward": 0.9670139253139496, + "rewards/repetition_penalty_reward": -0.06022655125707388, + "rewards/tag_count_reward": 0.9648437649011612, + "step": 647 + }, + { + "clip_ratio": 0.0, + "completion_length": 226.3489646911621, + "epoch": 0.9718785151856018, + "grad_norm": 1.0839111689107002, + "kl": 0.3359375, + "learning_rate": 4.452854898073788e-08, + "loss": 0.6235, + "reward": 2.5908501744270325, + "reward_std": 0.5783855766057968, + "rewards/accuracy_reward": 0.7395833432674408, + "rewards/reasoning_steps_reward": 0.9826388955116272, + "rewards/repetition_penalty_reward": -0.08189292438328266, + "rewards/tag_count_reward": 0.950520858168602, + "step": 648 + }, + { + "clip_ratio": 0.0, + "completion_length": 235.47917938232422, + "epoch": 0.9733783277090363, + "grad_norm": 1.1002454367127348, + "kl": 0.41015625, + "learning_rate": 3.9721553372150665e-08, + "loss": 0.6111, + "reward": 2.5614100098609924, + "reward_std": 0.7080376967787743, + "rewards/accuracy_reward": 0.7343750223517418, + "rewards/reasoning_steps_reward": 0.9635416865348816, + "rewards/repetition_penalty_reward": -0.08702756371349096, + "rewards/tag_count_reward": 0.950520858168602, + "step": 649 + }, + { + "clip_ratio": 0.0, + "completion_length": 241.73438262939453, + "epoch": 0.974878140232471, + "grad_norm": 4.156144128333803, + "kl": 2.935546875, + "learning_rate": 3.5188536327318554e-08, + "loss": 0.533, + "reward": 2.4512782096862793, + "reward_std": 0.528814010322094, + "rewards/accuracy_reward": 0.6354166716337204, + "rewards/reasoning_steps_reward": 0.9635416865348816, + "rewards/repetition_penalty_reward": -0.09169065579771996, + "rewards/tag_count_reward": 0.9440104365348816, + "step": 650 + }, + { + "clip_ratio": 0.0, + "completion_length": 176.15625381469727, + "epoch": 0.9763779527559056, + "grad_norm": 0.9986867176141776, + "kl": 0.37939453125, + "learning_rate": 3.092962253648302e-08, + "loss": 0.352, + "reward": 2.7320080399513245, + "reward_std": 0.40247857104986906, + "rewards/accuracy_reward": 0.8072916865348816, + "rewards/reasoning_steps_reward": 0.9791666716337204, + "rewards/repetition_penalty_reward": -0.03752335952594876, + "rewards/tag_count_reward": 0.9830729365348816, + "step": 651 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.25000381469727, + "epoch": 0.9778777652793401, + "grad_norm": 0.7209747458961953, + "kl": 0.27294921875, + "learning_rate": 2.694492915009006e-08, + "loss": 0.3393, + "reward": 2.59340101480484, + "reward_std": 0.36479785293340683, + "rewards/accuracy_reward": 0.6979166865348816, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.05764078535139561, + "rewards/tag_count_reward": 0.973958358168602, + "step": 652 + }, + { + "clip_ratio": 0.0, + "completion_length": 216.52083587646484, + "epoch": 0.9793775778027747, + "grad_norm": 0.8162473177671608, + "kl": 0.30859375, + "learning_rate": 2.3234565775575034e-08, + "loss": 0.5575, + "reward": 2.596774399280548, + "reward_std": 0.6674105823040009, + "rewards/accuracy_reward": 0.7447917014360428, + "rewards/reasoning_steps_reward": 0.9670139253139496, + "rewards/repetition_penalty_reward": -0.07466658856719732, + "rewards/tag_count_reward": 0.9596354365348816, + "step": 653 + }, + { + "clip_ratio": 0.0, + "completion_length": 218.06771087646484, + "epoch": 0.9808773903262092, + "grad_norm": 1.3998005104320381, + "kl": 0.35595703125, + "learning_rate": 1.9798634474345048e-08, + "loss": 0.6469, + "reward": 2.451699197292328, + "reward_std": 0.6794094815850258, + "rewards/accuracy_reward": 0.6093750149011612, + "rewards/reasoning_steps_reward": 0.9670139104127884, + "rewards/repetition_penalty_reward": -0.07651268597692251, + "rewards/tag_count_reward": 0.9518229216337204, + "step": 654 + }, + { + "clip_ratio": 0.0, + "completion_length": 197.5833396911621, + "epoch": 0.9823772028496438, + "grad_norm": 0.8512758770399678, + "kl": 0.31396484375, + "learning_rate": 1.6637229758970087e-08, + "loss": 0.4539, + "reward": 2.7271097898483276, + "reward_std": 0.4929804429411888, + "rewards/accuracy_reward": 0.817708358168602, + "rewards/reasoning_steps_reward": 0.9843750298023224, + "rewards/repetition_penalty_reward": -0.05283822864294052, + "rewards/tag_count_reward": 0.9778645932674408, + "step": 655 + }, + { + "clip_ratio": 0.0, + "completion_length": 221.85417556762695, + "epoch": 0.9838770153730784, + "grad_norm": 1.2498346639772517, + "kl": 0.3740234375, + "learning_rate": 1.3750438590586223e-08, + "loss": 0.5368, + "reward": 2.681597590446472, + "reward_std": 0.568534217774868, + "rewards/accuracy_reward": 0.833333358168602, + "rewards/reasoning_steps_reward": 0.960069477558136, + "rewards/repetition_penalty_reward": -0.07404482085257769, + "rewards/tag_count_reward": 0.962239608168602, + "step": 656 + }, + { + "clip_ratio": 0.0, + "completion_length": 215.24480056762695, + "epoch": 0.9853768278965129, + "grad_norm": 0.8636210438416793, + "kl": 0.30517578125, + "learning_rate": 1.1138340376501966e-08, + "loss": 0.5454, + "reward": 2.747868835926056, + "reward_std": 0.6248507276177406, + "rewards/accuracy_reward": 0.8697916865348816, + "rewards/reasoning_steps_reward": 0.9722222536802292, + "rewards/repetition_penalty_reward": -0.0602909866720438, + "rewards/tag_count_reward": 0.9661458432674408, + "step": 657 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.9270896911621, + "epoch": 0.9868766404199475, + "grad_norm": 1.3618095357818232, + "kl": 0.3759765625, + "learning_rate": 8.801006968012227e-09, + "loss": 0.6015, + "reward": 2.544634699821472, + "reward_std": 0.524773295968771, + "rewards/accuracy_reward": 0.6614583544433117, + "rewards/reasoning_steps_reward": 0.9774305820465088, + "rewards/repetition_penalty_reward": -0.06300420686602592, + "rewards/tag_count_reward": 0.9687500298023224, + "step": 658 + }, + { + "clip_ratio": 0.0, + "completion_length": 238.6614646911621, + "epoch": 0.9883764529433821, + "grad_norm": 0.8680279006911963, + "kl": 0.3662109375, + "learning_rate": 6.738502658426571e-09, + "loss": 0.6921, + "reward": 2.585418939590454, + "reward_std": 0.7059072330594063, + "rewards/accuracy_reward": 0.7812500149011612, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.09860894456505775, + "rewards/tag_count_reward": 0.9375, + "step": 659 + }, + { + "clip_ratio": 0.0, + "completion_length": 227.03125381469727, + "epoch": 0.9898762654668166, + "grad_norm": 1.0096929149450597, + "kl": 0.33642578125, + "learning_rate": 4.950884181295079e-09, + "loss": 0.6266, + "reward": 2.6765416860580444, + "reward_std": 0.5609493404626846, + "rewards/accuracy_reward": 0.8281250149011612, + "rewards/reasoning_steps_reward": 0.9774305820465088, + "rewards/repetition_penalty_reward": -0.0886493306607008, + "rewards/tag_count_reward": 0.9596354365348816, + "step": 660 + }, + { + "clip_ratio": 0.0, + "completion_length": 217.0104217529297, + "epoch": 0.9913760779902512, + "grad_norm": 0.858588179176331, + "kl": 0.3212890625, + "learning_rate": 3.4382007088518134e-09, + "loss": 0.7794, + "reward": 2.381414532661438, + "reward_std": 0.6443270593881607, + "rewards/accuracy_reward": 0.5468750149011612, + "rewards/reasoning_steps_reward": 0.9618055671453476, + "rewards/repetition_penalty_reward": -0.0777870174497366, + "rewards/tag_count_reward": 0.950520858168602, + "step": 661 + }, + { + "clip_ratio": 0.0, + "completion_length": 235.0833396911621, + "epoch": 0.9928758905136857, + "grad_norm": 2.317140366000632, + "kl": 0.34619140625, + "learning_rate": 2.200493850662566e-09, + "loss": 0.7031, + "reward": 2.485570192337036, + "reward_std": 0.7142708599567413, + "rewards/accuracy_reward": 0.692708358168602, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.09863131493330002, + "rewards/tag_count_reward": 0.9401041865348816, + "step": 662 + }, + { + "clip_ratio": 0.0, + "completion_length": 219.2239646911621, + "epoch": 0.9943757030371203, + "grad_norm": 0.8136792892151121, + "kl": 0.34765625, + "learning_rate": 1.2377976524746705e-09, + "loss": 0.6723, + "reward": 2.587967872619629, + "reward_std": 0.66152124106884, + "rewards/accuracy_reward": 0.7343750149011612, + "rewards/reasoning_steps_reward": 0.9739583432674408, + "rewards/repetition_penalty_reward": -0.07609467767179012, + "rewards/tag_count_reward": 0.9557291865348816, + "step": 663 + }, + { + "clip_ratio": 0.0, + "completion_length": 214.48958587646484, + "epoch": 0.995875515560555, + "grad_norm": 1.3047927636843426, + "kl": 0.3359375, + "learning_rate": 5.501385952888516e-10, + "loss": 0.751, + "reward": 2.50458562374115, + "reward_std": 0.6785130500793457, + "rewards/accuracy_reward": 0.6718750149011612, + "rewards/reasoning_steps_reward": 0.960069477558136, + "rewards/repetition_penalty_reward": -0.07397346664220095, + "rewards/tag_count_reward": 0.9466146230697632, + "step": 664 + }, + { + "clip_ratio": 0.0, + "completion_length": 224.18750762939453, + "epoch": 0.9973753280839895, + "grad_norm": 0.6420501828977786, + "kl": 0.3134765625, + "learning_rate": 1.375355946242607e-10, + "loss": 0.6873, + "reward": 2.3991820216178894, + "reward_std": 0.6460757553577423, + "rewards/accuracy_reward": 0.5677083432674408, + "rewards/reasoning_steps_reward": 0.9600694626569748, + "rewards/repetition_penalty_reward": -0.08041872084140778, + "rewards/tag_count_reward": 0.9518229365348816, + "step": 665 + }, + { + "clip_ratio": 0.0, + "completion_length": 242.2552146911621, + "epoch": 0.9988751406074241, + "grad_norm": 1.2779320053093417, + "kl": 0.5048828125, + "learning_rate": 0.0, + "loss": 0.8081, + "reward": 2.5781980752944946, + "reward_std": 0.8304520845413208, + "rewards/accuracy_reward": 0.7968750149011612, + "rewards/reasoning_steps_reward": 0.9496528059244156, + "rewards/repetition_penalty_reward": -0.10322570707648993, + "rewards/tag_count_reward": 0.934895858168602, + "step": 666 + }, + { + "epoch": 0.9988751406074241, + "step": 666, "total_flos": 0.0, - "train_loss": 0.6876773679259796, - "train_runtime": 4819.3131, - "train_samples_per_second": 0.83, - "train_steps_per_second": 0.069 + "train_loss": 566.2065416834479, + "train_runtime": 15651.159, + "train_samples_per_second": 0.511, + "train_steps_per_second": 0.043 } ], "logging_steps": 1, - "max_steps": 333, + "max_steps": 666, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200,