diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -3,16023 +3,32023 @@ "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, - "global_step": 1000, + "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, - "completion_length": 503.75, - "epoch": 0.001, - "grad_norm": 2.9832106604609847, + "completion_length": 328.70833587646484, + "epoch": 0.0005, + "grad_norm": 6.251723469874294, "kl": 0.0, - "learning_rate": 1e-08, - "loss": -0.0412, - "reward": 1.165066421031952, - "reward_std": 0.4164372831583023, - "rewards/accuracy_reward": 0.0416666679084301, - "rewards/reasoning_steps_reward": 0.6388888955116272, - "rewards/repetition_penalty_reward": -0.03632250800728798, - "rewards/tag_count_reward": 0.5208333730697632, + "learning_rate": 5e-09, + "loss": -0.0315, + "reward": 0.6384468078613281, + "reward_std": 0.2296978384256363, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/reasoning_steps_reward": 0.1597222313284874, + "rewards/repetition_penalty_reward": -0.05252542719244957, + "rewards/tag_count_reward": 0.5104166865348816, "step": 1 }, { "clip_ratio": 0.0, - "completion_length": 601.9375305175781, - "epoch": 0.002, - "grad_norm": 2.6050906286301534, + "completion_length": 333.6666717529297, + "epoch": 0.001, + "grad_norm": 4.439731592620521, "kl": 0.0, - "learning_rate": 2e-08, - "loss": -0.0073, - "reward": 1.1193318367004395, - "reward_std": 0.2575419098138809, + "learning_rate": 1e-08, + "loss": -0.0813, + "reward": 0.6252729892730713, + "reward_std": 0.29075783491134644, "rewards/accuracy_reward": 0.0, - "rewards/reasoning_steps_reward": 0.659722238779068, - "rewards/repetition_penalty_reward": -0.0508070383220911, - "rewards/tag_count_reward": 0.5104166865348816, + "rewards/reasoning_steps_reward": 0.194444440305233, + "rewards/repetition_penalty_reward": -0.06917147152125835, + "rewards/tag_count_reward": 0.5, "step": 2 }, { "clip_ratio": 0.0, - "completion_length": 624.4583740234375, - "epoch": 0.003, - "grad_norm": 2.4392990354195887, - "kl": 0.00021076202392578125, - "learning_rate": 3e-08, - "loss": 0.0166, - "reward": 1.1063403487205505, - "reward_std": 0.29120244085788727, + "completion_length": 315.2291717529297, + "epoch": 0.0015, + "grad_norm": 4.648660461286372, + "kl": 0.00014925003051757812, + "learning_rate": 1.5e-08, + "loss": -0.0918, + "reward": 0.578189492225647, + "reward_std": 0.19131075590848923, "rewards/accuracy_reward": 0.0, - "rewards/reasoning_steps_reward": 0.6597222089767456, - "rewards/repetition_penalty_reward": -0.0533819030970335, + "rewards/reasoning_steps_reward": 0.1319444552063942, + "rewards/repetition_penalty_reward": -0.05375497601926327, "rewards/tag_count_reward": 0.5, "step": 3 }, { "clip_ratio": 0.0, - "completion_length": 505.97918701171875, - "epoch": 0.004, - "grad_norm": 2.6745634307308377, - "kl": 0.000171661376953125, - "learning_rate": 4e-08, - "loss": -0.0265, - "reward": 1.1932607293128967, - "reward_std": 0.3999044597148895, - "rewards/accuracy_reward": 0.06250000186264515, - "rewards/reasoning_steps_reward": 0.6527777910232544, - "rewards/repetition_penalty_reward": -0.04805881343781948, - "rewards/tag_count_reward": 0.5260416865348816, + "completion_length": 391.8333435058594, + "epoch": 0.002, + "grad_norm": 3.069933909424734, + "kl": 0.00018310546875, + "learning_rate": 2e-08, + "loss": -0.0881, + "reward": 0.6503687500953674, + "reward_std": 0.2582377791404724, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/reasoning_steps_reward": 0.180555559694767, + "rewards/repetition_penalty_reward": -0.061436835676431656, + "rewards/tag_count_reward": 0.5104166865348816, "step": 4 }, { "clip_ratio": 0.0, - "completion_length": 604.6458435058594, - "epoch": 0.005, - "grad_norm": 2.7835442273591835, - "kl": 0.00020742416381835938, - "learning_rate": 5e-08, - "loss": -0.0375, - "reward": 1.0637533068656921, - "reward_std": 0.3286152780056, - "rewards/accuracy_reward": 0.02083333395421505, - "rewards/reasoning_steps_reward": 0.569444477558136, - "rewards/repetition_penalty_reward": -0.04735783860087395, - "rewards/tag_count_reward": 0.5208333432674408, + "completion_length": 415.8333435058594, + "epoch": 0.0025, + "grad_norm": 2.784498697916323, + "kl": 0.00012993812561035156, + "learning_rate": 2.5e-08, + "loss": -0.066, + "reward": 0.5441770553588867, + "reward_std": 0.21253511309623718, + "rewards/accuracy_reward": 0.0, + "rewards/reasoning_steps_reward": 0.1527777835726738, + "rewards/repetition_penalty_reward": -0.10860074311494827, + "rewards/tag_count_reward": 0.5, "step": 5 }, { "clip_ratio": 0.0, - "completion_length": 487.22918701171875, - "epoch": 0.006, - "grad_norm": 3.0412926302974874, - "kl": 0.00020647048950195312, - "learning_rate": 6e-08, - "loss": -0.0389, - "reward": 1.0956225991249084, - "reward_std": 0.38825175166130066, - "rewards/accuracy_reward": 0.02083333395421505, - "rewards/reasoning_steps_reward": 0.604166716337204, - "rewards/repetition_penalty_reward": -0.039794087409973145, - "rewards/tag_count_reward": 0.5104166865348816, + "completion_length": 432.1041717529297, + "epoch": 0.003, + "grad_norm": 3.090569400029858, + "kl": 0.00017309188842773438, + "learning_rate": 3e-08, + "loss": -0.0504, + "reward": 0.7367339134216309, + "reward_std": 0.3796353191137314, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/reasoning_steps_reward": 0.2500000149011612, + "rewards/repetition_penalty_reward": -0.07576615735888481, + "rewards/tag_count_reward": 0.5208333432674408, "step": 6 }, { "clip_ratio": 0.0, - "completion_length": 613.5833435058594, - "epoch": 0.007, - "grad_norm": 2.444811156842584, - "kl": 0.00019121170043945312, - "learning_rate": 7e-08, - "loss": -0.013, - "reward": 1.1998452544212341, - "reward_std": 0.3531786799430847, - "rewards/accuracy_reward": 0.0416666679084301, - "rewards/reasoning_steps_reward": 0.6736111640930176, - "rewards/repetition_penalty_reward": -0.05189092084765434, - "rewards/tag_count_reward": 0.5364583730697632, + "completion_length": 390.5833435058594, + "epoch": 0.0035, + "grad_norm": 2.8425497031556843, + "kl": 0.00014519691467285156, + "learning_rate": 3.5e-08, + "loss": -0.0414, + "reward": 0.6315869688987732, + "reward_std": 0.1821143701672554, + "rewards/accuracy_reward": 0.0, + "rewards/reasoning_steps_reward": 0.1875, + "rewards/repetition_penalty_reward": -0.06632974371314049, + "rewards/tag_count_reward": 0.5104166865348816, "step": 7 }, { "clip_ratio": 0.0, - "completion_length": 526.9166870117188, - "epoch": 0.008, - "grad_norm": 2.577759663416605, - "kl": 0.00021648406982421875, - "learning_rate": 8e-08, - "loss": -0.1157, - "reward": 1.1657525897026062, - "reward_std": 0.40865209698677063, - "rewards/accuracy_reward": 0.06250000186264515, - "rewards/reasoning_steps_reward": 0.597222238779068, - "rewards/repetition_penalty_reward": -0.040844724513590336, - "rewards/tag_count_reward": 0.5468750298023224, + "completion_length": 406.2083435058594, + "epoch": 0.004, + "grad_norm": 2.877982305967783, + "kl": 0.0001347064971923828, + "learning_rate": 4e-08, + "loss": -0.0406, + "reward": 0.6656659245491028, + "reward_std": 0.26930323243141174, + "rewards/accuracy_reward": 0.0, + "rewards/reasoning_steps_reward": 0.2638889104127884, + "rewards/repetition_penalty_reward": -0.0982230119407177, + "rewards/tag_count_reward": 0.5, "step": 8 }, { "clip_ratio": 0.0, - "completion_length": 527.5208435058594, - "epoch": 0.009, - "grad_norm": 2.8811327535844984, - "kl": 0.00019359588623046875, - "learning_rate": 9e-08, - "loss": -0.0489, - "reward": 1.1031688451766968, - "reward_std": 0.41312260925769806, - "rewards/accuracy_reward": 0.0833333358168602, - "rewards/reasoning_steps_reward": 0.506944477558136, - "rewards/repetition_penalty_reward": -0.0444006510078907, - "rewards/tag_count_reward": 0.5572916865348816, + "completion_length": 402.97918701171875, + "epoch": 0.0045, + "grad_norm": 2.796921444089777, + "kl": 0.00016069412231445312, + "learning_rate": 4.5e-08, + "loss": 0.0211, + "reward": 0.619064211845398, + "reward_std": 0.21635551750659943, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/reasoning_steps_reward": 0.1597222313284874, + "rewards/repetition_penalty_reward": -0.061491381376981735, + "rewards/tag_count_reward": 0.5, "step": 9 }, { "clip_ratio": 0.0, - "completion_length": 556.5208435058594, - "epoch": 0.01, - "grad_norm": 2.4760963797873985, - "kl": 0.00020551681518554688, - "learning_rate": 1e-07, - "loss": 0.1082, - "reward": 1.057469666004181, - "reward_std": 0.2826480269432068, + "completion_length": 428.52085876464844, + "epoch": 0.005, + "grad_norm": 2.732127974984369, + "kl": 0.00013709068298339844, + "learning_rate": 5e-08, + "loss": -0.0253, + "reward": 0.5867983400821686, + "reward_std": 0.2477174624800682, "rewards/accuracy_reward": 0.0, - "rewards/reasoning_steps_reward": 0.604166716337204, - "rewards/repetition_penalty_reward": -0.057113731279969215, - "rewards/tag_count_reward": 0.5104166865348816, + "rewards/reasoning_steps_reward": 0.1875000149011612, + "rewards/repetition_penalty_reward": -0.10070168599486351, + "rewards/tag_count_reward": 0.5, "step": 10 }, { "clip_ratio": 0.0, - "completion_length": 505.10418701171875, - "epoch": 0.011, - "grad_norm": 2.7426745766245304, - "kl": 0.00023126602172851562, - "learning_rate": 1.0999999999999999e-07, - "loss": -0.0164, - "reward": 1.1937065124511719, - "reward_std": 0.39100518822669983, - "rewards/accuracy_reward": 0.0416666679084301, - "rewards/reasoning_steps_reward": 0.6736111342906952, - "rewards/repetition_penalty_reward": -0.042404673993587494, - "rewards/tag_count_reward": 0.5208333730697632, + "completion_length": 483.6041717529297, + "epoch": 0.0055, + "grad_norm": 2.556791644892623, + "kl": 0.00016498565673828125, + "learning_rate": 5.4999999999999996e-08, + "loss": -0.046, + "reward": 0.5901573300361633, + "reward_std": 0.25573817640542984, + "rewards/accuracy_reward": 0.0, + "rewards/reasoning_steps_reward": 0.180555559694767, + "rewards/repetition_penalty_reward": -0.09039826691150665, + "rewards/tag_count_reward": 0.5, "step": 11 }, { "clip_ratio": 0.0, - "completion_length": 606.3333435058594, - "epoch": 0.012, - "grad_norm": 2.547656089396865, - "kl": 0.000209808349609375, - "learning_rate": 1.2e-07, - "loss": -0.1084, - "reward": 1.0744403004646301, - "reward_std": 0.3415149748325348, + "completion_length": 389.81251525878906, + "epoch": 0.006, + "grad_norm": 2.970269557370756, + "kl": 0.00014209747314453125, + "learning_rate": 6e-08, + "loss": -0.0726, + "reward": 0.5915650725364685, + "reward_std": 0.227043054997921, "rewards/accuracy_reward": 0.0, - "rewards/reasoning_steps_reward": 0.6250000298023224, - "rewards/repetition_penalty_reward": -0.06097651459276676, - "rewards/tag_count_reward": 0.5104166865348816, + "rewards/reasoning_steps_reward": 0.1458333432674408, + "rewards/repetition_penalty_reward": -0.0594765804708004, + "rewards/tag_count_reward": 0.5052083432674408, "step": 12 }, { "clip_ratio": 0.0, - "completion_length": 624.8958740234375, - "epoch": 0.013, - "grad_norm": 2.5452173789968726, - "kl": 0.00021314620971679688, - "learning_rate": 1.3e-07, - "loss": -0.0677, - "reward": 1.0435009598731995, - "reward_std": 0.2849584221839905, + "completion_length": 425.5833435058594, + "epoch": 0.0065, + "grad_norm": 3.55225958110278, + "kl": 0.00015544891357421875, + "learning_rate": 6.5e-08, + "loss": -0.1838, + "reward": 0.5981267392635345, + "reward_std": 0.2597131133079529, "rewards/accuracy_reward": 0.0, - "rewards/reasoning_steps_reward": 0.5902778208255768, - "rewards/repetition_penalty_reward": -0.057193491607904434, - "rewards/tag_count_reward": 0.5104166865348816, + "rewards/reasoning_steps_reward": 0.1527777835726738, + "rewards/repetition_penalty_reward": -0.05985940620303154, + "rewards/tag_count_reward": 0.5052083432674408, "step": 13 }, { "clip_ratio": 0.0, - "completion_length": 534.3125152587891, - "epoch": 0.014, - "grad_norm": 2.5568403376430475, - "kl": 0.000244140625, - "learning_rate": 1.4e-07, - "loss": -0.0267, - "reward": 1.033350259065628, - "reward_std": 0.36252032220363617, + "completion_length": 383.5, + "epoch": 0.007, + "grad_norm": 2.8654839696962204, + "kl": 0.00014352798461914062, + "learning_rate": 7e-08, + "loss": -0.0181, + "reward": 0.6581193804740906, + "reward_std": 0.33123770356178284, "rewards/accuracy_reward": 0.0416666679084301, - "rewards/reasoning_steps_reward": 0.5069444626569748, - "rewards/repetition_penalty_reward": -0.03609427623450756, - "rewards/tag_count_reward": 0.5208333730697632, + "rewards/reasoning_steps_reward": 0.1875, + "rewards/repetition_penalty_reward": -0.08146397396922112, + "rewards/tag_count_reward": 0.5104166865348816, "step": 14 }, { "clip_ratio": 0.0, - "completion_length": 633.8541870117188, - "epoch": 0.015, - "grad_norm": 2.4845137409121465, - "kl": 0.00019216537475585938, - "learning_rate": 1.5e-07, - "loss": -0.0353, - "reward": 1.1506143808364868, - "reward_std": 0.3063247799873352, - "rewards/accuracy_reward": 0.0, - "rewards/reasoning_steps_reward": 0.6805556118488312, - "rewards/repetition_penalty_reward": -0.045566244050860405, - "rewards/tag_count_reward": 0.5156250298023224, + "completion_length": 452.35418701171875, + "epoch": 0.0075, + "grad_norm": 2.735812818011658, + "kl": 0.0001621246337890625, + "learning_rate": 7.5e-08, + "loss": 0.0199, + "reward": 0.7007659077644348, + "reward_std": 0.2738788276910782, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/reasoning_steps_reward": 0.2500000149011612, + "rewards/repetition_penalty_reward": -0.08048411272466183, + "rewards/tag_count_reward": 0.5104166865348816, "step": 15 }, { "clip_ratio": 0.0, - "completion_length": 570.9375152587891, - "epoch": 0.016, - "grad_norm": 2.7140477200825273, - "kl": 0.0002346038818359375, - "learning_rate": 1.6e-07, - "loss": -0.0089, - "reward": 1.0886216163635254, - "reward_std": 0.3162507861852646, + "completion_length": 538.1666717529297, + "epoch": 0.008, + "grad_norm": 2.4984072149490264, + "kl": 0.00014781951904296875, + "learning_rate": 8e-08, + "loss": 0.0239, + "reward": 0.5570363700389862, + "reward_std": 0.23343774676322937, "rewards/accuracy_reward": 0.0, - "rewards/reasoning_steps_reward": 0.6388888955116272, - "rewards/repetition_penalty_reward": -0.06068398430943489, - "rewards/tag_count_reward": 0.5104166865348816, + "rewards/reasoning_steps_reward": 0.1527777835726738, + "rewards/repetition_penalty_reward": -0.09574145823717117, + "rewards/tag_count_reward": 0.5, "step": 16 }, { "clip_ratio": 0.0, - "completion_length": 585.2708740234375, - "epoch": 0.017, - "grad_norm": 2.4713266581431625, - "kl": 0.0001678466796875, - "learning_rate": 1.7000000000000001e-07, - "loss": -0.0658, - "reward": 1.1053802967071533, - "reward_std": 0.3452056348323822, - "rewards/accuracy_reward": 0.02083333395421505, - "rewards/reasoning_steps_reward": 0.631944477558136, - "rewards/repetition_penalty_reward": -0.06823080591857433, - "rewards/tag_count_reward": 0.5208333432674408, + "completion_length": 386.2916717529297, + "epoch": 0.0085, + "grad_norm": 3.2191965365951885, + "kl": 0.0001621246337890625, + "learning_rate": 8.500000000000001e-08, + "loss": -0.0334, + "reward": 0.5502374768257141, + "reward_std": 0.200188048183918, + "rewards/accuracy_reward": 0.0, + "rewards/reasoning_steps_reward": 0.1250000074505806, + "rewards/repetition_penalty_reward": -0.07997088506817818, + "rewards/tag_count_reward": 0.5052083432674408, "step": 17 }, { "clip_ratio": 0.0, - "completion_length": 513.6458587646484, - "epoch": 0.018, - "grad_norm": 2.751825490594234, - "kl": 0.00019788742065429688, - "learning_rate": 1.8e-07, - "loss": -0.0599, - "reward": 1.0880020260810852, - "reward_std": 0.318176731467247, - "rewards/accuracy_reward": 0.02083333395421505, - "rewards/reasoning_steps_reward": 0.61111119389534, - "rewards/repetition_penalty_reward": -0.04394245892763138, + "completion_length": 387.35418701171875, + "epoch": 0.009, + "grad_norm": 3.501374136164966, + "kl": 0.00019168853759765625, + "learning_rate": 9e-08, + "loss": -0.0635, + "reward": 0.5605765283107758, + "reward_std": 0.15551594644784927, + "rewards/accuracy_reward": 0.0, + "rewards/reasoning_steps_reward": 0.1319444514811039, + "rewards/repetition_penalty_reward": -0.07136795669794083, "rewards/tag_count_reward": 0.5, "step": 18 }, { "clip_ratio": 0.0, - "completion_length": 586.6250305175781, - "epoch": 0.019, - "grad_norm": 2.39448557038407, - "kl": 0.0001811981201171875, - "learning_rate": 1.8999999999999998e-07, - "loss": 0.0141, - "reward": 1.0959751605987549, - "reward_std": 0.43034467101097107, - "rewards/accuracy_reward": 0.06250000186264515, - "rewards/reasoning_steps_reward": 0.5486111640930176, - "rewards/repetition_penalty_reward": -0.05159439332783222, - "rewards/tag_count_reward": 0.5364583432674408, + "completion_length": 461.7083435058594, + "epoch": 0.0095, + "grad_norm": 2.8703850021404986, + "kl": 0.0001609325408935547, + "learning_rate": 9.499999999999999e-08, + "loss": 0.0089, + "reward": 0.6698747277259827, + "reward_std": 0.21975713968276978, + "rewards/accuracy_reward": 0.0, + "rewards/reasoning_steps_reward": 0.2638889104127884, + "rewards/repetition_penalty_reward": -0.09401418641209602, + "rewards/tag_count_reward": 0.5, "step": 19 }, { "clip_ratio": 0.0, - "completion_length": 652.1250305175781, - "epoch": 0.02, - "grad_norm": 2.0210344240592066, - "kl": 0.00016498565673828125, - "learning_rate": 2e-07, - "loss": -0.0102, - "reward": 1.2613900899887085, - "reward_std": 0.5556788444519043, - "rewards/accuracy_reward": 0.1250000037252903, - "rewards/reasoning_steps_reward": 0.6527777910232544, - "rewards/repetition_penalty_reward": -0.06847099214792252, - "rewards/tag_count_reward": 0.5520833432674408, + "completion_length": 434.0625, + "epoch": 0.01, + "grad_norm": 2.857781731829004, + "kl": 0.00016641616821289062, + "learning_rate": 1e-07, + "loss": -0.0404, + "reward": 0.6144264340400696, + "reward_std": 0.23488393425941467, + "rewards/accuracy_reward": 0.0, + "rewards/reasoning_steps_reward": 0.2013889029622078, + "rewards/repetition_penalty_reward": -0.08696247264742851, + "rewards/tag_count_reward": 0.5, "step": 20 }, { "clip_ratio": 0.0, - "completion_length": 534.3333587646484, - "epoch": 0.021, - "grad_norm": 2.723236250331092, - "kl": 0.000263214111328125, - "learning_rate": 2.0999999999999997e-07, - "loss": -0.0982, - "reward": 1.113393783569336, - "reward_std": 0.3082204759120941, - "rewards/accuracy_reward": 0.02083333395421505, - "rewards/reasoning_steps_reward": 0.6319444477558136, - "rewards/repetition_penalty_reward": -0.03938402608036995, + "completion_length": 453.7291717529297, + "epoch": 0.0105, + "grad_norm": 2.945964454449734, + "kl": 0.0001544952392578125, + "learning_rate": 1.0499999999999999e-07, + "loss": -0.028, + "reward": 0.6437437832355499, + "reward_std": 0.2618062347173691, + "rewards/accuracy_reward": 0.0, + "rewards/reasoning_steps_reward": 0.2222222313284874, + "rewards/repetition_penalty_reward": -0.07847847789525986, "rewards/tag_count_reward": 0.5, "step": 21 }, { "clip_ratio": 0.0, - "completion_length": 593.5208435058594, - "epoch": 0.022, - "grad_norm": 2.802437133759923, - "kl": 0.00021219253540039062, - "learning_rate": 2.1999999999999998e-07, - "loss": 0.025, - "reward": 1.141821563243866, - "reward_std": 0.30135248601436615, - "rewards/accuracy_reward": 0.02083333395421505, - "rewards/reasoning_steps_reward": 0.6527777910232544, - "rewards/repetition_penalty_reward": -0.042206283658742905, - "rewards/tag_count_reward": 0.5104166865348816, + "completion_length": 412.50001525878906, + "epoch": 0.011, + "grad_norm": 2.5290188531303004, + "kl": 0.00010466575622558594, + "learning_rate": 1.0999999999999999e-07, + "loss": -0.0099, + "reward": 0.6274481117725372, + "reward_std": 0.2595982700586319, + "rewards/accuracy_reward": 0.0, + "rewards/reasoning_steps_reward": 0.2361111268401146, + "rewards/repetition_penalty_reward": -0.10866303369402885, + "rewards/tag_count_reward": 0.5, "step": 22 }, { "clip_ratio": 0.0, - "completion_length": 569.9166870117188, - "epoch": 0.023, - "grad_norm": 2.513753582519348, - "kl": 0.00023412704467773438, - "learning_rate": 2.3e-07, - "loss": -0.0394, - "reward": 1.033397912979126, - "reward_std": 0.3224620223045349, + "completion_length": 435.1666717529297, + "epoch": 0.0115, + "grad_norm": 2.757687382488654, + "kl": 0.00016546249389648438, + "learning_rate": 1.15e-07, + "loss": 0.0177, + "reward": 0.6414158344268799, + "reward_std": 0.22706189006567, "rewards/accuracy_reward": 0.0, - "rewards/reasoning_steps_reward": 0.5833333730697632, - "rewards/repetition_penalty_reward": -0.04993540979921818, + "rewards/reasoning_steps_reward": 0.2291666939854622, + "rewards/repetition_penalty_reward": -0.08775084465742111, "rewards/tag_count_reward": 0.5, "step": 23 }, { "clip_ratio": 0.0, - "completion_length": 538.8125305175781, - "epoch": 0.024, - "grad_norm": 2.5472422948610527, - "kl": 0.0001964569091796875, - "learning_rate": 2.4e-07, - "loss": -0.0033, - "reward": 1.2361397743225098, - "reward_std": 0.4402560144662857, - "rewards/accuracy_reward": 0.0833333358168602, - "rewards/reasoning_steps_reward": 0.6736111640930176, - "rewards/repetition_penalty_reward": -0.052054738625884056, - "rewards/tag_count_reward": 0.5312500298023224, + "completion_length": 385.7708435058594, + "epoch": 0.012, + "grad_norm": 3.020715752986674, + "kl": 0.0001327991485595703, + "learning_rate": 1.2e-07, + "loss": 0.0064, + "reward": 0.5981995165348053, + "reward_std": 0.22115938365459442, + "rewards/accuracy_reward": 0.0, + "rewards/reasoning_steps_reward": 0.173611119389534, + "rewards/repetition_penalty_reward": -0.075411606580019, + "rewards/tag_count_reward": 0.5, "step": 24 }, { "clip_ratio": 0.0, - "completion_length": 537.1041870117188, - "epoch": 0.025, - "grad_norm": 2.734296760351383, - "kl": 0.00023365020751953125, - "learning_rate": 2.5e-07, - "loss": -0.0501, - "reward": 1.1702337265014648, - "reward_std": 0.4403166174888611, - "rewards/accuracy_reward": 0.08333333395421505, - "rewards/reasoning_steps_reward": 0.6041666865348816, - "rewards/repetition_penalty_reward": -0.053724685683846474, - "rewards/tag_count_reward": 0.5364583432674408, + "completion_length": 410.3333435058594, + "epoch": 0.0125, + "grad_norm": 2.6737799977246337, + "kl": 0.00013828277587890625, + "learning_rate": 1.25e-07, + "loss": 0.0381, + "reward": 0.7896432876586914, + "reward_std": 0.43896663188934326, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/reasoning_steps_reward": 0.2916666865348816, + "rewards/repetition_penalty_reward": -0.06452339142560959, + "rewards/tag_count_reward": 0.5208333730697632, "step": 25 }, { "clip_ratio": 0.0, - "completion_length": 561.1041870117188, - "epoch": 0.026, - "grad_norm": 2.8294150280268386, - "kl": 0.0002675056457519531, - "learning_rate": 2.6e-07, - "loss": -0.007, - "reward": 1.1117247343063354, - "reward_std": 0.32203447818756104, - "rewards/accuracy_reward": 0.02083333395421505, - "rewards/reasoning_steps_reward": 0.6319444477558136, - "rewards/repetition_penalty_reward": -0.05146980658173561, + "completion_length": 455.2708435058594, + "epoch": 0.013, + "grad_norm": 2.7398842512902424, + "kl": 0.0001811981201171875, + "learning_rate": 1.3e-07, + "loss": -0.0147, + "reward": 0.6110469698905945, + "reward_std": 0.21338298171758652, + "rewards/accuracy_reward": 0.0, + "rewards/reasoning_steps_reward": 0.2083333358168602, + "rewards/repetition_penalty_reward": -0.1077030710875988, "rewards/tag_count_reward": 0.5104166865348816, "step": 26 }, { "clip_ratio": 0.0, - "completion_length": 527.0, - "epoch": 0.027, - "grad_norm": 2.762760874256859, - "kl": 0.00025463104248046875, - "learning_rate": 2.7e-07, - "loss": -0.0483, - "reward": 1.1202597618103027, - "reward_std": 0.36672815680503845, - "rewards/accuracy_reward": 0.0416666679084301, - "rewards/reasoning_steps_reward": 0.5972222685813904, - "rewards/repetition_penalty_reward": -0.03946254029870033, - "rewards/tag_count_reward": 0.5208333432674408, + "completion_length": 414.1041717529297, + "epoch": 0.0135, + "grad_norm": 2.8122894279350015, + "kl": 0.00015592575073242188, + "learning_rate": 1.35e-07, + "loss": -0.0376, + "reward": 0.6806438565254211, + "reward_std": 0.22845971584320068, + "rewards/accuracy_reward": 0.0, + "rewards/reasoning_steps_reward": 0.2430555671453476, + "rewards/repetition_penalty_reward": -0.06762006506323814, + "rewards/tag_count_reward": 0.5052083432674408, "step": 27 }, { "clip_ratio": 0.0, - "completion_length": 532.3125, - "epoch": 0.028, - "grad_norm": 2.5927248535988077, - "kl": 0.00019502639770507812, - "learning_rate": 2.8e-07, - "loss": -0.0126, - "reward": 1.1283650398254395, - "reward_std": 0.4593771994113922, - "rewards/accuracy_reward": 0.06250000186264515, - "rewards/reasoning_steps_reward": 0.5833333730697632, - "rewards/repetition_penalty_reward": -0.04871835932135582, - "rewards/tag_count_reward": 0.5312500298023224, + "completion_length": 442.8958435058594, + "epoch": 0.014, + "grad_norm": 2.792569424252216, + "kl": 0.00018310546875, + "learning_rate": 1.4e-07, + "loss": -0.0449, + "reward": 0.6144725680351257, + "reward_std": 0.2537280172109604, + "rewards/accuracy_reward": 0.0, + "rewards/reasoning_steps_reward": 0.1944444552063942, + "rewards/repetition_penalty_reward": -0.09038857370615005, + "rewards/tag_count_reward": 0.5104166865348816, "step": 28 }, { "clip_ratio": 0.0, - "completion_length": 587.5416870117188, - "epoch": 0.029, - "grad_norm": 2.379176256693628, - "kl": 0.00020503997802734375, - "learning_rate": 2.9e-07, - "loss": -0.0035, - "reward": 0.9848364293575287, - "reward_std": 0.31292901933193207, - "rewards/accuracy_reward": 0.0, - "rewards/reasoning_steps_reward": 0.541666716337204, - "rewards/repetition_penalty_reward": -0.06203855946660042, - "rewards/tag_count_reward": 0.5052083432674408, + "completion_length": 469.0833435058594, + "epoch": 0.0145, + "grad_norm": 2.573870828705473, + "kl": 0.00014352798461914062, + "learning_rate": 1.45e-07, + "loss": -0.0278, + "reward": 0.6352183520793915, + "reward_std": 0.31960034370422363, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/reasoning_steps_reward": 0.1736111268401146, + "rewards/repetition_penalty_reward": -0.11130945011973381, + "rewards/tag_count_reward": 0.5312500298023224, "step": 29 }, { "clip_ratio": 0.0, - "completion_length": 521.4791870117188, - "epoch": 0.03, - "grad_norm": 2.711197411130166, - "kl": 0.0002446174621582031, - "learning_rate": 3e-07, - "loss": 0.0671, - "reward": 1.1310882568359375, - "reward_std": 0.3909059464931488, - "rewards/accuracy_reward": 0.0416666679084301, - "rewards/reasoning_steps_reward": 0.6180555820465088, - "rewards/repetition_penalty_reward": -0.049467260017991066, - "rewards/tag_count_reward": 0.5208333432674408, + "completion_length": 391.125, + "epoch": 0.015, + "grad_norm": 2.895308039349145, + "kl": 0.00012111663818359375, + "learning_rate": 1.5e-07, + "loss": -0.0451, + "reward": 0.6583640277385712, + "reward_std": 0.37743693590164185, + "rewards/accuracy_reward": 0.06250000186264515, + "rewards/reasoning_steps_reward": 0.173611119389534, + "rewards/repetition_penalty_reward": -0.09337210655212402, + "rewards/tag_count_reward": 0.515625, "step": 30 }, { "clip_ratio": 0.0, - "completion_length": 650.3125, - "epoch": 0.031, - "grad_norm": 2.094653949008266, - "kl": 0.00021648406982421875, - "learning_rate": 3.1e-07, - "loss": -0.0303, - "reward": 1.3080359101295471, - "reward_std": 0.36785976588726044, - "rewards/accuracy_reward": 0.06250000186264515, - "rewards/reasoning_steps_reward": 0.784722238779068, - "rewards/repetition_penalty_reward": -0.05481144040822983, - "rewards/tag_count_reward": 0.515625, + "completion_length": 417.41668701171875, + "epoch": 0.0155, + "grad_norm": 3.3217228750439722, + "kl": 0.00017309188842773438, + "learning_rate": 1.55e-07, + "loss": -0.0388, + "reward": 0.6766975820064545, + "reward_std": 0.3049939051270485, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/reasoning_steps_reward": 0.173611119389534, + "rewards/repetition_penalty_reward": -0.06983024999499321, + "rewards/tag_count_reward": 0.5312500298023224, "step": 31 }, { "clip_ratio": 0.0, - "completion_length": 492.2708435058594, - "epoch": 0.032, - "grad_norm": 2.6034268800973854, - "kl": 0.0003032684326171875, - "learning_rate": 3.2e-07, - "loss": -0.0383, - "reward": 1.2631294131278992, - "reward_std": 0.42681366205215454, - "rewards/accuracy_reward": 0.06250000186264515, - "rewards/reasoning_steps_reward": 0.701388955116272, - "rewards/repetition_penalty_reward": -0.03200950939208269, - "rewards/tag_count_reward": 0.5312500298023224, + "completion_length": 467.0625, + "epoch": 0.016, + "grad_norm": 2.592972499186617, + "kl": 0.00017213821411132812, + "learning_rate": 1.6e-07, + "loss": 0.002, + "reward": 0.6280147135257721, + "reward_std": 0.2920844256877899, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/reasoning_steps_reward": 0.1666666716337204, + "rewards/repetition_penalty_reward": -0.09073532372713089, + "rewards/tag_count_reward": 0.5104166865348816, "step": 32 }, { "clip_ratio": 0.0, - "completion_length": 517.9375305175781, - "epoch": 0.033, - "grad_norm": 2.6134583516084144, - "kl": 0.0002884864807128906, - "learning_rate": 3.3e-07, - "loss": 0.085, - "reward": 1.3313005566596985, - "reward_std": 0.5551705211400986, - "rewards/accuracy_reward": 0.10416666977107525, - "rewards/reasoning_steps_reward": 0.7152777910232544, - "rewards/repetition_penalty_reward": -0.04022728279232979, - "rewards/tag_count_reward": 0.5520833730697632, + "completion_length": 368.2708435058594, + "epoch": 0.0165, + "grad_norm": 2.885134740538339, + "kl": 0.00016164779663085938, + "learning_rate": 1.65e-07, + "loss": -0.0322, + "reward": 0.5996805429458618, + "reward_std": 0.2433435320854187, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/reasoning_steps_reward": 0.1458333358168602, + "rewards/repetition_penalty_reward": -0.07219448685646057, + "rewards/tag_count_reward": 0.5052083432674408, "step": 33 }, { "clip_ratio": 0.0, - "completion_length": 575.625, - "epoch": 0.034, - "grad_norm": 2.785389250577513, - "kl": 0.00032901763916015625, - "learning_rate": 3.4000000000000003e-07, - "loss": -0.0602, - "reward": 1.2880616784095764, - "reward_std": 0.4542257487773895, - "rewards/accuracy_reward": 0.0833333358168602, - "rewards/reasoning_steps_reward": 0.694444477558136, - "rewards/repetition_penalty_reward": -0.04179951548576355, - "rewards/tag_count_reward": 0.5520833730697632, + "completion_length": 412.5208435058594, + "epoch": 0.017, + "grad_norm": 3.0967481185727936, + "kl": 0.00016927719116210938, + "learning_rate": 1.7000000000000001e-07, + "loss": -0.0958, + "reward": 0.6694463491439819, + "reward_std": 0.224090114235878, + "rewards/accuracy_reward": 0.0, + "rewards/reasoning_steps_reward": 0.2222222462296486, + "rewards/repetition_penalty_reward": -0.06319255381822586, + "rewards/tag_count_reward": 0.5104166865348816, "step": 34 }, { "clip_ratio": 0.0, - "completion_length": 521.4791870117188, - "epoch": 0.035, - "grad_norm": 2.5702386275125564, - "kl": 0.00037384033203125, - "learning_rate": 3.5e-07, - "loss": -0.0291, - "reward": 1.2401779294013977, - "reward_std": 0.3075665980577469, - "rewards/accuracy_reward": 0.02083333395421505, - "rewards/reasoning_steps_reward": 0.7500000596046448, - "rewards/repetition_penalty_reward": -0.04628048092126846, - "rewards/tag_count_reward": 0.5156250298023224, + "completion_length": 456.2291717529297, + "epoch": 0.0175, + "grad_norm": 2.5514196985566495, + "kl": 0.00011515617370605469, + "learning_rate": 1.75e-07, + "loss": -0.0634, + "reward": 0.7589674890041351, + "reward_std": 0.3829844295978546, + "rewards/accuracy_reward": 0.06250000186264515, + "rewards/reasoning_steps_reward": 0.2638889104127884, + "rewards/repetition_penalty_reward": -0.08825474977493286, + "rewards/tag_count_reward": 0.5208333730697632, "step": 35 }, { "clip_ratio": 0.0, - "completion_length": 583.4166870117188, - "epoch": 0.036, - "grad_norm": 2.4872780913924584, - "kl": 0.000377655029296875, - "learning_rate": 3.6e-07, - "loss": -0.0549, - "reward": 1.0867574214935303, - "reward_std": 0.3429463729262352, - "rewards/accuracy_reward": 0.02083333395421505, - "rewards/reasoning_steps_reward": 0.5972222685813904, - "rewards/repetition_penalty_reward": -0.052131447941064835, - "rewards/tag_count_reward": 0.5208333730697632, + "completion_length": 428.6458435058594, + "epoch": 0.018, + "grad_norm": 2.596517737983663, + "kl": 0.00011324882507324219, + "learning_rate": 1.8e-07, + "loss": -0.0258, + "reward": 0.6948606371879578, + "reward_std": 0.2351381480693817, + "rewards/accuracy_reward": 0.0, + "rewards/reasoning_steps_reward": 0.284722238779068, + "rewards/repetition_penalty_reward": -0.08986162021756172, + "rewards/tag_count_reward": 0.5, "step": 36 }, { "clip_ratio": 0.0, - "completion_length": 576.4583435058594, - "epoch": 0.037, - "grad_norm": 2.5982342131854668, - "kl": 0.00039768218994140625, - "learning_rate": 3.7e-07, - "loss": -0.0891, - "reward": 1.198279321193695, - "reward_std": 0.26803672313690186, + "completion_length": 434.5416717529297, + "epoch": 0.0185, + "grad_norm": 2.6997865552883082, + "kl": 0.00014734268188476562, + "learning_rate": 1.85e-07, + "loss": 0.0079, + "reward": 0.6786134541034698, + "reward_std": 0.34496983885765076, "rewards/accuracy_reward": 0.0416666679084301, - "rewards/reasoning_steps_reward": 0.7152778208255768, - "rewards/repetition_penalty_reward": -0.0638735331594944, - "rewards/tag_count_reward": 0.5052083432674408, + "rewards/reasoning_steps_reward": 0.1944444626569748, + "rewards/repetition_penalty_reward": -0.07833104580640793, + "rewards/tag_count_reward": 0.5208333730697632, "step": 37 }, { "clip_ratio": 0.0, - "completion_length": 573.3541870117188, - "epoch": 0.038, - "grad_norm": 2.416596419208738, - "kl": 0.0005702972412109375, - "learning_rate": 3.7999999999999996e-07, - "loss": 0.021, - "reward": 1.1571301221847534, - "reward_std": 0.37196002900600433, + "completion_length": 420.7916717529297, + "epoch": 0.019, + "grad_norm": 2.8889113372764026, + "kl": 0.00016307830810546875, + "learning_rate": 1.8999999999999998e-07, + "loss": -0.0861, + "reward": 0.6670835018157959, + "reward_std": 0.297781839966774, "rewards/accuracy_reward": 0.02083333395421505, - "rewards/reasoning_steps_reward": 0.6527777910232544, - "rewards/repetition_penalty_reward": -0.04773101769387722, - "rewards/tag_count_reward": 0.5312500298023224, + "rewards/reasoning_steps_reward": 0.1944444477558136, + "rewards/repetition_penalty_reward": -0.0690276212990284, + "rewards/tag_count_reward": 0.5208333432674408, "step": 38 }, { "clip_ratio": 0.0, - "completion_length": 489.2500305175781, - "epoch": 0.039, - "grad_norm": 2.7311163851954614, - "kl": 0.0006694793701171875, - "learning_rate": 3.8999999999999997e-07, - "loss": 0.0014, - "reward": 1.1946178078651428, - "reward_std": 0.42158831655979156, - "rewards/accuracy_reward": 0.06250000186264515, - "rewards/reasoning_steps_reward": 0.6319445073604584, - "rewards/repetition_penalty_reward": -0.03628497198224068, - "rewards/tag_count_reward": 0.5364583730697632, + "completion_length": 391.50001525878906, + "epoch": 0.0195, + "grad_norm": 3.0222077809885683, + "kl": 0.00017881393432617188, + "learning_rate": 1.9499999999999999e-07, + "loss": -0.0421, + "reward": 0.6102947890758514, + "reward_std": 0.2821989879012108, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/reasoning_steps_reward": 0.1527777835726738, + "rewards/repetition_penalty_reward": -0.07373302057385445, + "rewards/tag_count_reward": 0.5104166865348816, "step": 39 }, { "clip_ratio": 0.0, - "completion_length": 557.4791870117188, - "epoch": 0.04, - "grad_norm": 2.5731368960029664, - "kl": 0.000751495361328125, - "learning_rate": 4e-07, - "loss": 0.0449, - "reward": 1.1931411623954773, - "reward_std": 0.4069296419620514, - "rewards/accuracy_reward": 0.0416666679084301, - "rewards/reasoning_steps_reward": 0.694444477558136, - "rewards/repetition_penalty_reward": -0.05338665284216404, - "rewards/tag_count_reward": 0.5104166865348816, + "completion_length": 438.60418701171875, + "epoch": 0.02, + "grad_norm": 2.8665067076530604, + "kl": 0.00017309188842773438, + "learning_rate": 2e-07, + "loss": -0.066, + "reward": 0.6099069118499756, + "reward_std": 0.23168403655290604, + "rewards/accuracy_reward": 0.0, + "rewards/reasoning_steps_reward": 0.2083333432674408, + "rewards/repetition_penalty_reward": -0.0984264425933361, + "rewards/tag_count_reward": 0.5, "step": 40 }, { "clip_ratio": 0.0, - "completion_length": 630.5625, - "epoch": 0.041, - "grad_norm": 2.394979741576189, - "kl": 0.0008411407470703125, - "learning_rate": 4.0999999999999994e-07, - "loss": -0.0326, - "reward": 1.2281219959259033, - "reward_std": 0.4181075543165207, - "rewards/accuracy_reward": 0.08333333395421505, - "rewards/reasoning_steps_reward": 0.6458333432674408, - "rewards/repetition_penalty_reward": -0.05312805995345116, - "rewards/tag_count_reward": 0.5520833730697632, + "completion_length": 552.5416717529297, + "epoch": 0.0205, + "grad_norm": 2.7988133120372214, + "kl": 0.00020265579223632812, + "learning_rate": 2.0499999999999997e-07, + "loss": -0.0244, + "reward": 0.712624192237854, + "reward_std": 0.2695208936929703, + "rewards/accuracy_reward": 0.0, + "rewards/reasoning_steps_reward": 0.2986111417412758, + "rewards/repetition_penalty_reward": -0.08598695322871208, + "rewards/tag_count_reward": 0.5, "step": 41 }, { "clip_ratio": 0.0, - "completion_length": 608.2083740234375, - "epoch": 0.042, - "grad_norm": 2.3967041495309647, - "kl": 0.0009288787841796875, - "learning_rate": 4.1999999999999995e-07, - "loss": 0.0268, - "reward": 1.1586747765541077, - "reward_std": 0.34190115332603455, - "rewards/accuracy_reward": 0.02083333395421505, - "rewards/reasoning_steps_reward": 0.6666666865348816, - "rewards/repetition_penalty_reward": -0.04445029981434345, - "rewards/tag_count_reward": 0.5156250298023224, + "completion_length": 429.68751525878906, + "epoch": 0.021, + "grad_norm": 3.1003673488903027, + "kl": 0.0001983642578125, + "learning_rate": 2.0999999999999997e-07, + "loss": -0.0657, + "reward": 0.6647423803806305, + "reward_std": 0.3343476206064224, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/reasoning_steps_reward": 0.1944444589316845, + "rewards/repetition_penalty_reward": -0.08178546652197838, + "rewards/tag_count_reward": 0.5104166865348816, "step": 42 }, { "clip_ratio": 0.0, - "completion_length": 545.5208435058594, - "epoch": 0.043, - "grad_norm": 2.7138542170454003, - "kl": 0.00109100341796875, - "learning_rate": 4.2999999999999996e-07, - "loss": -0.0442, - "reward": 1.2215397357940674, - "reward_std": 0.330724373459816, - "rewards/accuracy_reward": 0.02083333395421505, - "rewards/reasoning_steps_reward": 0.729166716337204, - "rewards/repetition_penalty_reward": -0.04929363913834095, - "rewards/tag_count_reward": 0.5208333730697632, + "completion_length": 388.29168701171875, + "epoch": 0.0215, + "grad_norm": 3.1597548965345204, + "kl": 0.000186920166015625, + "learning_rate": 2.1499999999999998e-07, + "loss": 0.0158, + "reward": 0.6541115939617157, + "reward_std": 0.2999124675989151, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/reasoning_steps_reward": 0.1527777872979641, + "rewards/repetition_penalty_reward": -0.05595788359642029, + "rewards/tag_count_reward": 0.515625, "step": 43 }, { "clip_ratio": 0.0, - "completion_length": 520.8125, - "epoch": 0.044, - "grad_norm": 2.9707533290622923, - "kl": 0.001163482666015625, - "learning_rate": 4.3999999999999997e-07, - "loss": -0.0372, - "reward": 1.193300485610962, - "reward_std": 0.3503916561603546, - "rewards/accuracy_reward": 0.0416666679084301, - "rewards/reasoning_steps_reward": 0.6527778208255768, - "rewards/repetition_penalty_reward": -0.0428106477484107, - "rewards/tag_count_reward": 0.5416666865348816, + "completion_length": 447.18751525878906, + "epoch": 0.022, + "grad_norm": 2.558177785664247, + "kl": 0.00014591217041015625, + "learning_rate": 2.1999999999999998e-07, + "loss": -0.0045, + "reward": 0.6448527276515961, + "reward_std": 0.25322096794843674, + "rewards/accuracy_reward": 0.0, + "rewards/reasoning_steps_reward": 0.2430555745959282, + "rewards/repetition_penalty_reward": -0.0982028879225254, + "rewards/tag_count_reward": 0.5, "step": 44 }, { "clip_ratio": 0.0, - "completion_length": 592.2291870117188, - "epoch": 0.045, - "grad_norm": 2.8301357058974603, - "kl": 0.001316070556640625, - "learning_rate": 4.5e-07, - "loss": -0.0149, - "reward": 1.5306417346000671, - "reward_std": 0.6089198887348175, - "rewards/accuracy_reward": 0.2291666716337204, - "rewards/reasoning_steps_reward": 0.7430555522441864, - "rewards/repetition_penalty_reward": -0.04574713110923767, - "rewards/tag_count_reward": 0.6041666865348816, + "completion_length": 430.3125, + "epoch": 0.0225, + "grad_norm": 2.8810601257270263, + "kl": 0.00017499923706054688, + "learning_rate": 2.25e-07, + "loss": -0.0455, + "reward": 0.6615873873233795, + "reward_std": 0.2432490661740303, + "rewards/accuracy_reward": 0.0, + "rewards/reasoning_steps_reward": 0.222222238779068, + "rewards/repetition_penalty_reward": -0.06063482351601124, + "rewards/tag_count_reward": 0.5, "step": 45 }, { "clip_ratio": 0.0, - "completion_length": 560.0833435058594, - "epoch": 0.046, - "grad_norm": 2.9830534003583997, - "kl": 0.001560211181640625, - "learning_rate": 4.6e-07, - "loss": -0.0654, - "reward": 1.2372803688049316, - "reward_std": 0.3439289480447769, - "rewards/accuracy_reward": 0.0416666679084301, - "rewards/reasoning_steps_reward": 0.7013889253139496, - "rewards/repetition_penalty_reward": -0.04744199104607105, - "rewards/tag_count_reward": 0.5416666865348816, + "completion_length": 420.18751525878906, + "epoch": 0.023, + "grad_norm": 2.727537687409915, + "kl": 0.00017786026000976562, + "learning_rate": 2.3e-07, + "loss": -0.0767, + "reward": 0.7279665172100067, + "reward_std": 0.3034803867340088, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/reasoning_steps_reward": 0.2708333507180214, + "rewards/repetition_penalty_reward": -0.07411682605743408, + "rewards/tag_count_reward": 0.5104166865348816, "step": 46 }, { "clip_ratio": 0.0, - "completion_length": 595.5416870117188, - "epoch": 0.047, - "grad_norm": 2.62124053942929, - "kl": 0.001735687255859375, - "learning_rate": 4.6999999999999995e-07, - "loss": 0.0004, - "reward": 1.289455771446228, - "reward_std": 0.41812919080257416, - "rewards/accuracy_reward": 0.06250000186264515, - "rewards/reasoning_steps_reward": 0.7222222685813904, - "rewards/repetition_penalty_reward": -0.04734986647963524, - "rewards/tag_count_reward": 0.5520833730697632, + "completion_length": 348.72918701171875, + "epoch": 0.0235, + "grad_norm": 3.1997159072003227, + "kl": 0.00018548965454101562, + "learning_rate": 2.3499999999999997e-07, + "loss": 0.0492, + "reward": 0.598628580570221, + "reward_std": 0.2303108423948288, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/reasoning_steps_reward": 0.11111111752688885, + "rewards/repetition_penalty_reward": -0.043732548132538795, + "rewards/tag_count_reward": 0.5104166865348816, "step": 47 }, { "clip_ratio": 0.0, - "completion_length": 603.2500305175781, - "epoch": 0.048, - "grad_norm": 2.313447720284229, - "kl": 0.001773834228515625, - "learning_rate": 4.8e-07, - "loss": 0.0089, - "reward": 1.3387857675552368, - "reward_std": 0.3809618651866913, - "rewards/accuracy_reward": 0.12500000558793545, - "rewards/reasoning_steps_reward": 0.7083333432674408, - "rewards/repetition_penalty_reward": -0.05183934420347214, - "rewards/tag_count_reward": 0.5572916865348816, + "completion_length": 383.6458435058594, + "epoch": 0.024, + "grad_norm": 2.6573394355964783, + "kl": 0.00020503997802734375, + "learning_rate": 2.4e-07, + "loss": 0.033, + "reward": 0.6143307387828827, + "reward_std": 0.28714819252491, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/reasoning_steps_reward": 0.166666679084301, + "rewards/repetition_penalty_reward": -0.08358598873019218, + "rewards/tag_count_reward": 0.5104166865348816, "step": 48 }, { "clip_ratio": 0.0, - "completion_length": 523.3541717529297, - "epoch": 0.049, - "grad_norm": 2.8891626678471063, - "kl": 0.00232696533203125, - "learning_rate": 4.9e-07, - "loss": -0.089, - "reward": 1.2625147104263306, - "reward_std": 0.4176745116710663, - "rewards/accuracy_reward": 0.0416666679084301, - "rewards/reasoning_steps_reward": 0.7083334028720856, - "rewards/repetition_penalty_reward": -0.039568664506077766, - "rewards/tag_count_reward": 0.5520833730697632, + "completion_length": 436.62501525878906, + "epoch": 0.0245, + "grad_norm": 2.7867155541781488, + "kl": 0.00023317337036132812, + "learning_rate": 2.45e-07, + "loss": -0.0491, + "reward": 0.6386054158210754, + "reward_std": 0.24428075551986694, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/reasoning_steps_reward": 0.1875000149011612, + "rewards/repetition_penalty_reward": -0.06972793489694595, + "rewards/tag_count_reward": 0.5, "step": 49 }, { "clip_ratio": 0.0, - "completion_length": 527.0625, - "epoch": 0.05, - "grad_norm": 2.6847464499943308, - "kl": 0.0029754638671875, - "learning_rate": 5e-07, - "loss": 0.0743, - "reward": 1.615307629108429, - "reward_std": 0.6159101724624634, - "rewards/accuracy_reward": 0.3125000149011612, - "rewards/reasoning_steps_reward": 0.7083333730697632, - "rewards/repetition_penalty_reward": -0.046150704845786095, - "rewards/tag_count_reward": 0.6406250298023224, + "completion_length": 449.9166717529297, + "epoch": 0.025, + "grad_norm": 3.006240341565843, + "kl": 0.00022745132446289062, + "learning_rate": 2.5e-07, + "loss": -0.1239, + "reward": 0.7317107021808624, + "reward_std": 0.27121981978416443, + "rewards/accuracy_reward": 0.0, + "rewards/reasoning_steps_reward": 0.3125000149011612, + "rewards/repetition_penalty_reward": -0.08078934252262115, + "rewards/tag_count_reward": 0.5, "step": 50 }, { "clip_ratio": 0.0, - "completion_length": 558.1875305175781, - "epoch": 0.051, - "grad_norm": 2.4123895595368547, - "kl": 0.0031280517578125, - "learning_rate": 5.1e-07, - "loss": 0.0147, - "reward": 1.4028043746948242, - "reward_std": 0.43062111735343933, - "rewards/accuracy_reward": 0.1041666679084301, - "rewards/reasoning_steps_reward": 0.7430555522441864, - "rewards/repetition_penalty_reward": -0.04858442768454552, - "rewards/tag_count_reward": 0.6041666865348816, + "completion_length": 472.2291717529297, + "epoch": 0.0255, + "grad_norm": 2.6548371678838856, + "kl": 0.00025463104248046875, + "learning_rate": 2.55e-07, + "loss": -0.0904, + "reward": 0.6300550401210785, + "reward_std": 0.31470321863889694, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/reasoning_steps_reward": 0.1666666716337204, + "rewards/repetition_penalty_reward": -0.1043199859559536, + "rewards/tag_count_reward": 0.5260416865348816, "step": 51 }, { "clip_ratio": 0.0, - "completion_length": 575.7291870117188, - "epoch": 0.052, - "grad_norm": 2.8249120385470197, - "kl": 0.00350189208984375, - "learning_rate": 5.2e-07, - "loss": -0.0312, - "reward": 1.5281450748443604, - "reward_std": 0.6317126750946045, - "rewards/accuracy_reward": 0.2708333358168602, - "rewards/reasoning_steps_reward": 0.6805556118488312, - "rewards/repetition_penalty_reward": -0.058660563081502914, - "rewards/tag_count_reward": 0.6354166865348816, + "completion_length": 410.37501525878906, + "epoch": 0.026, + "grad_norm": 2.891679932666362, + "kl": 0.00026416778564453125, + "learning_rate": 2.6e-07, + "loss": -0.0272, + "reward": 0.5869153141975403, + "reward_std": 0.2003287822008133, + "rewards/accuracy_reward": 0.0, + "rewards/reasoning_steps_reward": 0.173611119389534, + "rewards/repetition_penalty_reward": -0.0866958275437355, + "rewards/tag_count_reward": 0.5, "step": 52 }, { "clip_ratio": 0.0, - "completion_length": 519.4166870117188, - "epoch": 0.053, - "grad_norm": 2.8646116438456453, - "kl": 0.00429534912109375, - "learning_rate": 5.3e-07, - "loss": 0.0331, - "reward": 1.6907492280006409, - "reward_std": 0.6883328557014465, - "rewards/accuracy_reward": 0.3541666716337204, - "rewards/reasoning_steps_reward": 0.6875000298023224, - "rewards/repetition_penalty_reward": -0.03841756656765938, - "rewards/tag_count_reward": 0.6875, + "completion_length": 434.125, + "epoch": 0.0265, + "grad_norm": 2.87497343902247, + "kl": 0.0002574920654296875, + "learning_rate": 2.65e-07, + "loss": 0.0957, + "reward": 0.6155928373336792, + "reward_std": 0.31732793152332306, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/reasoning_steps_reward": 0.1875000149011612, + "rewards/repetition_penalty_reward": -0.0979488454759121, + "rewards/tag_count_reward": 0.5052083432674408, "step": 53 }, { "clip_ratio": 0.0, - "completion_length": 578.1250305175781, - "epoch": 0.054, - "grad_norm": 2.532736796747683, - "kl": 0.0043792724609375, - "learning_rate": 5.4e-07, - "loss": 0.0327, - "reward": 1.4663925766944885, - "reward_std": 0.4452537000179291, - "rewards/accuracy_reward": 0.12500000558793545, - "rewards/reasoning_steps_reward": 0.7986111342906952, - "rewards/repetition_penalty_reward": -0.06138528883457184, - "rewards/tag_count_reward": 0.6041666865348816, + "completion_length": 388.5833435058594, + "epoch": 0.027, + "grad_norm": 3.1249467609329815, + "kl": 0.00028514862060546875, + "learning_rate": 2.7e-07, + "loss": -0.0707, + "reward": 0.6530064940452576, + "reward_std": 0.26522205770015717, + "rewards/accuracy_reward": 0.0, + "rewards/reasoning_steps_reward": 0.2361111268401146, + "rewards/repetition_penalty_reward": -0.08831300958991051, + "rewards/tag_count_reward": 0.5052083432674408, "step": 54 }, { "clip_ratio": 0.0, - "completion_length": 556.3750305175781, - "epoch": 0.055, - "grad_norm": 2.626064949357054, - "kl": 0.0063018798828125, - "learning_rate": 5.5e-07, - "loss": 0.0412, - "reward": 1.5763724446296692, - "reward_std": 0.5313678234815598, - "rewards/accuracy_reward": 0.2500000149011612, - "rewards/reasoning_steps_reward": 0.7083333730697632, - "rewards/repetition_penalty_reward": -0.05904424749314785, - "rewards/tag_count_reward": 0.6770833432674408, + "completion_length": 391.7916717529297, + "epoch": 0.0275, + "grad_norm": 3.7182582598169582, + "kl": 0.00043487548828125, + "learning_rate": 2.75e-07, + "loss": -0.1307, + "reward": 0.6811047792434692, + "reward_std": 0.3341375142335892, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/reasoning_steps_reward": 0.2083333432674408, + "rewards/repetition_penalty_reward": -0.08972860500216484, + "rewards/tag_count_reward": 0.5208333730697632, "step": 55 }, { "clip_ratio": 0.0, - "completion_length": 558.9166870117188, - "epoch": 0.056, - "grad_norm": 2.6301403493903748, - "kl": 0.0066680908203125, - "learning_rate": 5.6e-07, - "loss": 0.0203, - "reward": 1.794304072856903, - "reward_std": 0.6143919825553894, - "rewards/accuracy_reward": 0.3750000149011612, - "rewards/reasoning_steps_reward": 0.7361111640930176, - "rewards/repetition_penalty_reward": -0.04597374238073826, - "rewards/tag_count_reward": 0.7291666865348816, + "completion_length": 431.72918701171875, + "epoch": 0.028, + "grad_norm": 2.9876795440276056, + "kl": 0.00030040740966796875, + "learning_rate": 2.8e-07, + "loss": -0.0276, + "reward": 0.6388545334339142, + "reward_std": 0.1873469203710556, + "rewards/accuracy_reward": 0.0, + "rewards/reasoning_steps_reward": 0.2152777835726738, + "rewards/repetition_penalty_reward": -0.0764232836663723, + "rewards/tag_count_reward": 0.5, "step": 56 }, { "clip_ratio": 0.0, - "completion_length": 581.75, - "epoch": 0.057, - "grad_norm": 2.377807933864504, - "kl": 0.0074462890625, - "learning_rate": 5.699999999999999e-07, - "loss": 0.0249, - "reward": 1.681954801082611, - "reward_std": 0.5288814753293991, - "rewards/accuracy_reward": 0.291666679084301, - "rewards/reasoning_steps_reward": 0.7430556118488312, - "rewards/repetition_penalty_reward": -0.05589243024587631, - "rewards/tag_count_reward": 0.703125, + "completion_length": 446.7916717529297, + "epoch": 0.0285, + "grad_norm": 2.5753158131037117, + "kl": 0.00035953521728515625, + "learning_rate": 2.8499999999999997e-07, + "loss": -0.0241, + "reward": 0.740548849105835, + "reward_std": 0.40591859817504883, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/reasoning_steps_reward": 0.2500000074505806, + "rewards/repetition_penalty_reward": -0.09278450906276703, + "rewards/tag_count_reward": 0.5416666865348816, "step": 57 }, { "clip_ratio": 0.0, - "completion_length": 564.7916870117188, - "epoch": 0.058, - "grad_norm": 2.9351818937282297, - "kl": 0.0069580078125, - "learning_rate": 5.8e-07, - "loss": 0.0467, - "reward": 2.0512834787368774, - "reward_std": 0.659260630607605, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/reasoning_steps_reward": 0.7708333730697632, - "rewards/repetition_penalty_reward": -0.04767502471804619, - "rewards/tag_count_reward": 0.7864583432674408, + "completion_length": 292.2291793823242, + "epoch": 0.029, + "grad_norm": 5.591928815640096, + "kl": 0.0007524490356445312, + "learning_rate": 2.9e-07, + "loss": 0.0557, + "reward": 0.6187903136014938, + "reward_std": 0.20580651611089706, + "rewards/accuracy_reward": 0.0, + "rewards/reasoning_steps_reward": 0.19444445706903934, + "rewards/repetition_penalty_reward": -0.07565413787961006, + "rewards/tag_count_reward": 0.5, "step": 58 }, { "clip_ratio": 0.0, - "completion_length": 611.2916870117188, - "epoch": 0.059, - "grad_norm": 2.5837982836551467, - "kl": 0.00921630859375, - "learning_rate": 5.9e-07, - "loss": -0.0409, - "reward": 1.758675456047058, - "reward_std": 0.5324381589889526, - "rewards/accuracy_reward": 0.2500000111758709, - "rewards/reasoning_steps_reward": 0.7222222089767456, - "rewards/repetition_penalty_reward": -0.05729677900671959, - "rewards/tag_count_reward": 0.84375, + "completion_length": 421.5416717529297, + "epoch": 0.0295, + "grad_norm": 3.1936915993711903, + "kl": 0.00032329559326171875, + "learning_rate": 2.95e-07, + "loss": -0.1173, + "reward": 0.7119120061397552, + "reward_std": 0.47539106011390686, + "rewards/accuracy_reward": 0.08333333395421505, + "rewards/reasoning_steps_reward": 0.1875000111758709, + "rewards/repetition_penalty_reward": -0.07975470274686813, + "rewards/tag_count_reward": 0.5208333730697632, "step": 59 }, { "clip_ratio": 0.0, - "completion_length": 520.6666870117188, - "epoch": 0.06, - "grad_norm": 3.05141171791299, - "kl": 0.011322021484375, - "learning_rate": 6e-07, - "loss": 0.0112, - "reward": 2.136493146419525, - "reward_std": 0.587219700217247, - "rewards/accuracy_reward": 0.5625000149011612, - "rewards/reasoning_steps_reward": 0.756944477558136, - "rewards/repetition_penalty_reward": -0.0475346464663744, - "rewards/tag_count_reward": 0.8645833432674408, + "completion_length": 506.2708435058594, + "epoch": 0.03, + "grad_norm": 3.162339001956615, + "kl": 0.0004940032958984375, + "learning_rate": 3e-07, + "loss": 0.0028, + "reward": 0.7199562191963196, + "reward_std": 0.25821222364902496, + "rewards/accuracy_reward": 0.0, + "rewards/reasoning_steps_reward": 0.3125000149011612, + "rewards/repetition_penalty_reward": -0.09254380315542221, + "rewards/tag_count_reward": 0.5, "step": 60 }, { "clip_ratio": 0.0, - "completion_length": 547.7916870117188, - "epoch": 0.061, - "grad_norm": 2.7073330792983072, - "kl": 0.011810302734375, - "learning_rate": 6.1e-07, - "loss": 0.0477, - "reward": 2.2359567284584045, - "reward_std": 0.5013462156057358, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/reasoning_steps_reward": 0.6458333134651184, - "rewards/repetition_penalty_reward": -0.04008489940315485, - "rewards/tag_count_reward": 0.921875, + "completion_length": 439.12501525878906, + "epoch": 0.0305, + "grad_norm": 2.6085011916809364, + "kl": 0.0004892349243164062, + "learning_rate": 3.05e-07, + "loss": -0.0387, + "reward": 0.6562853455543518, + "reward_std": 0.2614954710006714, + "rewards/accuracy_reward": 0.0, + "rewards/reasoning_steps_reward": 0.2291666865348816, + "rewards/repetition_penalty_reward": -0.09371470659971237, + "rewards/tag_count_reward": 0.5208333432674408, "step": 61 }, { "clip_ratio": 0.0, - "completion_length": 449.5208435058594, - "epoch": 0.062, - "grad_norm": 3.257750731526075, - "kl": 0.019744873046875, - "learning_rate": 6.2e-07, - "loss": 0.0639, - "reward": 2.321463108062744, - "reward_std": 0.4783380925655365, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/reasoning_steps_reward": 0.7013889253139496, - "rewards/repetition_penalty_reward": -0.04138421919196844, - "rewards/tag_count_reward": 0.9114583432674408, + "completion_length": 438.9791717529297, + "epoch": 0.031, + "grad_norm": 2.933119213416396, + "kl": 0.00045871734619140625, + "learning_rate": 3.1e-07, + "loss": 0.0025, + "reward": 0.7185687720775604, + "reward_std": 0.2700326144695282, + "rewards/accuracy_reward": 0.0, + "rewards/reasoning_steps_reward": 0.3125000298023224, + "rewards/repetition_penalty_reward": -0.09393121674656868, + "rewards/tag_count_reward": 0.5, "step": 62 }, { "clip_ratio": 0.0, - "completion_length": 547.2500305175781, - "epoch": 0.063, - "grad_norm": 2.7432912908420732, - "kl": 0.011871337890625, - "learning_rate": 6.3e-07, - "loss": 0.0445, - "reward": 2.019491195678711, - "reward_std": 0.5333467572927475, - "rewards/accuracy_reward": 0.458333358168602, - "rewards/reasoning_steps_reward": 0.6736111640930176, - "rewards/repetition_penalty_reward": -0.04995330423116684, - "rewards/tag_count_reward": 0.9375, + "completion_length": 403.1458435058594, + "epoch": 0.0315, + "grad_norm": 2.897102185777002, + "kl": 0.000621795654296875, + "learning_rate": 3.15e-07, + "loss": -0.0791, + "reward": 0.6670123040676117, + "reward_std": 0.237733893096447, + "rewards/accuracy_reward": 0.0, + "rewards/reasoning_steps_reward": 0.2569444477558136, + "rewards/repetition_penalty_reward": -0.08993213623762131, + "rewards/tag_count_reward": 0.5, "step": 63 }, { "clip_ratio": 0.0, - "completion_length": 533.8125305175781, - "epoch": 0.064, - "grad_norm": 2.775011190038876, - "kl": 0.01220703125, - "learning_rate": 6.4e-07, - "loss": -0.0156, - "reward": 2.413369059562683, - "reward_std": 0.517508327960968, - "rewards/accuracy_reward": 0.6875000298023224, - "rewards/reasoning_steps_reward": 0.79861119389534, - "rewards/repetition_penalty_reward": -0.041492147371172905, - "rewards/tag_count_reward": 0.96875, + "completion_length": 433.6041717529297, + "epoch": 0.032, + "grad_norm": 2.446714560125192, + "kl": 0.0006542205810546875, + "learning_rate": 3.2e-07, + "loss": 0.0159, + "reward": 0.7158828973770142, + "reward_std": 0.33434779942035675, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/reasoning_steps_reward": 0.2638888955116272, + "rewards/repetition_penalty_reward": -0.10529769212007523, + "rewards/tag_count_reward": 0.5156250298023224, "step": 64 }, { "clip_ratio": 0.0, - "completion_length": 511.16668701171875, - "epoch": 0.065, - "grad_norm": 2.745662806864412, - "kl": 0.014892578125, - "learning_rate": 6.5e-07, - "loss": -0.0064, - "reward": 2.327776312828064, - "reward_std": 0.5232683420181274, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.694444477558136, - "rewards/repetition_penalty_reward": -0.038543105125427246, - "rewards/tag_count_reward": 0.9427083730697632, + "completion_length": 401.54168701171875, + "epoch": 0.0325, + "grad_norm": 2.7732151443967163, + "kl": 0.000652313232421875, + "learning_rate": 3.25e-07, + "loss": -0.0358, + "reward": 0.8164660930633545, + "reward_std": 0.3192940354347229, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/reasoning_steps_reward": 0.326388917863369, + "rewards/repetition_penalty_reward": -0.07242286205291748, + "rewards/tag_count_reward": 0.5208333730697632, "step": 65 }, { "clip_ratio": 0.0, - "completion_length": 538.875, - "epoch": 0.066, - "grad_norm": 2.5297084422395555, - "kl": 0.0113525390625, - "learning_rate": 6.6e-07, - "loss": -0.0286, - "reward": 2.1816118955612183, - "reward_std": 0.38229691982269287, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/reasoning_steps_reward": 0.6805555820465088, - "rewards/repetition_penalty_reward": -0.03540213964879513, - "rewards/tag_count_reward": 0.9947916865348816, + "completion_length": 320.4583435058594, + "epoch": 0.033, + "grad_norm": 6.165543609500131, + "kl": 0.0016994476318359375, + "learning_rate": 3.3e-07, + "loss": -0.0995, + "reward": 0.7236150503158569, + "reward_std": 0.33078788220882416, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/reasoning_steps_reward": 0.2361111268401146, + "rewards/repetition_penalty_reward": -0.0645794328302145, + "rewards/tag_count_reward": 0.5104166865348816, "step": 66 }, { "clip_ratio": 0.0, - "completion_length": 559.8958435058594, - "epoch": 0.067, - "grad_norm": 2.672746029201746, - "kl": 0.012908935546875, - "learning_rate": 6.7e-07, - "loss": -0.035, - "reward": 2.2214502096176147, - "reward_std": 0.5396545231342316, - "rewards/accuracy_reward": 0.5625, - "rewards/reasoning_steps_reward": 0.7152777910232544, - "rewards/repetition_penalty_reward": -0.04591094329953194, - "rewards/tag_count_reward": 0.9895833730697632, + "completion_length": 391.9791717529297, + "epoch": 0.0335, + "grad_norm": 2.658500757905026, + "kl": 0.001007080078125, + "learning_rate": 3.35e-07, + "loss": -0.0184, + "reward": 0.6389293074607849, + "reward_std": 0.27491873502731323, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/reasoning_steps_reward": 0.229166679084301, + "rewards/repetition_penalty_reward": -0.12148737907409668, + "rewards/tag_count_reward": 0.5104166865348816, "step": 67 }, { "clip_ratio": 0.0, - "completion_length": 537.3125, - "epoch": 0.068, - "grad_norm": 2.920361106100061, - "kl": 0.019287109375, - "learning_rate": 6.800000000000001e-07, - "loss": 0.0507, - "reward": 2.291020631790161, - "reward_std": 0.49400143325328827, - "rewards/accuracy_reward": 0.645833358168602, - "rewards/reasoning_steps_reward": 0.7361111342906952, - "rewards/repetition_penalty_reward": -0.049257127568125725, - "rewards/tag_count_reward": 0.9583333730697632, + "completion_length": 444.3333435058594, + "epoch": 0.034, + "grad_norm": 2.6477800642599996, + "kl": 0.0009365081787109375, + "learning_rate": 3.4000000000000003e-07, + "loss": -0.0267, + "reward": 0.7566591203212738, + "reward_std": 0.3201362192630768, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/reasoning_steps_reward": 0.270833358168602, + "rewards/repetition_penalty_reward": -0.08709090948104858, + "rewards/tag_count_reward": 0.5520833432674408, "step": 68 }, { "clip_ratio": 0.0, - "completion_length": 469.2708435058594, - "epoch": 0.069, - "grad_norm": 2.901120243600892, - "kl": 0.017242431640625, - "learning_rate": 6.9e-07, - "loss": 0.0654, - "reward": 2.4167280197143555, - "reward_std": 0.5367273986339569, - "rewards/accuracy_reward": 0.6875000298023224, - "rewards/reasoning_steps_reward": 0.7638889849185944, - "rewards/repetition_penalty_reward": -0.03466090187430382, - "rewards/tag_count_reward": 1.0, + "completion_length": 457.5833435058594, + "epoch": 0.0345, + "grad_norm": 2.633615712345152, + "kl": 0.001323699951171875, + "learning_rate": 3.45e-07, + "loss": -0.0703, + "reward": 0.7424019575119019, + "reward_std": 0.3558398336172104, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/reasoning_steps_reward": 0.3194444477558136, + "rewards/repetition_penalty_reward": -0.11350089311599731, + "rewards/tag_count_reward": 0.5156250298023224, "step": 69 }, { "clip_ratio": 0.0, - "completion_length": 504.6666717529297, - "epoch": 0.07, - "grad_norm": 3.137395976127796, - "kl": 0.018524169921875, - "learning_rate": 7e-07, - "loss": 0.0147, - "reward": 2.4454265832901, - "reward_std": 0.5369938015937805, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 0.7083333730697632, - "rewards/repetition_penalty_reward": -0.04415685310959816, - "rewards/tag_count_reward": 0.9895833432674408, - "step": 70 - }, + "completion_length": 436.72918701171875, + "epoch": 0.035, + "grad_norm": 2.949344418221538, + "kl": 0.0012359619140625, + "learning_rate": 3.5e-07, + "loss": -0.0637, + "reward": 0.7435359060764313, + "reward_std": 0.20930662006139755, + "rewards/accuracy_reward": 0.0, + "rewards/reasoning_steps_reward": 0.3194444626569748, + "rewards/repetition_penalty_reward": -0.0759085863828659, + "rewards/tag_count_reward": 0.5, + "step": 70 + }, { "clip_ratio": 0.0, - "completion_length": 535.5000305175781, - "epoch": 0.071, - "grad_norm": 2.6854335457859206, - "kl": 0.01251220703125, - "learning_rate": 7.1e-07, - "loss": 0.0458, - "reward": 2.375571846961975, - "reward_std": 0.4342511296272278, - "rewards/accuracy_reward": 0.6875, - "rewards/reasoning_steps_reward": 0.7222222685813904, - "rewards/repetition_penalty_reward": -0.028942234814167023, - "rewards/tag_count_reward": 0.9947916865348816, + "completion_length": 423.5833435058594, + "epoch": 0.0355, + "grad_norm": 2.869137528454137, + "kl": 0.001552581787109375, + "learning_rate": 3.55e-07, + "loss": -0.0516, + "reward": 0.7146015167236328, + "reward_std": 0.29403047263622284, + "rewards/accuracy_reward": 0.0, + "rewards/reasoning_steps_reward": 0.298611119389534, + "rewards/repetition_penalty_reward": -0.08400958776473999, + "rewards/tag_count_reward": 0.5, "step": 71 }, { "clip_ratio": 0.0, - "completion_length": 527.6250305175781, - "epoch": 0.072, - "grad_norm": 2.736209803149782, - "kl": 0.01373291015625, - "learning_rate": 7.2e-07, - "loss": 0.0066, - "reward": 2.4244872331619263, - "reward_std": 0.4428471177816391, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.7361111640930176, - "rewards/repetition_penalty_reward": -0.04079079441726208, - "rewards/tag_count_reward": 1.0, + "completion_length": 490.0208435058594, + "epoch": 0.036, + "grad_norm": 2.479062073282576, + "kl": 0.001407623291015625, + "learning_rate": 3.6e-07, + "loss": 0.0264, + "reward": 0.766609251499176, + "reward_std": 0.3444522023200989, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/reasoning_steps_reward": 0.3263889104127884, + "rewards/repetition_penalty_reward": -0.09102966263890266, + "rewards/tag_count_reward": 0.5104166865348816, "step": 72 }, { "clip_ratio": 0.0, - "completion_length": 540.7708740234375, - "epoch": 0.073, - "grad_norm": 3.26658788883033, - "kl": 0.0230712890625, - "learning_rate": 7.3e-07, - "loss": 0.05, - "reward": 2.218494176864624, - "reward_std": 0.44005706906318665, - "rewards/accuracy_reward": 0.5416666716337204, - "rewards/reasoning_steps_reward": 0.7291666865348816, - "rewards/repetition_penalty_reward": -0.036714269779622555, - "rewards/tag_count_reward": 0.984375, + "completion_length": 403.35418701171875, + "epoch": 0.0365, + "grad_norm": 2.55373473630631, + "kl": 0.00188446044921875, + "learning_rate": 3.65e-07, + "loss": 0.0128, + "reward": 0.8594351708889008, + "reward_std": 0.38698044419288635, + "rewards/accuracy_reward": 0.06250000186264515, + "rewards/reasoning_steps_reward": 0.361111119389534, + "rewards/repetition_penalty_reward": -0.10063427314162254, + "rewards/tag_count_reward": 0.5364583730697632, "step": 73 }, { "clip_ratio": 0.0, - "completion_length": 477.5833435058594, - "epoch": 0.074, - "grad_norm": 2.85387562733274, - "kl": 0.015350341796875, - "learning_rate": 7.4e-07, - "loss": -0.0614, - "reward": 2.5582029819488525, - "reward_std": 0.46813952922821045, - "rewards/accuracy_reward": 0.8125000298023224, - "rewards/reasoning_steps_reward": 0.784722238779068, - "rewards/repetition_penalty_reward": -0.03901927825063467, - "rewards/tag_count_reward": 1.0, + "completion_length": 441.31251525878906, + "epoch": 0.037, + "grad_norm": 2.763489892021762, + "kl": 0.00213623046875, + "learning_rate": 3.7e-07, + "loss": -0.0037, + "reward": 0.8403445482254028, + "reward_std": 0.22306300699710846, + "rewards/accuracy_reward": 0.0, + "rewards/reasoning_steps_reward": 0.409722238779068, + "rewards/repetition_penalty_reward": -0.10062775760889053, + "rewards/tag_count_reward": 0.53125, "step": 74 }, { "clip_ratio": 0.0, - "completion_length": 530.1041870117188, - "epoch": 0.075, - "grad_norm": 2.963663960294806, - "kl": 0.017822265625, - "learning_rate": 7.5e-07, - "loss": 0.0242, - "reward": 2.356380820274353, - "reward_std": 0.5158264935016632, - "rewards/accuracy_reward": 0.6875, - "rewards/reasoning_steps_reward": 0.7083334028720856, - "rewards/repetition_penalty_reward": -0.03945251181721687, - "rewards/tag_count_reward": 1.0, + "completion_length": 430.0416717529297, + "epoch": 0.0375, + "grad_norm": 2.9294440220568005, + "kl": 0.00238800048828125, + "learning_rate": 3.75e-07, + "loss": -0.0807, + "reward": 0.8580312430858612, + "reward_std": 0.39887402951717377, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/reasoning_steps_reward": 0.3680555820465088, + "rewards/repetition_penalty_reward": -0.08294104412198067, + "rewards/tag_count_reward": 0.53125, "step": 75 }, { "clip_ratio": 0.0, - "completion_length": 542.7500305175781, - "epoch": 0.076, - "grad_norm": 8.251127258415144, - "kl": 0.02813720703125, - "learning_rate": 7.599999999999999e-07, - "loss": 0.0219, - "reward": 2.4178924560546875, - "reward_std": 0.44688859581947327, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/reasoning_steps_reward": 0.7222222089767456, - "rewards/repetition_penalty_reward": -0.043913234025239944, - "rewards/tag_count_reward": 0.9895833432674408, + "completion_length": 356.8541717529297, + "epoch": 0.038, + "grad_norm": 2.1979386745725438, + "kl": 0.00208282470703125, + "learning_rate": 3.7999999999999996e-07, + "loss": -0.0118, + "reward": 0.8219205439090729, + "reward_std": 0.33333858847618103, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/reasoning_steps_reward": 0.3055555745959282, + "rewards/repetition_penalty_reward": -0.0669684112071991, + "rewards/tag_count_reward": 0.5416666865348816, "step": 76 }, { "clip_ratio": 0.0, - "completion_length": 547.0625, - "epoch": 0.077, - "grad_norm": 3.4452973677390144, - "kl": 0.021881103515625, - "learning_rate": 7.699999999999999e-07, - "loss": 0.0425, - "reward": 2.3173056840896606, - "reward_std": 0.49892735481262207, - "rewards/accuracy_reward": 0.6250000298023224, - "rewards/reasoning_steps_reward": 0.75, - "rewards/repetition_penalty_reward": -0.04727764055132866, - "rewards/tag_count_reward": 0.9895833432674408, + "completion_length": 499.41668701171875, + "epoch": 0.0385, + "grad_norm": 2.4412843501817862, + "kl": 0.00276947021484375, + "learning_rate": 3.8499999999999997e-07, + "loss": 0.0243, + "reward": 0.9450303316116333, + "reward_std": 0.35558322072029114, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/reasoning_steps_reward": 0.472222238779068, + "rewards/repetition_penalty_reward": -0.10531692206859589, + "rewards/tag_count_reward": 0.5364583432674408, "step": 77 }, { "clip_ratio": 0.0, - "completion_length": 554.3958435058594, - "epoch": 0.078, - "grad_norm": 2.5762620979216746, - "kl": 0.017181396484375, - "learning_rate": 7.799999999999999e-07, - "loss": -0.0855, - "reward": 2.4648728370666504, - "reward_std": 0.5124194473028183, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/reasoning_steps_reward": 0.8472222685813904, - "rewards/repetition_penalty_reward": -0.04380778409540653, - "rewards/tag_count_reward": 0.9947916865348816, + "completion_length": 395.62501525878906, + "epoch": 0.039, + "grad_norm": 2.8263728812238895, + "kl": 0.00323486328125, + "learning_rate": 3.8999999999999997e-07, + "loss": -0.0533, + "reward": 0.7922153770923615, + "reward_std": 0.2505815550684929, + "rewards/accuracy_reward": 0.0, + "rewards/reasoning_steps_reward": 0.3750000298023224, + "rewards/repetition_penalty_reward": -0.08278463035821915, + "rewards/tag_count_reward": 0.5, "step": 78 }, { "clip_ratio": 0.0, - "completion_length": 490.83335876464844, - "epoch": 0.079, - "grad_norm": 2.8936310930522597, - "kl": 0.0186767578125, - "learning_rate": 7.9e-07, - "loss": 0.0224, - "reward": 2.5715928077697754, - "reward_std": 0.4545498937368393, - "rewards/accuracy_reward": 0.8333333730697632, - "rewards/reasoning_steps_reward": 0.7708334028720856, - "rewards/repetition_penalty_reward": -0.03257405199110508, - "rewards/tag_count_reward": 1.0, + "completion_length": 408.9791717529297, + "epoch": 0.0395, + "grad_norm": 2.623388298754155, + "kl": 0.00296783447265625, + "learning_rate": 3.95e-07, + "loss": -0.0619, + "reward": 0.8954714834690094, + "reward_std": 0.3817988187074661, + "rewards/accuracy_reward": 0.06250000186264515, + "rewards/reasoning_steps_reward": 0.3819444477558136, + "rewards/repetition_penalty_reward": -0.07501471415162086, + "rewards/tag_count_reward": 0.5260416865348816, "step": 79 }, { "clip_ratio": 0.0, - "completion_length": 505.16668701171875, - "epoch": 0.08, - "grad_norm": 3.386283681350372, - "kl": 0.018951416015625, - "learning_rate": 8e-07, - "loss": 0.0251, - "reward": 2.380759596824646, - "reward_std": 0.40306712687015533, - "rewards/accuracy_reward": 0.6458333432674408, - "rewards/reasoning_steps_reward": 0.7708333432674408, - "rewards/repetition_penalty_reward": -0.03590711485594511, - "rewards/tag_count_reward": 1.0, + "completion_length": 440.35418701171875, + "epoch": 0.04, + "grad_norm": 3.2358128766367216, + "kl": 0.0038299560546875, + "learning_rate": 4e-07, + "loss": -0.0908, + "reward": 1.0643112063407898, + "reward_std": 0.43053002655506134, + "rewards/accuracy_reward": 0.14583333395421505, + "rewards/reasoning_steps_reward": 0.4166666567325592, + "rewards/repetition_penalty_reward": -0.07110553234815598, + "rewards/tag_count_reward": 0.5729166865348816, "step": 80 }, { "clip_ratio": 0.0, - "completion_length": 518.7708435058594, - "epoch": 0.081, - "grad_norm": 2.774591505446817, - "kl": 0.016845703125, - "learning_rate": 8.1e-07, - "loss": 0.1045, - "reward": 2.4150757789611816, - "reward_std": 0.34570978581905365, - "rewards/accuracy_reward": 0.6875000298023224, - "rewards/reasoning_steps_reward": 0.7638889253139496, - "rewards/repetition_penalty_reward": -0.03631327673792839, - "rewards/tag_count_reward": 1.0, + "completion_length": 464.41668701171875, + "epoch": 0.0405, + "grad_norm": 2.469210493769818, + "kl": 0.0035552978515625, + "learning_rate": 4.05e-07, + "loss": 0.0004, + "reward": 0.8626311421394348, + "reward_std": 0.3268212229013443, + "rewards/accuracy_reward": 0.0625, + "rewards/reasoning_steps_reward": 0.3680555820465088, + "rewards/repetition_penalty_reward": -0.09917446598410606, + "rewards/tag_count_reward": 0.53125, "step": 81 }, { "clip_ratio": 0.0, - "completion_length": 542.4791717529297, - "epoch": 0.082, - "grad_norm": 2.9728145387844758, - "kl": 0.01812744140625, - "learning_rate": 8.199999999999999e-07, - "loss": -0.0635, - "reward": 2.5557461977005005, - "reward_std": 0.4178060442209244, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.8680555820465088, - "rewards/repetition_penalty_reward": -0.04147614166140556, - "rewards/tag_count_reward": 1.0, + "completion_length": 439.0833435058594, + "epoch": 0.041, + "grad_norm": 3.0805939483605678, + "kl": 0.0042877197265625, + "learning_rate": 4.0999999999999994e-07, + "loss": -0.019, + "reward": 0.9204289317131042, + "reward_std": 0.3671407848596573, + "rewards/accuracy_reward": 0.08333333395421505, + "rewards/reasoning_steps_reward": 0.4027777910232544, + "rewards/repetition_penalty_reward": -0.10214058682322502, + "rewards/tag_count_reward": 0.5364583432674408, "step": 82 }, { "clip_ratio": 0.0, - "completion_length": 540.0208435058594, - "epoch": 0.083, - "grad_norm": 2.624113800462916, - "kl": 0.02069091796875, - "learning_rate": 8.299999999999999e-07, - "loss": -0.0095, - "reward": 2.4058659076690674, - "reward_std": 0.49703338742256165, - "rewards/accuracy_reward": 0.6875000298023224, - "rewards/reasoning_steps_reward": 0.7708334028720856, - "rewards/repetition_penalty_reward": -0.04205096513032913, - "rewards/tag_count_reward": 0.9895833432674408, + "completion_length": 521.9166870117188, + "epoch": 0.0415, + "grad_norm": 2.597320274415619, + "kl": 0.0041046142578125, + "learning_rate": 4.1499999999999994e-07, + "loss": 0.0645, + "reward": 1.1805400252342224, + "reward_std": 0.43789660930633545, + "rewards/accuracy_reward": 0.16666667722165585, + "rewards/reasoning_steps_reward": 0.5277778059244156, + "rewards/repetition_penalty_reward": -0.08682116121053696, + "rewards/tag_count_reward": 0.5729166865348816, "step": 83 }, { "clip_ratio": 0.0, - "completion_length": 538.7291870117188, - "epoch": 0.084, - "grad_norm": 2.654590798347164, - "kl": 0.0184326171875, - "learning_rate": 8.399999999999999e-07, - "loss": -0.0225, - "reward": 2.4977999925613403, - "reward_std": 0.5065296292304993, - "rewards/accuracy_reward": 0.6875000298023224, - "rewards/reasoning_steps_reward": 0.847222238779068, - "rewards/repetition_penalty_reward": -0.03692223224788904, - "rewards/tag_count_reward": 1.0, + "completion_length": 475.2708435058594, + "epoch": 0.042, + "grad_norm": 2.652905559142146, + "kl": 0.004302978515625, + "learning_rate": 4.1999999999999995e-07, + "loss": -0.0013, + "reward": 1.1051982939243317, + "reward_std": 0.4141012579202652, + "rewards/accuracy_reward": 0.12500000558793545, + "rewards/reasoning_steps_reward": 0.5138888657093048, + "rewards/repetition_penalty_reward": -0.09619061276316643, + "rewards/tag_count_reward": 0.5625000298023224, "step": 84 }, { "clip_ratio": 0.0, - "completion_length": 558.7916870117188, - "epoch": 0.085, - "grad_norm": 2.5236300531262033, - "kl": 0.02142333984375, - "learning_rate": 8.499999999999999e-07, - "loss": 0.0286, - "reward": 2.3374500274658203, - "reward_std": 0.4774511754512787, - "rewards/accuracy_reward": 0.5208333432674408, - "rewards/reasoning_steps_reward": 0.854166716337204, - "rewards/repetition_penalty_reward": -0.03755011223256588, - "rewards/tag_count_reward": 1.0, + "completion_length": 401.0833435058594, + "epoch": 0.0425, + "grad_norm": 2.7752719784771798, + "kl": 0.00518798828125, + "learning_rate": 4.2499999999999995e-07, + "loss": -0.0391, + "reward": 1.0055087208747864, + "reward_std": 0.2745797038078308, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/reasoning_steps_reward": 0.5555556118488312, + "rewards/repetition_penalty_reward": -0.09171349555253983, + "rewards/tag_count_reward": 0.5208333730697632, "step": 85 }, { "clip_ratio": 0.0, - "completion_length": 557.6458435058594, - "epoch": 0.086, - "grad_norm": 2.4520679561302434, - "kl": 0.01904296875, - "learning_rate": 8.599999999999999e-07, - "loss": -0.0126, - "reward": 2.469316601753235, - "reward_std": 0.3855314701795578, - "rewards/accuracy_reward": 0.6250000149011612, - "rewards/reasoning_steps_reward": 0.8958333432674408, - "rewards/repetition_penalty_reward": -0.046308472752571106, - "rewards/tag_count_reward": 0.9947916865348816, + "completion_length": 419.4791717529297, + "epoch": 0.043, + "grad_norm": 2.640366127433102, + "kl": 0.0047607421875, + "learning_rate": 4.2999999999999996e-07, + "loss": 0.0125, + "reward": 1.0041911602020264, + "reward_std": 0.25805340707302094, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/reasoning_steps_reward": 0.5486111342906952, + "rewards/repetition_penalty_reward": -0.12254500389099121, + "rewards/tag_count_reward": 0.5364583432674408, "step": 86 }, { "clip_ratio": 0.0, - "completion_length": 537.1458435058594, - "epoch": 0.087, - "grad_norm": 2.8696069886274858, - "kl": 0.02032470703125, - "learning_rate": 8.699999999999999e-07, - "loss": 0.0288, - "reward": 2.775725483894348, - "reward_std": 0.3478439748287201, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 0.9097222685813904, - "rewards/repetition_penalty_reward": -0.04545513913035393, - "rewards/tag_count_reward": 0.9947916865348816, + "completion_length": 428.2083435058594, + "epoch": 0.0435, + "grad_norm": 2.780462703093036, + "kl": 0.0068511962890625, + "learning_rate": 4.3499999999999996e-07, + "loss": -0.0254, + "reward": 0.9412573575973511, + "reward_std": 0.3015138581395149, + "rewards/accuracy_reward": 0.0625, + "rewards/reasoning_steps_reward": 0.4722222536802292, + "rewards/repetition_penalty_reward": -0.1247149184346199, + "rewards/tag_count_reward": 0.53125, "step": 87 }, { "clip_ratio": 0.0, - "completion_length": 552.0625152587891, - "epoch": 0.088, - "grad_norm": 2.7328359228843984, - "kl": 0.0185546875, - "learning_rate": 8.799999999999999e-07, - "loss": 0.1113, - "reward": 2.5936840772628784, - "reward_std": 0.2873719036579132, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/reasoning_steps_reward": 0.881944477558136, - "rewards/repetition_penalty_reward": -0.03826039098203182, - "rewards/tag_count_reward": 1.0, + "completion_length": 452.4583435058594, + "epoch": 0.044, + "grad_norm": 2.9854860146208924, + "kl": 0.0058135986328125, + "learning_rate": 4.3999999999999997e-07, + "loss": -0.0644, + "reward": 1.1855429410934448, + "reward_std": 0.4236704409122467, + "rewards/accuracy_reward": 0.12500000558793545, + "rewards/reasoning_steps_reward": 0.604166716337204, + "rewards/repetition_penalty_reward": -0.10091547667980194, + "rewards/tag_count_reward": 0.5572916865348816, "step": 88 }, { "clip_ratio": 0.0, - "completion_length": 540.8125305175781, - "epoch": 0.089, - "grad_norm": 2.6685590803479307, - "kl": 0.02239990234375, - "learning_rate": 8.9e-07, - "loss": 0.0224, - "reward": 2.3886401653289795, - "reward_std": 0.48963838815689087, - "rewards/accuracy_reward": 0.5, - "rewards/reasoning_steps_reward": 0.9305555522441864, - "rewards/repetition_penalty_reward": -0.03670708276331425, - "rewards/tag_count_reward": 0.9947916865348816, + "completion_length": 460.37501525878906, + "epoch": 0.0445, + "grad_norm": 2.6660126680917506, + "kl": 0.0048370361328125, + "learning_rate": 4.45e-07, + "loss": 0.0538, + "reward": 1.1993393301963806, + "reward_std": 0.45567750930786133, + "rewards/accuracy_reward": 0.2083333432674408, + "rewards/reasoning_steps_reward": 0.4583333432674408, + "rewards/repetition_penalty_reward": -0.11316069215536118, + "rewards/tag_count_reward": 0.6458333432674408, "step": 89 }, { "clip_ratio": 0.0, - "completion_length": 490.06251525878906, - "epoch": 0.09, - "grad_norm": 3.839427427471072, - "kl": 0.024169921875, - "learning_rate": 9e-07, - "loss": 0.0817, - "reward": 2.394170045852661, - "reward_std": 0.5097104758024216, - "rewards/accuracy_reward": 0.5625, - "rewards/reasoning_steps_reward": 0.8680555820465088, - "rewards/repetition_penalty_reward": -0.03638560324907303, - "rewards/tag_count_reward": 1.0, + "completion_length": 503.18751525878906, + "epoch": 0.045, + "grad_norm": 2.6170875678464642, + "kl": 0.0070648193359375, + "learning_rate": 4.5e-07, + "loss": -0.0717, + "reward": 1.1190320253372192, + "reward_std": 0.36526739597320557, + "rewards/accuracy_reward": 0.0625, + "rewards/reasoning_steps_reward": 0.6111111640930176, + "rewards/repetition_penalty_reward": -0.08582913875579834, + "rewards/tag_count_reward": 0.53125, "step": 90 }, { "clip_ratio": 0.0, - "completion_length": 524.9791870117188, - "epoch": 0.091, - "grad_norm": 2.553614340168228, - "kl": 0.02398681640625, - "learning_rate": 9.1e-07, - "loss": 0.0725, - "reward": 2.6662930250167847, - "reward_std": 0.20492929220199585, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 0.9097222685813904, - "rewards/repetition_penalty_reward": -0.03509608097374439, - "rewards/tag_count_reward": 1.0, + "completion_length": 437.7083435058594, + "epoch": 0.0455, + "grad_norm": 2.6475065375904956, + "kl": 0.00689697265625, + "learning_rate": 4.55e-07, + "loss": 0.0389, + "reward": 1.283356487751007, + "reward_std": 0.5063433349132538, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/reasoning_steps_reward": 0.5833333730697632, + "rewards/repetition_penalty_reward": -0.08122700080275536, + "rewards/tag_count_reward": 0.6145833432674408, "step": 91 }, { "clip_ratio": 0.0, - "completion_length": 515.0625305175781, - "epoch": 0.092, - "grad_norm": 2.79663742057304, - "kl": 0.02203369140625, - "learning_rate": 9.2e-07, - "loss": 0.0525, - "reward": 2.7029329538345337, - "reward_std": 0.3332049995660782, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/reasoning_steps_reward": 0.9166666865348816, - "rewards/repetition_penalty_reward": -0.04185891151428223, - "rewards/tag_count_reward": 0.9947916865348816, + "completion_length": 419.4583435058594, + "epoch": 0.046, + "grad_norm": 2.893081594976591, + "kl": 0.0062255859375, + "learning_rate": 4.6e-07, + "loss": 0.0274, + "reward": 1.465721845626831, + "reward_std": 0.415210023522377, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/reasoning_steps_reward": 0.5277778208255768, + "rewards/repetition_penalty_reward": -0.08288927376270294, + "rewards/tag_count_reward": 0.6875000298023224, "step": 92 }, { "clip_ratio": 0.0, - "completion_length": 586.6875, - "epoch": 0.093, - "grad_norm": 2.51813341735105, - "kl": 0.02301025390625, - "learning_rate": 9.3e-07, - "loss": 0.0461, - "reward": 2.7553551197052, - "reward_std": 0.41791823506355286, - "rewards/accuracy_reward": 0.8750000298023224, - "rewards/reasoning_steps_reward": 0.9305555820465088, - "rewards/repetition_penalty_reward": -0.04499218240380287, - "rewards/tag_count_reward": 0.9947916865348816, + "completion_length": 496.0833435058594, + "epoch": 0.0465, + "grad_norm": 3.0157369307824924, + "kl": 0.0055694580078125, + "learning_rate": 4.65e-07, + "loss": -0.0453, + "reward": 1.174148678779602, + "reward_std": 0.43805110454559326, + "rewards/accuracy_reward": 0.14583333395421505, + "rewards/reasoning_steps_reward": 0.5069444626569748, + "rewards/repetition_penalty_reward": -0.07758747413754463, + "rewards/tag_count_reward": 0.5989583432674408, "step": 93 }, { "clip_ratio": 0.0, - "completion_length": 589.2708435058594, - "epoch": 0.094, - "grad_norm": 2.5842043324661863, - "kl": 0.0238037109375, - "learning_rate": 9.399999999999999e-07, - "loss": 0.0528, - "reward": 2.252669334411621, - "reward_std": 0.3811512589454651, - "rewards/accuracy_reward": 0.3333333432674408, - "rewards/reasoning_steps_reward": 0.9652777910232544, - "rewards/repetition_penalty_reward": -0.04073341749608517, - "rewards/tag_count_reward": 0.9947916865348816, + "completion_length": 400.6458435058594, + "epoch": 0.047, + "grad_norm": 3.021710643706005, + "kl": 0.00909423828125, + "learning_rate": 4.6999999999999995e-07, + "loss": 0.0763, + "reward": 1.587534487247467, + "reward_std": 0.6594474613666534, + "rewards/accuracy_reward": 0.4375000149011612, + "rewards/reasoning_steps_reward": 0.479166716337204, + "rewards/repetition_penalty_reward": -0.07392388582229614, + "rewards/tag_count_reward": 0.7447916865348816, "step": 94 }, { "clip_ratio": 0.0, - "completion_length": 564.7708435058594, - "epoch": 0.095, - "grad_norm": 2.5488718443741667, - "kl": 0.02752685546875, - "learning_rate": 9.499999999999999e-07, - "loss": 0.1107, - "reward": 2.6501858234405518, - "reward_std": 0.27840781956911087, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 0.902777761220932, - "rewards/repetition_penalty_reward": -0.04425876401364803, - "rewards/tag_count_reward": 1.0, + "completion_length": 419.3958435058594, + "epoch": 0.0475, + "grad_norm": 2.8068774316465617, + "kl": 0.0103759765625, + "learning_rate": 4.7499999999999995e-07, + "loss": -0.0172, + "reward": 1.508695363998413, + "reward_std": 0.5535295158624649, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/reasoning_steps_reward": 0.5694444477558136, + "rewards/repetition_penalty_reward": -0.0763741172850132, + "rewards/tag_count_reward": 0.6822916865348816, "step": 95 }, { "clip_ratio": 0.0, - "completion_length": 559.2708740234375, - "epoch": 0.096, - "grad_norm": 2.631139494749351, - "kl": 0.02752685546875, - "learning_rate": 9.6e-07, - "loss": 0.017, - "reward": 2.412258505821228, - "reward_std": 0.4537012577056885, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/reasoning_steps_reward": 0.9166666865348816, - "rewards/repetition_penalty_reward": -0.04607492312788963, - "rewards/tag_count_reward": 1.0, + "completion_length": 470.35418701171875, + "epoch": 0.048, + "grad_norm": 2.8744733124062676, + "kl": 0.0072174072265625, + "learning_rate": 4.8e-07, + "loss": -0.0036, + "reward": 1.1895955204963684, + "reward_std": 0.4534749835729599, + "rewards/accuracy_reward": 0.1875000074505806, + "rewards/reasoning_steps_reward": 0.3888889104127884, + "rewards/repetition_penalty_reward": -0.11075177043676376, + "rewards/tag_count_reward": 0.7239583432674408, "step": 96 }, { "clip_ratio": 0.0, - "completion_length": 606.6666870117188, - "epoch": 0.097, - "grad_norm": 2.8652564620163288, - "kl": 0.02764892578125, - "learning_rate": 9.7e-07, - "loss": 0.0402, - "reward": 2.6432669162750244, - "reward_std": 0.47565995156764984, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 0.9027778208255768, - "rewards/repetition_penalty_reward": -0.04596925526857376, - "rewards/tag_count_reward": 0.9947916865348816, + "completion_length": 427.00001525878906, + "epoch": 0.0485, + "grad_norm": 2.820798846547903, + "kl": 0.010498046875, + "learning_rate": 4.85e-07, + "loss": 0.0096, + "reward": 1.331734836101532, + "reward_std": 0.6030288934707642, + "rewards/accuracy_reward": 0.3125, + "rewards/reasoning_steps_reward": 0.4027778059244156, + "rewards/repetition_penalty_reward": -0.10750139504671097, + "rewards/tag_count_reward": 0.7239583432674408, "step": 97 }, { "clip_ratio": 0.0, - "completion_length": 506.4583435058594, - "epoch": 0.098, - "grad_norm": 2.701632712968642, - "kl": 0.032470703125, - "learning_rate": 9.8e-07, - "loss": -0.0494, - "reward": 2.5521618127822876, - "reward_std": 0.4033654034137726, - "rewards/accuracy_reward": 0.708333358168602, - "rewards/reasoning_steps_reward": 0.8888889253139496, - "rewards/repetition_penalty_reward": -0.0450606532394886, - "rewards/tag_count_reward": 1.0, + "completion_length": 486.1666717529297, + "epoch": 0.049, + "grad_norm": 2.607464524310001, + "kl": 0.010772705078125, + "learning_rate": 4.9e-07, + "loss": 0.0265, + "reward": 1.5102072954177856, + "reward_std": 0.5378637164831161, + "rewards/accuracy_reward": 0.458333358168602, + "rewards/reasoning_steps_reward": 0.298611119389534, + "rewards/repetition_penalty_reward": -0.12173716351389885, + "rewards/tag_count_reward": 0.8750000298023224, "step": 98 }, { "clip_ratio": 0.0, - "completion_length": 570.2916870117188, - "epoch": 0.099, - "grad_norm": 2.578693808208252, - "kl": 0.03271484375, - "learning_rate": 9.9e-07, - "loss": -0.0427, - "reward": 2.4133933782577515, - "reward_std": 0.30207425355911255, - "rewards/accuracy_reward": 0.4791666865348816, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.03973175957798958, - "rewards/tag_count_reward": 0.9947916865348816, + "completion_length": 504.04168701171875, + "epoch": 0.0495, + "grad_norm": 2.6750653025151925, + "kl": 0.013946533203125, + "learning_rate": 4.95e-07, + "loss": 0.0658, + "reward": 1.610666811466217, + "reward_std": 0.6144271492958069, + "rewards/accuracy_reward": 0.5000000149011612, + "rewards/reasoning_steps_reward": 0.3541667014360428, + "rewards/repetition_penalty_reward": -0.10287501662969589, + "rewards/tag_count_reward": 0.8593750298023224, "step": 99 }, { "clip_ratio": 0.0, - "completion_length": 614.3333740234375, - "epoch": 0.1, - "grad_norm": 2.67358470309149, - "kl": 0.028564453125, - "learning_rate": 1e-06, - "loss": -0.0598, - "reward": 2.3288190364837646, - "reward_std": 0.3720841705799103, - "rewards/accuracy_reward": 0.5000000149011612, - "rewards/reasoning_steps_reward": 0.8819445073604584, - "rewards/repetition_penalty_reward": -0.05312554910778999, - "rewards/tag_count_reward": 1.0, + "completion_length": 510.72918701171875, + "epoch": 0.05, + "grad_norm": 2.7156863679632885, + "kl": 0.009613037109375, + "learning_rate": 5e-07, + "loss": 0.2283, + "reward": 1.464966893196106, + "reward_std": 0.5712087154388428, + "rewards/accuracy_reward": 0.4166666865348816, + "rewards/reasoning_steps_reward": 0.3402777910232544, + "rewards/repetition_penalty_reward": -0.1096859984099865, + "rewards/tag_count_reward": 0.8177083432674408, "step": 100 }, { "clip_ratio": 0.0, - "completion_length": 558.4375, - "epoch": 0.101, - "grad_norm": 2.557702270091156, - "kl": 0.0347900390625, - "learning_rate": 9.999972584460056e-07, - "loss": 0.0339, - "reward": 2.7004672288894653, - "reward_std": 0.3648904711008072, - "rewards/accuracy_reward": 0.8333333730697632, - "rewards/reasoning_steps_reward": 0.92361119389534, - "rewards/repetition_penalty_reward": -0.05126911960542202, - "rewards/tag_count_reward": 0.9947916865348816, + "completion_length": 423.0416717529297, + "epoch": 0.0505, + "grad_norm": 2.919566854448967, + "kl": 0.0150146484375, + "learning_rate": 5.049999999999999e-07, + "loss": -0.0017, + "reward": 1.971518337726593, + "reward_std": 0.5628243684768677, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.375, + "rewards/repetition_penalty_reward": -0.08056501299142838, + "rewards/tag_count_reward": 0.9270833432674408, "step": 101 }, { "clip_ratio": 0.0, - "completion_length": 596.1458435058594, - "epoch": 0.102, - "grad_norm": 2.7419447112432644, - "kl": 0.033935546875, - "learning_rate": 9.999890338174275e-07, - "loss": 0.0276, - "reward": 2.4844894409179688, - "reward_std": 0.4377788305282593, - "rewards/accuracy_reward": 0.583333358168602, - "rewards/reasoning_steps_reward": 0.944444477558136, - "rewards/repetition_penalty_reward": -0.043288541957736015, - "rewards/tag_count_reward": 1.0, + "completion_length": 370.5416717529297, + "epoch": 0.051, + "grad_norm": 2.864551722431305, + "kl": 0.017913818359375, + "learning_rate": 5.1e-07, + "loss": 0.037, + "reward": 1.7677536010742188, + "reward_std": 0.4400963932275772, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.3194444626569748, + "rewards/repetition_penalty_reward": -0.07252424210309982, + "rewards/tag_count_reward": 0.9375000298023224, "step": 102 }, { "clip_ratio": 0.0, - "completion_length": 549.3125305175781, - "epoch": 0.103, - "grad_norm": 2.5449102277843556, - "kl": 0.0384521484375, - "learning_rate": 9.999753262144804e-07, - "loss": 0.091, - "reward": 2.214192271232605, - "reward_std": 0.41757629811763763, - "rewards/accuracy_reward": 0.3333333358168602, - "rewards/reasoning_steps_reward": 0.930555522441864, - "rewards/repetition_penalty_reward": -0.04969660937786102, - "rewards/tag_count_reward": 1.0, + "completion_length": 413.66668701171875, + "epoch": 0.0515, + "grad_norm": 2.962558584851364, + "kl": 0.012969970703125, + "learning_rate": 5.149999999999999e-07, + "loss": -0.0576, + "reward": 1.84916353225708, + "reward_std": 0.4907376766204834, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 0.3263888955116272, + "rewards/repetition_penalty_reward": -0.07097543030977249, + "rewards/tag_count_reward": 0.9687500298023224, "step": 103 }, { "clip_ratio": 0.0, - "completion_length": 503.6458435058594, - "epoch": 0.104, - "grad_norm": 2.5988919390766134, - "kl": 0.038330078125, - "learning_rate": 9.999561358041868e-07, - "loss": -0.0306, - "reward": 2.473710536956787, - "reward_std": 0.36740100383758545, - "rewards/accuracy_reward": 0.5833333730697632, - "rewards/reasoning_steps_reward": 0.9375000596046448, - "rewards/repetition_penalty_reward": -0.04712275043129921, - "rewards/tag_count_reward": 1.0, + "completion_length": 383.3958435058594, + "epoch": 0.052, + "grad_norm": 3.174614828168193, + "kl": 0.015167236328125, + "learning_rate": 5.2e-07, + "loss": -0.0749, + "reward": 1.8075725436210632, + "reward_std": 0.28401315957307816, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.2916666865348816, + "rewards/repetition_penalty_reward": -0.057010795921087265, + "rewards/tag_count_reward": 0.9895833730697632, "step": 104 }, { "clip_ratio": 0.0, - "completion_length": 584.5833435058594, - "epoch": 0.105, - "grad_norm": 2.2253173220096847, - "kl": 0.03759765625, - "learning_rate": 9.99931462820376e-07, - "loss": -0.0418, - "reward": 2.6660202741622925, - "reward_std": 0.39214441180229187, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.04231319762766361, - "rewards/tag_count_reward": 1.0, + "completion_length": 413.75, + "epoch": 0.0525, + "grad_norm": 2.9070150258769845, + "kl": 0.0205078125, + "learning_rate": 5.25e-07, + "loss": -0.0433, + "reward": 1.7146551609039307, + "reward_std": 0.5036576092243195, + "rewards/accuracy_reward": 0.4791666865348816, + "rewards/reasoning_steps_reward": 0.36805559694767, + "rewards/repetition_penalty_reward": -0.07527554780244827, + "rewards/tag_count_reward": 0.9427083730697632, "step": 105 }, { "clip_ratio": 0.0, - "completion_length": 580.5833435058594, - "epoch": 0.106, - "grad_norm": 2.740740338946049, - "kl": 0.03857421875, - "learning_rate": 9.999013075636804e-07, - "loss": -0.0301, - "reward": 2.671258807182312, - "reward_std": 0.35463356226682663, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/reasoning_steps_reward": 0.9652777910232544, - "rewards/repetition_penalty_reward": -0.044019101187586784, - "rewards/tag_count_reward": 1.0, + "completion_length": 506.22918701171875, + "epoch": 0.053, + "grad_norm": 2.7140962159705353, + "kl": 0.01116943359375, + "learning_rate": 5.3e-07, + "loss": -0.0411, + "reward": 1.7457041144371033, + "reward_std": 0.43025586009025574, + "rewards/accuracy_reward": 0.6875000149011612, + "rewards/reasoning_steps_reward": 0.236111119389534, + "rewards/repetition_penalty_reward": -0.0997820794582367, + "rewards/tag_count_reward": 0.9218750298023224, "step": 106 }, { "clip_ratio": 0.0, - "completion_length": 552.8958740234375, - "epoch": 0.107, - "grad_norm": 2.6710939233020636, - "kl": 0.0406494140625, - "learning_rate": 9.998656704015323e-07, - "loss": 0.0481, - "reward": 2.4425666332244873, - "reward_std": 0.36045634746551514, - "rewards/accuracy_reward": 0.5625000298023224, - "rewards/reasoning_steps_reward": 0.9305555820465088, - "rewards/repetition_penalty_reward": -0.04528068192303181, - "rewards/tag_count_reward": 0.9947916865348816, + "completion_length": 390.7708435058594, + "epoch": 0.0535, + "grad_norm": 2.5890826142014376, + "kl": 0.016693115234375, + "learning_rate": 5.35e-07, + "loss": 0.048, + "reward": 1.9064030051231384, + "reward_std": 0.3108719140291214, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.2916666865348816, + "rewards/repetition_penalty_reward": -0.08838870003819466, + "rewards/tag_count_reward": 0.9739583432674408, "step": 107 }, { "clip_ratio": 0.0, - "completion_length": 521.7500305175781, - "epoch": 0.108, - "grad_norm": 2.9062446352100086, - "kl": 0.0419921875, - "learning_rate": 9.998245517681593e-07, - "loss": 0.0707, - "reward": 2.725650668144226, - "reward_std": 0.3536549210548401, - "rewards/accuracy_reward": 0.8125000298023224, - "rewards/reasoning_steps_reward": 0.9652778506278992, - "rewards/repetition_penalty_reward": -0.05212709680199623, - "rewards/tag_count_reward": 1.0, + "completion_length": 457.6458435058594, + "epoch": 0.054, + "grad_norm": 2.4716329573423796, + "kl": 0.01641845703125, + "learning_rate": 5.4e-07, + "loss": -0.0065, + "reward": 1.9147586822509766, + "reward_std": 0.30019962787628174, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.2986111342906952, + "rewards/repetition_penalty_reward": -0.08697742223739624, + "rewards/tag_count_reward": 0.9947916865348816, "step": 108 }, { "clip_ratio": 0.0, - "completion_length": 473.10418701171875, - "epoch": 0.109, - "grad_norm": 2.840139368335502, - "kl": 0.048583984375, - "learning_rate": 9.997779521645791e-07, - "loss": 0.0222, - "reward": 2.534466028213501, - "reward_std": 0.2846931293606758, - "rewards/accuracy_reward": 0.5833333432674408, - "rewards/reasoning_steps_reward": 0.979166716337204, - "rewards/repetition_penalty_reward": -0.028034010902047157, - "rewards/tag_count_reward": 1.0, + "completion_length": 413.6458435058594, + "epoch": 0.0545, + "grad_norm": 2.954116545096682, + "kl": 0.0186767578125, + "learning_rate": 5.45e-07, + "loss": 0.0025, + "reward": 2.002891957759857, + "reward_std": 0.39341901242733, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/reasoning_steps_reward": 0.2430555671453476, + "rewards/repetition_penalty_reward": -0.06828875839710236, + "rewards/tag_count_reward": 0.9947916865348816, "step": 109 }, { "clip_ratio": 0.0, - "completion_length": 565.9375305175781, - "epoch": 0.11, - "grad_norm": 2.2512653144943884, - "kl": 0.0401611328125, - "learning_rate": 9.997258721585931e-07, - "loss": 0.0075, - "reward": 2.7475154399871826, - "reward_std": 0.3540599048137665, - "rewards/accuracy_reward": 0.8333333730697632, - "rewards/reasoning_steps_reward": 0.979166716337204, - "rewards/repetition_penalty_reward": -0.04935969039797783, - "rewards/tag_count_reward": 0.9843750298023224, + "completion_length": 431.93751525878906, + "epoch": 0.055, + "grad_norm": 2.796630595163655, + "kl": 0.013763427734375, + "learning_rate": 5.5e-07, + "loss": -0.0342, + "reward": 2.094782531261444, + "reward_std": 0.32828887552022934, + "rewards/accuracy_reward": 0.8125, + "rewards/reasoning_steps_reward": 0.395833358168602, + "rewards/repetition_penalty_reward": -0.10834262520074844, + "rewards/tag_count_reward": 0.9947916865348816, "step": 110 }, { "clip_ratio": 0.0, - "completion_length": 553.3125, - "epoch": 0.111, - "grad_norm": 2.527203584806038, - "kl": 0.04296875, - "learning_rate": 9.996683123847795e-07, - "loss": 0.0512, - "reward": 2.7313212156295776, - "reward_std": 0.2547169327735901, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.04124833457171917, - "rewards/tag_count_reward": 0.9947916865348816, + "completion_length": 455.0, + "epoch": 0.0555, + "grad_norm": 3.3329482247244755, + "kl": 0.013427734375, + "learning_rate": 5.55e-07, + "loss": 0.1221, + "reward": 2.00520521402359, + "reward_std": 0.48712170124053955, + "rewards/accuracy_reward": 0.75, + "rewards/reasoning_steps_reward": 0.3541666865348816, + "rewards/repetition_penalty_reward": -0.07812817022204399, + "rewards/tag_count_reward": 0.9791666865348816, "step": 111 }, { "clip_ratio": 0.0, - "completion_length": 536.8958740234375, - "epoch": 0.112, - "grad_norm": 2.7442985147195307, - "kl": 0.044189453125, - "learning_rate": 9.996052735444862e-07, - "loss": 0.0349, - "reward": 2.7351317405700684, - "reward_std": 0.31766992807388306, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/reasoning_steps_reward": 0.951388955116272, - "rewards/repetition_penalty_reward": -0.03917396813631058, - "rewards/tag_count_reward": 0.9895833730697632, + "completion_length": 410.37501525878906, + "epoch": 0.056, + "grad_norm": 2.9159306101645845, + "kl": 0.01458740234375, + "learning_rate": 5.6e-07, + "loss": 0.0302, + "reward": 1.945899486541748, + "reward_std": 0.5274121165275574, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.3680555522441864, + "rewards/repetition_penalty_reward": -0.09923946484923363, + "rewards/tag_count_reward": 0.9895833432674408, "step": 112 }, { "clip_ratio": 0.0, - "completion_length": 633.9375, - "epoch": 0.113, - "grad_norm": 2.206627822892764, - "kl": 0.0399169921875, - "learning_rate": 9.995367564058216e-07, - "loss": 0.0002, - "reward": 2.4217275381088257, - "reward_std": 0.37440885603427887, - "rewards/accuracy_reward": 0.5000000223517418, - "rewards/reasoning_steps_reward": 0.972222238779068, - "rewards/repetition_penalty_reward": -0.050494687631726265, - "rewards/tag_count_reward": 1.0, + "completion_length": 430.8958435058594, + "epoch": 0.0565, + "grad_norm": 3.0211550455327836, + "kl": 0.02276611328125, + "learning_rate": 5.649999999999999e-07, + "loss": 0.0349, + "reward": 2.0766223669052124, + "reward_std": 0.3339201509952545, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.3263888955116272, + "rewards/repetition_penalty_reward": -0.07789156958460808, + "rewards/tag_count_reward": 0.9739583432674408, "step": 113 }, { "clip_ratio": 0.0, - "completion_length": 559.5000305175781, - "epoch": 0.114, - "grad_norm": 2.9323115322073567, - "kl": 0.048583984375, - "learning_rate": 9.994627618036452e-07, - "loss": 0.0838, - "reward": 2.7629553079605103, - "reward_std": 0.3546500727534294, - "rewards/accuracy_reward": 0.8333333730697632, - "rewards/reasoning_steps_reward": 0.972222238779068, - "rewards/repetition_penalty_reward": -0.04260038584470749, - "rewards/tag_count_reward": 1.0, + "completion_length": 421.3541717529297, + "epoch": 0.057, + "grad_norm": 2.946457972762416, + "kl": 0.0147705078125, + "learning_rate": 5.699999999999999e-07, + "loss": 0.012, + "reward": 1.9260443449020386, + "reward_std": 0.4190318286418915, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.402777761220932, + "rewards/repetition_penalty_reward": -0.07569188624620438, + "rewards/tag_count_reward": 0.9947916865348816, "step": 114 }, { "clip_ratio": 0.0, - "completion_length": 600.7291870117188, - "epoch": 0.115, - "grad_norm": 2.281029530481033, - "kl": 0.0433349609375, - "learning_rate": 9.993832906395582e-07, - "loss": 0.0435, - "reward": 2.6696189641952515, - "reward_std": 0.3239249736070633, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.05260329693555832, + "completion_length": 401.47918701171875, + "epoch": 0.0575, + "grad_norm": 2.776628513370855, + "kl": 0.01708984375, + "learning_rate": 5.749999999999999e-07, + "loss": -0.0217, + "reward": 2.1737382411956787, + "reward_std": 0.416288822889328, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 0.3680555820465088, + "rewards/repetition_penalty_reward": -0.09015080332756042, "rewards/tag_count_reward": 1.0, "step": 115 }, { "clip_ratio": 0.0, - "completion_length": 571.4791870117188, - "epoch": 0.116, - "grad_norm": 2.339412143248174, - "kl": 0.0447998046875, - "learning_rate": 9.992983438818915e-07, - "loss": -0.0459, - "reward": 2.695494771003723, - "reward_std": 0.3602631092071533, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 0.9722222089767456, - "rewards/repetition_penalty_reward": -0.05276927165687084, - "rewards/tag_count_reward": 0.984375, + "completion_length": 462.1458435058594, + "epoch": 0.058, + "grad_norm": 2.5888150833378383, + "kl": 0.01837158203125, + "learning_rate": 5.8e-07, + "loss": -0.0648, + "reward": 2.0337727069854736, + "reward_std": 0.3629211187362671, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/reasoning_steps_reward": 0.291666679084301, + "rewards/repetition_penalty_reward": -0.09122735634446144, + "rewards/tag_count_reward": 1.0, "step": 116 }, { "clip_ratio": 0.0, - "completion_length": 569.2291870117188, - "epoch": 0.117, - "grad_norm": 2.6819516393809444, - "kl": 0.0435791015625, - "learning_rate": 9.992079225656944e-07, - "loss": 0.0445, - "reward": 2.677255153656006, - "reward_std": 0.2550307661294937, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9861111044883728, - "rewards/repetition_penalty_reward": -0.038022806867957115, + "completion_length": 395.1041717529297, + "epoch": 0.0585, + "grad_norm": 3.1651475575486065, + "kl": 0.02288818359375, + "learning_rate": 5.849999999999999e-07, + "loss": 0.0276, + "reward": 2.0250861048698425, + "reward_std": 0.385862335562706, + "rewards/accuracy_reward": 0.8125, + "rewards/reasoning_steps_reward": 0.284722238779068, + "rewards/repetition_penalty_reward": -0.07213617861270905, "rewards/tag_count_reward": 1.0, "step": 117 }, { "clip_ratio": 0.0, - "completion_length": 606.5, - "epoch": 0.118, - "grad_norm": 2.7553746000465016, - "kl": 0.0418701171875, - "learning_rate": 9.991120277927223e-07, - "loss": 0.0683, - "reward": 2.683447241783142, - "reward_std": 0.4267844557762146, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/reasoning_steps_reward": 0.979166716337204, - "rewards/repetition_penalty_reward": -0.045719537883996964, + "completion_length": 405.50001525878906, + "epoch": 0.059, + "grad_norm": 2.995948787512775, + "kl": 0.01568603515625, + "learning_rate": 5.9e-07, + "loss": 0.0545, + "reward": 2.079026162624359, + "reward_std": 0.4058743417263031, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.3680555745959282, + "rewards/repetition_penalty_reward": -0.08069606870412827, "rewards/tag_count_reward": 1.0, "step": 118 }, { "clip_ratio": 0.0, - "completion_length": 589.4583740234375, - "epoch": 0.119, - "grad_norm": 2.5283730612634168, - "kl": 0.0426025390625, - "learning_rate": 9.990106607314225e-07, - "loss": -0.0626, - "reward": 2.5398285388946533, - "reward_std": 0.37042136490345, - "rewards/accuracy_reward": 0.6250000149011612, - "rewards/reasoning_steps_reward": 0.9652778208255768, - "rewards/repetition_penalty_reward": -0.05044914782047272, + "completion_length": 427.1458435058594, + "epoch": 0.0595, + "grad_norm": 3.2921667824300074, + "kl": 0.02056884765625, + "learning_rate": 5.949999999999999e-07, + "loss": 0.0068, + "reward": 2.110425293445587, + "reward_std": 0.4844990372657776, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.4305555671453476, + "rewards/repetition_penalty_reward": -0.07013046741485596, "rewards/tag_count_reward": 1.0, "step": 119 }, { "clip_ratio": 0.0, - "completion_length": 581.7500305175781, - "epoch": 0.12, - "grad_norm": 2.649956713864194, - "kl": 0.04931640625, - "learning_rate": 9.989038226169207e-07, - "loss": 0.0635, - "reward": 2.569565773010254, - "reward_std": 0.4911084771156311, - "rewards/accuracy_reward": 0.6458333730697632, - "rewards/reasoning_steps_reward": 0.9652778208255768, - "rewards/repetition_penalty_reward": -0.04154558852314949, + "completion_length": 394.79168701171875, + "epoch": 0.06, + "grad_norm": 3.0710450999617622, + "kl": 0.01605224609375, + "learning_rate": 6e-07, + "loss": 0.0034, + "reward": 1.7129506468772888, + "reward_std": 0.3703618347644806, + "rewards/accuracy_reward": 0.4375, + "rewards/reasoning_steps_reward": 0.3472222238779068, + "rewards/repetition_penalty_reward": -0.07177170366048813, "rewards/tag_count_reward": 1.0, "step": 120 }, { "clip_ratio": 0.0, - "completion_length": 636.6666870117188, - "epoch": 0.121, - "grad_norm": 2.7429948870181233, - "kl": 0.0418701171875, - "learning_rate": 9.98791514751006e-07, - "loss": 0.13, - "reward": 2.663628339767456, - "reward_std": 0.396682009100914, - "rewards/accuracy_reward": 0.75, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.060329992324113846, - "rewards/tag_count_reward": 0.9947916865348816, + "completion_length": 441.18751525878906, + "epoch": 0.0605, + "grad_norm": 2.7064381875532835, + "kl": 0.02081298828125, + "learning_rate": 6.049999999999999e-07, + "loss": 0.0009, + "reward": 1.8438260555267334, + "reward_std": 0.46181726455688477, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 0.3055555522441864, + "rewards/repetition_penalty_reward": -0.08672957122325897, + "rewards/tag_count_reward": 1.0, "step": 121 }, { "clip_ratio": 0.0, - "completion_length": 596.2083435058594, - "epoch": 0.122, - "grad_norm": 2.4221170538253816, - "kl": 0.045166015625, - "learning_rate": 9.98673738502114e-07, - "loss": 0.0342, - "reward": 2.6282721757888794, - "reward_std": 0.46036189794540405, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.04707522317767143, - "rewards/tag_count_reward": 0.9947916865348816, + "completion_length": 453.5833435058594, + "epoch": 0.061, + "grad_norm": 2.9942145930271735, + "kl": 0.01763916015625, + "learning_rate": 6.1e-07, + "loss": -0.0401, + "reward": 1.858453392982483, + "reward_std": 0.4963291585445404, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.361111119389534, + "rewards/repetition_penalty_reward": -0.07036612182855606, + "rewards/tag_count_reward": 0.9843750298023224, "step": 122 }, { "clip_ratio": 0.0, - "completion_length": 610.0416870117188, - "epoch": 0.123, - "grad_norm": 2.268992560964926, - "kl": 0.045166015625, - "learning_rate": 9.985504953053113e-07, - "loss": -0.0075, - "reward": 2.5075987577438354, - "reward_std": 0.4839261472225189, - "rewards/accuracy_reward": 0.6041666865348816, - "rewards/reasoning_steps_reward": 0.9513888955116272, - "rewards/repetition_penalty_reward": -0.047956960275769234, - "rewards/tag_count_reward": 1.0, + "completion_length": 440.72918701171875, + "epoch": 0.0615, + "grad_norm": 2.591891405826925, + "kl": 0.0142822265625, + "learning_rate": 6.149999999999999e-07, + "loss": 0.0201, + "reward": 2.2280489206314087, + "reward_std": 0.35533128678798676, + "rewards/accuracy_reward": 0.8750000298023224, + "rewards/reasoning_steps_reward": 0.4583333730697632, + "rewards/repetition_penalty_reward": -0.08445119112730026, + "rewards/tag_count_reward": 0.9791666865348816, "step": 123 }, { "clip_ratio": 0.0, - "completion_length": 637.8333435058594, - "epoch": 0.124, - "grad_norm": 2.3192412067695276, - "kl": 0.040283203125, - "learning_rate": 9.98421786662277e-07, - "loss": 0.0276, - "reward": 2.580567240715027, - "reward_std": 0.40886104106903076, - "rewards/accuracy_reward": 0.6875000298023224, - "rewards/reasoning_steps_reward": 0.9583333432674408, - "rewards/repetition_penalty_reward": -0.060057852417230606, - "rewards/tag_count_reward": 0.9947916865348816, + "completion_length": 507.8958435058594, + "epoch": 0.062, + "grad_norm": 2.5201772155206448, + "kl": 0.01611328125, + "learning_rate": 6.2e-07, + "loss": 0.0014, + "reward": 2.040831744670868, + "reward_std": 0.43686023354530334, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.4375, + "rewards/repetition_penalty_reward": -0.10500159859657288, + "rewards/tag_count_reward": 1.0, "step": 124 }, { "clip_ratio": 0.0, - "completion_length": 550.6875305175781, - "epoch": 0.125, - "grad_norm": 2.6428669262747935, - "kl": 0.0528564453125, - "learning_rate": 9.982876141412855e-07, - "loss": -0.021, - "reward": 2.6972975730895996, - "reward_std": 0.30313703417778015, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 0.972222238779068, - "rewards/repetition_penalty_reward": -0.0353414136916399, - "rewards/tag_count_reward": 0.9895833432674408, + "completion_length": 403.1041717529297, + "epoch": 0.0625, + "grad_norm": 2.930044284566675, + "kl": 0.0194091796875, + "learning_rate": 6.249999999999999e-07, + "loss": 0.0418, + "reward": 1.9074211716651917, + "reward_std": 0.5366443991661072, + "rewards/accuracy_reward": 0.6458333730697632, + "rewards/reasoning_steps_reward": 0.3611111342906952, + "rewards/repetition_penalty_reward": -0.08389833942055702, + "rewards/tag_count_reward": 0.9843750298023224, "step": 125 }, { "clip_ratio": 0.0, - "completion_length": 680.6041870117188, - "epoch": 0.126, - "grad_norm": 2.3227570325460047, - "kl": 0.0430908203125, - "learning_rate": 9.981479793771866e-07, - "loss": 0.0752, - "reward": 2.591596007347107, - "reward_std": 0.49062085151672363, - "rewards/accuracy_reward": 0.6875, - "rewards/reasoning_steps_reward": 0.9583333730697632, - "rewards/repetition_penalty_reward": -0.04902906343340874, - "rewards/tag_count_reward": 0.9947916865348816, + "completion_length": 401.6458435058594, + "epoch": 0.063, + "grad_norm": 3.0687595839670165, + "kl": 0.02783203125, + "learning_rate": 6.3e-07, + "loss": -0.1223, + "reward": 2.101858079433441, + "reward_std": 0.38748544454574585, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.3958333879709244, + "rewards/repetition_penalty_reward": -0.09085042029619217, + "rewards/tag_count_reward": 0.984375, "step": 126 }, { "clip_ratio": 0.0, - "completion_length": 567.7291870117188, - "epoch": 0.127, - "grad_norm": 2.5341618125591894, - "kl": 0.0494384765625, - "learning_rate": 9.98002884071386e-07, - "loss": 0.0338, - "reward": 2.735344886779785, - "reward_std": 0.3433926999568939, + "completion_length": 404.7291717529297, + "epoch": 0.0635, + "grad_norm": 2.876756319174346, + "kl": 0.019561767578125, + "learning_rate": 6.35e-07, + "loss": 0.0507, + "reward": 2.075514793395996, + "reward_std": 0.3558611124753952, "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 0.9861111044883728, - "rewards/repetition_penalty_reward": -0.04243295639753342, + "rewards/reasoning_steps_reward": 0.3541666716337204, + "rewards/repetition_penalty_reward": -0.0703187882900238, "rewards/tag_count_reward": 1.0, "step": 127 }, { "clip_ratio": 0.0, - "completion_length": 611.7291870117188, - "epoch": 0.128, - "grad_norm": 84.62793212916526, - "kl": 0.50634765625, - "learning_rate": 9.97852329991824e-07, - "loss": 0.0161, - "reward": 2.649895191192627, - "reward_std": 0.4249277561903, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.04628550261259079, - "rewards/tag_count_reward": 0.9947916865348816, + "completion_length": 483.1875, + "epoch": 0.064, + "grad_norm": 2.722649342936597, + "kl": 0.0159912109375, + "learning_rate": 6.4e-07, + "loss": 0.0157, + "reward": 1.942257821559906, + "reward_std": 0.45219163596630096, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.4375000149011612, + "rewards/repetition_penalty_reward": -0.09940891712903976, + "rewards/tag_count_reward": 1.0, "step": 128 }, { "clip_ratio": 0.0, - "completion_length": 587.2083740234375, - "epoch": 0.129, - "grad_norm": 8.417728255751756, - "kl": 0.0889892578125, - "learning_rate": 9.976963189729547e-07, - "loss": 0.0735, - "reward": 2.7577908039093018, - "reward_std": 0.3766750693321228, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 0.9444445371627808, - "rewards/repetition_penalty_reward": -0.04082038067281246, - "rewards/tag_count_reward": 1.0, + "completion_length": 428.5416717529297, + "epoch": 0.0645, + "grad_norm": 2.721100537467751, + "kl": 0.02197265625, + "learning_rate": 6.45e-07, + "loss": -0.0335, + "reward": 1.7678640484809875, + "reward_std": 0.4747384488582611, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.3125000149011612, + "rewards/repetition_penalty_reward": -0.0706777349114418, + "rewards/tag_count_reward": 0.984375, "step": 129 }, { "clip_ratio": 0.0, - "completion_length": 592.8958435058594, - "epoch": 0.13, - "grad_norm": 2.422421343225394, - "kl": 0.047607421875, - "learning_rate": 9.975348529157229e-07, - "loss": 0.0476, - "reward": 2.71326220035553, - "reward_std": 0.3403441533446312, + "completion_length": 435.2083435058594, + "epoch": 0.065, + "grad_norm": 2.617452029309224, + "kl": 0.0174560546875, + "learning_rate": 6.5e-07, + "loss": -0.0038, + "reward": 2.130853533744812, + "reward_std": 0.4503345489501953, "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.05236278846859932, - "rewards/tag_count_reward": 0.9947916865348816, + "rewards/reasoning_steps_reward": 0.4166666865348816, + "rewards/repetition_penalty_reward": -0.07747986912727356, + "rewards/tag_count_reward": 1.0, "step": 130 }, { "clip_ratio": 0.0, - "completion_length": 592.1250305175781, - "epoch": 0.131, - "grad_norm": 3.1437399948807276, - "kl": 0.0548095703125, - "learning_rate": 9.973679337875418e-07, - "loss": 0.0466, - "reward": 2.6631596088409424, - "reward_std": 0.37047071754932404, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.04170159809291363, - "rewards/tag_count_reward": 0.9895833432674408, + "completion_length": 397.7083435058594, + "epoch": 0.0655, + "grad_norm": 2.808732238238703, + "kl": 0.02032470703125, + "learning_rate": 6.55e-07, + "loss": -0.011, + "reward": 2.1196643710136414, + "reward_std": 0.40009158849716187, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.4027778059244156, + "rewards/repetition_penalty_reward": -0.07478011026978493, + "rewards/tag_count_reward": 1.0, "step": 131 }, { "clip_ratio": 0.0, - "completion_length": 531.9791870117188, - "epoch": 0.132, - "grad_norm": 2.5320955473297437, - "kl": 0.052490234375, - "learning_rate": 9.971955636222684e-07, - "loss": 0.0346, - "reward": 2.73369300365448, - "reward_std": 0.3214843422174454, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 0.9861111044883728, - "rewards/repetition_penalty_reward": -0.04408488981425762, + "completion_length": 428.5625, + "epoch": 0.066, + "grad_norm": 2.6151338968076194, + "kl": 0.0184326171875, + "learning_rate": 6.6e-07, + "loss": 0.0189, + "reward": 1.877784252166748, + "reward_std": 0.3097234219312668, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/reasoning_steps_reward": 0.2916666865348816, + "rewards/repetition_penalty_reward": -0.08054918050765991, "rewards/tag_count_reward": 1.0, "step": 132 }, { "clip_ratio": 0.0, - "completion_length": 654.1875305175781, - "epoch": 0.133, - "grad_norm": 2.3742871546515545, - "kl": 0.047607421875, - "learning_rate": 9.970177445201783e-07, - "loss": 0.0042, - "reward": 2.523491144180298, - "reward_std": 0.49983178079128265, - "rewards/accuracy_reward": 0.5833333730697632, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.045953478664159775, + "completion_length": 434.8333435058594, + "epoch": 0.0665, + "grad_norm": 3.154952772902957, + "kl": 0.02117919921875, + "learning_rate": 6.65e-07, + "loss": 0.0818, + "reward": 2.2022292613983154, + "reward_std": 0.3770638406276703, + "rewards/accuracy_reward": 0.8333333730697632, + "rewards/reasoning_steps_reward": 0.4375000149011612, + "rewards/repetition_penalty_reward": -0.06860406324267387, "rewards/tag_count_reward": 1.0, "step": 133 }, { "clip_ratio": 0.0, - "completion_length": 567.6458740234375, - "epoch": 0.134, - "grad_norm": 2.5745090813340257, - "kl": 0.052734375, - "learning_rate": 9.968344786479415e-07, - "loss": 0.0497, - "reward": 2.654099702835083, - "reward_std": 0.32977308332920074, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.05770585313439369, - "rewards/tag_count_reward": 0.9895833730697632, + "completion_length": 401.87501525878906, + "epoch": 0.067, + "grad_norm": 2.764724446368728, + "kl": 0.01763916015625, + "learning_rate": 6.7e-07, + "loss": -0.0843, + "reward": 1.9817876815795898, + "reward_std": 0.39721402525901794, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.409722238779068, + "rewards/repetition_penalty_reward": -0.09460122510790825, + "rewards/tag_count_reward": 1.0, "step": 134 }, { "clip_ratio": 0.0, - "completion_length": 591.4375305175781, - "epoch": 0.135, - "grad_norm": 2.613972316646787, - "kl": 0.0504150390625, - "learning_rate": 9.96645768238595e-07, - "loss": 0.0264, - "reward": 2.715443730354309, - "reward_std": 0.4562966376543045, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.04323691129684448, + "completion_length": 465.7708435058594, + "epoch": 0.0675, + "grad_norm": 2.3989523347162534, + "kl": 0.01953125, + "learning_rate": 6.75e-07, + "loss": -0.0177, + "reward": 2.228469967842102, + "reward_std": 0.3507627248764038, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.534722238779068, + "rewards/repetition_penalty_reward": -0.0927107036113739, "rewards/tag_count_reward": 0.9947916865348816, "step": 135 }, { "clip_ratio": 0.0, - "completion_length": 608.5208435058594, - "epoch": 0.136, - "grad_norm": 2.405934822243009, - "kl": 0.0482177734375, - "learning_rate": 9.964516155915151e-07, - "loss": 0.0707, - "reward": 2.7309588193893433, - "reward_std": 0.42438623309135437, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 0.9652778208255768, - "rewards/repetition_penalty_reward": -0.06244398467242718, - "rewards/tag_count_reward": 0.9739583432674408, + "completion_length": 391.06251525878906, + "epoch": 0.068, + "grad_norm": 3.0855932954768965, + "kl": 0.01800537109375, + "learning_rate": 6.800000000000001e-07, + "loss": 0.0166, + "reward": 2.255845546722412, + "reward_std": 0.4071827381849289, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.4166666716337204, + "rewards/repetition_penalty_reward": -0.07748781517148018, + "rewards/tag_count_reward": 1.0, "step": 136 }, { "clip_ratio": 0.0, - "completion_length": 698.9375305175781, - "epoch": 0.137, - "grad_norm": 2.4898466567094255, - "kl": 0.041259765625, - "learning_rate": 9.962520230723906e-07, - "loss": -0.0047, - "reward": 2.7930511236190796, - "reward_std": 0.3140558898448944, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.06458798050880432, - "rewards/tag_count_reward": 0.9895833432674408, + "completion_length": 446.2708435058594, + "epoch": 0.0685, + "grad_norm": 2.9929865598477066, + "kl": 0.019287109375, + "learning_rate": 6.85e-07, + "loss": -0.0368, + "reward": 2.1877795457839966, + "reward_std": 0.5341714322566986, + "rewards/accuracy_reward": 0.8125, + "rewards/reasoning_steps_reward": 0.4513889104127884, + "rewards/repetition_penalty_reward": -0.07610936462879181, + "rewards/tag_count_reward": 1.0, "step": 137 }, { "clip_ratio": 0.0, - "completion_length": 529.0208435058594, - "epoch": 0.138, - "grad_norm": 2.8868557758723306, - "kl": 0.0540771484375, - "learning_rate": 9.960469931131936e-07, - "loss": 0.0636, - "reward": 2.6102973222732544, - "reward_std": 0.42000116407871246, - "rewards/accuracy_reward": 0.7083333730697632, - "rewards/reasoning_steps_reward": 0.944444477558136, - "rewards/repetition_penalty_reward": -0.0372721990570426, + "completion_length": 397.7083435058594, + "epoch": 0.069, + "grad_norm": 2.954697616837276, + "kl": 0.02471923828125, + "learning_rate": 6.9e-07, + "loss": 0.009, + "reward": 1.824935793876648, + "reward_std": 0.49531859159469604, + "rewards/accuracy_reward": 0.4166666865348816, + "rewards/reasoning_steps_reward": 0.5000000596046448, + "rewards/repetition_penalty_reward": -0.08652260527014732, "rewards/tag_count_reward": 0.9947916865348816, "step": 138 }, { "clip_ratio": 0.0, - "completion_length": 525.6875305175781, - "epoch": 0.139, - "grad_norm": 2.8464042704145114, - "kl": 0.058349609375, - "learning_rate": 9.958365282121496e-07, - "loss": 0.0587, - "reward": 2.6924026012420654, - "reward_std": 0.3652355223894119, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 0.9652778208255768, - "rewards/repetition_penalty_reward": -0.04370862618088722, - "rewards/tag_count_reward": 1.0, + "completion_length": 548.0625152587891, + "epoch": 0.0695, + "grad_norm": 2.506876897622573, + "kl": 0.014892578125, + "learning_rate": 6.949999999999999e-07, + "loss": -0.054, + "reward": 1.6296937465667725, + "reward_std": 0.4967469274997711, + "rewards/accuracy_reward": 0.2708333432674408, + "rewards/reasoning_steps_reward": 0.451388880610466, + "rewards/repetition_penalty_reward": -0.08732018992304802, + "rewards/tag_count_reward": 0.9947916865348816, "step": 139 }, { "clip_ratio": 0.0, - "completion_length": 623.0416870117188, - "epoch": 0.14, - "grad_norm": 2.5535766431944555, - "kl": 0.05078125, - "learning_rate": 9.956206309337066e-07, - "loss": -0.0369, - "reward": 2.5890008211135864, - "reward_std": 0.43424585461616516, - "rewards/accuracy_reward": 0.6458333432674408, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.049888189882040024, - "rewards/tag_count_reward": 1.0, + "completion_length": 435.0833435058594, + "epoch": 0.07, + "grad_norm": 2.7570820251386037, + "kl": 0.02325439453125, + "learning_rate": 7e-07, + "loss": 0.0036, + "reward": 2.1854411363601685, + "reward_std": 0.4105776250362396, + "rewards/accuracy_reward": 0.75, + "rewards/reasoning_steps_reward": 0.5138888657093048, + "rewards/repetition_penalty_reward": -0.0732395276427269, + "rewards/tag_count_reward": 0.9947916865348816, "step": 140 }, { "clip_ratio": 0.0, - "completion_length": 527.7291870117188, - "epoch": 0.141, - "grad_norm": 2.5331968568084804, - "kl": 0.055419921875, - "learning_rate": 9.953993039085048e-07, - "loss": 0.0256, - "reward": 2.5839684009552, - "reward_std": 0.30562296509742737, - "rewards/accuracy_reward": 0.6458333432674408, - "rewards/reasoning_steps_reward": 0.9861111044883728, - "rewards/repetition_penalty_reward": -0.0427678357809782, - "rewards/tag_count_reward": 0.9947916865348816, + "completion_length": 388.2291717529297, + "epoch": 0.0705, + "grad_norm": 2.6297348721129463, + "kl": 0.0208740234375, + "learning_rate": 7.049999999999999e-07, + "loss": -0.0319, + "reward": 1.8632826209068298, + "reward_std": 0.35653068125247955, + "rewards/accuracy_reward": 0.5, + "rewards/reasoning_steps_reward": 0.4652777910232544, + "rewards/repetition_penalty_reward": -0.10199519246816635, + "rewards/tag_count_reward": 1.0, "step": 141 }, { "clip_ratio": 0.0, - "completion_length": 631.0625152587891, - "epoch": 0.142, - "grad_norm": 3.9726646160122283, - "kl": 0.0572509765625, - "learning_rate": 9.951725498333448e-07, - "loss": 0.0009, - "reward": 2.697226047515869, - "reward_std": 0.40920713543891907, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 0.944444477558136, - "rewards/repetition_penalty_reward": -0.03888525255024433, + "completion_length": 405.81251525878906, + "epoch": 0.071, + "grad_norm": 3.135261611976366, + "kl": 0.02117919921875, + "learning_rate": 7.1e-07, + "loss": -0.0326, + "reward": 2.321570873260498, + "reward_std": 0.41436049342155457, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.5277777910232544, + "rewards/repetition_penalty_reward": -0.06037369184195995, "rewards/tag_count_reward": 1.0, "step": 142 }, { "clip_ratio": 0.0, - "completion_length": 608.5416870117188, - "epoch": 0.143, - "grad_norm": 2.415561560430353, - "kl": 0.052734375, - "learning_rate": 9.949403714711526e-07, - "loss": 0.0716, - "reward": 2.7793259620666504, - "reward_std": 0.3527311235666275, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.045327022671699524, - "rewards/tag_count_reward": 0.9843750298023224, + "completion_length": 367.7083435058594, + "epoch": 0.0715, + "grad_norm": 2.8965793866155183, + "kl": 0.02557373046875, + "learning_rate": 7.149999999999999e-07, + "loss": 0.003, + "reward": 2.2131210565567017, + "reward_std": 0.3542313575744629, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.541666716337204, + "rewards/repetition_penalty_reward": -0.05771246552467346, + "rewards/tag_count_reward": 1.0, "step": 143 }, { "clip_ratio": 0.0, - "completion_length": 589.875, - "epoch": 0.144, - "grad_norm": 2.9100333446880744, - "kl": 0.063232421875, - "learning_rate": 9.947027716509488e-07, - "loss": 0.0006, - "reward": 2.563896417617798, - "reward_std": 0.4373708665370941, - "rewards/accuracy_reward": 0.6875, - "rewards/reasoning_steps_reward": 0.9305556118488312, - "rewards/repetition_penalty_reward": -0.043742526322603226, - "rewards/tag_count_reward": 0.9895833432674408, + "completion_length": 401.12501525878906, + "epoch": 0.072, + "grad_norm": 3.04746917441956, + "kl": 0.02337646484375, + "learning_rate": 7.2e-07, + "loss": 0.0068, + "reward": 2.1876049041748047, + "reward_std": 0.4434930086135864, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/reasoning_steps_reward": 0.43055559694767, + "rewards/repetition_penalty_reward": -0.0762840211391449, + "rewards/tag_count_reward": 1.0, "step": 144 }, { "clip_ratio": 0.0, - "completion_length": 566.2916870117188, - "epoch": 0.145, - "grad_norm": 2.4477067800919987, - "kl": 0.0556640625, - "learning_rate": 9.944597532678119e-07, - "loss": -0.0456, - "reward": 2.466762065887451, - "reward_std": 0.48124293982982635, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/reasoning_steps_reward": 0.965277761220932, - "rewards/repetition_penalty_reward": -0.040182387456297874, + "completion_length": 423.3333435058594, + "epoch": 0.0725, + "grad_norm": 2.7345052559782155, + "kl": 0.02398681640625, + "learning_rate": 7.249999999999999e-07, + "loss": -0.0068, + "reward": 2.092087507247925, + "reward_std": 0.38775935769081116, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.548611119389534, + "rewards/repetition_penalty_reward": -0.06069031357765198, "rewards/tag_count_reward": 1.0, "step": 145 }, { "clip_ratio": 0.0, - "completion_length": 527.2708740234375, - "epoch": 0.146, - "grad_norm": 2.735483077486635, - "kl": 0.06005859375, - "learning_rate": 9.942113192828444e-07, - "loss": 0.0496, - "reward": 2.605388879776001, - "reward_std": 0.47325506806373596, - "rewards/accuracy_reward": 0.6458333730697632, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.0335000604391098, - "rewards/tag_count_reward": 1.0, + "completion_length": 423.06251525878906, + "epoch": 0.073, + "grad_norm": 2.9633251544566503, + "kl": 0.02386474609375, + "learning_rate": 7.3e-07, + "loss": 0.0508, + "reward": 1.913641095161438, + "reward_std": 0.47340986132621765, + "rewards/accuracy_reward": 0.4791666865348816, + "rewards/reasoning_steps_reward": 0.5138888955116272, + "rewards/repetition_penalty_reward": -0.07420631498098373, + "rewards/tag_count_reward": 0.9947916865348816, "step": 146 }, { "clip_ratio": 0.0, - "completion_length": 508.66668701171875, - "epoch": 0.147, - "grad_norm": 2.5279842832232133, - "kl": 0.0618896484375, - "learning_rate": 9.939574727231362e-07, - "loss": -0.0023, - "reward": 2.484343409538269, - "reward_std": 0.5071098208427429, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.04343440383672714, - "rewards/tag_count_reward": 1.0, + "completion_length": 466.04168701171875, + "epoch": 0.0735, + "grad_norm": 2.73697900758412, + "kl": 0.0244140625, + "learning_rate": 7.35e-07, + "loss": -0.0118, + "reward": 2.186763048171997, + "reward_std": 0.4524783492088318, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.5625000596046448, + "rewards/repetition_penalty_reward": -0.09969542920589447, + "rewards/tag_count_reward": 0.9947916865348816, "step": 147 }, { "clip_ratio": 0.0, - "completion_length": 525.5000305175781, - "epoch": 0.148, - "grad_norm": 2.2840152072866364, - "kl": 0.057373046875, - "learning_rate": 9.93698216681727e-07, - "loss": 0.0172, - "reward": 2.810342311859131, - "reward_std": 0.24958577752113342, - "rewards/accuracy_reward": 0.8750000298023224, - "rewards/reasoning_steps_reward": 0.9652778506278992, - "rewards/repetition_penalty_reward": -0.029935498721897602, + "completion_length": 450.1041717529297, + "epoch": 0.074, + "grad_norm": 2.7745954642868256, + "kl": 0.02392578125, + "learning_rate": 7.4e-07, + "loss": 0.0112, + "reward": 1.895469844341278, + "reward_std": 0.42417220771312714, + "rewards/accuracy_reward": 0.4791666865348816, + "rewards/reasoning_steps_reward": 0.5069444626569748, + "rewards/repetition_penalty_reward": -0.09064128994941711, "rewards/tag_count_reward": 1.0, "step": 148 }, { "clip_ratio": 0.0, - "completion_length": 587.7083435058594, - "epoch": 0.149, - "grad_norm": 2.495321625160956, - "kl": 0.055908203125, - "learning_rate": 9.934335543175705e-07, - "loss": 0.0162, - "reward": 2.811280369758606, - "reward_std": 0.40344125032424927, - "rewards/accuracy_reward": 0.875, - "rewards/reasoning_steps_reward": 0.9861111044883728, - "rewards/repetition_penalty_reward": -0.03941434063017368, - "rewards/tag_count_reward": 0.9895833432674408, + "completion_length": 445.29168701171875, + "epoch": 0.0745, + "grad_norm": 2.6014655366522357, + "kl": 0.02288818359375, + "learning_rate": 7.45e-07, + "loss": 0.015, + "reward": 2.204192638397217, + "reward_std": 0.38898123800754547, + "rewards/accuracy_reward": 0.8333333730697632, + "rewards/reasoning_steps_reward": 0.451388880610466, + "rewards/repetition_penalty_reward": -0.08052962273359299, + "rewards/tag_count_reward": 1.0, "step": 149 }, { "clip_ratio": 0.0, - "completion_length": 634.9166870117188, - "epoch": 0.15, - "grad_norm": 2.2429641351216834, - "kl": 0.0501708984375, - "learning_rate": 9.931634888554935e-07, - "loss": 0.0582, - "reward": 2.6869924068450928, - "reward_std": 0.39581531286239624, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 0.9722222089767456, - "rewards/repetition_penalty_reward": -0.056063249707221985, + "completion_length": 455.0208435058594, + "epoch": 0.075, + "grad_norm": 2.5737888201247574, + "kl": 0.02252197265625, + "learning_rate": 7.5e-07, + "loss": -0.0142, + "reward": 2.2747018337249756, + "reward_std": 0.37661734223365784, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/reasoning_steps_reward": 0.5277777910232544, + "rewards/repetition_penalty_reward": -0.08640927076339722, "rewards/tag_count_reward": 1.0, "step": 150 }, { "clip_ratio": 0.0, - "completion_length": 500.54168701171875, - "epoch": 0.151, - "grad_norm": 2.468490484200265, - "kl": 0.0621337890625, - "learning_rate": 9.928880235861588e-07, - "loss": 0.1059, - "reward": 2.7397245168685913, - "reward_std": 0.32457810640335083, - "rewards/accuracy_reward": 0.8125, - "rewards/reasoning_steps_reward": 0.9652778506278992, - "rewards/repetition_penalty_reward": -0.03805328160524368, + "completion_length": 380.35418701171875, + "epoch": 0.0755, + "grad_norm": 3.109877896725919, + "kl": 0.030029296875, + "learning_rate": 7.55e-07, + "loss": 0.0968, + "reward": 2.1878127455711365, + "reward_std": 0.4361240267753601, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.5208334028720856, + "rewards/repetition_penalty_reward": -0.0830206349492073, "rewards/tag_count_reward": 1.0, "step": 151 }, { "clip_ratio": 0.0, - "completion_length": 566.6875, - "epoch": 0.152, - "grad_norm": 2.5723071707401126, - "kl": 0.05517578125, - "learning_rate": 9.926071618660237e-07, - "loss": 0.0205, - "reward": 2.7304086685180664, - "reward_std": 0.34727388620376587, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 0.979166716337204, - "rewards/repetition_penalty_reward": -0.04042468871921301, - "rewards/tag_count_reward": 1.0, + "completion_length": 405.3541717529297, + "epoch": 0.076, + "grad_norm": 2.883829245260866, + "kl": 0.02972412109375, + "learning_rate": 7.599999999999999e-07, + "loss": 0.0348, + "reward": 2.174973964691162, + "reward_std": 0.47633519768714905, + "rewards/accuracy_reward": 0.7083333730697632, + "rewards/reasoning_steps_reward": 0.5416667461395264, + "rewards/repetition_penalty_reward": -0.06460951268672943, + "rewards/tag_count_reward": 0.9895833432674408, "step": 152 }, { "clip_ratio": 0.0, - "completion_length": 591.1875, - "epoch": 0.153, - "grad_norm": 2.2620129683268595, - "kl": 0.058349609375, - "learning_rate": 9.923209071172994e-07, - "loss": 0.0377, - "reward": 2.5389167070388794, - "reward_std": 0.3387536555528641, - "rewards/accuracy_reward": 0.6041666716337204, - "rewards/reasoning_steps_reward": 0.979166716337204, - "rewards/repetition_penalty_reward": -0.044416630640625954, - "rewards/tag_count_reward": 1.0, + "completion_length": 400.0833435058594, + "epoch": 0.0765, + "grad_norm": 11.818146755937764, + "kl": 0.3184814453125, + "learning_rate": 7.65e-07, + "loss": -0.0353, + "reward": 1.938881516456604, + "reward_std": 0.4211771637201309, + "rewards/accuracy_reward": 0.7083333730697632, + "rewards/reasoning_steps_reward": 0.4027778059244156, + "rewards/repetition_penalty_reward": -0.11493790149688721, + "rewards/tag_count_reward": 0.9427083432674408, "step": 153 }, { "clip_ratio": 0.0, - "completion_length": 565.8958435058594, - "epoch": 0.154, - "grad_norm": 2.7014878630289547, - "kl": 0.064453125, - "learning_rate": 9.9202926282791e-07, - "loss": 0.0875, - "reward": 2.7602481842041016, - "reward_std": 0.4075208753347397, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/reasoning_steps_reward": 0.9861111044883728, - "rewards/repetition_penalty_reward": -0.04357131011784077, - "rewards/tag_count_reward": 0.984375, + "completion_length": 350.4791793823242, + "epoch": 0.077, + "grad_norm": 9.783552011785373, + "kl": 0.1412353515625, + "learning_rate": 7.699999999999999e-07, + "loss": 0.0569, + "reward": 1.8291372656822205, + "reward_std": 0.3712882995605469, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.4722222685813904, + "rewards/repetition_penalty_reward": -0.08579343557357788, + "rewards/tag_count_reward": 0.8593750298023224, "step": 154 }, { "clip_ratio": 0.0, - "completion_length": 558.3541870117188, - "epoch": 0.155, - "grad_norm": 2.6542156680058486, - "kl": 0.060791015625, - "learning_rate": 9.917322325514487e-07, - "loss": 0.0558, - "reward": 2.7091113328933716, - "reward_std": 0.3156071752309799, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.03568043000996113, - "rewards/tag_count_reward": 0.9947916865348816, + "completion_length": 389.3333435058594, + "epoch": 0.0775, + "grad_norm": 2.8193374221463054, + "kl": 0.027099609375, + "learning_rate": 7.75e-07, + "loss": 0.0898, + "reward": 2.1005048751831055, + "reward_std": 0.42864808440208435, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.4930555820465088, + "rewards/repetition_penalty_reward": -0.0696341022849083, + "rewards/tag_count_reward": 0.9895833432674408, "step": 155 }, { "clip_ratio": 0.0, - "completion_length": 547.8333435058594, - "epoch": 0.156, - "grad_norm": 2.6130648886630397, - "kl": 0.0601806640625, - "learning_rate": 9.91429819907136e-07, - "loss": -0.002, - "reward": 2.6609745025634766, - "reward_std": 0.2819754481315613, - "rewards/accuracy_reward": 0.708333358168602, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.04041440971195698, - "rewards/tag_count_reward": 1.0, + "completion_length": 389.79168701171875, + "epoch": 0.078, + "grad_norm": 3.00650124802611, + "kl": 0.0260009765625, + "learning_rate": 7.799999999999999e-07, + "loss": 0.0294, + "reward": 2.472437024116516, + "reward_std": 0.3745059221982956, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 0.6319445073604584, + "rewards/repetition_penalty_reward": -0.0501323863863945, + "rewards/tag_count_reward": 0.9947916865348816, "step": 156 }, { "clip_ratio": 0.0, - "completion_length": 535.5208435058594, - "epoch": 0.157, - "grad_norm": 2.4079001126290325, - "kl": 0.064208984375, - "learning_rate": 9.911220285797748e-07, - "loss": 0.0085, - "reward": 2.513691782951355, - "reward_std": 0.3074871450662613, - "rewards/accuracy_reward": 0.5625000149011612, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.04880848899483681, - "rewards/tag_count_reward": 1.0, + "completion_length": 443.6458435058594, + "epoch": 0.0785, + "grad_norm": 2.698570132718695, + "kl": 0.02716064453125, + "learning_rate": 7.85e-07, + "loss": -0.0863, + "reward": 2.4511717557907104, + "reward_std": 0.2767828330397606, + "rewards/accuracy_reward": 0.9791666865348816, + "rewards/reasoning_steps_reward": 0.5763888955116272, + "rewards/repetition_penalty_reward": -0.08875882998108864, + "rewards/tag_count_reward": 0.9843750298023224, "step": 157 }, { "clip_ratio": 0.0, - "completion_length": 539.7500305175781, - "epoch": 0.158, - "grad_norm": 2.772614283171138, - "kl": 0.0655517578125, - "learning_rate": 9.908088623197048e-07, - "loss": -0.0064, - "reward": 2.664715528488159, - "reward_std": 0.265642948448658, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.029728862456977367, - "rewards/tag_count_reward": 1.0, + "completion_length": 453.8333435058594, + "epoch": 0.079, + "grad_norm": 2.9560567179248194, + "kl": 0.0284423828125, + "learning_rate": 7.9e-07, + "loss": 0.0109, + "reward": 2.1405357122421265, + "reward_std": 0.46409404277801514, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.4444444626569748, + "rewards/repetition_penalty_reward": -0.06432541459798813, + "rewards/tag_count_reward": 0.9895833432674408, "step": 158 }, { "clip_ratio": 0.0, - "completion_length": 541.2291870117188, - "epoch": 0.159, - "grad_norm": 2.4163026682291426, - "kl": 0.0654296875, - "learning_rate": 9.904903249427582e-07, - "loss": -0.0569, - "reward": 2.6886651515960693, - "reward_std": 0.3855375796556473, - "rewards/accuracy_reward": 0.75, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.03355725575238466, - "rewards/tag_count_reward": 1.0, + "completion_length": 391.3958435058594, + "epoch": 0.0795, + "grad_norm": 4.542085974407862, + "kl": 0.0609130859375, + "learning_rate": 7.95e-07, + "loss": -0.1684, + "reward": 2.112655282020569, + "reward_std": 0.5330419987440109, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.5000000298023224, + "rewards/repetition_penalty_reward": -0.0800531879067421, + "rewards/tag_count_reward": 0.984375, "step": 159 }, { "clip_ratio": 0.0, - "completion_length": 576.7916870117188, - "epoch": 0.16, - "grad_norm": 2.4138081235417306, - "kl": 0.06298828125, - "learning_rate": 9.901664203302124e-07, - "loss": 0.0256, - "reward": 2.8020124435424805, - "reward_std": 0.28147071599960327, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.03305710572749376, - "rewards/tag_count_reward": 0.9947916865348816, + "completion_length": 450.1875, + "epoch": 0.08, + "grad_norm": 3.8700369269616512, + "kl": 0.0513916015625, + "learning_rate": 8e-07, + "loss": -0.088, + "reward": 1.9016221165657043, + "reward_std": 0.26658795773983, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.3819444626569748, + "rewards/repetition_penalty_reward": -0.13657239079475403, + "rewards/tag_count_reward": 0.9479166865348816, "step": 160 }, { "clip_ratio": 0.0, - "completion_length": 538.8125, - "epoch": 0.161, - "grad_norm": 2.3732077198637116, - "kl": 0.0616455078125, - "learning_rate": 9.89837152428743e-07, - "loss": -0.0005, - "reward": 2.4948108196258545, - "reward_std": 0.3342844396829605, - "rewards/accuracy_reward": 0.5625000298023224, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.04338358715176582, - "rewards/tag_count_reward": 0.9895833432674408, + "completion_length": 388.31251525878906, + "epoch": 0.0805, + "grad_norm": 3.247547106584298, + "kl": 0.0306396484375, + "learning_rate": 8.05e-07, + "loss": -0.0041, + "reward": 1.9749692678451538, + "reward_std": 0.468127503991127, + "rewards/accuracy_reward": 0.4375, + "rewards/reasoning_steps_reward": 0.631944477558136, + "rewards/repetition_penalty_reward": -0.08926697075366974, + "rewards/tag_count_reward": 0.9947916865348816, "step": 161 }, { "clip_ratio": 0.0, - "completion_length": 593.4166870117188, - "epoch": 0.162, - "grad_norm": 2.1912046412336528, - "kl": 0.06591796875, - "learning_rate": 9.895025252503755e-07, - "loss": -0.0436, - "reward": 2.7147765159606934, - "reward_std": 0.40104810893535614, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.05084844306111336, - "rewards/tag_count_reward": 0.9947916865348816, + "completion_length": 434.5416717529297, + "epoch": 0.081, + "grad_norm": 3.0024455965281724, + "kl": 0.0406494140625, + "learning_rate": 8.1e-07, + "loss": -0.1179, + "reward": 2.2072755098342896, + "reward_std": 0.5785205662250519, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.5694444626569748, + "rewards/repetition_penalty_reward": -0.07571066915988922, + "rewards/tag_count_reward": 0.984375, "step": 162 }, { "clip_ratio": 0.0, - "completion_length": 604.3750305175781, - "epoch": 0.163, - "grad_norm": 2.276675908100595, - "kl": 0.06494140625, - "learning_rate": 9.891625428724364e-07, - "loss": -0.0477, - "reward": 2.567950963973999, - "reward_std": 0.3306031674146652, - "rewards/accuracy_reward": 0.6041666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.036215804517269135, - "rewards/tag_count_reward": 1.0, + "completion_length": 257.0833435058594, + "epoch": 0.0815, + "grad_norm": 5.717018782517202, + "kl": 0.0693359375, + "learning_rate": 8.149999999999999e-07, + "loss": -0.1785, + "reward": 1.6311047077178955, + "reward_std": 0.5291544497013092, + "rewards/accuracy_reward": 0.4583333432674408, + "rewards/reasoning_steps_reward": 0.3750000223517418, + "rewards/repetition_penalty_reward": -0.061603715643286705, + "rewards/tag_count_reward": 0.859375, "step": 163 }, { "clip_ratio": 0.0, - "completion_length": 522.5416870117188, - "epoch": 0.164, - "grad_norm": 3.181028204076106, - "kl": 0.071044921875, - "learning_rate": 9.888172094375033e-07, - "loss": 0.1641, - "reward": 2.7617541551589966, - "reward_std": 0.19565748795866966, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.02991252951323986, - "rewards/tag_count_reward": 1.0, + "completion_length": 402.7708435058594, + "epoch": 0.082, + "grad_norm": 2.9816895384452073, + "kl": 0.03515625, + "learning_rate": 8.199999999999999e-07, + "loss": -0.0196, + "reward": 2.12343430519104, + "reward_std": 0.3986600935459137, + "rewards/accuracy_reward": 0.8333333730697632, + "rewards/reasoning_steps_reward": 0.3888889253139496, + "rewards/repetition_penalty_reward": -0.08837117999792099, + "rewards/tag_count_reward": 0.9895833730697632, "step": 164 }, { "clip_ratio": 0.0, - "completion_length": 604.3333435058594, - "epoch": 0.165, - "grad_norm": 2.3335259823634105, - "kl": 0.06494140625, - "learning_rate": 9.88466529153356e-07, - "loss": -0.1023, - "reward": 2.599223256111145, - "reward_std": 0.4700702428817749, - "rewards/accuracy_reward": 0.6875000298023224, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.055290715768933296, - "rewards/tag_count_reward": 0.9947916865348816, + "completion_length": 459.5, + "epoch": 0.0825, + "grad_norm": 2.7857777262413346, + "kl": 0.0350341796875, + "learning_rate": 8.249999999999999e-07, + "loss": -0.0107, + "reward": 2.1470755338668823, + "reward_std": 0.4360540807247162, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.5000000149011612, + "rewards/repetition_penalty_reward": -0.08209127560257912, + "rewards/tag_count_reward": 1.0, "step": 165 }, { "clip_ratio": 0.0, - "completion_length": 583.4791870117188, - "epoch": 0.166, - "grad_norm": 2.4208829670581853, - "kl": 0.070556640625, - "learning_rate": 9.881105062929221e-07, - "loss": 0.0434, - "reward": 2.332135558128357, - "reward_std": 0.36515676975250244, - "rewards/accuracy_reward": 0.3958333544433117, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.039392316713929176, - "rewards/tag_count_reward": 0.9895833730697632, + "completion_length": 453.5208435058594, + "epoch": 0.083, + "grad_norm": 2.904120641790527, + "kl": 0.03466796875, + "learning_rate": 8.299999999999999e-07, + "loss": 0.0829, + "reward": 2.2425050735473633, + "reward_std": 0.48431138694286346, + "rewards/accuracy_reward": 0.7083333730697632, + "rewards/reasoning_steps_reward": 0.6111111640930176, + "rewards/repetition_penalty_reward": -0.0769394002854824, + "rewards/tag_count_reward": 1.0, "step": 166 }, { "clip_ratio": 0.0, - "completion_length": 630.8125, - "epoch": 0.167, - "grad_norm": 2.4360050121887475, - "kl": 0.06396484375, - "learning_rate": 9.877491451942284e-07, - "loss": -0.0026, - "reward": 2.669437289237976, - "reward_std": 0.23492664098739624, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9791667461395264, - "rewards/repetition_penalty_reward": -0.03889598324894905, + "completion_length": 420.43751525878906, + "epoch": 0.0835, + "grad_norm": 2.7716468429377215, + "kl": 0.0323486328125, + "learning_rate": 8.349999999999999e-07, + "loss": -0.0462, + "reward": 2.455272912979126, + "reward_std": 0.31839539110660553, + "rewards/accuracy_reward": 0.8750000298023224, + "rewards/reasoning_steps_reward": 0.652777761220932, + "rewards/repetition_penalty_reward": -0.07250505313277245, "rewards/tag_count_reward": 1.0, "step": 167 }, { "clip_ratio": 0.0, - "completion_length": 653.8750305175781, - "epoch": 0.168, - "grad_norm": 2.299477713264774, - "kl": 0.066162109375, - "learning_rate": 9.873824502603459e-07, - "loss": 0.0313, - "reward": 2.6158487796783447, - "reward_std": 0.346173420548439, - "rewards/accuracy_reward": 0.6875000298023224, - "rewards/reasoning_steps_reward": 0.972222238779068, - "rewards/repetition_penalty_reward": -0.04387335851788521, + "completion_length": 398.2291717529297, + "epoch": 0.084, + "grad_norm": 2.8774923504646184, + "kl": 0.032958984375, + "learning_rate": 8.399999999999999e-07, + "loss": -0.0118, + "reward": 2.3479169607162476, + "reward_std": 0.284304603934288, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.5069444477558136, + "rewards/repetition_penalty_reward": -0.07569434866309166, "rewards/tag_count_reward": 1.0, "step": 168 }, { "clip_ratio": 0.0, - "completion_length": 625.8541870117188, - "epoch": 0.169, - "grad_norm": 2.4011318956925503, - "kl": 0.0640869140625, - "learning_rate": 9.870104259593362e-07, - "loss": 0.0797, - "reward": 2.6212562322616577, - "reward_std": 0.1528831347823143, - "rewards/accuracy_reward": 0.6666666716337204, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.034993914887309074, - "rewards/tag_count_reward": 0.9895833432674408, + "completion_length": 376.43751525878906, + "epoch": 0.0845, + "grad_norm": 3.0955160270611013, + "kl": 0.0306396484375, + "learning_rate": 8.45e-07, + "loss": 0.0031, + "reward": 2.163801431655884, + "reward_std": 0.33351075649261475, + "rewards/accuracy_reward": 0.5000000111758709, + "rewards/reasoning_steps_reward": 0.7083333432674408, + "rewards/repetition_penalty_reward": -0.04453178122639656, + "rewards/tag_count_reward": 1.0, "step": 169 }, { "clip_ratio": 0.0, - "completion_length": 579.7083435058594, - "epoch": 0.17, - "grad_norm": 2.5134518850907948, - "kl": 0.069580078125, - "learning_rate": 9.866330768241983e-07, - "loss": 0.0129, - "reward": 2.715871572494507, - "reward_std": 0.325920432806015, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.03760070167481899, - "rewards/tag_count_reward": 0.9895833730697632, + "completion_length": 439.2708435058594, + "epoch": 0.085, + "grad_norm": 2.8041778591178126, + "kl": 0.0338134765625, + "learning_rate": 8.499999999999999e-07, + "loss": 0.0152, + "reward": 2.1758224964141846, + "reward_std": 0.48637865483760834, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.6180555820465088, + "rewards/repetition_penalty_reward": -0.08285807259380817, + "rewards/tag_count_reward": 0.9739583432674408, "step": 170 }, { "clip_ratio": 0.0, - "completion_length": 633.0625305175781, - "epoch": 0.171, - "grad_norm": 2.4224007332086424, - "kl": 0.0693359375, - "learning_rate": 9.862504074528126e-07, - "loss": 0.0557, - "reward": 2.5395129919052124, - "reward_std": 0.4677208960056305, - "rewards/accuracy_reward": 0.5833333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.04382044076919556, + "completion_length": 368.125, + "epoch": 0.0855, + "grad_norm": 3.047194462178732, + "kl": 0.0386962890625, + "learning_rate": 8.55e-07, + "loss": -0.1328, + "reward": 2.137127637863159, + "reward_std": 0.3549790009856224, + "rewards/accuracy_reward": 0.5833333358168602, + "rewards/reasoning_steps_reward": 0.6250000447034836, + "rewards/repetition_penalty_reward": -0.07120569795370102, "rewards/tag_count_reward": 1.0, "step": 171 }, { "clip_ratio": 0.0, - "completion_length": 621.0416870117188, - "epoch": 0.172, - "grad_norm": 2.1277905450282595, - "kl": 0.0667724609375, - "learning_rate": 9.85862422507884e-07, - "loss": 0.0193, - "reward": 2.8295018672943115, - "reward_std": 0.13751935493201017, - "rewards/accuracy_reward": 0.875, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.04549824073910713, + "completion_length": 424.5833435058594, + "epoch": 0.086, + "grad_norm": 2.8616005778229785, + "kl": 0.03955078125, + "learning_rate": 8.599999999999999e-07, + "loss": -0.0308, + "reward": 2.2606923580169678, + "reward_std": 0.5883138179779053, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.659722238779068, + "rewards/repetition_penalty_reward": -0.08653007447719574, "rewards/tag_count_reward": 1.0, "step": 172 }, { "clip_ratio": 0.0, - "completion_length": 648.7916870117188, - "epoch": 0.173, - "grad_norm": 2.324532047104547, - "kl": 0.069091796875, - "learning_rate": 9.854691267168871e-07, - "loss": 0.1049, - "reward": 2.722783327102661, - "reward_std": 0.41735316812992096, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.0376333836466074, - "rewards/tag_count_reward": 0.9895833432674408, + "completion_length": 448.9166717529297, + "epoch": 0.0865, + "grad_norm": 2.649883532273222, + "kl": 0.0318603515625, + "learning_rate": 8.65e-07, + "loss": -0.0174, + "reward": 2.2526204586029053, + "reward_std": 0.45084331929683685, + "rewards/accuracy_reward": 0.7708333730697632, + "rewards/reasoning_steps_reward": 0.5625000596046448, + "rewards/repetition_penalty_reward": -0.08071288466453552, + "rewards/tag_count_reward": 1.0, "step": 173 }, { "clip_ratio": 0.0, - "completion_length": 671.7083435058594, - "epoch": 0.174, - "grad_norm": 1.9577431816288606, - "kl": 0.066650390625, - "learning_rate": 9.850705248720068e-07, - "loss": 0.0282, - "reward": 2.7458845376968384, - "reward_std": 0.3250259757041931, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.04578226990997791, - "rewards/tag_count_reward": 1.0, + "completion_length": 366.625, + "epoch": 0.087, + "grad_norm": 3.025435169465401, + "kl": 0.0360107421875, + "learning_rate": 8.699999999999999e-07, + "loss": 0.0487, + "reward": 2.397473096847534, + "reward_std": 0.47506286203861237, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.7222222685813904, + "rewards/repetition_penalty_reward": -0.04349912703037262, + "rewards/tag_count_reward": 0.9895833432674408, "step": 174 }, { "clip_ratio": 0.0, - "completion_length": 700.7916870117188, - "epoch": 0.175, - "grad_norm": 1.8987257817841927, - "kl": 0.0645751953125, - "learning_rate": 9.846666218300807e-07, - "loss": 0.0104, - "reward": 2.6111772060394287, - "reward_std": 0.39906148612499237, - "rewards/accuracy_reward": 0.708333358168602, - "rewards/reasoning_steps_reward": 0.9652778506278992, - "rewards/repetition_penalty_reward": -0.06243372894823551, + "completion_length": 431.31251525878906, + "epoch": 0.0875, + "grad_norm": 2.9618305547728543, + "kl": 0.0430908203125, + "learning_rate": 8.75e-07, + "loss": -0.0493, + "reward": 2.137214183807373, + "reward_std": 0.5185305774211884, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.6458333730697632, + "rewards/repetition_penalty_reward": -0.09195243567228317, "rewards/tag_count_reward": 1.0, "step": 175 }, { "clip_ratio": 0.0, - "completion_length": 628.1041870117188, - "epoch": 0.176, - "grad_norm": 2.404386166811554, - "kl": 0.071533203125, - "learning_rate": 9.8425742251254e-07, - "loss": 0.0773, - "reward": 2.6240488290786743, - "reward_std": 0.37613917887210846, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.06692333333194256, - "rewards/tag_count_reward": 0.9895833432674408, + "completion_length": 326.56251525878906, + "epoch": 0.088, + "grad_norm": 2.9727047540243623, + "kl": 0.0399169921875, + "learning_rate": 8.799999999999999e-07, + "loss": 0.0354, + "reward": 2.605048894882202, + "reward_std": 0.31301962584257126, + "rewards/accuracy_reward": 0.9583333432674408, + "rewards/reasoning_steps_reward": 0.6875, + "rewards/repetition_penalty_reward": -0.04078466631472111, + "rewards/tag_count_reward": 1.0, "step": 176 }, { "clip_ratio": 0.0, - "completion_length": 587.6458435058594, - "epoch": 0.177, - "grad_norm": 2.4043287387592125, - "kl": 0.075439453125, - "learning_rate": 9.838429319053495e-07, - "loss": 0.0096, - "reward": 2.5300588607788086, - "reward_std": 0.3319057375192642, - "rewards/accuracy_reward": 0.5833333432674408, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.04112182557582855, - "rewards/tag_count_reward": 0.9947916865348816, + "completion_length": 473.66668701171875, + "epoch": 0.0885, + "grad_norm": 2.522294184113134, + "kl": 0.0426025390625, + "learning_rate": 8.85e-07, + "loss": -0.0891, + "reward": 2.436373710632324, + "reward_std": 0.4591614753007889, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.8125000298023224, + "rewards/repetition_penalty_reward": -0.06362627819180489, + "rewards/tag_count_reward": 1.0, "step": 177 }, { "clip_ratio": 0.0, - "completion_length": 698.8125305175781, - "epoch": 0.178, - "grad_norm": 2.028048720044857, - "kl": 0.072265625, - "learning_rate": 9.83423155058946e-07, - "loss": -0.0443, - "reward": 2.4296261072158813, - "reward_std": 0.34993553161621094, - "rewards/accuracy_reward": 0.4791666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.04954059235751629, - "rewards/tag_count_reward": 1.0, + "completion_length": 398.0, + "epoch": 0.089, + "grad_norm": 2.952883589913921, + "kl": 0.0462646484375, + "learning_rate": 8.9e-07, + "loss": 0.007, + "reward": 2.2954567670822144, + "reward_std": 0.47059088945388794, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.6944444477558136, + "rewards/repetition_penalty_reward": -0.06044617295265198, + "rewards/tag_count_reward": 0.9947916865348816, "step": 178 }, { "clip_ratio": 0.0, - "completion_length": 643.4166870117188, - "epoch": 0.179, - "grad_norm": 2.5526900934607544, - "kl": 0.07763671875, - "learning_rate": 9.829980970881784e-07, - "loss": 0.1619, - "reward": 2.847709059715271, - "reward_std": 0.24474234879016876, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.04291607812047005, - "rewards/tag_count_reward": 0.9739583432674408, + "completion_length": 377.7916717529297, + "epoch": 0.0895, + "grad_norm": 2.8614701370754245, + "kl": 0.04736328125, + "learning_rate": 8.95e-07, + "loss": -0.0243, + "reward": 2.436295747756958, + "reward_std": 0.4291905164718628, + "rewards/accuracy_reward": 0.8125, + "rewards/reasoning_steps_reward": 0.701388955116272, + "rewards/repetition_penalty_reward": -0.06717650964856148, + "rewards/tag_count_reward": 0.9895833432674408, "step": 179 }, { "clip_ratio": 0.0, - "completion_length": 617.2291870117188, - "epoch": 0.18, - "grad_norm": 1.9749460293522159, - "kl": 0.082275390625, - "learning_rate": 9.825677631722435e-07, - "loss": 0.0669, - "reward": 2.691727042198181, - "reward_std": 0.3086921200156212, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.03743956796824932, - "rewards/tag_count_reward": 1.0, + "completion_length": 485.9583435058594, + "epoch": 0.09, + "grad_norm": 3.026265114082148, + "kl": 0.0382080078125, + "learning_rate": 9e-07, + "loss": 0.0719, + "reward": 2.1689553260803223, + "reward_std": 0.30075494945049286, + "rewards/accuracy_reward": 0.5, + "rewards/reasoning_steps_reward": 0.7361111640930176, + "rewards/repetition_penalty_reward": -0.06194741278886795, + "rewards/tag_count_reward": 0.9947916865348816, "step": 180 }, { "clip_ratio": 0.0, - "completion_length": 619.3541870117188, - "epoch": 0.181, - "grad_norm": 2.115751260793685, - "kl": 0.0771484375, - "learning_rate": 9.821321585546243e-07, - "loss": 0.0106, - "reward": 2.788450598716736, - "reward_std": 0.2660830020904541, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.05529944226145744, - "rewards/tag_count_reward": 0.9895833432674408, + "completion_length": 448.3125, + "epoch": 0.0905, + "grad_norm": 2.810163992973923, + "kl": 0.04541015625, + "learning_rate": 9.05e-07, + "loss": 0.0161, + "reward": 2.3727446794509888, + "reward_std": 0.5085368752479553, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.7083333730697632, + "rewards/repetition_penalty_reward": -0.07517208904027939, + "rewards/tag_count_reward": 0.9895833730697632, "step": 181 }, { "clip_ratio": 0.0, - "completion_length": 631.7500305175781, - "epoch": 0.182, - "grad_norm": 2.195356667493195, - "kl": 0.084716796875, - "learning_rate": 9.816912885430258e-07, - "loss": 0.0269, - "reward": 2.6126253604888916, - "reward_std": 0.32306814193725586, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.054041286930441856, - "rewards/tag_count_reward": 1.0, + "completion_length": 569.9583435058594, + "epoch": 0.091, + "grad_norm": 2.6212790656780056, + "kl": 0.0643310546875, + "learning_rate": 9.1e-07, + "loss": -0.0303, + "reward": 2.1286094188690186, + "reward_std": 0.5249549150466919, + "rewards/accuracy_reward": 0.4375, + "rewards/reasoning_steps_reward": 0.79861119389534, + "rewards/repetition_penalty_reward": -0.09187676757574081, + "rewards/tag_count_reward": 0.984375, "step": 182 }, { "clip_ratio": 0.0, - "completion_length": 634.8125305175781, - "epoch": 0.183, - "grad_norm": 2.1090909584715587, - "kl": 0.0791015625, - "learning_rate": 9.812451585093098e-07, - "loss": 0.0252, - "reward": 2.7393864393234253, - "reward_std": 0.3596974164247513, - "rewards/accuracy_reward": 0.8125, - "rewards/reasoning_steps_reward": 0.979166716337204, - "rewards/repetition_penalty_reward": -0.052280182018876076, - "rewards/tag_count_reward": 1.0, + "completion_length": 571.0208435058594, + "epoch": 0.0915, + "grad_norm": 2.7275256259539025, + "kl": 0.045166015625, + "learning_rate": 9.15e-07, + "loss": 0.0061, + "reward": 2.457062244415283, + "reward_std": 0.44156837463378906, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.8611111640930176, + "rewards/repetition_penalty_reward": -0.09154891595244408, + "rewards/tag_count_reward": 0.9791666865348816, "step": 183 }, { "clip_ratio": 0.0, - "completion_length": 661.7083435058594, - "epoch": 0.184, - "grad_norm": 2.077351011146557, - "kl": 0.0771484375, - "learning_rate": 9.807937738894303e-07, - "loss": 0.0566, - "reward": 2.869002103805542, - "reward_std": 0.24853621423244476, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.04766450449824333, - "rewards/tag_count_reward": 1.0, + "completion_length": 568.0416870117188, + "epoch": 0.092, + "grad_norm": 2.5794652768185697, + "kl": 0.0457763671875, + "learning_rate": 9.2e-07, + "loss": -0.0542, + "reward": 2.1548627614974976, + "reward_std": 0.41270676255226135, + "rewards/accuracy_reward": 0.4791666865348816, + "rewards/reasoning_steps_reward": 0.784722238779068, + "rewards/repetition_penalty_reward": -0.10381785407662392, + "rewards/tag_count_reward": 0.9947916865348816, "step": 184 }, { "clip_ratio": 0.0, - "completion_length": 603.0208435058594, - "epoch": 0.185, - "grad_norm": 2.1637455121074223, - "kl": 0.087158203125, - "learning_rate": 9.80337140183366e-07, - "loss": 0.0752, - "reward": 2.8117637634277344, - "reward_std": 0.24315628595650196, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.035458519123494625, - "rewards/tag_count_reward": 1.0, + "completion_length": 550.5833435058594, + "epoch": 0.0925, + "grad_norm": 2.6922728125371544, + "kl": 0.0513916015625, + "learning_rate": 9.25e-07, + "loss": -0.0506, + "reward": 2.1982321739196777, + "reward_std": 0.47882315516471863, + "rewards/accuracy_reward": 0.3958333432674408, + "rewards/reasoning_steps_reward": 0.909722238779068, + "rewards/repetition_penalty_reward": -0.09690695255994797, + "rewards/tag_count_reward": 0.9895833432674408, "step": 185 }, { "clip_ratio": 0.0, - "completion_length": 583.1458435058594, - "epoch": 0.186, - "grad_norm": 2.275135225584201, - "kl": 0.09326171875, - "learning_rate": 9.798752629550546e-07, - "loss": 0.0655, - "reward": 2.8143444061279297, - "reward_std": 0.21083202213048935, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.039822228252887726, - "rewards/tag_count_reward": 1.0, + "completion_length": 552.75, + "epoch": 0.093, + "grad_norm": 2.34743025577209, + "kl": 0.0548095703125, + "learning_rate": 9.3e-07, + "loss": -0.017, + "reward": 2.2561983466148376, + "reward_std": 0.37413595616817474, + "rewards/accuracy_reward": 0.4583333358168602, + "rewards/reasoning_steps_reward": 0.9027778506278992, + "rewards/repetition_penalty_reward": -0.0944962427020073, + "rewards/tag_count_reward": 0.9895833730697632, "step": 186 }, { "clip_ratio": 0.0, - "completion_length": 583.2916870117188, - "epoch": 0.187, - "grad_norm": 2.233756675170646, - "kl": 0.089599609375, - "learning_rate": 9.794081478323245e-07, - "loss": 0.0115, - "reward": 2.55490505695343, - "reward_std": 0.20559479296207428, - "rewards/accuracy_reward": 0.6041666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.04231713339686394, + "completion_length": 445.5208435058594, + "epoch": 0.0935, + "grad_norm": 3.017831161763003, + "kl": 0.061767578125, + "learning_rate": 9.35e-07, + "loss": 0.0359, + "reward": 2.617498517036438, + "reward_std": 0.2746337354183197, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/reasoning_steps_reward": 0.86111119389534, + "rewards/repetition_penalty_reward": -0.07694609090685844, "rewards/tag_count_reward": 1.0, "step": 187 }, { "clip_ratio": 0.0, - "completion_length": 577.3958435058594, - "epoch": 0.188, - "grad_norm": 2.312702195384097, - "kl": 0.09033203125, - "learning_rate": 9.78935800506826e-07, - "loss": 0.0262, - "reward": 2.8108856678009033, - "reward_std": 0.29880291223526, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.04328102432191372, - "rewards/tag_count_reward": 1.0, + "completion_length": 516.4166870117188, + "epoch": 0.094, + "grad_norm": 10.120993359942497, + "kl": 0.0875244140625, + "learning_rate": 9.399999999999999e-07, + "loss": -0.0016, + "reward": 2.3637200593948364, + "reward_std": 0.35263994336128235, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.92361119389534, + "rewards/repetition_penalty_reward": -0.09114107862114906, + "rewards/tag_count_reward": 0.9895833432674408, "step": 188 }, { "clip_ratio": 0.0, - "completion_length": 621.6458435058594, - "epoch": 0.189, - "grad_norm": 2.450467067468715, - "kl": 0.095458984375, - "learning_rate": 9.784582267339622e-07, - "loss": 0.1076, - "reward": 2.691537618637085, - "reward_std": 0.46115170419216156, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.03762921877205372, - "rewards/tag_count_reward": 1.0, + "completion_length": 578.9166717529297, + "epoch": 0.0945, + "grad_norm": 2.4810269448429314, + "kl": 0.060546875, + "learning_rate": 9.45e-07, + "loss": 0.135, + "reward": 2.5320863723754883, + "reward_std": 0.4612206518650055, + "rewards/accuracy_reward": 0.75, + "rewards/reasoning_steps_reward": 0.9305555820465088, + "rewards/repetition_penalty_reward": -0.12763602659106255, + "rewards/tag_count_reward": 0.9791666865348816, "step": 189 }, { "clip_ratio": 0.0, - "completion_length": 552.2291870117188, - "epoch": 0.19, - "grad_norm": 2.33663470812737, - "kl": 0.096923828125, - "learning_rate": 9.779754323328192e-07, - "loss": 0.0829, - "reward": 2.556964159011841, - "reward_std": 0.2390810027718544, - "rewards/accuracy_reward": 0.5833333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.02636928204447031, + "completion_length": 536.8333435058594, + "epoch": 0.095, + "grad_norm": 2.42276363492381, + "kl": 0.0567626953125, + "learning_rate": 9.499999999999999e-07, + "loss": -0.0277, + "reward": 2.275315523147583, + "reward_std": 0.42472073435783386, + "rewards/accuracy_reward": 0.4375000149011612, + "rewards/reasoning_steps_reward": 0.9305555522441864, + "rewards/repetition_penalty_reward": -0.09274015948176384, "rewards/tag_count_reward": 1.0, "step": 190 }, { "clip_ratio": 0.0, - "completion_length": 589.0416870117188, - "epoch": 0.191, - "grad_norm": 2.145945974879737, - "kl": 0.094482421875, - "learning_rate": 9.774874231860935e-07, - "loss": -0.0278, - "reward": 2.621474266052246, - "reward_std": 0.42711225152015686, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.04519243072718382, - "rewards/tag_count_reward": 1.0, + "completion_length": 515.4166870117188, + "epoch": 0.0955, + "grad_norm": 3.0986857990959504, + "kl": 0.102783203125, + "learning_rate": 9.55e-07, + "loss": -0.069, + "reward": 2.445477247238159, + "reward_std": 0.40403106808662415, + "rewards/accuracy_reward": 0.625, + "rewards/reasoning_steps_reward": 0.9305556118488312, + "rewards/repetition_penalty_reward": -0.10487010702490807, + "rewards/tag_count_reward": 0.9947916865348816, "step": 191 }, { "clip_ratio": 0.0, - "completion_length": 539.6875305175781, - "epoch": 0.192, - "grad_norm": 2.147974112089912, - "kl": 0.090087890625, - "learning_rate": 9.769942052400235e-07, - "loss": -0.0237, - "reward": 2.76838481426239, - "reward_std": 0.310782328248024, - "rewards/accuracy_reward": 0.8125, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.04411514103412628, - "rewards/tag_count_reward": 1.0, + "completion_length": 576.6041717529297, + "epoch": 0.096, + "grad_norm": 2.527746035134317, + "kl": 0.0672607421875, + "learning_rate": 9.6e-07, + "loss": 0.0669, + "reward": 2.4357703924179077, + "reward_std": 0.40775516629219055, + "rewards/accuracy_reward": 0.6250000149011612, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.10763245820999146, + "rewards/tag_count_reward": 0.9739583432674408, "step": 192 }, { "clip_ratio": 0.0, - "completion_length": 525.2916870117188, - "epoch": 0.193, - "grad_norm": 2.1483746204852827, - "kl": 0.096923828125, - "learning_rate": 9.764957845043135e-07, - "loss": 0.0031, - "reward": 2.842332124710083, - "reward_std": 0.13549592159688473, - "rewards/accuracy_reward": 0.875, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.032667966559529305, - "rewards/tag_count_reward": 1.0, + "completion_length": 561.1250152587891, + "epoch": 0.0965, + "grad_norm": 2.0944240845048028, + "kl": 0.0623779296875, + "learning_rate": 9.649999999999999e-07, + "loss": -0.012, + "reward": 2.684821605682373, + "reward_std": 0.2746804505586624, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.11726166307926178, + "rewards/tag_count_reward": 0.9895833432674408, "step": 193 }, { "clip_ratio": 0.0, - "completion_length": 595.1875305175781, - "epoch": 0.194, - "grad_norm": 1.981195616724217, - "kl": 0.09228515625, - "learning_rate": 9.759921670520634e-07, - "loss": -0.0784, - "reward": 2.706938862800598, - "reward_std": 0.286833293735981, + "completion_length": 545.1875305175781, + "epoch": 0.097, + "grad_norm": 2.1965210412413896, + "kl": 0.059814453125, + "learning_rate": 9.7e-07, + "loss": -0.0042, + "reward": 2.5165964365005493, + "reward_std": 0.41372165083885193, "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 0.972222238779068, - "rewards/repetition_penalty_reward": -0.03611667454242706, - "rewards/tag_count_reward": 1.0, + "rewards/reasoning_steps_reward": 0.8958333432674408, + "rewards/repetition_penalty_reward": -0.09277866780757904, + "rewards/tag_count_reward": 0.9427083432674408, "step": 194 }, { "clip_ratio": 0.0, - "completion_length": 576.2708587646484, - "epoch": 0.195, - "grad_norm": 2.648586034553415, - "kl": 0.09375, - "learning_rate": 9.754833590196926e-07, - "loss": 0.198, - "reward": 2.8884334564208984, - "reward_std": 0.24114538729190826, - "rewards/accuracy_reward": 0.9375000298023224, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.035177784971892834, - "rewards/tag_count_reward": 1.0, + "completion_length": 519.5416870117188, + "epoch": 0.0975, + "grad_norm": 2.4964745244416346, + "kl": 0.052978515625, + "learning_rate": 9.75e-07, + "loss": 0.065, + "reward": 2.522639751434326, + "reward_std": 0.43553659319877625, + "rewards/accuracy_reward": 0.75, + "rewards/reasoning_steps_reward": 0.8958333730697632, + "rewards/repetition_penalty_reward": -0.11277706176042557, + "rewards/tag_count_reward": 0.9895833432674408, "step": 195 }, { "clip_ratio": 0.0, - "completion_length": 593.3750305175781, - "epoch": 0.196, - "grad_norm": 2.403156236141252, - "kl": 0.10107421875, - "learning_rate": 9.749693666068663e-07, - "loss": -0.0295, - "reward": 2.310402512550354, - "reward_std": 0.2717669606208801, - "rewards/accuracy_reward": 0.3541666716337204, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.03681975603103638, - "rewards/tag_count_reward": 1.0, + "completion_length": 564.7708435058594, + "epoch": 0.098, + "grad_norm": 2.2610161792561527, + "kl": 0.061767578125, + "learning_rate": 9.8e-07, + "loss": -0.0127, + "reward": 2.3144911527633667, + "reward_std": 0.4604138135910034, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.9027778506278992, + "rewards/repetition_penalty_reward": -0.10912004858255386, + "rewards/tag_count_reward": 0.9791666865348816, "step": 196 }, { "clip_ratio": 0.0, - "completion_length": 466.60418701171875, - "epoch": 0.197, - "grad_norm": 5.530819309602341, - "kl": 0.12060546875, - "learning_rate": 9.744501960764203e-07, - "loss": 0.0515, - "reward": 2.7814574241638184, - "reward_std": 0.3644670993089676, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.02062598057091236, - "rewards/tag_count_reward": 0.9895833432674408, + "completion_length": 471.6666717529297, + "epoch": 0.0985, + "grad_norm": 2.720385116064672, + "kl": 0.061767578125, + "learning_rate": 9.849999999999999e-07, + "loss": 0.0132, + "reward": 2.4773154258728027, + "reward_std": 0.4094041436910629, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.8958333730697632, + "rewards/repetition_penalty_reward": -0.06955961138010025, + "rewards/tag_count_reward": 0.984375, "step": 197 }, { "clip_ratio": 0.0, - "completion_length": 528.1666870117188, - "epoch": 0.198, - "grad_norm": 2.484071909086178, - "kl": 0.1015625, - "learning_rate": 9.739258537542835e-07, - "loss": 0.0689, - "reward": 2.845056176185608, - "reward_std": 0.29101284593343735, + "completion_length": 537.2500305175781, + "epoch": 0.099, + "grad_norm": 2.7986049436758194, + "kl": 0.06494140625, + "learning_rate": 9.9e-07, + "loss": 0.0464, + "reward": 2.6958858966827393, + "reward_std": 0.3497622162103653, "rewards/accuracy_reward": 0.8750000298023224, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.0229995958507061, - "rewards/tag_count_reward": 1.0, + "rewards/reasoning_steps_reward": 0.958333432674408, + "rewards/repetition_penalty_reward": -0.11140592023730278, + "rewards/tag_count_reward": 0.9739583432674408, "step": 198 }, { "clip_ratio": 0.0, - "completion_length": 472.9583435058594, - "epoch": 0.199, - "grad_norm": 2.111890817265467, - "kl": 0.10791015625, - "learning_rate": 9.733963460294015e-07, - "loss": -0.0139, - "reward": 2.8303216695785522, - "reward_std": 0.34869830310344696, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.034261688590049744, - "rewards/tag_count_reward": 0.9895833432674408, + "completion_length": 525.125, + "epoch": 0.0995, + "grad_norm": 2.5376762252956166, + "kl": 0.057861328125, + "learning_rate": 9.95e-07, + "loss": 0.0873, + "reward": 2.4644105434417725, + "reward_std": 0.5261338353157043, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.888888955116272, + "rewards/repetition_penalty_reward": -0.09635350480675697, + "rewards/tag_count_reward": 0.9843750298023224, "step": 199 }, { "clip_ratio": 0.0, - "completion_length": 515.3333587646484, - "epoch": 0.2, - "grad_norm": 2.557342178020337, - "kl": 0.1005859375, - "learning_rate": 9.728616793536587e-07, - "loss": -0.0301, - "reward": 2.5907087326049805, - "reward_std": 0.3476664125919342, - "rewards/accuracy_reward": 0.625, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.027346878312528133, - "rewards/tag_count_reward": 1.0, + "completion_length": 479.56251525878906, + "epoch": 0.1, + "grad_norm": 2.51025292795749, + "kl": 0.0599365234375, + "learning_rate": 1e-06, + "loss": -0.0162, + "reward": 2.544384002685547, + "reward_std": 0.447608157992363, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9166667461395264, + "rewards/repetition_penalty_reward": -0.10144944489002228, + "rewards/tag_count_reward": 0.9791666865348816, "step": 200 }, { "clip_ratio": 0.0, - "completion_length": 531.8333435058594, - "epoch": 0.201, - "grad_norm": 2.699850984290719, - "kl": 0.107177734375, - "learning_rate": 9.723218602418e-07, - "loss": 0.026, - "reward": 2.548583984375, - "reward_std": 0.4529786705970764, - "rewards/accuracy_reward": 0.5833333432674408, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.027805176563560963, - "rewards/tag_count_reward": 1.0, + "completion_length": 569.0625, + "epoch": 0.1005, + "grad_norm": 2.1126300771805564, + "kl": 0.0582275390625, + "learning_rate": 9.999993146109795e-07, + "loss": 0.0202, + "reward": 2.5578041076660156, + "reward_std": 0.334363654255867, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.92361119389534, + "rewards/repetition_penalty_reward": -0.10539035871624947, + "rewards/tag_count_reward": 0.9895833432674408, "step": 201 }, { "clip_ratio": 0.0, - "completion_length": 506.7708435058594, - "epoch": 0.202, - "grad_norm": 2.3216160000213297, - "kl": 0.088623046875, - "learning_rate": 9.717768952713511e-07, - "loss": -0.0301, - "reward": 2.6107916831970215, - "reward_std": 0.3744069039821625, - "rewards/accuracy_reward": 0.6875000298023224, - "rewards/reasoning_steps_reward": 0.9722222089767456, - "rewards/repetition_penalty_reward": -0.03851393796503544, - "rewards/tag_count_reward": 0.9895833432674408, + "completion_length": 487.8333435058594, + "epoch": 0.101, + "grad_norm": 2.537230361987624, + "kl": 0.068603515625, + "learning_rate": 9.999972584460056e-07, + "loss": -0.0015, + "reward": 2.552291989326477, + "reward_std": 0.5273123234510422, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.8750000596046448, + "rewards/repetition_penalty_reward": -0.09874986857175827, + "rewards/tag_count_reward": 0.984375, "step": 202 }, { "clip_ratio": 0.0, - "completion_length": 530.9375152587891, - "epoch": 0.203, - "grad_norm": 3.0552289765907665, - "kl": 0.105712890625, - "learning_rate": 9.71226791082538e-07, - "loss": 0.0276, - "reward": 2.899757981300354, - "reward_std": 0.19229509681463242, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/reasoning_steps_reward": 0.979166716337204, - "rewards/repetition_penalty_reward": -0.032533735036849976, - "rewards/tag_count_reward": 0.9947916865348816, + "completion_length": 460.3333435058594, + "epoch": 0.1015, + "grad_norm": 2.663772208431935, + "kl": 0.0540771484375, + "learning_rate": 9.99993831511342e-07, + "loss": 0.0486, + "reward": 2.662340521812439, + "reward_std": 0.3466527909040451, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.8263889253139496, + "rewards/repetition_penalty_reward": -0.08071524277329445, + "rewards/tag_count_reward": 1.0, "step": 203 }, { "clip_ratio": 0.0, - "completion_length": 613.8333435058594, - "epoch": 0.204, - "grad_norm": 2.25614862930677, - "kl": 0.094482421875, - "learning_rate": 9.706715543782064e-07, - "loss": -0.0024, - "reward": 2.8068939447402954, - "reward_std": 0.3218880593776703, - "rewards/accuracy_reward": 0.8958333730697632, - "rewards/reasoning_steps_reward": 0.9652778506278992, - "rewards/repetition_penalty_reward": -0.0438005393370986, - "rewards/tag_count_reward": 0.9895833432674408, + "completion_length": 551.3750305175781, + "epoch": 0.102, + "grad_norm": 2.3037155314735895, + "kl": 0.0634765625, + "learning_rate": 9.999890338174275e-07, + "loss": 0.116, + "reward": 2.5760300159454346, + "reward_std": 0.3985503613948822, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.07674792408943176, + "rewards/tag_count_reward": 1.0, "step": 204 }, { "clip_ratio": 0.0, - "completion_length": 548.3333740234375, - "epoch": 0.205, - "grad_norm": 2.1739212203171405, - "kl": 0.100830078125, - "learning_rate": 9.701111919237408e-07, - "loss": -0.0353, - "reward": 2.628522038459778, - "reward_std": 0.37829773128032684, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/reasoning_steps_reward": 0.9513889253139496, - "rewards/repetition_penalty_reward": -0.03120046854019165, - "rewards/tag_count_reward": 1.0, + "completion_length": 460.6458435058594, + "epoch": 0.1025, + "grad_norm": 2.659875652962376, + "kl": 0.064453125, + "learning_rate": 9.99982865378877e-07, + "loss": -0.0381, + "reward": 2.600967049598694, + "reward_std": 0.34831008315086365, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.8680555820465088, + "rewards/repetition_penalty_reward": -0.0691719576716423, + "rewards/tag_count_reward": 0.9895833432674408, "step": 205 }, { "clip_ratio": 0.0, - "completion_length": 575.6666870117188, - "epoch": 0.206, - "grad_norm": 2.142090873135409, - "kl": 0.081298828125, - "learning_rate": 9.695457105469804e-07, - "loss": 0.0283, - "reward": 2.6884310245513916, - "reward_std": 0.1302720569074154, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 0.9652777910232544, - "rewards/repetition_penalty_reward": -0.042471904307603836, - "rewards/tag_count_reward": 0.9947916865348816, + "completion_length": 473.8333435058594, + "epoch": 0.103, + "grad_norm": 2.672719916018964, + "kl": 0.0626220703125, + "learning_rate": 9.999753262144804e-07, + "loss": 0.1019, + "reward": 2.5358208417892456, + "reward_std": 0.42981427907943726, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.888888955116272, + "rewards/repetition_penalty_reward": -0.061401575803756714, + "rewards/tag_count_reward": 1.0, "step": 206 }, { "clip_ratio": 0.0, - "completion_length": 537.5833587646484, - "epoch": 0.207, - "grad_norm": 2.5678922668456563, - "kl": 0.092529296875, - "learning_rate": 9.689751171381377e-07, - "loss": 0.0451, - "reward": 2.6688740253448486, - "reward_std": 0.40736155211925507, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/reasoning_steps_reward": 0.9861111044883728, - "rewards/repetition_penalty_reward": -0.025570510886609554, - "rewards/tag_count_reward": 1.0, + "completion_length": 469.6458435058594, + "epoch": 0.1035, + "grad_norm": 2.5741947297183385, + "kl": 0.0634765625, + "learning_rate": 9.999664163472034e-07, + "loss": -0.0236, + "reward": 2.332731008529663, + "reward_std": 0.4647013247013092, + "rewards/accuracy_reward": 0.5625, + "rewards/reasoning_steps_reward": 0.8611111342906952, + "rewards/repetition_penalty_reward": -0.08567183464765549, + "rewards/tag_count_reward": 0.9947916865348816, "step": 207 }, { "clip_ratio": 0.0, - "completion_length": 585.875, - "epoch": 0.208, - "grad_norm": 2.5924187225102355, - "kl": 0.098388671875, - "learning_rate": 9.683994186497132e-07, - "loss": 0.169, - "reward": 2.3460363149642944, - "reward_std": 0.4289597123861313, - "rewards/accuracy_reward": 0.4375000111758709, - "rewards/reasoning_steps_reward": 0.9583333432674408, - "rewards/repetition_penalty_reward": -0.034172071143984795, - "rewards/tag_count_reward": 0.984375, + "completion_length": 546.7916870117188, + "epoch": 0.104, + "grad_norm": 2.7033266921870123, + "kl": 0.0594482421875, + "learning_rate": 9.999561358041868e-07, + "loss": -0.0888, + "reward": 2.297361373901367, + "reward_std": 0.5272791683673859, + "rewards/accuracy_reward": 0.5000000298023224, + "rewards/reasoning_steps_reward": 0.8750001192092896, + "rewards/repetition_penalty_reward": -0.07763872668147087, + "rewards/tag_count_reward": 1.0, "step": 208 }, { "clip_ratio": 0.0, - "completion_length": 488.35418701171875, - "epoch": 0.209, - "grad_norm": 2.6979608618101913, - "kl": 0.099609375, - "learning_rate": 9.67818622096411e-07, - "loss": 0.0553, - "reward": 2.830100178718567, - "reward_std": 0.3162241727113724, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 0.9513888955116272, - "rewards/repetition_penalty_reward": -0.017122075892984867, - "rewards/tag_count_reward": 0.9791666865348816, + "completion_length": 443.62501525878906, + "epoch": 0.1045, + "grad_norm": 3.90101302775015, + "kl": 0.072265625, + "learning_rate": 9.99944484616747e-07, + "loss": 0.0252, + "reward": 2.6521321535110474, + "reward_std": 0.33744025230407715, + "rewards/accuracy_reward": 0.875, + "rewards/reasoning_steps_reward": 0.8541667461395264, + "rewards/repetition_penalty_reward": -0.07703462615609169, + "rewards/tag_count_reward": 1.0, "step": 209 }, { "clip_ratio": 0.0, - "completion_length": 535.6041870117188, - "epoch": 0.21, - "grad_norm": 2.6398456904077494, - "kl": 0.09814453125, - "learning_rate": 9.672327345550543e-07, - "loss": 0.0618, - "reward": 2.509260654449463, - "reward_std": 0.28831499069929123, - "rewards/accuracy_reward": 0.5625, - "rewards/reasoning_steps_reward": 0.9861111044883728, - "rewards/repetition_penalty_reward": -0.023725682869553566, - "rewards/tag_count_reward": 0.984375, + "completion_length": 540.4375, + "epoch": 0.105, + "grad_norm": 2.31473784045809, + "kl": 0.0623779296875, + "learning_rate": 9.99931462820376e-07, + "loss": 0.0167, + "reward": 2.5439374446868896, + "reward_std": 0.2827417850494385, + "rewards/accuracy_reward": 0.7083333730697632, + "rewards/reasoning_steps_reward": 0.9444445073604584, + "rewards/repetition_penalty_reward": -0.10884056985378265, + "rewards/tag_count_reward": 1.0, "step": 210 }, { "clip_ratio": 0.0, - "completion_length": 599.6041870117188, - "epoch": 0.211, - "grad_norm": 4.517806643959562, - "kl": 0.109130859375, - "learning_rate": 9.666417631644976e-07, - "loss": 0.0023, - "reward": 2.5767656564712524, - "reward_std": 0.3708747327327728, - "rewards/accuracy_reward": 0.645833358168602, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.03608160652220249, + "completion_length": 518.5625, + "epoch": 0.1055, + "grad_norm": 2.4382433122696763, + "kl": 0.074462890625, + "learning_rate": 9.999170704547398e-07, + "loss": 0.0511, + "reward": 2.5632206201553345, + "reward_std": 0.31003791093826294, + "rewards/accuracy_reward": 0.6875000149011612, + "rewards/reasoning_steps_reward": 0.9722222089767456, + "rewards/repetition_penalty_reward": -0.0912933386862278, "rewards/tag_count_reward": 0.9947916865348816, "step": 211 }, { "clip_ratio": 0.0, - "completion_length": 589.4166870117188, - "epoch": 0.212, - "grad_norm": 2.261259945602269, - "kl": 0.10205078125, - "learning_rate": 9.66045715125541e-07, - "loss": 0.097, - "reward": 2.5739282369613647, - "reward_std": 0.3682664930820465, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/reasoning_steps_reward": 0.9583333432674408, - "rewards/repetition_penalty_reward": -0.040655218064785004, - "rewards/tag_count_reward": 0.9895833432674408, + "completion_length": 503.8125305175781, + "epoch": 0.106, + "grad_norm": 2.370275119306627, + "kl": 0.0654296875, + "learning_rate": 9.999013075636804e-07, + "loss": -0.008, + "reward": 2.4429709911346436, + "reward_std": 0.4311106353998184, + "rewards/accuracy_reward": 0.6458333730697632, + "rewards/reasoning_steps_reward": 0.8888889253139496, + "rewards/repetition_penalty_reward": -0.09175140410661697, + "rewards/tag_count_reward": 1.0, "step": 212 }, { "clip_ratio": 0.0, - "completion_length": 515.7291870117188, - "epoch": 0.213, - "grad_norm": 2.397926932727844, - "kl": 0.1044921875, - "learning_rate": 9.654445977008414e-07, - "loss": 0.0394, - "reward": 2.682453751564026, - "reward_std": 0.4673650562763214, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.024143685586750507, - "rewards/tag_count_reward": 0.984375, + "completion_length": 588.3958740234375, + "epoch": 0.1065, + "grad_norm": 2.5342323653428696, + "kl": 0.0633544921875, + "learning_rate": 9.998841741952141e-07, + "loss": 0.1784, + "reward": 2.6273250579833984, + "reward_std": 0.28149472177028656, + "rewards/accuracy_reward": 0.75, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.09489719569683075, + "rewards/tag_count_reward": 1.0, "step": 213 }, { "clip_ratio": 0.0, - "completion_length": 691.2916870117188, - "epoch": 0.214, - "grad_norm": 8.712060550440611, - "kl": 0.124755859375, - "learning_rate": 9.648384182148252e-07, - "loss": 0.0044, - "reward": 2.719285011291504, - "reward_std": 0.3266633450984955, - "rewards/accuracy_reward": 0.8125, - "rewards/reasoning_steps_reward": 0.9652778506278992, - "rewards/repetition_penalty_reward": -0.04807615838944912, - "rewards/tag_count_reward": 0.9895833432674408, + "completion_length": 525.9375305175781, + "epoch": 0.107, + "grad_norm": 2.1916917773556763, + "kl": 0.06689453125, + "learning_rate": 9.998656704015323e-07, + "loss": -0.0292, + "reward": 2.3184261322021484, + "reward_std": 0.42756618559360504, + "rewards/accuracy_reward": 0.4583333432674408, + "rewards/reasoning_steps_reward": 0.9583334028720856, + "rewards/repetition_penalty_reward": -0.09303238615393639, + "rewards/tag_count_reward": 0.9947916865348816, "step": 214 }, { "clip_ratio": 0.0, - "completion_length": 609.5416870117188, - "epoch": 0.215, - "grad_norm": 2.215014758114719, - "kl": 0.09326171875, - "learning_rate": 9.64227184053598e-07, - "loss": 0.0437, - "reward": 2.835343837738037, - "reward_std": 0.26147839426994324, - "rewards/accuracy_reward": 0.9375000298023224, - "rewards/reasoning_steps_reward": 0.9513889253139496, - "rewards/repetition_penalty_reward": -0.04312858823686838, - "rewards/tag_count_reward": 0.9895833432674408, + "completion_length": 622.7708435058594, + "epoch": 0.1075, + "grad_norm": 2.279834707710596, + "kl": 0.065673828125, + "learning_rate": 9.998457962390008e-07, + "loss": 0.0573, + "reward": 2.3788540363311768, + "reward_std": 0.44249793887138367, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.09510444849729538, + "rewards/tag_count_reward": 0.9947916865348816, "step": 215 }, { "clip_ratio": 0.0, - "completion_length": 600.7291870117188, - "epoch": 0.216, - "grad_norm": 2.0090732793867403, - "kl": 0.124267578125, - "learning_rate": 9.636109026648554e-07, - "loss": -0.091, - "reward": 2.805617928504944, - "reward_std": 0.33824414014816284, - "rewards/accuracy_reward": 0.8958333730697632, - "rewards/reasoning_steps_reward": 0.9513889253139496, - "rewards/repetition_penalty_reward": -0.04160444252192974, - "rewards/tag_count_reward": 1.0, + "completion_length": 561.1458435058594, + "epoch": 0.108, + "grad_norm": 2.4104772332041033, + "kl": 0.068603515625, + "learning_rate": 9.998245517681593e-07, + "loss": 0.0665, + "reward": 2.5858339071273804, + "reward_std": 0.3917628526687622, + "rewards/accuracy_reward": 0.7083333730697632, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.07562451809644699, + "rewards/tag_count_reward": 0.9739583432674408, "step": 216 }, { "clip_ratio": 0.0, - "completion_length": 645.2083740234375, - "epoch": 0.217, - "grad_norm": 13.509335743253168, - "kl": 0.146484375, - "learning_rate": 9.629895815577915e-07, - "loss": -0.0366, - "reward": 2.7113317251205444, - "reward_std": 0.34692464768886566, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.045612769201397896, + "completion_length": 494.2291717529297, + "epoch": 0.1085, + "grad_norm": 2.3453403680398917, + "kl": 0.07568359375, + "learning_rate": 9.998019370537227e-07, + "loss": -0.0702, + "reward": 2.553897261619568, + "reward_std": 0.29422348737716675, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.06415844522416592, "rewards/tag_count_reward": 1.0, "step": 217 }, { "clip_ratio": 0.0, - "completion_length": 644.5208435058594, - "epoch": 0.218, - "grad_norm": 2.1194540845072596, - "kl": 0.09033203125, - "learning_rate": 9.623632283030077e-07, - "loss": 0.0715, - "reward": 2.900136113166809, - "reward_std": 0.12626729905605316, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 0.9652778506278992, - "rewards/repetition_penalty_reward": -0.04430842399597168, - "rewards/tag_count_reward": 1.0, - "step": 218 - }, + "completion_length": 535.3541870117188, + "epoch": 0.109, + "grad_norm": 2.203319463752206, + "kl": 0.0732421875, + "learning_rate": 9.997779521645791e-07, + "loss": -0.0537, + "reward": 2.600339412689209, + "reward_std": 0.34052593261003494, + "rewards/accuracy_reward": 0.708333358168602, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.07500774413347244, + "rewards/tag_count_reward": 0.9947916865348816, + "step": 218 + }, { "clip_ratio": 0.0, - "completion_length": 621.1041870117188, - "epoch": 0.219, - "grad_norm": 2.2781852426998395, - "kl": 0.1044921875, - "learning_rate": 9.617318505324212e-07, - "loss": -0.0177, - "reward": 2.647742986679077, - "reward_std": 0.42986422777175903, - "rewards/accuracy_reward": 0.6875000298023224, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.0328126884996891, - "rewards/tag_count_reward": 1.0, + "completion_length": 511.16668701171875, + "epoch": 0.1095, + "grad_norm": 2.6145352347525184, + "kl": 0.075927734375, + "learning_rate": 9.997525971737909e-07, + "loss": 0.1401, + "reward": 2.736908793449402, + "reward_std": 0.2718254253268242, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.09121626242995262, + "rewards/tag_count_reward": 0.9947916865348816, "step": 219 }, { "clip_ratio": 0.0, - "completion_length": 605.4791870117188, - "epoch": 0.22, - "grad_norm": 2.061772640741915, - "kl": 0.096435546875, - "learning_rate": 9.610954559391704e-07, - "loss": 0.041, - "reward": 2.9348647594451904, - "reward_std": 0.0986992521211505, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.03735771402716637, - "rewards/tag_count_reward": 1.0, + "completion_length": 524.2500305175781, + "epoch": 0.11, + "grad_norm": 2.5498578386752473, + "kl": 0.07666015625, + "learning_rate": 9.997258721585931e-07, + "loss": 0.0166, + "reward": 2.643193244934082, + "reward_std": 0.3977600634098053, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.08770973607897758, + "rewards/tag_count_reward": 0.9947916865348816, "step": 220 }, { "clip_ratio": 0.0, - "completion_length": 548.5416870117188, - "epoch": 0.221, - "grad_norm": 2.174295218510334, - "kl": 0.10888671875, - "learning_rate": 9.604540522775227e-07, - "loss": 0.0495, - "reward": 2.896275758743286, - "reward_std": 0.1799733191728592, - "rewards/accuracy_reward": 0.9375, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.02386310324072838, - "rewards/tag_count_reward": 0.9895833432674408, + "completion_length": 534.625, + "epoch": 0.1105, + "grad_norm": 2.4145494510921766, + "kl": 0.08544921875, + "learning_rate": 9.99697777200395e-07, + "loss": 0.0173, + "reward": 2.4541492462158203, + "reward_std": 0.46001937985420227, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.09446194767951965, + "rewards/tag_count_reward": 1.0, "step": 221 }, { "clip_ratio": 0.0, - "completion_length": 544.0, - "epoch": 0.222, - "grad_norm": 2.1466512297504834, - "kl": 0.10986328125, - "learning_rate": 9.598076473627796e-07, - "loss": 0.0578, - "reward": 2.740749478340149, - "reward_std": 0.31613367795944214, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.03182023763656616, - "rewards/tag_count_reward": 0.9947916865348816, + "completion_length": 586.8541870117188, + "epoch": 0.111, + "grad_norm": 2.465480916047294, + "kl": 0.069580078125, + "learning_rate": 9.996683123847795e-07, + "loss": 0.0974, + "reward": 2.7027587890625, + "reward_std": 0.3243703097105026, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.951388955116272, + "rewards/repetition_penalty_reward": -0.09238022565841675, + "rewards/tag_count_reward": 0.9895833432674408, "step": 222 }, { "clip_ratio": 0.0, - "completion_length": 535.7708435058594, - "epoch": 0.223, - "grad_norm": 2.280033428835585, - "kl": 0.10107421875, - "learning_rate": 9.59156249071181e-07, - "loss": 0.0484, - "reward": 2.7902063131332397, - "reward_std": 0.30508676171302795, + "completion_length": 557.25, + "epoch": 0.1115, + "grad_norm": 2.363152779089099, + "kl": 0.077880859375, + "learning_rate": 9.996374778015007e-07, + "loss": 0.075, + "reward": 2.7129660844802856, + "reward_std": 0.39234504103660583, "rewards/accuracy_reward": 0.8333333730697632, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.03618272300809622, - "rewards/tag_count_reward": 1.0, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.07522866874933243, + "rewards/tag_count_reward": 0.9895833432674408, "step": 223 }, { "clip_ratio": 0.0, - "completion_length": 611.3958435058594, - "epoch": 0.224, - "grad_norm": 2.78636229997098, - "kl": 0.103515625, - "learning_rate": 9.58499865339809e-07, - "loss": -0.0115, - "reward": 2.8599945306777954, - "reward_std": 0.2055322751402855, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.03583900257945061, - "rewards/tag_count_reward": 1.0, + "completion_length": 606.8958435058594, + "epoch": 0.112, + "grad_norm": 2.153335337523184, + "kl": 0.073486328125, + "learning_rate": 9.996052735444862e-07, + "loss": 0.088, + "reward": 2.5669835805892944, + "reward_std": 0.45203205943107605, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.08579418063163757, + "rewards/tag_count_reward": 0.9791666865348816, "step": 224 }, { "clip_ratio": 0.0, - "completion_length": 539.375, - "epoch": 0.225, - "grad_norm": 2.3136617005859286, - "kl": 0.100341796875, - "learning_rate": 9.578385041664925e-07, - "loss": 0.0279, - "reward": 2.5980384349823, - "reward_std": 0.35135801136493683, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/reasoning_steps_reward": 0.9583333730697632, - "rewards/repetition_penalty_reward": -0.026961631141602993, - "rewards/tag_count_reward": 1.0, + "completion_length": 568.1875305175781, + "epoch": 0.1125, + "grad_norm": 2.3030578231246324, + "kl": 0.069580078125, + "learning_rate": 9.99571699711836e-07, + "loss": 0.0667, + "reward": 2.5682613849639893, + "reward_std": 0.37949907779693604, + "rewards/accuracy_reward": 0.6875, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.09319691359996796, + "rewards/tag_count_reward": 0.9947916865348816, "step": 225 }, { "clip_ratio": 0.0, - "completion_length": 599.7291870117188, - "epoch": 0.226, - "grad_norm": 2.7312479125410287, - "kl": 0.11376953125, - "learning_rate": 9.571721736097088e-07, - "loss": -0.0362, - "reward": 2.461398482322693, - "reward_std": 0.46244145929813385, - "rewards/accuracy_reward": 0.5000000149011612, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.024712713435292244, - "rewards/tag_count_reward": 1.0, + "completion_length": 600.0833435058594, + "epoch": 0.113, + "grad_norm": 2.6766438894868294, + "kl": 0.070556640625, + "learning_rate": 9.995367564058216e-07, + "loss": 0.2762, + "reward": 2.503145933151245, + "reward_std": 0.5531609952449799, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9097222685813904, + "rewards/repetition_penalty_reward": -0.12011810764670372, + "rewards/tag_count_reward": 0.9843750298023224, "step": 226 }, { "clip_ratio": 0.0, - "completion_length": 503.6458435058594, - "epoch": 0.227, - "grad_norm": 2.9850488724293416, - "kl": 0.11083984375, - "learning_rate": 9.565008817884854e-07, - "loss": 0.1147, - "reward": 2.619432210922241, - "reward_std": 0.47723488509655, - "rewards/accuracy_reward": 0.6875000298023224, - "rewards/reasoning_steps_reward": 0.9652778208255768, - "rewards/repetition_penalty_reward": -0.028137334622442722, - "rewards/tag_count_reward": 0.9947916865348816, + "completion_length": 579.0833740234375, + "epoch": 0.1135, + "grad_norm": 2.2069984250119963, + "kl": 0.074462890625, + "learning_rate": 9.995004437328865e-07, + "loss": 0.0125, + "reward": 2.4455236196517944, + "reward_std": 0.4911371320486069, + "rewards/accuracy_reward": 0.5625000298023224, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.08919859677553177, + "rewards/tag_count_reward": 1.0, "step": 227 }, { "clip_ratio": 0.0, - "completion_length": 523.9583435058594, - "epoch": 0.228, - "grad_norm": 2.171102325634445, - "kl": 0.107421875, - "learning_rate": 9.55824636882301e-07, - "loss": -0.0038, - "reward": 2.581455111503601, - "reward_std": 0.357488214969635, - "rewards/accuracy_reward": 0.625, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.036600593477487564, - "rewards/tag_count_reward": 1.0, + "completion_length": 562.5000305175781, + "epoch": 0.114, + "grad_norm": 2.219775988144525, + "kl": 0.07666015625, + "learning_rate": 9.994627618036452e-07, + "loss": 0.0123, + "reward": 2.5935659408569336, + "reward_std": 0.34948965907096863, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9583334028720856, + "rewards/repetition_penalty_reward": -0.08351738005876541, + "rewards/tag_count_reward": 0.9895833432674408, "step": 228 }, { "clip_ratio": 0.0, - "completion_length": 552.9375152587891, - "epoch": 0.229, - "grad_norm": 2.3604731804980457, - "kl": 0.115234375, - "learning_rate": 9.55143447130987e-07, - "loss": 0.044, - "reward": 2.7417712211608887, - "reward_std": 0.29566267877817154, + "completion_length": 591.1666870117188, + "epoch": 0.1145, + "grad_norm": 2.952466021730348, + "kl": 0.080078125, + "learning_rate": 9.994237107328838e-07, + "loss": 0.0722, + "reward": 2.6423157453536987, + "reward_std": 0.2888629548251629, "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.02906225249171257, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.08685098960995674, "rewards/tag_count_reward": 1.0, "step": 229 }, { "clip_ratio": 0.0, - "completion_length": 599.6666870117188, - "epoch": 0.23, - "grad_norm": 2.247209523128909, - "kl": 0.1123046875, - "learning_rate": 9.54457320834625e-07, - "loss": 0.0325, - "reward": 2.77982234954834, - "reward_std": 0.38850660622119904, - "rewards/accuracy_reward": 0.8125000298023224, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.03267781715840101, + "completion_length": 591.8958435058594, + "epoch": 0.115, + "grad_norm": 2.1182653417835424, + "kl": 0.07373046875, + "learning_rate": 9.993832906395582e-07, + "loss": 0.0002, + "reward": 2.5170371532440186, + "reward_std": 0.468481108546257, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.08712967112660408, "rewards/tag_count_reward": 1.0, "step": 230 }, { "clip_ratio": 0.0, - "completion_length": 498.91668701171875, - "epoch": 0.231, - "grad_norm": 2.1665114909176872, - "kl": 0.103515625, - "learning_rate": 9.537662663534477e-07, - "loss": 0.01, - "reward": 2.9319541454315186, - "reward_std": 0.12954077869653702, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 0.9861111044883728, - "rewards/repetition_penalty_reward": -0.03332359157502651, + "completion_length": 546.125, + "epoch": 0.1155, + "grad_norm": 2.0941520101521065, + "kl": 0.072509765625, + "learning_rate": 9.993415016467952e-07, + "loss": -0.01, + "reward": 2.8371798992156982, + "reward_std": 0.18849333748221397, + "rewards/accuracy_reward": 0.9375, + "rewards/reasoning_steps_reward": 0.9722222089767456, + "rewards/repetition_penalty_reward": -0.07254217192530632, "rewards/tag_count_reward": 1.0, "step": 231 }, { "clip_ratio": 0.0, - "completion_length": 565.1458435058594, - "epoch": 0.232, - "grad_norm": 2.397445136474842, - "kl": 0.1044921875, - "learning_rate": 9.530702921077358e-07, - "loss": 0.0176, - "reward": 2.7926957607269287, - "reward_std": 0.3122672885656357, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 0.979166716337204, - "rewards/repetition_penalty_reward": -0.04063776135444641, + "completion_length": 600.3333587646484, + "epoch": 0.116, + "grad_norm": 2.070220926978154, + "kl": 0.0732421875, + "learning_rate": 9.992983438818915e-07, + "loss": 0.0136, + "reward": 2.483844041824341, + "reward_std": 0.35293829441070557, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 0.9513889849185944, + "rewards/repetition_penalty_reward": -0.09254487603902817, "rewards/tag_count_reward": 1.0, "step": 232 }, { "clip_ratio": 0.0, - "completion_length": 515.5625152587891, - "epoch": 0.233, - "grad_norm": 2.3670675203551528, - "kl": 0.113525390625, - "learning_rate": 9.523694065777156e-07, - "loss": 0.0494, - "reward": 2.6802080869674683, - "reward_std": 0.28352178633213043, + "completion_length": 499.60418701171875, + "epoch": 0.1165, + "grad_norm": 3.1702925639839306, + "kl": 0.08349609375, + "learning_rate": 9.992538174763127e-07, + "loss": -0.0305, + "reward": 2.5292654037475586, + "reward_std": 0.334541991353035, "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.02812536619603634, - "rewards/tag_count_reward": 1.0, + "rewards/reasoning_steps_reward": 0.9027778506278992, + "rewards/repetition_penalty_reward": -0.09226257354021072, + "rewards/tag_count_reward": 0.9895833432674408, "step": 233 }, { "clip_ratio": 0.0, - "completion_length": 524.875, - "epoch": 0.234, - "grad_norm": 2.1646989735165527, - "kl": 0.121826171875, - "learning_rate": 9.516636183034564e-07, - "loss": 0.0408, - "reward": 2.791554570198059, - "reward_std": 0.38019663095474243, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.03309844061732292, + "completion_length": 564.8125152587891, + "epoch": 0.117, + "grad_norm": 2.3960583772955424, + "kl": 0.07763671875, + "learning_rate": 9.992079225656944e-07, + "loss": 0.029, + "reward": 2.762247085571289, + "reward_std": 0.279547318816185, + "rewards/accuracy_reward": 0.9375, + "rewards/reasoning_steps_reward": 0.9305556118488312, + "rewards/repetition_penalty_reward": -0.09018350392580032, "rewards/tag_count_reward": 0.984375, "step": 234 }, { "clip_ratio": 0.0, - "completion_length": 560.0208435058594, - "epoch": 0.235, - "grad_norm": 2.2756987535611404, - "kl": 0.10693359375, - "learning_rate": 9.509529358847654e-07, - "loss": 0.0085, - "reward": 2.8194351196289062, - "reward_std": 0.3406111150979996, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.03473140113055706, + "completion_length": 607.2083435058594, + "epoch": 0.1175, + "grad_norm": 2.042913157935333, + "kl": 0.068115234375, + "learning_rate": 9.9916065928984e-07, + "loss": -0.0039, + "reward": 2.4307457208633423, + "reward_std": 0.44265393912792206, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.902777761220932, + "rewards/repetition_penalty_reward": -0.11786547303199768, "rewards/tag_count_reward": 1.0, "step": 235 }, { "clip_ratio": 0.0, - "completion_length": 575.375, - "epoch": 0.236, - "grad_norm": 2.174744939943205, - "kl": 0.100830078125, - "learning_rate": 9.502373679810839e-07, - "loss": -0.0023, - "reward": 2.7065417766571045, - "reward_std": 0.15310228383168578, - "rewards/accuracy_reward": 0.75, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.03651383891701698, + "completion_length": 533.5208435058594, + "epoch": 0.118, + "grad_norm": 2.2072960786944313, + "kl": 0.073974609375, + "learning_rate": 9.991120277927223e-07, + "loss": 0.0585, + "reward": 2.7634425163269043, + "reward_std": 0.2200346365571022, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.10461306571960449, "rewards/tag_count_reward": 1.0, "step": 236 }, { "clip_ratio": 0.0, - "completion_length": 484.9583435058594, - "epoch": 0.237, - "grad_norm": 2.6379983576818242, - "kl": 0.119140625, - "learning_rate": 9.495169233113806e-07, - "loss": 0.0566, - "reward": 2.538248300552368, - "reward_std": 0.3112593740224838, - "rewards/accuracy_reward": 0.6041666865348816, - "rewards/reasoning_steps_reward": 0.965277761220932, - "rewards/repetition_penalty_reward": -0.031196235679090023, + "completion_length": 564.6458435058594, + "epoch": 0.1185, + "grad_norm": 2.2413703328479935, + "kl": 0.076171875, + "learning_rate": 9.990620282224806e-07, + "loss": -0.024, + "reward": 2.489652156829834, + "reward_std": 0.32129333913326263, + "rewards/accuracy_reward": 0.645833358168602, + "rewards/reasoning_steps_reward": 0.9375000596046448, + "rewards/repetition_penalty_reward": -0.09368134289979935, "rewards/tag_count_reward": 1.0, "step": 237 }, { "clip_ratio": 0.0, - "completion_length": 565.7916870117188, - "epoch": 0.238, - "grad_norm": 2.052140048933374, - "kl": 0.114990234375, - "learning_rate": 9.487916106540465e-07, - "loss": -0.0027, - "reward": 2.778996706008911, - "reward_std": 0.32464583218097687, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.03350352682173252, + "completion_length": 509.1041717529297, + "epoch": 0.119, + "grad_norm": 2.255281998818256, + "kl": 0.071044921875, + "learning_rate": 9.990106607314225e-07, + "loss": 0.0002, + "reward": 2.6565502882003784, + "reward_std": 0.40016523003578186, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.9166666865348816, + "rewards/repetition_penalty_reward": -0.07261638343334198, "rewards/tag_count_reward": 1.0, "step": 238 }, { "clip_ratio": 0.0, - "completion_length": 507.2291717529297, - "epoch": 0.239, - "grad_norm": 2.3870284690729915, - "kl": 0.103515625, - "learning_rate": 9.480614388467877e-07, - "loss": 0.0266, - "reward": 2.865510582923889, - "reward_std": 0.2045399323105812, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 0.9861111044883728, - "rewards/repetition_penalty_reward": -0.037267398089170456, - "rewards/tag_count_reward": 1.0, + "completion_length": 528.0833587646484, + "epoch": 0.1195, + "grad_norm": 2.188877162746338, + "kl": 0.07568359375, + "learning_rate": 9.989579254760224e-07, + "loss": -0.0314, + "reward": 2.826788544654846, + "reward_std": 0.20467501878738403, + "rewards/accuracy_reward": 0.9791666865348816, + "rewards/reasoning_steps_reward": 0.9166666865348816, + "rewards/repetition_penalty_reward": -0.05862821638584137, + "rewards/tag_count_reward": 0.9895833432674408, "step": 239 }, { "clip_ratio": 0.0, - "completion_length": 473.0833435058594, - "epoch": 0.24, - "grad_norm": 2.340901673290699, - "kl": 0.120849609375, - "learning_rate": 9.473264167865171e-07, - "loss": -0.0045, - "reward": 2.849257707595825, - "reward_std": 0.26365862786769867, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 0.951388955116272, - "rewards/repetition_penalty_reward": -0.018797898665070534, - "rewards/tag_count_reward": 1.0, + "completion_length": 568.8125305175781, + "epoch": 0.12, + "grad_norm": 2.434441145634029, + "kl": 0.074951171875, + "learning_rate": 9.989038226169207e-07, + "loss": 0.1175, + "reward": 2.318480372428894, + "reward_std": 0.2832699418067932, + "rewards/accuracy_reward": 0.4583333432674408, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.07908925786614418, + "rewards/tag_count_reward": 0.9947916865348816, "step": 240 }, { "clip_ratio": 0.0, - "completion_length": 502.3125, - "epoch": 0.241, - "grad_norm": 1.9771145576165876, - "kl": 0.10791015625, - "learning_rate": 9.465865534292464e-07, - "loss": 0.0245, - "reward": 2.665071487426758, - "reward_std": 0.25069355964660645, - "rewards/accuracy_reward": 0.7083333730697632, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.036317549645900726, + "completion_length": 533.5416870117188, + "epoch": 0.1205, + "grad_norm": 2.378465168474232, + "kl": 0.081787109375, + "learning_rate": 9.988483523189248e-07, + "loss": 0.0519, + "reward": 2.758083462715149, + "reward_std": 0.313043013215065, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 0.9375000596046448, + "rewards/repetition_penalty_reward": -0.07524998486042023, "rewards/tag_count_reward": 1.0, "step": 241 }, { "clip_ratio": 0.0, - "completion_length": 469.0625, - "epoch": 0.242, - "grad_norm": 2.429463773148552, - "kl": 0.12890625, - "learning_rate": 9.458418577899774e-07, - "loss": 0.0219, - "reward": 2.832092523574829, - "reward_std": 0.3335033804178238, - "rewards/accuracy_reward": 0.8750000298023224, - "rewards/reasoning_steps_reward": 0.972222238779068, - "rewards/repetition_penalty_reward": -0.015129742678254843, + "completion_length": 646.6041870117188, + "epoch": 0.121, + "grad_norm": 2.0129363099493394, + "kl": 0.070068359375, + "learning_rate": 9.98791514751006e-07, + "loss": -0.0271, + "reward": 2.4740262031555176, + "reward_std": 0.3102487027645111, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 0.9375000298023224, + "rewards/repetition_penalty_reward": -0.08847374841570854, "rewards/tag_count_reward": 1.0, "step": 242 }, { "clip_ratio": 0.0, - "completion_length": 509.1041717529297, - "epoch": 0.243, - "grad_norm": 2.2888490947191453, - "kl": 0.11669921875, - "learning_rate": 9.450923389425911e-07, - "loss": 0.0679, - "reward": 2.8057637214660645, - "reward_std": 0.3620642125606537, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.027569908648729324, + "completion_length": 495.85418701171875, + "epoch": 0.1215, + "grad_norm": 2.3579635378621226, + "kl": 0.080810546875, + "learning_rate": 9.98733310086302e-07, + "loss": 0.0068, + "reward": 2.6361684799194336, + "reward_std": 0.22484752535820007, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.0860537700355053, "rewards/tag_count_reward": 1.0, "step": 243 }, { "clip_ratio": 0.0, - "completion_length": 462.3958435058594, - "epoch": 0.244, - "grad_norm": 2.7695341753126383, - "kl": 0.12841796875, - "learning_rate": 9.443380060197385e-07, - "loss": 0.028, - "reward": 2.687015175819397, - "reward_std": 0.19807551801204681, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.02305438881739974, - "rewards/tag_count_reward": 0.9947916865348816, + "completion_length": 570.1250305175781, + "epoch": 0.122, + "grad_norm": 2.2032148417869277, + "kl": 0.0771484375, + "learning_rate": 9.98673738502114e-07, + "loss": -0.0345, + "reward": 2.1713971495628357, + "reward_std": 0.3244406059384346, + "rewards/accuracy_reward": 0.27083333395421505, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.07860295474529266, + "rewards/tag_count_reward": 1.0, "step": 244 }, { "clip_ratio": 0.0, - "completion_length": 562.0625152587891, - "epoch": 0.245, - "grad_norm": 2.0280976947569536, - "kl": 0.12060546875, - "learning_rate": 9.43578868212728e-07, - "loss": -0.044, - "reward": 2.79758882522583, - "reward_std": 0.2576555460691452, - "rewards/accuracy_reward": 0.875, - "rewards/reasoning_steps_reward": 0.9513889849185944, - "rewards/repetition_penalty_reward": -0.028800049796700478, - "rewards/tag_count_reward": 1.0, + "completion_length": 607.0833435058594, + "epoch": 0.1225, + "grad_norm": 2.343457795616331, + "kl": 0.07861328125, + "learning_rate": 9.986128001799076e-07, + "loss": -0.0081, + "reward": 2.586448907852173, + "reward_std": 0.3658638447523117, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9097222685813904, + "rewards/repetition_penalty_reward": -0.08889832720160484, + "rewards/tag_count_reward": 0.9947916865348816, "step": 245 }, { "clip_ratio": 0.0, - "completion_length": 529.3333435058594, - "epoch": 0.246, - "grad_norm": 2.3299542232310477, - "kl": 0.119873046875, - "learning_rate": 9.428149347714143e-07, - "loss": -0.0061, - "reward": 2.660530924797058, - "reward_std": 0.16930758208036423, - "rewards/accuracy_reward": 0.6875000149011612, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.026969049125909805, - "rewards/tag_count_reward": 1.0, - "step": 246 + "completion_length": 528.2708435058594, + "epoch": 0.123, + "grad_norm": 2.1457635532124315, + "kl": 0.09033203125, + "learning_rate": 9.985504953053113e-07, + "loss": 0.0243, + "reward": 2.761418104171753, + "reward_std": 0.19703956693410873, + "rewards/accuracy_reward": 0.875, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.07365139201283455, + "rewards/tag_count_reward": 0.9947916865348816, + "step": 246 }, { "clip_ratio": 0.0, - "completion_length": 488.72918701171875, - "epoch": 0.247, - "grad_norm": 2.300003946060058, - "kl": 0.1318359375, - "learning_rate": 9.420462150040852e-07, - "loss": 0.0401, - "reward": 2.5491377115249634, - "reward_std": 0.3199358731508255, - "rewards/accuracy_reward": 0.5833333432674408, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.027251286432147026, - "rewards/tag_count_reward": 1.0, + "completion_length": 606.6458435058594, + "epoch": 0.1235, + "grad_norm": 2.3768865639458063, + "kl": 0.083740234375, + "learning_rate": 9.984868240681164e-07, + "loss": 0.061, + "reward": 2.5193967819213867, + "reward_std": 0.4161098003387451, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.08129774034023285, + "rewards/tag_count_reward": 0.9895833432674408, "step": 247 }, { "clip_ratio": 0.0, - "completion_length": 528.2291870117188, - "epoch": 0.248, - "grad_norm": 2.242455600453007, - "kl": 0.119384765625, - "learning_rate": 9.412727182773486e-07, - "loss": 0.0059, - "reward": 2.7635743618011475, - "reward_std": 0.3389218971133232, - "rewards/accuracy_reward": 0.8125, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.02809229213744402, + "completion_length": 626.9375305175781, + "epoch": 0.124, + "grad_norm": 2.4645417816762683, + "kl": 0.077392578125, + "learning_rate": 9.98421786662277e-07, + "loss": 0.148, + "reward": 2.6498433351516724, + "reward_std": 0.4323410838842392, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9652778506278992, + "rewards/repetition_penalty_reward": -0.08626773580908775, "rewards/tag_count_reward": 1.0, "step": 248 }, { "clip_ratio": 0.0, - "completion_length": 639.5625305175781, - "epoch": 0.249, - "grad_norm": 1.9733887624956274, - "kl": 0.110595703125, - "learning_rate": 9.404944540160177e-07, - "loss": -0.0684, - "reward": 2.6618722677230835, - "reward_std": 0.2736282553523779, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.03951651602983475, - "rewards/tag_count_reward": 1.0, + "completion_length": 543.25, + "epoch": 0.1245, + "grad_norm": 2.3866168405234474, + "kl": 0.081298828125, + "learning_rate": 9.983553832859078e-07, + "loss": 0.0302, + "reward": 2.5024802684783936, + "reward_std": 0.43941864371299744, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.07043641060590744, + "rewards/tag_count_reward": 0.9895833432674408, "step": 249 }, { "clip_ratio": 0.0, - "completion_length": 501.12501525878906, - "epoch": 0.25, - "grad_norm": 2.167863484844858, - "kl": 0.13525390625, - "learning_rate": 9.397114317029974e-07, - "loss": 0.004, - "reward": 2.526219606399536, - "reward_std": 0.380461186170578, - "rewards/accuracy_reward": 0.5833333730697632, - "rewards/reasoning_steps_reward": 0.972222238779068, - "rewards/repetition_penalty_reward": -0.0293360473588109, - "rewards/tag_count_reward": 1.0, + "completion_length": 571.1875305175781, + "epoch": 0.125, + "grad_norm": 2.425419283542172, + "kl": 0.112060546875, + "learning_rate": 9.982876141412855e-07, + "loss": 0.034, + "reward": 2.78093945980072, + "reward_std": 0.2996261715888977, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.07496321201324463, + "rewards/tag_count_reward": 0.9739583730697632, "step": 250 }, { "clip_ratio": 0.0, - "completion_length": 592.2083435058594, - "epoch": 0.251, - "grad_norm": 2.441314951742952, - "kl": 0.11767578125, - "learning_rate": 9.38923660879167e-07, - "loss": 0.1345, - "reward": 2.7307329177856445, - "reward_std": 0.2994953393936157, - "rewards/accuracy_reward": 0.7708333730697632, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.033156177029013634, + "completion_length": 620.5, + "epoch": 0.1255, + "grad_norm": 2.129504256319349, + "kl": 0.080322265625, + "learning_rate": 9.982184794348462e-07, + "loss": 0.0166, + "reward": 2.693773031234741, + "reward_std": 0.40914659202098846, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.08400484919548035, "rewards/tag_count_reward": 1.0, "step": 251 }, { "clip_ratio": 0.0, - "completion_length": 564.2708435058594, - "epoch": 0.252, - "grad_norm": 2.49677783518446, - "kl": 0.122802734375, - "learning_rate": 9.381311511432658e-07, - "loss": 0.1067, - "reward": 2.4708348512649536, - "reward_std": 0.36502179503440857, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/reasoning_steps_reward": 0.9652777910232544, - "rewards/repetition_penalty_reward": -0.036109643056988716, - "rewards/tag_count_reward": 1.0, + "completion_length": 534.3958435058594, + "epoch": 0.126, + "grad_norm": 1.8697440201638618, + "kl": 0.087890625, + "learning_rate": 9.981479793771866e-07, + "loss": 0.0042, + "reward": 2.5290188789367676, + "reward_std": 0.21817893348634243, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.10466165468096733, + "rewards/tag_count_reward": 0.9947916865348816, "step": 252 }, { "clip_ratio": 0.0, - "completion_length": 601.3125305175781, - "epoch": 0.253, - "grad_norm": 2.031276195067717, - "kl": 0.119384765625, - "learning_rate": 9.373339121517746e-07, - "loss": 0.0435, - "reward": 2.9231903553009033, - "reward_std": 0.10614164918661118, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.04382369481027126, - "rewards/tag_count_reward": 0.9947916865348816, + "completion_length": 609.8958587646484, + "epoch": 0.1265, + "grad_norm": 2.3009060679085156, + "kl": 0.078369140625, + "learning_rate": 9.98076114183062e-07, + "loss": 0.1217, + "reward": 2.6637524366378784, + "reward_std": 0.40659724175930023, + "rewards/accuracy_reward": 0.7708333730697632, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.06194208189845085, + "rewards/tag_count_reward": 0.96875, "step": 253 }, { "clip_ratio": 0.0, - "completion_length": 572.4791870117188, - "epoch": 0.254, - "grad_norm": 2.1020100515227313, - "kl": 0.12060546875, - "learning_rate": 9.36531953618799e-07, - "loss": 0.0657, - "reward": 2.7917503118515015, - "reward_std": 0.26876559667289257, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.029430264607071877, - "rewards/tag_count_reward": 0.9947916865348816, + "completion_length": 720.5416870117188, + "epoch": 0.127, + "grad_norm": 2.2567238155019322, + "kl": 0.080078125, + "learning_rate": 9.98002884071386e-07, + "loss": 0.0781, + "reward": 2.4784340858459473, + "reward_std": 0.4698047339916229, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.11531602591276169, + "rewards/tag_count_reward": 0.9479166865348816, "step": 254 }, { "clip_ratio": 0.0, - "completion_length": 553.8333740234375, - "epoch": 0.255, - "grad_norm": 3.5327461787319776, - "kl": 0.13232421875, - "learning_rate": 9.357252853159505e-07, - "loss": 0.0351, - "reward": 2.589944839477539, - "reward_std": 0.22619716823101044, - "rewards/accuracy_reward": 0.6250000298023224, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.02811075560748577, - "rewards/tag_count_reward": 1.0, + "completion_length": 644.8125305175781, + "epoch": 0.1275, + "grad_norm": 1.972516265850252, + "kl": 0.081298828125, + "learning_rate": 9.979282892652304e-07, + "loss": 0.0858, + "reward": 2.5695180892944336, + "reward_std": 0.34912221878767014, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.11277362704277039, + "rewards/tag_count_reward": 0.9739583432674408, "step": 255 }, { "clip_ratio": 0.0, - "completion_length": 548.0416870117188, - "epoch": 0.256, - "grad_norm": 2.183159179082376, - "kl": 0.13525390625, - "learning_rate": 9.34913917072228e-07, - "loss": -0.0696, - "reward": 2.6049808263778687, - "reward_std": 0.3286217898130417, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.028699966147542, - "rewards/tag_count_reward": 0.9947916865348816, + "completion_length": 641.7708435058594, + "epoch": 0.128, + "grad_norm": 2.042073924733626, + "kl": 0.08740234375, + "learning_rate": 9.97852329991824e-07, + "loss": 0.0308, + "reward": 2.818283200263977, + "reward_std": 0.27264343202114105, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.06886980310082436, + "rewards/tag_count_reward": 0.984375, "step": 256 }, { "clip_ratio": 0.0, - "completion_length": 580.6041870117188, - "epoch": 0.257, - "grad_norm": 2.401471424731539, - "kl": 0.12744140625, - "learning_rate": 9.340978587738972e-07, - "loss": 0.0462, - "reward": 2.8021459579467773, - "reward_std": 0.3006982207298279, - "rewards/accuracy_reward": 0.8333333730697632, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.031187265180051327, - "rewards/tag_count_reward": 1.0, + "completion_length": 572.0625305175781, + "epoch": 0.1285, + "grad_norm": 2.101899385079832, + "kl": 0.08544921875, + "learning_rate": 9.977750064825519e-07, + "loss": -0.024, + "reward": 2.627909779548645, + "reward_std": 0.3649473935365677, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9861111640930176, + "rewards/repetition_penalty_reward": -0.08215974271297455, + "rewards/tag_count_reward": 0.9947916865348816, "step": 257 }, { "clip_ratio": 0.0, - "completion_length": 570.6041870117188, - "epoch": 0.258, - "grad_norm": 7.315971327716766, - "kl": 0.12353515625, - "learning_rate": 9.332771203643714e-07, - "loss": 0.1692, - "reward": 2.6945351362228394, - "reward_std": 0.3729802221059799, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.0398398581892252, - "rewards/tag_count_reward": 0.984375, + "completion_length": 498.64585876464844, + "epoch": 0.129, + "grad_norm": 2.2345487850590944, + "kl": 0.088134765625, + "learning_rate": 9.976963189729547e-07, + "loss": -0.0191, + "reward": 2.7989070415496826, + "reward_std": 0.24870866537094116, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.06220416724681854, + "rewards/tag_count_reward": 1.0, "step": 258 }, { "clip_ratio": 0.0, - "completion_length": 462.18751525878906, - "epoch": 0.259, - "grad_norm": 2.249715244876303, - "kl": 0.1328125, - "learning_rate": 9.324517118440888e-07, - "loss": 0.0048, - "reward": 2.757077693939209, - "reward_std": 0.31304194778203964, - "rewards/accuracy_reward": 0.8125000298023224, - "rewards/reasoning_steps_reward": 0.9652778208255768, - "rewards/repetition_penalty_reward": -0.020700284279882908, - "rewards/tag_count_reward": 1.0, + "completion_length": 715.6250305175781, + "epoch": 0.1295, + "grad_norm": 1.8695009477650302, + "kl": 0.07861328125, + "learning_rate": 9.976162677027284e-07, + "loss": 0.0655, + "reward": 2.5826069116592407, + "reward_std": 0.4399617910385132, + "rewards/accuracy_reward": 0.75, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.11010143533349037, + "rewards/tag_count_reward": 0.984375, "step": 259 }, { "clip_ratio": 0.0, - "completion_length": 508.2083435058594, - "epoch": 0.26, - "grad_norm": 5.193269569172354, - "kl": 0.1416015625, - "learning_rate": 9.316216432703916e-07, - "loss": 0.1727, - "reward": 2.8865363597869873, - "reward_std": 0.19043887220323086, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.04228321462869644, - "rewards/tag_count_reward": 0.984375, + "completion_length": 501.1458435058594, + "epoch": 0.13, + "grad_norm": 2.1459813731795068, + "kl": 0.08251953125, + "learning_rate": 9.975348529157229e-07, + "loss": 0.0961, + "reward": 2.8910681009292603, + "reward_std": 0.10162430070340633, + "rewards/accuracy_reward": 1.0, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.0776820220053196, + "rewards/tag_count_reward": 0.9895833432674408, "step": 260 }, { "clip_ratio": 0.0, - "completion_length": 496.58335876464844, - "epoch": 0.261, - "grad_norm": 2.2509057041996443, - "kl": 0.13623046875, - "learning_rate": 9.307869247574038e-07, - "loss": 0.0359, - "reward": 2.8467074632644653, - "reward_std": 0.2996975928544998, - "rewards/accuracy_reward": 0.8958333730697632, - "rewards/reasoning_steps_reward": 0.9791667461395264, - "rewards/repetition_penalty_reward": -0.02829269878566265, - "rewards/tag_count_reward": 1.0, + "completion_length": 555.25, + "epoch": 0.1305, + "grad_norm": 2.0994299368541287, + "kl": 0.0859375, + "learning_rate": 9.974520748599421e-07, + "loss": 0.0963, + "reward": 2.3302276134490967, + "reward_std": 0.17742525041103363, + "rewards/accuracy_reward": 0.4375, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.07775864750146866, + "rewards/tag_count_reward": 0.984375, "step": 261 }, { "clip_ratio": 0.0, - "completion_length": 628.9375, - "epoch": 0.262, - "grad_norm": 4.668495262967337, - "kl": 0.13623046875, - "learning_rate": 9.299475664759068e-07, - "loss": 0.1609, - "reward": 2.720739483833313, - "reward_std": 0.4649975746870041, - "rewards/accuracy_reward": 0.8125, - "rewards/reasoning_steps_reward": 0.9652777910232544, - "rewards/repetition_penalty_reward": -0.04141336679458618, - "rewards/tag_count_reward": 0.9843750298023224, + "completion_length": 707.6041870117188, + "epoch": 0.131, + "grad_norm": 2.179306110235832, + "kl": 0.09033203125, + "learning_rate": 9.973679337875418e-07, + "loss": 0.0756, + "reward": 2.457505941390991, + "reward_std": 0.49539755284786224, + "rewards/accuracy_reward": 0.645833358168602, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.09978591650724411, + "rewards/tag_count_reward": 0.9531250298023224, "step": 262 }, { "clip_ratio": 0.0, - "completion_length": 519.0, - "epoch": 0.263, - "grad_norm": 2.3541332969660975, - "kl": 0.13671875, - "learning_rate": 9.291035786532163e-07, - "loss": 0.0638, - "reward": 2.4355462789535522, - "reward_std": 0.21204119874164462, - "rewards/accuracy_reward": 0.4791666716337204, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.033203769475221634, - "rewards/tag_count_reward": 0.9895833432674408, + "completion_length": 616.2916870117188, + "epoch": 0.1315, + "grad_norm": 2.0703888587416466, + "kl": 0.08544921875, + "learning_rate": 9.972824299548309e-07, + "loss": 0.0048, + "reward": 2.7762060165405273, + "reward_std": 0.2107193972915411, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.07796090468764305, + "rewards/tag_count_reward": 1.0, "step": 263 }, { "clip_ratio": 0.0, - "completion_length": 546.3125, - "epoch": 0.264, - "grad_norm": 2.1456751308727213, - "kl": 0.1416015625, - "learning_rate": 9.282549715730579e-07, - "loss": -0.0358, - "reward": 2.4012598991394043, - "reward_std": 0.44198279082775116, - "rewards/accuracy_reward": 0.4791666865348816, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.03971245139837265, - "rewards/tag_count_reward": 0.9895833432674408, + "completion_length": 475.9166717529297, + "epoch": 0.132, + "grad_norm": 2.170217179415964, + "kl": 0.090087890625, + "learning_rate": 9.971955636222684e-07, + "loss": 0.0146, + "reward": 2.930613875389099, + "reward_std": 0.0612574927508831, + "rewards/accuracy_reward": 1.0, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.05549720861017704, + "rewards/tag_count_reward": 1.0, "step": 264 }, { "clip_ratio": 0.0, - "completion_length": 460.12501525878906, - "epoch": 0.265, - "grad_norm": 2.3449066935567697, - "kl": 0.15087890625, - "learning_rate": 9.274017555754407e-07, - "loss": 0.0136, - "reward": 2.7137099504470825, - "reward_std": 0.326103575527668, - "rewards/accuracy_reward": 0.7708333730697632, - "rewards/reasoning_steps_reward": 0.9652778506278992, - "rewards/repetition_penalty_reward": -0.02240123925730586, + "completion_length": 510.8958435058594, + "epoch": 0.1325, + "grad_norm": 2.0247597136853077, + "kl": 0.093994140625, + "learning_rate": 9.971073350544644e-07, + "loss": 0.0141, + "reward": 2.8639657497406006, + "reward_std": 0.18884775042533875, + "rewards/accuracy_reward": 0.9583333432674408, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.06658982113003731, "rewards/tag_count_reward": 1.0, "step": 265 }, { "clip_ratio": 0.0, - "completion_length": 526.7291870117188, - "epoch": 0.266, - "grad_norm": 2.521194502817804, - "kl": 0.1572265625, - "learning_rate": 9.265439410565328e-07, - "loss": 0.1172, - "reward": 2.733197331428528, - "reward_std": 0.43402716517448425, - "rewards/accuracy_reward": 0.8125, - "rewards/reasoning_steps_reward": 0.9652777910232544, - "rewards/repetition_penalty_reward": -0.028955545276403427, - "rewards/tag_count_reward": 0.984375, + "completion_length": 613.6041870117188, + "epoch": 0.133, + "grad_norm": 2.819432158137642, + "kl": 0.124267578125, + "learning_rate": 9.970177445201783e-07, + "loss": 0.0593, + "reward": 2.438739776611328, + "reward_std": 0.35922619700431824, + "rewards/accuracy_reward": 0.5625000298023224, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.07688540033996105, + "rewards/tag_count_reward": 0.9739583432674408, "step": 266 }, { "clip_ratio": 0.0, - "completion_length": 580.3541870117188, - "epoch": 0.267, - "grad_norm": 2.2571936139784374, - "kl": 0.15185546875, - "learning_rate": 9.256815384685328e-07, - "loss": 0.0205, - "reward": 2.665441632270813, - "reward_std": 0.2782168686389923, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.029003008268773556, - "rewards/tag_count_reward": 1.0, + "completion_length": 521.1458587646484, + "epoch": 0.1335, + "grad_norm": 2.046082487346144, + "kl": 0.101806640625, + "learning_rate": 9.969267922923188e-07, + "loss": 0.0155, + "reward": 2.68787944316864, + "reward_std": 0.290618859231472, + "rewards/accuracy_reward": 0.8125, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.09163457155227661, + "rewards/tag_count_reward": 0.9947916865348816, "step": 267 }, { "clip_ratio": 0.0, - "completion_length": 528.8541870117188, - "epoch": 0.268, - "grad_norm": 2.527114445757883, - "kl": 0.15478515625, - "learning_rate": 9.248145583195447e-07, - "loss": 0.0642, - "reward": 2.634427309036255, - "reward_std": 0.35257890820503235, - "rewards/accuracy_reward": 0.6666666716337204, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.025295117869973183, - "rewards/tag_count_reward": 1.0, + "completion_length": 531.6041870117188, + "epoch": 0.134, + "grad_norm": 2.415289797288351, + "kl": 0.10546875, + "learning_rate": 9.968344786479415e-07, + "loss": 0.0305, + "reward": 2.6710007190704346, + "reward_std": 0.35774578154087067, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.05469386838376522, + "rewards/tag_count_reward": 0.9895833432674408, "step": 268 }, { "clip_ratio": 0.0, - "completion_length": 480.4791717529297, - "epoch": 0.269, - "grad_norm": 2.565433677851082, - "kl": 0.15185546875, - "learning_rate": 9.239430111734476e-07, - "loss": -0.0129, - "reward": 2.6094166040420532, - "reward_std": 0.32510536164045334, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/reasoning_steps_reward": 0.9652778208255768, - "rewards/repetition_penalty_reward": -0.02252791728824377, - "rewards/tag_count_reward": 1.0, + "completion_length": 486.10418701171875, + "epoch": 0.1345, + "grad_norm": 2.0309227835825108, + "kl": 0.10546875, + "learning_rate": 9.967408038682505e-07, + "loss": 0.0142, + "reward": 2.8688745498657227, + "reward_std": 0.1512543261051178, + "rewards/accuracy_reward": 1.0, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.06688930839300156, + "rewards/tag_count_reward": 0.984375, "step": 269 }, { "clip_ratio": 0.0, - "completion_length": 535.3333435058594, - "epoch": 0.27, - "grad_norm": 7.328177927840423, - "kl": 0.15185546875, - "learning_rate": 9.230669076497687e-07, - "loss": 0.1255, - "reward": 2.5093064308166504, - "reward_std": 0.3799414336681366, - "rewards/accuracy_reward": 0.5625000149011612, + "completion_length": 502.2708435058594, + "epoch": 0.135, + "grad_norm": 2.2773691497710242, + "kl": 0.1025390625, + "learning_rate": 9.96645768238595e-07, + "loss": 0.0202, + "reward": 2.5804989337921143, + "reward_std": 0.20569386333227158, + "rewards/accuracy_reward": 0.708333358168602, "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.02367980219423771, - "rewards/tag_count_reward": 0.984375, + "rewards/repetition_penalty_reward": -0.10873730108141899, + "rewards/tag_count_reward": 0.9947916865348816, "step": 270 }, { "clip_ratio": 0.0, - "completion_length": 509.85418701171875, - "epoch": 0.271, - "grad_norm": 2.718362036973218, - "kl": 0.15576171875, - "learning_rate": 9.221862584235526e-07, - "loss": 0.0344, - "reward": 2.775928258895874, - "reward_std": 0.3645378649234772, - "rewards/accuracy_reward": 0.8333333730697632, - "rewards/reasoning_steps_reward": 0.972222238779068, - "rewards/repetition_penalty_reward": -0.02962728962302208, - "rewards/tag_count_reward": 1.0, + "completion_length": 519.25, + "epoch": 0.1355, + "grad_norm": 2.1749339879371896, + "kl": 0.106689453125, + "learning_rate": 9.965493720484698e-07, + "loss": 0.0813, + "reward": 2.627415657043457, + "reward_std": 0.32304753363132477, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.06702888198196888, + "rewards/tag_count_reward": 0.9791666865348816, "step": 271 }, { "clip_ratio": 0.0, - "completion_length": 511.50001525878906, - "epoch": 0.272, - "grad_norm": 2.272195946333135, - "kl": 0.1591796875, - "learning_rate": 9.213010742252327e-07, - "loss": -0.0293, - "reward": 2.742081642150879, - "reward_std": 0.2952452003955841, - "rewards/accuracy_reward": 0.8333333730697632, - "rewards/reasoning_steps_reward": 0.944444477558136, - "rewards/repetition_penalty_reward": -0.03048785123974085, - "rewards/tag_count_reward": 0.9947916865348816, + "completion_length": 632.9375305175781, + "epoch": 0.136, + "grad_norm": 1.908928580484598, + "kl": 0.12060546875, + "learning_rate": 9.964516155915151e-07, + "loss": 0.061, + "reward": 2.5479973554611206, + "reward_std": 0.32585373520851135, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.0770026333630085, + "rewards/tag_count_reward": 0.9791666865348816, "step": 272 }, { "clip_ratio": 0.0, - "completion_length": 585.25, - "epoch": 0.273, - "grad_norm": 2.32139747171617, - "kl": 0.14794921875, - "learning_rate": 9.204113658404989e-07, - "loss": -0.0799, - "reward": 2.521125078201294, - "reward_std": 0.2736772522330284, - "rewards/accuracy_reward": 0.6041666865348816, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.04137510620057583, - "rewards/tag_count_reward": 0.9791666865348816, + "completion_length": 619.9791870117188, + "epoch": 0.1365, + "grad_norm": 1.9025926132999502, + "kl": 0.1201171875, + "learning_rate": 9.963524991655133e-07, + "loss": -0.02, + "reward": 2.6221734285354614, + "reward_std": 0.2964826971292496, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.07053492963314056, + "rewards/tag_count_reward": 0.9635416865348816, "step": 273 }, { "clip_ratio": 0.0, - "completion_length": 559.2708435058594, - "epoch": 0.274, - "grad_norm": 3.852408203301354, - "kl": 0.1982421875, - "learning_rate": 9.195171441101668e-07, - "loss": -0.0098, - "reward": 2.5533807277679443, - "reward_std": 0.35553011298179626, - "rewards/accuracy_reward": 0.6041666865348816, - "rewards/reasoning_steps_reward": 0.979166716337204, - "rewards/repetition_penalty_reward": -0.019536098465323448, - "rewards/tag_count_reward": 0.9895833432674408, + "completion_length": 577.2291870117188, + "epoch": 0.137, + "grad_norm": 2.695248913656631, + "kl": 0.11669921875, + "learning_rate": 9.962520230723906e-07, + "loss": -0.0972, + "reward": 2.581347942352295, + "reward_std": 0.22622781991958618, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9444444477558136, + "rewards/repetition_penalty_reward": -0.08184656128287315, + "rewards/tag_count_reward": 0.96875, "step": 274 }, { "clip_ratio": 0.0, - "completion_length": 633.4583435058594, - "epoch": 0.275, - "grad_norm": 2.0627187344345166, - "kl": 0.1513671875, - "learning_rate": 9.186184199300463e-07, - "loss": -0.0293, - "reward": 2.7091498374938965, - "reward_std": 0.1623903214931488, + "completion_length": 623.5208435058594, + "epoch": 0.1375, + "grad_norm": 3.342402691090167, + "kl": 0.12939453125, + "learning_rate": 9.961501876182148e-07, + "loss": 0.2172, + "reward": 2.620313286781311, + "reward_std": 0.4897034168243408, "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.04432236962020397, - "rewards/tag_count_reward": 0.9895833432674408, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.0689227245748043, + "rewards/tag_count_reward": 0.953125, "step": 275 }, { "clip_ratio": 0.0, - "completion_length": 532.0416717529297, - "epoch": 0.276, - "grad_norm": 2.401556778445366, - "kl": 0.1552734375, - "learning_rate": 9.177152042508077e-07, - "loss": 0.0535, - "reward": 2.728344440460205, - "reward_std": 0.3177375793457031, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.025128038600087166, - "rewards/tag_count_reward": 0.9895833432674408, + "completion_length": 522.1666717529297, + "epoch": 0.138, + "grad_norm": 2.3942345495108803, + "kl": 0.12939453125, + "learning_rate": 9.960469931131936e-07, + "loss": 0.0718, + "reward": 2.6490787267684937, + "reward_std": 0.3025604486465454, + "rewards/accuracy_reward": 0.75, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.05925481766462326, + "rewards/tag_count_reward": 1.0, "step": 276 }, { "clip_ratio": 0.0, - "completion_length": 497.7500305175781, - "epoch": 0.277, - "grad_norm": 2.544090680740168, - "kl": 0.17333984375, - "learning_rate": 9.168075080778494e-07, - "loss": 0.0797, - "reward": 2.892575979232788, - "reward_std": 0.2266939841210842, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.01714644394814968, - "rewards/tag_count_reward": 1.0, + "completion_length": 506.5, + "epoch": 0.1385, + "grad_norm": 2.3474305496678625, + "kl": 0.1181640625, + "learning_rate": 9.959424398716763e-07, + "loss": 0.0796, + "reward": 2.911013603210449, + "reward_std": 0.09207849018275738, + "rewards/accuracy_reward": 0.9791666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.06294489465653896, + "rewards/tag_count_reward": 0.9947916865348816, "step": 277 }, { "clip_ratio": 0.0, - "completion_length": 496.6666717529297, - "epoch": 0.278, - "grad_norm": 2.35427197118846, - "kl": 0.15576171875, - "learning_rate": 9.158953424711624e-07, - "loss": 0.0443, - "reward": 2.8280975818634033, - "reward_std": 0.15652123093605042, - "rewards/accuracy_reward": 0.875, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.029541232623159885, - "rewards/tag_count_reward": 0.9895833432674408, + "completion_length": 627.6666870117188, + "epoch": 0.139, + "grad_norm": 2.6905854255736252, + "kl": 0.15625, + "learning_rate": 9.958365282121496e-07, + "loss": 0.0858, + "reward": 2.7694180011749268, + "reward_std": 0.29491981118917465, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.06738761439919472, + "rewards/tag_count_reward": 0.9895833730697632, "step": 278 }, { "clip_ratio": 0.0, - "completion_length": 505.7708435058594, - "epoch": 0.279, - "grad_norm": 2.8854860164238882, - "kl": 0.1611328125, - "learning_rate": 9.149787185451969e-07, - "loss": 0.1633, - "reward": 2.6974780559539795, - "reward_std": 0.26681460440158844, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.019535831175744534, - "rewards/tag_count_reward": 0.9947916865348816, + "completion_length": 510.91668701171875, + "epoch": 0.1395, + "grad_norm": 2.175289440432667, + "kl": 0.16796875, + "learning_rate": 9.95729258457239e-07, + "loss": 0.0046, + "reward": 2.836862325668335, + "reward_std": 0.21854999661445618, + "rewards/accuracy_reward": 0.9583333432674408, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.08674902468919754, + "rewards/tag_count_reward": 0.9791666865348816, "step": 279 }, { "clip_ratio": 0.0, - "completion_length": 544.1458587646484, - "epoch": 0.28, - "grad_norm": 2.1507469824718592, - "kl": 0.15576171875, - "learning_rate": 9.140576474687263e-07, - "loss": 0.0214, - "reward": 2.660393714904785, - "reward_std": 0.35248108208179474, - "rewards/accuracy_reward": 0.708333358168602, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.02884255349636078, - "rewards/tag_count_reward": 0.9947916865348816, + "completion_length": 577.4583435058594, + "epoch": 0.14, + "grad_norm": 5.672542060786621, + "kl": 0.24072265625, + "learning_rate": 9.956206309337066e-07, + "loss": -0.005, + "reward": 2.5967882871627808, + "reward_std": 0.48753632605075836, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.075086809694767, + "rewards/tag_count_reward": 0.984375, "step": 280 }, { "clip_ratio": 0.0, - "completion_length": 582.9791870117188, - "epoch": 0.281, - "grad_norm": 2.2081213981330974, - "kl": 0.1484375, - "learning_rate": 9.131321404647109e-07, - "loss": 0.0259, - "reward": 2.749055504798889, - "reward_std": 0.3296816051006317, - "rewards/accuracy_reward": 0.8125000298023224, - "rewards/reasoning_steps_reward": 0.9722222089767456, - "rewards/repetition_penalty_reward": -0.035666871815919876, - "rewards/tag_count_reward": 1.0, + "completion_length": 589.9791870117188, + "epoch": 0.1405, + "grad_norm": 5.053359884020266, + "kl": 0.23046875, + "learning_rate": 9.955106459724508e-07, + "loss": 0.1177, + "reward": 2.557823896408081, + "reward_std": 0.23683376610279083, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.0741206556558609, + "rewards/tag_count_reward": 0.9791666865348816, "step": 281 }, { "clip_ratio": 0.0, - "completion_length": 495.8958435058594, - "epoch": 0.282, - "grad_norm": 2.6311621368175055, - "kl": 0.185546875, - "learning_rate": 9.122022088101613e-07, - "loss": 0.0722, - "reward": 2.718705654144287, - "reward_std": 0.37944258749485016, - "rewards/accuracy_reward": 0.75, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.01740554254502058, - "rewards/tag_count_reward": 1.0, + "completion_length": 647.2291870117188, + "epoch": 0.141, + "grad_norm": 3.63123333573302, + "kl": 0.2919921875, + "learning_rate": 9.953993039085048e-07, + "loss": 0.094, + "reward": 2.555752992630005, + "reward_std": 0.38118627667427063, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.06751113198697567, + "rewards/tag_count_reward": 0.9427083432674408, "step": 282 }, { "clip_ratio": 0.0, - "completion_length": 585.125, - "epoch": 0.283, - "grad_norm": 2.435559416337875, - "kl": 0.16943359375, - "learning_rate": 9.112678638360015e-07, - "loss": 0.0748, - "reward": 2.535888671875, - "reward_std": 0.3460448384284973, - "rewards/accuracy_reward": 0.6250000298023224, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.04918085224926472, - "rewards/tag_count_reward": 0.9739583730697632, + "completion_length": 848.5000305175781, + "epoch": 0.1415, + "grad_norm": 12.94083486478269, + "kl": 0.4365234375, + "learning_rate": 9.952866050810363e-07, + "loss": 0.2525, + "reward": 2.34884250164032, + "reward_std": 0.5896148979663849, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.9444445073604584, + "rewards/repetition_penalty_reward": -0.07476872578263283, + "rewards/tag_count_reward": 0.9375000298023224, "step": 283 }, { "clip_ratio": 0.0, - "completion_length": 568.4583740234375, - "epoch": 0.284, - "grad_norm": 2.3346177606876206, - "kl": 0.1689453125, - "learning_rate": 9.103291169269299e-07, - "loss": 0.0921, - "reward": 2.8603535890579224, - "reward_std": 0.25171563029289246, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.0389520637691021, - "rewards/tag_count_reward": 0.9895833432674408, + "completion_length": 537.2500305175781, + "epoch": 0.142, + "grad_norm": 921.7663615274216, + "kl": 3.4921875, + "learning_rate": 9.951725498333448e-07, + "loss": 0.2981, + "reward": 2.666219711303711, + "reward_std": 0.38224542140960693, + "rewards/accuracy_reward": 0.7708333730697632, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.06121090054512024, + "rewards/tag_count_reward": 0.984375, "step": 284 }, { "clip_ratio": 0.0, - "completion_length": 663.0416870117188, - "epoch": 0.285, - "grad_norm": 2.193370842368294, - "kl": 0.15673828125, - "learning_rate": 9.093859795212817e-07, - "loss": 0.1456, - "reward": 2.744846820831299, - "reward_std": 0.27857429534196854, - "rewards/accuracy_reward": 0.8125000298023224, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.04508392512798309, - "rewards/tag_count_reward": 0.984375, + "completion_length": 840.1458740234375, + "epoch": 0.1425, + "grad_norm": 1120.8208786375085, + "kl": 8.21875, + "learning_rate": 9.950571385128625e-07, + "loss": 0.488, + "reward": 2.1636295914649963, + "reward_std": 0.5795368552207947, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.07768993638455868, + "rewards/tag_count_reward": 0.921875, "step": 285 }, { "clip_ratio": 0.0, - "completion_length": 563.3125305175781, - "epoch": 0.286, - "grad_norm": 2.4347017189444737, - "kl": 0.20166015625, - "learning_rate": 9.084384631108882e-07, - "loss": 0.0819, - "reward": 2.745976209640503, - "reward_std": 0.29341720789670944, - "rewards/accuracy_reward": 0.7708333730697632, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.01964883040636778, - "rewards/tag_count_reward": 0.9947916865348816, + "completion_length": 779.0000305175781, + "epoch": 0.143, + "grad_norm": 13.757784684112176, + "kl": 0.984375, + "learning_rate": 9.949403714711526e-07, + "loss": 0.2595, + "reward": 2.427113175392151, + "reward_std": 0.5199109017848969, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9236111640930176, + "rewards/repetition_penalty_reward": -0.06420639157295227, + "rewards/tag_count_reward": 0.8593750298023224, "step": 286 }, { "clip_ratio": 0.0, - "completion_length": 530.8333435058594, - "epoch": 0.287, - "grad_norm": 2.5821740187900097, - "kl": 0.1962890625, - "learning_rate": 9.074865792409381e-07, - "loss": 0.0427, - "reward": 2.4354602098464966, - "reward_std": 0.31608445942401886, - "rewards/accuracy_reward": 0.458333358168602, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.015928767155855894, - "rewards/tag_count_reward": 1.0, + "completion_length": 830.5208435058594, + "epoch": 0.1435, + "grad_norm": 24.88573937996162, + "kl": 0.892578125, + "learning_rate": 9.948222490639075e-07, + "loss": 0.3276, + "reward": 2.313633441925049, + "reward_std": 0.5377289652824402, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.07699155062437057, + "rewards/tag_count_reward": 0.8489583730697632, "step": 287 }, { "clip_ratio": 0.0, - "completion_length": 516.6875, - "epoch": 0.288, - "grad_norm": 2.4879515053528927, - "kl": 0.1796875, - "learning_rate": 9.065303395098358e-07, - "loss": 0.0885, - "reward": 2.7357919216156006, - "reward_std": 0.21385888010263443, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.022888831794261932, - "rewards/tag_count_reward": 0.9947916865348816, + "completion_length": 682.3750305175781, + "epoch": 0.144, + "grad_norm": 108.97734325463617, + "kl": 1.734375, + "learning_rate": 9.947027716509488e-07, + "loss": 0.2837, + "reward": 2.294344663619995, + "reward_std": 0.6158420443534851, + "rewards/accuracy_reward": 0.5000000298023224, + "rewards/reasoning_steps_reward": 0.9652778506278992, + "rewards/repetition_penalty_reward": -0.07718315534293652, + "rewards/tag_count_reward": 0.9062500298023224, "step": 288 }, { "clip_ratio": 0.0, - "completion_length": 610.2291870117188, - "epoch": 0.289, - "grad_norm": 10.258023745667739, - "kl": 0.24951171875, - "learning_rate": 9.055697555690607e-07, - "loss": 0.2087, - "reward": 2.447946548461914, - "reward_std": 0.3903558999300003, - "rewards/accuracy_reward": 0.520833358168602, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.0399008896201849, - "rewards/tag_count_reward": 0.9739583432674408, + "completion_length": 729.6875, + "epoch": 0.1445, + "grad_norm": 20.647159290728784, + "kl": 0.93359375, + "learning_rate": 9.94581939596225e-07, + "loss": 0.1069, + "reward": 2.5923370122909546, + "reward_std": 0.4817170798778534, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.058704692870378494, + "rewards/tag_count_reward": 0.8802083432674408, "step": 289 }, { "clip_ratio": 0.0, - "completion_length": 488.2500305175781, - "epoch": 0.29, - "grad_norm": 2.72857938643871, - "kl": 0.2431640625, - "learning_rate": 9.046048391230247e-07, - "loss": -0.0029, - "reward": 2.764933466911316, - "reward_std": 0.31763769686222076, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.019788892939686775, - "rewards/tag_count_reward": 1.0, + "completion_length": 632.1666870117188, + "epoch": 0.145, + "grad_norm": 23.007824810794826, + "kl": 0.37890625, + "learning_rate": 9.944597532678119e-07, + "loss": 0.1326, + "reward": 2.522615909576416, + "reward_std": 0.5047437995672226, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.07460657134652138, + "rewards/tag_count_reward": 0.9375000298023224, "step": 290 }, { "clip_ratio": 0.0, - "completion_length": 548.4166717529297, - "epoch": 0.291, - "grad_norm": 13.446114403763556, - "kl": 0.3310546875, - "learning_rate": 9.036356019289309e-07, - "loss": 0.1304, - "reward": 2.442963719367981, - "reward_std": 0.4253944456577301, - "rewards/accuracy_reward": 0.4791666865348816, + "completion_length": 613.8125, + "epoch": 0.1455, + "grad_norm": 17.27026907734221, + "kl": 0.34375, + "learning_rate": 9.943362130379101e-07, + "loss": 0.1261, + "reward": 2.5630831718444824, + "reward_std": 0.35013893246650696, + "rewards/accuracy_reward": 0.6666666865348816, "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.020578143652528524, - "rewards/tag_count_reward": 0.984375, + "rewards/repetition_penalty_reward": -0.061916783452034, + "rewards/tag_count_reward": 0.9583333432674408, "step": 291 }, { "clip_ratio": 0.0, - "completion_length": 527.5000305175781, - "epoch": 0.292, - "grad_norm": 2.399625776751827, - "kl": 0.275390625, - "learning_rate": 9.026620557966279e-07, - "loss": 0.03, - "reward": 2.5269211530685425, - "reward_std": 0.2096565067768097, - "rewards/accuracy_reward": 0.5625000298023224, - "rewards/reasoning_steps_reward": 0.979166716337204, - "rewards/repetition_penalty_reward": -0.01474552322179079, - "rewards/tag_count_reward": 1.0, + "completion_length": 596.3125152587891, + "epoch": 0.146, + "grad_norm": 24.305363731366423, + "kl": 0.4072265625, + "learning_rate": 9.942113192828444e-07, + "loss": 0.0284, + "reward": 2.6639617681503296, + "reward_std": 0.3908410295844078, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.05826057866215706, + "rewards/tag_count_reward": 0.9583333432674408, "step": 292 }, { "clip_ratio": 0.0, - "completion_length": 557.6458435058594, - "epoch": 0.293, - "grad_norm": 2.6309659598581225, - "kl": 0.28955078125, - "learning_rate": 9.016842125884684e-07, - "loss": 0.0651, - "reward": 2.6259918212890625, - "reward_std": 0.3636874854564667, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.018105541355907917, - "rewards/tag_count_reward": 0.984375, + "completion_length": 454.125, + "epoch": 0.1465, + "grad_norm": 7.404474348588299, + "kl": 0.22705078125, + "learning_rate": 9.940850723830632e-07, + "loss": 0.0509, + "reward": 2.862213373184204, + "reward_std": 0.18731126189231873, + "rewards/accuracy_reward": 0.9375000298023224, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.0613978561013937, + "rewards/tag_count_reward": 1.0, "step": 293 }, { "clip_ratio": 0.0, - "completion_length": 591.3958435058594, - "epoch": 0.294, - "grad_norm": 2.594070997434929, - "kl": 0.2685546875, - "learning_rate": 9.007020842191634e-07, - "loss": 0.0742, - "reward": 2.6963618993759155, - "reward_std": 0.27481937408447266, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.0258603785187006, - "rewards/tag_count_reward": 1.0, + "completion_length": 621.3125305175781, + "epoch": 0.147, + "grad_norm": 18.874856559368098, + "kl": 0.2001953125, + "learning_rate": 9.939574727231362e-07, + "loss": 0.0696, + "reward": 2.7581335306167603, + "reward_std": 0.3472418487071991, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.06999156251549721, + "rewards/tag_count_reward": 0.9739583432674408, "step": 294 }, { "clip_ratio": 0.0, - "completion_length": 503.7916717529297, - "epoch": 0.295, - "grad_norm": 2.797055325097868, - "kl": 0.3544921875, - "learning_rate": 8.997156826556369e-07, - "loss": 0.0827, - "reward": 2.522792100906372, - "reward_std": 0.4541844576597214, - "rewards/accuracy_reward": 0.5625, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.01887462381273508, + "completion_length": 549.125, + "epoch": 0.1475, + "grad_norm": 3.2744531346829038, + "kl": 0.14990234375, + "learning_rate": 9.93828520691754e-07, + "loss": 0.0335, + "reward": 2.661821484565735, + "reward_std": 0.22525277733802795, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.08123417943716049, "rewards/tag_count_reward": 1.0, "step": 295 }, { "clip_ratio": 0.0, - "completion_length": 572.1875305175781, - "epoch": 0.296, - "grad_norm": 3.656654339941009, - "kl": 0.388671875, - "learning_rate": 8.987250199168808e-07, - "loss": 0.0611, - "reward": 2.7958298921585083, - "reward_std": 0.3831692487001419, - "rewards/accuracy_reward": 0.8541666865348816, + "completion_length": 630.3958435058594, + "epoch": 0.148, + "grad_norm": 3.3512313017643782, + "kl": 0.212890625, + "learning_rate": 9.93698216681727e-07, + "loss": 0.0249, + "reward": 2.5591964721679688, + "reward_std": 0.309579461812973, + "rewards/accuracy_reward": 0.6458333730697632, "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.03750366624444723, - "rewards/tag_count_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.086636982858181, + "rewards/tag_count_reward": 1.0, "step": 296 }, { "clip_ratio": 0.0, - "completion_length": 679.375, - "epoch": 0.297, - "grad_norm": 5.294668622093825, - "kl": 0.4609375, - "learning_rate": 8.977301080738079e-07, - "loss": 0.1776, - "reward": 2.7792227268218994, - "reward_std": 0.2825077772140503, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.04369393363595009, - "rewards/tag_count_reward": 0.9895833432674408, + "completion_length": 570.3958740234375, + "epoch": 0.1485, + "grad_norm": 19.392543852050114, + "kl": 0.19921875, + "learning_rate": 9.93566561089984e-07, + "loss": 0.1167, + "reward": 2.6523066759109497, + "reward_std": 0.39907996356487274, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.05429063364863396, + "rewards/tag_count_reward": 0.9843750298023224, "step": 297 }, { "clip_ratio": 0.0, - "completion_length": 777.9375, - "epoch": 0.298, - "grad_norm": 9.442864310512464, - "kl": 0.658203125, - "learning_rate": 8.967309592491052e-07, - "loss": 0.3089, - "reward": 2.554570198059082, - "reward_std": 0.5709795355796814, - "rewards/accuracy_reward": 0.6875, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.04265192709863186, - "rewards/tag_count_reward": 0.9375, + "completion_length": 632.3125305175781, + "epoch": 0.149, + "grad_norm": 4.2670800629296775, + "kl": 0.22900390625, + "learning_rate": 9.934335543175705e-07, + "loss": 0.022, + "reward": 2.6927382946014404, + "reward_std": 0.3433649092912674, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.965277761220932, + "rewards/repetition_penalty_reward": -0.05899788439273834, + "rewards/tag_count_reward": 0.9947916865348816, "step": 298 }, { "clip_ratio": 0.0, - "completion_length": 774.7708435058594, - "epoch": 0.299, - "grad_norm": 4.044252023077153, - "kl": 1.21484375, - "learning_rate": 8.957275856170855e-07, - "loss": 0.1666, - "reward": 2.4033294916152954, - "reward_std": 0.4724307656288147, - "rewards/accuracy_reward": 0.520833358168602, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.028962312266230583, - "rewards/tag_count_reward": 0.9114583432674408, + "completion_length": 618.9583435058594, + "epoch": 0.1495, + "grad_norm": 15.195924787873745, + "kl": 0.28076171875, + "learning_rate": 9.932991967696482e-07, + "loss": 0.1031, + "reward": 2.660310745239258, + "reward_std": 0.16687491163611412, + "rewards/accuracy_reward": 0.7291666716337204, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.041078392416238785, + "rewards/tag_count_reward": 0.9791666865348816, "step": 299 }, { "clip_ratio": 0.0, - "completion_length": 775.9166870117188, - "epoch": 0.3, - "grad_norm": 13.271553563881938, - "kl": 1.8515625, - "learning_rate": 8.9471999940354e-07, - "loss": 0.2894, - "reward": 2.3227447271347046, - "reward_std": 0.5490213930606842, - "rewards/accuracy_reward": 0.4583333432674408, + "completion_length": 555.4375305175781, + "epoch": 0.15, + "grad_norm": 23.003706484527033, + "kl": 0.33984375, + "learning_rate": 9.931634888554935e-07, + "loss": 0.1633, + "reward": 2.595449686050415, + "reward_std": 0.35070881247520447, + "rewards/accuracy_reward": 0.6666666865348816, "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.03663041442632675, - "rewards/tag_count_reward": 0.9010416865348816, + "rewards/repetition_penalty_reward": -0.039967115968465805, + "rewards/tag_count_reward": 0.96875, "step": 300 }, { "clip_ratio": 0.0, - "completion_length": 862.0625305175781, - "epoch": 0.301, - "grad_norm": 11.329717412687812, - "kl": 2.6328125, - "learning_rate": 8.937082128855891e-07, - "loss": 0.303, - "reward": 2.497220277786255, - "reward_std": 0.5546972751617432, - "rewards/accuracy_reward": 0.625, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.025349291041493416, - "rewards/tag_count_reward": 0.9114583432674408, + "completion_length": 591.4791870117188, + "epoch": 0.1505, + "grad_norm": 29.883708582664426, + "kl": 1.10546875, + "learning_rate": 9.930264309884964e-07, + "loss": 0.1903, + "reward": 2.266157388687134, + "reward_std": 0.4054761230945587, + "rewards/accuracy_reward": 0.3541666716337204, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.05155107006430626, + "rewards/tag_count_reward": 0.9635416865348816, "step": 301 }, { "clip_ratio": 0.0, - "completion_length": 718.0000305175781, - "epoch": 0.302, - "grad_norm": 11.620481772542528, - "kl": 1.78125, - "learning_rate": 8.926922383915315e-07, - "loss": 0.1349, - "reward": 2.820877194404602, - "reward_std": 0.2959998771548271, - "rewards/accuracy_reward": 0.8958333730697632, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.026345071382820606, - "rewards/tag_count_reward": 0.9583333432674408, + "completion_length": 559.6875305175781, + "epoch": 0.151, + "grad_norm": 53.32541543626509, + "kl": 1.33203125, + "learning_rate": 9.928880235861588e-07, + "loss": 0.0845, + "reward": 2.680440068244934, + "reward_std": 0.240939699113369, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.033101567067205906, + "rewards/tag_count_reward": 0.984375, "step": 302 }, { "clip_ratio": 0.0, - "completion_length": 699.8958740234375, - "epoch": 0.303, - "grad_norm": 18.84955689848848, - "kl": 0.779296875, - "learning_rate": 8.916720883006963e-07, - "loss": 0.2289, - "reward": 2.662257194519043, - "reward_std": 0.4918932765722275, + "completion_length": 569.5833587646484, + "epoch": 0.1515, + "grad_norm": 64.08652020758656, + "kl": 1.0302734375, + "learning_rate": 9.927482670700936e-07, + "loss": 0.1684, + "reward": 2.6656733751296997, + "reward_std": 0.3831760287284851, "rewards/accuracy_reward": 0.7500000298023224, "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.02697897609323263, - "rewards/tag_count_reward": 0.9531250298023224, + "rewards/repetition_penalty_reward": -0.054812896996736526, + "rewards/tag_count_reward": 0.984375, "step": 303 }, { "clip_ratio": 0.0, - "completion_length": 854.3333435058594, - "epoch": 0.304, - "grad_norm": 16.06283643508135, - "kl": 0.9453125, - "learning_rate": 8.906477750432903e-07, - "loss": 0.2394, - "reward": 2.463517665863037, - "reward_std": 0.5290980041027069, - "rewards/accuracy_reward": 0.6041666865348816, + "completion_length": 505.60418701171875, + "epoch": 0.152, + "grad_norm": 4.124901631085503, + "kl": 0.162109375, + "learning_rate": 9.926071618660237e-07, + "loss": -0.017, + "reward": 2.7977336645126343, + "reward_std": 0.2716551870107651, + "rewards/accuracy_reward": 0.8541666865348816, "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.034746479243040085, - "rewards/tag_count_reward": 0.9010416865348816, + "rewards/repetition_penalty_reward": -0.04948872700333595, + "rewards/tag_count_reward": 1.0, "step": 304 }, { "clip_ratio": 0.0, - "completion_length": 795.7291870117188, - "epoch": 0.305, - "grad_norm": 7.368811609366542, - "kl": 1.072265625, - "learning_rate": 8.896193111002475e-07, - "loss": 0.1781, - "reward": 2.7576704025268555, - "reward_std": 0.3518691807985306, - "rewards/accuracy_reward": 0.8958333730697632, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.03399639390408993, - "rewards/tag_count_reward": 0.9166666865348816, + "completion_length": 535.5000305175781, + "epoch": 0.1525, + "grad_norm": 2.3959309961624955, + "kl": 0.169921875, + "learning_rate": 9.924647084037797e-07, + "loss": 0.028, + "reward": 2.503369092941284, + "reward_std": 0.22217638790607452, + "rewards/accuracy_reward": 0.6041666716337204, + "rewards/reasoning_steps_reward": 0.9722222089767456, + "rewards/repetition_penalty_reward": -0.057394931092858315, + "rewards/tag_count_reward": 0.984375, "step": 305 }, { "clip_ratio": 0.0, - "completion_length": 646.9583435058594, - "epoch": 0.306, - "grad_norm": 5.727482347866686, - "kl": 1.28125, - "learning_rate": 8.88586709003076e-07, - "loss": 0.1126, - "reward": 2.540480852127075, - "reward_std": 0.4421972632408142, - "rewards/accuracy_reward": 0.6250000298023224, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.03243602532893419, - "rewards/tag_count_reward": 0.9479166865348816, + "completion_length": 533.0208435058594, + "epoch": 0.153, + "grad_norm": 2.4185306667509985, + "kl": 0.149658203125, + "learning_rate": 9.923209071172994e-07, + "loss": 0.0305, + "reward": 2.573015809059143, + "reward_std": 0.3675818666815758, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.06934532895684242, + "rewards/tag_count_reward": 0.9895833730697632, "step": 306 }, { "clip_ratio": 0.0, - "completion_length": 876.6041870117188, - "epoch": 0.307, - "grad_norm": 6.1244435821869905, - "kl": 1.9296875, - "learning_rate": 8.875499813337067e-07, - "loss": 0.2147, - "reward": 2.5565717220306396, - "reward_std": 0.5118266344070435, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.026761652901768684, - "rewards/tag_count_reward": 0.9166666865348816, + "completion_length": 533.2500305175781, + "epoch": 0.1535, + "grad_norm": 3.4451894822132405, + "kl": 0.146484375, + "learning_rate": 9.921757584446268e-07, + "loss": 0.038, + "reward": 2.809865355491638, + "reward_std": 0.27805351465940475, + "rewards/accuracy_reward": 0.8958333730697632, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.07902374118566513, + "rewards/tag_count_reward": 1.0, "step": 307 }, { "clip_ratio": 0.0, - "completion_length": 735.1041870117188, - "epoch": 0.308, - "grad_norm": 6.2567586393756685, - "kl": 1.25390625, - "learning_rate": 8.865091407243394e-07, - "loss": 0.1521, - "reward": 2.8010218143463135, - "reward_std": 0.3402135968208313, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.03057546727359295, - "rewards/tag_count_reward": 0.9218750298023224, + "completion_length": 626.2708740234375, + "epoch": 0.154, + "grad_norm": 4.282437820486171, + "kl": 0.18994140625, + "learning_rate": 9.9202926282791e-07, + "loss": 0.0255, + "reward": 2.4780622720718384, + "reward_std": 0.4932084083557129, + "rewards/accuracy_reward": 0.5625, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.051451750099658966, + "rewards/tag_count_reward": 0.9947916865348816, "step": 308 }, { "clip_ratio": 0.0, - "completion_length": 884.6666870117188, - "epoch": 0.309, - "grad_norm": 11.171414148004054, - "kl": 2.3984375, - "learning_rate": 8.85464199857288e-07, - "loss": 0.2805, - "reward": 2.442511558532715, - "reward_std": 0.5785179138183594, - "rewards/accuracy_reward": 0.5833333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.026238556019961834, - "rewards/tag_count_reward": 0.8854166865348816, + "completion_length": 544.6875305175781, + "epoch": 0.1545, + "grad_norm": 3.8575808680226182, + "kl": 0.16064453125, + "learning_rate": 9.918814207133997e-07, + "loss": -0.0068, + "reward": 2.1725984811782837, + "reward_std": 0.3830345869064331, + "rewards/accuracy_reward": 0.2916666716337204, + "rewards/reasoning_steps_reward": 0.9652778506278992, + "rewards/repetition_penalty_reward": -0.08434605225920677, + "rewards/tag_count_reward": 1.0, "step": 309 }, { "clip_ratio": 0.0, - "completion_length": 982.625, - "epoch": 0.31, - "grad_norm": 5.999562164727096, - "kl": 2.0234375, - "learning_rate": 8.844151714648274e-07, - "loss": 0.2776, - "reward": 2.3847498893737793, - "reward_std": 0.6050747036933899, - "rewards/accuracy_reward": 0.5208333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.026708428747951984, - "rewards/tag_count_reward": 0.8906250298023224, + "completion_length": 556.2083740234375, + "epoch": 0.155, + "grad_norm": 2.30299380218455, + "kl": 0.1865234375, + "learning_rate": 9.917322325514487e-07, + "loss": 0.0359, + "reward": 2.8288975954055786, + "reward_std": 0.22805538028478622, + "rewards/accuracy_reward": 0.8958333730697632, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.05999153107404709, + "rewards/tag_count_reward": 1.0, "step": 310 }, { "clip_ratio": 0.0, - "completion_length": 750.9166870117188, - "epoch": 0.311, - "grad_norm": 10.25459242080789, - "kl": 1.228515625, - "learning_rate": 8.833620683290375e-07, - "loss": 0.2226, - "reward": 2.489295244216919, - "reward_std": 0.3828739821910858, - "rewards/accuracy_reward": 0.5833333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.031538188457489014, - "rewards/tag_count_reward": 0.9375, + "completion_length": 571.8333435058594, + "epoch": 0.1555, + "grad_norm": 13.544980888404456, + "kl": 0.1787109375, + "learning_rate": 9.915816987965102e-07, + "loss": 0.1167, + "reward": 2.6313341856002808, + "reward_std": 0.3306514471769333, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.06137407571077347, + "rewards/tag_count_reward": 0.984375, "step": 311 }, { "clip_ratio": 0.0, - "completion_length": 920.7083435058594, - "epoch": 0.312, - "grad_norm": 11.341166194035626, - "kl": 3.0390625, - "learning_rate": 8.823049032816478e-07, - "loss": 0.3465, - "reward": 2.3931429386138916, - "reward_std": 0.5464552640914917, - "rewards/accuracy_reward": 0.5416666716337204, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.033940425142645836, - "rewards/tag_count_reward": 0.8854166865348816, + "completion_length": 601.6041870117188, + "epoch": 0.156, + "grad_norm": 7.684331357658625, + "kl": 0.29296875, + "learning_rate": 9.91429819907136e-07, + "loss": 0.1471, + "reward": 2.8133013248443604, + "reward_std": 0.28423790633678436, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.06690708734095097, + "rewards/tag_count_reward": 0.9843750298023224, "step": 312 }, { "clip_ratio": 0.0, - "completion_length": 824.3541870117188, - "epoch": 0.313, - "grad_norm": 14.499216833551465, - "kl": 2.5703125, - "learning_rate": 8.812436892038805e-07, - "loss": 0.4531, - "reward": 2.4514354467391968, - "reward_std": 0.4615107327699661, - "rewards/accuracy_reward": 0.6458333432674408, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.03120344504714012, - "rewards/tag_count_reward": 0.8645833432674408, + "completion_length": 568.6666870117188, + "epoch": 0.1565, + "grad_norm": 16.451352502057013, + "kl": 0.5234375, + "learning_rate": 9.912765963459756e-07, + "loss": -0.0008, + "reward": 2.7371630668640137, + "reward_std": 0.3937046229839325, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/reasoning_steps_reward": 0.9861111640930176, + "rewards/repetition_penalty_reward": -0.05103146657347679, + "rewards/tag_count_reward": 0.9687500298023224, "step": 313 }, { "clip_ratio": 0.0, - "completion_length": 752.2500305175781, - "epoch": 0.314, - "grad_norm": 9.160833779249737, - "kl": 2.52734375, - "learning_rate": 8.801784390262943e-07, - "loss": 0.4578, - "reward": 2.459816098213196, - "reward_std": 0.5396545231342316, - "rewards/accuracy_reward": 0.625, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.02629512920975685, - "rewards/tag_count_reward": 0.8750000298023224, + "completion_length": 538.75, + "epoch": 0.157, + "grad_norm": 5.827769683111703, + "kl": 0.4130859375, + "learning_rate": 9.911220285797748e-07, + "loss": 0.0415, + "reward": 2.58489727973938, + "reward_std": 0.38202086091041565, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.06614442355930805, + "rewards/tag_count_reward": 0.9635416865348816, "step": 314 }, { "clip_ratio": 0.0, - "completion_length": 666.5625305175781, - "epoch": 0.315, - "grad_norm": 16.821527260355506, - "kl": 1.90625, - "learning_rate": 8.791091657286267e-07, - "loss": 0.1424, - "reward": 2.7689753770828247, - "reward_std": 0.27851930260658264, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.03137190733104944, - "rewards/tag_count_reward": 0.953125, + "completion_length": 527.9166870117188, + "epoch": 0.1575, + "grad_norm": 7.880587052881426, + "kl": 0.20703125, + "learning_rate": 9.909661170793733e-07, + "loss": 0.1088, + "reward": 2.4540834426879883, + "reward_std": 0.39223696291446686, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.059805694967508316, + "rewards/tag_count_reward": 1.0, "step": 315 }, { "clip_ratio": 0.0, - "completion_length": 749.3958435058594, - "epoch": 0.316, - "grad_norm": 15.034225144072645, - "kl": 2.33984375, - "learning_rate": 8.780358823396352e-07, - "loss": 0.2368, - "reward": 2.8358311653137207, - "reward_std": 0.2934058606624603, - "rewards/accuracy_reward": 0.9375, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.03396068140864372, - "rewards/tag_count_reward": 0.953125, + "completion_length": 550.7291717529297, + "epoch": 0.158, + "grad_norm": 6.6106566025225515, + "kl": 0.314453125, + "learning_rate": 9.908088623197048e-07, + "loss": 0.013, + "reward": 2.4705549478530884, + "reward_std": 0.46846726536750793, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.058959029614925385, + "rewards/tag_count_reward": 0.9739583432674408, "step": 316 }, { "clip_ratio": 0.0, - "completion_length": 740.875, - "epoch": 0.317, - "grad_norm": 8.197253327861047, - "kl": 2.1796875, - "learning_rate": 8.769586019369391e-07, - "loss": 0.2262, - "reward": 2.721475839614868, - "reward_std": 0.3922436535358429, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.023315943777561188, - "rewards/tag_count_reward": 0.9114583432674408, + "completion_length": 567.0208435058594, + "epoch": 0.1585, + "grad_norm": 2.7832594906032084, + "kl": 0.24462890625, + "learning_rate": 9.906502647797945e-07, + "loss": -0.0046, + "reward": 2.705116033554077, + "reward_std": 0.33963292837142944, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.058772869408130646, + "rewards/tag_count_reward": 1.0, "step": 317 }, { "clip_ratio": 0.0, - "completion_length": 731.8125, - "epoch": 0.318, - "grad_norm": 17.02162446986935, - "kl": 0.9296875, - "learning_rate": 8.758773376468604e-07, - "loss": 0.2494, - "reward": 2.5680298805236816, - "reward_std": 0.4028843492269516, - "rewards/accuracy_reward": 0.6875, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.030928438529372215, - "rewards/tag_count_reward": 0.9114583432674408, + "completion_length": 566.4583435058594, + "epoch": 0.159, + "grad_norm": 6.607393683777124, + "kl": 0.19873046875, + "learning_rate": 9.904903249427582e-07, + "loss": 0.0072, + "reward": 2.756907343864441, + "reward_std": 0.3417035788297653, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.074690081179142, + "rewards/tag_count_reward": 0.984375, "step": 318 }, { "clip_ratio": 0.0, - "completion_length": 646.0625305175781, - "epoch": 0.319, - "grad_norm": 8.403943914356434, - "kl": 0.6796875, - "learning_rate": 8.747921026442629e-07, - "loss": 0.1368, - "reward": 2.6740156412124634, - "reward_std": 0.3067634850740433, - "rewards/accuracy_reward": 0.75, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.018692771438509226, - "rewards/tag_count_reward": 0.9427083432674408, + "completion_length": 525.0833587646484, + "epoch": 0.1595, + "grad_norm": 12.632626492261867, + "kl": 0.22802734375, + "learning_rate": 9.903290432958003e-07, + "loss": 0.0901, + "reward": 2.739717960357666, + "reward_std": 0.4267748296260834, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.03979583643376827, + "rewards/tag_count_reward": 0.9947916865348816, "step": 319 }, { "clip_ratio": 0.0, - "completion_length": 612.8125, - "epoch": 0.32, - "grad_norm": 3.055603498237129, - "kl": 0.384765625, - "learning_rate": 8.737029101523929e-07, - "loss": 0.0287, - "reward": 2.7557495832443237, - "reward_std": 0.21633769571781158, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.028972744941711426, + "completion_length": 434.00001525878906, + "epoch": 0.16, + "grad_norm": 3.2432048734417296, + "kl": 0.169921875, + "learning_rate": 9.901664203302124e-07, + "loss": 0.0588, + "reward": 2.945529341697693, + "reward_std": 0.02130332589149475, + "rewards/accuracy_reward": 1.0, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.054470714181661606, "rewards/tag_count_reward": 1.0, "step": 320 }, { "clip_ratio": 0.0, - "completion_length": 619.375, - "epoch": 0.321, - "grad_norm": 3.8882839855022113, - "kl": 0.3203125, - "learning_rate": 8.726097734427172e-07, - "loss": 0.0372, - "reward": 2.877520799636841, - "reward_std": 0.2280464619398117, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.028729302808642387, - "rewards/tag_count_reward": 0.9895833432674408, + "completion_length": 511.7708435058594, + "epoch": 0.1605, + "grad_norm": 13.83323285922156, + "kl": 0.26025390625, + "learning_rate": 9.900024565413727e-07, + "loss": 0.075, + "reward": 2.590381622314453, + "reward_std": 0.46982041001319885, + "rewards/accuracy_reward": 0.7083333730697632, + "rewards/reasoning_steps_reward": 0.9652778506278992, + "rewards/repetition_penalty_reward": -0.06760460883378983, + "rewards/tag_count_reward": 0.9843750298023224, "step": 321 }, { "clip_ratio": 0.0, - "completion_length": 642.2291870117188, - "epoch": 0.322, - "grad_norm": 5.934518493796481, - "kl": 0.5390625, - "learning_rate": 8.715127058347614e-07, - "loss": 0.1573, - "reward": 2.8018211126327515, - "reward_std": 0.2535254070535302, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.029776128008961678, - "rewards/tag_count_reward": 0.984375, + "completion_length": 568.3958435058594, + "epoch": 0.161, + "grad_norm": 15.041988051632119, + "kl": 0.603515625, + "learning_rate": 9.89837152428743e-07, + "loss": 0.0217, + "reward": 2.7006497383117676, + "reward_std": 0.3142661973834038, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.04761423170566559, + "rewards/tag_count_reward": 0.9843750298023224, "step": 322 }, { "clip_ratio": 0.0, - "completion_length": 584.3958435058594, - "epoch": 0.323, - "grad_norm": 12.30823020424776, - "kl": 0.6533203125, - "learning_rate": 8.704117206959484e-07, - "loss": -0.0165, - "reward": 2.274023652076721, - "reward_std": 0.3736593574285507, - "rewards/accuracy_reward": 0.3125000111758709, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.022851460613310337, - "rewards/tag_count_reward": 0.984375, + "completion_length": 498.0833435058594, + "epoch": 0.1615, + "grad_norm": 8.377756641490475, + "kl": 0.564453125, + "learning_rate": 9.896705084958687e-07, + "loss": 0.0812, + "reward": 2.581056237220764, + "reward_std": 0.424957811832428, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.04741601645946503, + "rewards/tag_count_reward": 0.9895833432674408, "step": 323 }, { "clip_ratio": 0.0, - "completion_length": 616.7083435058594, - "epoch": 0.324, - "grad_norm": 11.03962394142098, + "completion_length": 561.6875, + "epoch": 0.162, + "grad_norm": 24.710240478593196, "kl": 0.708984375, - "learning_rate": 8.693068314414344e-07, - "loss": 0.2153, - "reward": 2.9491621255874634, - "reward_std": 0.0671940129250288, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.030004790984094143, - "rewards/tag_count_reward": 0.9791666865348816, + "learning_rate": 9.895025252503755e-07, + "loss": 0.2563, + "reward": 2.6197162866592407, + "reward_std": 0.4363028407096863, + "rewards/accuracy_reward": 0.7708333730697632, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.05736706592142582, + "rewards/tag_count_reward": 0.9270833432674408, "step": 324 }, { "clip_ratio": 0.0, - "completion_length": 586.2916870117188, - "epoch": 0.325, - "grad_norm": 21.7519973984484, - "kl": 0.9091796875, - "learning_rate": 8.681980515339463e-07, - "loss": 0.1511, - "reward": 2.7508959770202637, - "reward_std": 0.3181898444890976, - "rewards/accuracy_reward": 0.7916666865348816, + "completion_length": 469.5625, + "epoch": 0.1625, + "grad_norm": 9.070483774061584, + "kl": 0.58203125, + "learning_rate": 9.8933320320397e-07, + "loss": 0.0186, + "reward": 2.583137035369873, + "reward_std": 0.3166651949286461, + "rewards/accuracy_reward": 0.6458333432674408, "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.025145714171230793, + "rewards/repetition_penalty_reward": -0.04707140102982521, "rewards/tag_count_reward": 0.9843750298023224, "step": 325 }, { "clip_ratio": 0.0, - "completion_length": 609.3125305175781, - "epoch": 0.326, - "grad_norm": 7.475624041841047, - "kl": 1.08984375, - "learning_rate": 8.670853944836176e-07, - "loss": 0.1854, - "reward": 2.718400478363037, - "reward_std": 0.3192872703075409, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.024655278772115707, - "rewards/tag_count_reward": 0.9791666865348816, + "completion_length": 480.0416717529297, + "epoch": 0.163, + "grad_norm": 4.3751653300427655, + "kl": 0.4638671875, + "learning_rate": 9.891625428724364e-07, + "loss": 0.0585, + "reward": 2.6971195936203003, + "reward_std": 0.33363979309797287, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.04072774015367031, + "rewards/tag_count_reward": 0.9947916865348816, "step": 326 }, { "clip_ratio": 0.0, - "completion_length": 537.0833435058594, - "epoch": 0.327, - "grad_norm": 3.934696523513004, - "kl": 0.4482421875, - "learning_rate": 8.659688738478231e-07, - "loss": 0.0237, - "reward": 2.5368727445602417, - "reward_std": 0.2816409021615982, - "rewards/accuracy_reward": 0.5833333432674408, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.029099617153406143, - "rewards/tag_count_reward": 0.9895833432674408, + "completion_length": 469.9791717529297, + "epoch": 0.1635, + "grad_norm": 8.17049817832142, + "kl": 0.4169921875, + "learning_rate": 9.889905447756355e-07, + "loss": -0.0211, + "reward": 2.66062331199646, + "reward_std": 0.09492377191781998, + "rewards/accuracy_reward": 0.75, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.05639079958200455, + "rewards/tag_count_reward": 0.9947916865348816, "step": 327 }, { "clip_ratio": 0.0, - "completion_length": 561.1875, - "epoch": 0.328, - "grad_norm": 4.769237705857611, - "kl": 0.576171875, - "learning_rate": 8.648485032310144e-07, - "loss": 0.0438, - "reward": 2.7067657709121704, - "reward_std": 0.287298321723938, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.029345519840717316, - "rewards/tag_count_reward": 0.9791666865348816, + "completion_length": 553.0833435058594, + "epoch": 0.164, + "grad_norm": 11.036171582992713, + "kl": 0.36328125, + "learning_rate": 9.888172094375033e-07, + "loss": -0.0311, + "reward": 2.6875771284103394, + "reward_std": 0.2717094421386719, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.053742485120892525, + "rewards/tag_count_reward": 0.984375, "step": 328 }, { "clip_ratio": 0.0, - "completion_length": 630.4375, - "epoch": 0.329, - "grad_norm": 10.245397889363776, - "kl": 2.02734375, - "learning_rate": 8.63724296284554e-07, - "loss": 0.2262, - "reward": 2.599774479866028, - "reward_std": 0.4478468745946884, - "rewards/accuracy_reward": 0.6875000298023224, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.023489387705922127, - "rewards/tag_count_reward": 0.9427083432674408, + "completion_length": 475.93751525878906, + "epoch": 0.1645, + "grad_norm": 2.6290357341375863, + "kl": 0.166015625, + "learning_rate": 9.886425373860496e-07, + "loss": -0.0091, + "reward": 2.445455312728882, + "reward_std": 0.3463872969150543, + "rewards/accuracy_reward": 0.5000000298023224, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.04239194467663765, + "rewards/tag_count_reward": 0.9947916865348816, "step": 329 }, { "clip_ratio": 0.0, - "completion_length": 571.2916870117188, - "epoch": 0.33, - "grad_norm": 22.88975372356627, - "kl": 1.796875, - "learning_rate": 8.625962667065487e-07, - "loss": 0.206, - "reward": 2.5127453804016113, - "reward_std": 0.4001367390155792, - "rewards/accuracy_reward": 0.5625000149011612, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.028921468183398247, - "rewards/tag_count_reward": 0.9791666865348816, + "completion_length": 549.9791870117188, + "epoch": 0.165, + "grad_norm": 2.8947233993694232, + "kl": 0.15380859375, + "learning_rate": 9.88466529153356e-07, + "loss": 0.0504, + "reward": 2.676490306854248, + "reward_std": 0.26038385927677155, + "rewards/accuracy_reward": 0.7708333730697632, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.07350974902510643, + "rewards/tag_count_reward": 1.0, "step": 330 }, { "clip_ratio": 0.0, - "completion_length": 524.7083740234375, - "epoch": 0.331, - "grad_norm": 5.9516087463505025, - "kl": 0.42626953125, - "learning_rate": 8.614644282416831e-07, - "loss": 0.0881, - "reward": 2.731919050216675, - "reward_std": 0.31574640423059464, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/reasoning_steps_reward": 0.9652777910232544, - "rewards/repetition_penalty_reward": -0.040650567039847374, - "rewards/tag_count_reward": 0.9739583432674408, + "completion_length": 442.12501525878906, + "epoch": 0.1655, + "grad_norm": 2.304611040563405, + "kl": 0.1357421875, + "learning_rate": 9.882891852755732e-07, + "loss": 0.0241, + "reward": 2.857384443283081, + "reward_std": 0.18227218464016914, + "rewards/accuracy_reward": 0.9375, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.06622675433754921, + "rewards/tag_count_reward": 1.0, "step": 331 }, { "clip_ratio": 0.0, - "completion_length": 558.9583740234375, - "epoch": 0.332, - "grad_norm": 5.083242340459772, - "kl": 0.27294921875, - "learning_rate": 8.603287946810513e-07, - "loss": 0.05, - "reward": 2.7102267742156982, - "reward_std": 0.2585765942931175, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.03456501290202141, - "rewards/tag_count_reward": 0.9947916865348816, + "completion_length": 457.7708435058594, + "epoch": 0.166, + "grad_norm": 2.2368549675600944, + "kl": 0.13671875, + "learning_rate": 9.881105062929221e-07, + "loss": -0.0458, + "reward": 2.5673773288726807, + "reward_std": 0.4164246767759323, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.040261661633849144, + "rewards/tag_count_reward": 0.9895833432674408, "step": 332 }, { "clip_ratio": 0.0, - "completion_length": 618.1041870117188, - "epoch": 0.333, - "grad_norm": 6.1273870030040065, - "kl": 1.0595703125, - "learning_rate": 8.591893798619903e-07, - "loss": 0.2075, - "reward": 2.7363253831863403, - "reward_std": 0.37927868962287903, - "rewards/accuracy_reward": 0.8125000298023224, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.018883120268583298, - "rewards/tag_count_reward": 0.9635416865348816, + "completion_length": 583.4375152587891, + "epoch": 0.1665, + "grad_norm": 3.020419228903663, + "kl": 0.12939453125, + "learning_rate": 9.879304927496896e-07, + "loss": 0.1167, + "reward": 2.7053964138031006, + "reward_std": 0.3210095912218094, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.06196486949920654, + "rewards/tag_count_reward": 0.9895833432674408, "step": 333 }, { "clip_ratio": 0.0, - "completion_length": 603.9375, - "epoch": 0.334, - "grad_norm": 5.897033604873569, - "kl": 1.12890625, - "learning_rate": 8.580461976679099e-07, - "loss": 0.2075, - "reward": 2.6124191284179688, - "reward_std": 0.4683973491191864, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.017789172008633614, - "rewards/tag_count_reward": 0.921875, + "completion_length": 529.2708587646484, + "epoch": 0.167, + "grad_norm": 2.375485512826028, + "kl": 0.142578125, + "learning_rate": 9.877491451942284e-07, + "loss": 0.0589, + "reward": 2.5137102603912354, + "reward_std": 0.3925721198320389, + "rewards/accuracy_reward": 0.583333358168602, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.05747038125991821, + "rewards/tag_count_reward": 0.9947916865348816, "step": 334 }, { "clip_ratio": 0.0, - "completion_length": 515.75, - "epoch": 0.335, - "grad_norm": 7.88708659023299, - "kl": 0.53515625, - "learning_rate": 8.568992620281243e-07, - "loss": 0.0447, - "reward": 2.5799560546875, - "reward_std": 0.4440341293811798, - "rewards/accuracy_reward": 0.625, - "rewards/reasoning_steps_reward": 0.979166716337204, - "rewards/repetition_penalty_reward": -0.019002487882971764, - "rewards/tag_count_reward": 0.9947916865348816, + "completion_length": 510.5, + "epoch": 0.1675, + "grad_norm": 4.018502569391315, + "kl": 0.146484375, + "learning_rate": 9.875664641789543e-07, + "loss": 0.1097, + "reward": 2.4702670574188232, + "reward_std": 0.21461456269025803, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.050566382706165314, + "rewards/tag_count_reward": 1.0, "step": 335 }, { "clip_ratio": 0.0, - "completion_length": 618.2291870117188, - "epoch": 0.336, - "grad_norm": 13.855588768970593, - "kl": 1.8125, - "learning_rate": 8.557485869176825e-07, - "loss": 0.1842, - "reward": 2.804866313934326, - "reward_std": 0.27131783962249756, - "rewards/accuracy_reward": 0.8958333730697632, + "completion_length": 563.6875, + "epoch": 0.168, + "grad_norm": 2.9361596954162597, + "kl": 0.14208984375, + "learning_rate": 9.873824502603459e-07, + "loss": -0.0059, + "reward": 2.709442138671875, + "reward_std": 0.3301195055246353, + "rewards/accuracy_reward": 0.7708333730697632, "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.023258699104189873, - "rewards/tag_count_reward": 0.9322916865348816, + "rewards/repetition_penalty_reward": -0.06139139086008072, + "rewards/tag_count_reward": 1.0, "step": 336 }, { "clip_ratio": 0.0, - "completion_length": 544.0416870117188, - "epoch": 0.337, - "grad_norm": 10.22565696968507, - "kl": 0.556640625, - "learning_rate": 8.545941863571973e-07, - "loss": 0.0697, - "reward": 2.7841432094573975, - "reward_std": 0.22680224478244781, - "rewards/accuracy_reward": 0.8333333730697632, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.028356771916151047, - "rewards/tag_count_reward": 0.9791666865348816, + "completion_length": 590.7916870117188, + "epoch": 0.1685, + "grad_norm": 4.349944868969375, + "kl": 0.1650390625, + "learning_rate": 9.871971039989407e-07, + "loss": 0.0989, + "reward": 2.6142622232437134, + "reward_std": 0.44250747561454773, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9375000298023224, + "rewards/repetition_penalty_reward": -0.07844629138708115, + "rewards/tag_count_reward": 0.984375, "step": 337 }, { "clip_ratio": 0.0, - "completion_length": 660.1875305175781, - "epoch": 0.338, - "grad_norm": 13.275393254857, - "kl": 1.57421875, - "learning_rate": 8.534360744126753e-07, - "loss": 0.3522, - "reward": 2.626600503921509, - "reward_std": 0.5416260808706284, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.026177333667874336, - "rewards/tag_count_reward": 0.9375000298023224, + "completion_length": 454.06251525878906, + "epoch": 0.169, + "grad_norm": 2.19737492930891, + "kl": 0.16650390625, + "learning_rate": 9.870104259593362e-07, + "loss": 0.0282, + "reward": 2.822553515434265, + "reward_std": 0.14519703015685081, + "rewards/accuracy_reward": 0.875, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.04550204798579216, + "rewards/tag_count_reward": 1.0, "step": 338 }, { "clip_ratio": 0.0, - "completion_length": 538.2916870117188, - "epoch": 0.339, - "grad_norm": 9.284294976639861, - "kl": 0.890625, - "learning_rate": 8.522742651953456e-07, - "loss": 0.2327, - "reward": 2.6350467205047607, - "reward_std": 0.30000850558280945, - "rewards/accuracy_reward": 0.6875000298023224, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.024675646796822548, - "rewards/tag_count_reward": 0.9791666865348816, + "completion_length": 521.3333435058594, + "epoch": 0.1695, + "grad_norm": 2.595702991755517, + "kl": 0.21337890625, + "learning_rate": 9.86822416710186e-07, + "loss": -0.0125, + "reward": 2.728282928466797, + "reward_std": 0.35869090259075165, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.04255035147070885, + "rewards/tag_count_reward": 1.0, "step": 339 }, { "clip_ratio": 0.0, - "completion_length": 572.5208740234375, - "epoch": 0.34, - "grad_norm": 11.869158855207635, - "kl": 1.427734375, - "learning_rate": 8.511087728614862e-07, - "loss": 0.2899, - "reward": 2.693272352218628, - "reward_std": 0.4559956192970276, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.023741761222481728, - "rewards/tag_count_reward": 0.9322916865348816, + "completion_length": 487.3333435058594, + "epoch": 0.17, + "grad_norm": 2.150403718440524, + "kl": 0.18505859375, + "learning_rate": 9.866330768241983e-07, + "loss": 0.0148, + "reward": 2.5233949422836304, + "reward_std": 0.3504791557788849, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.06688292883336544, + "rewards/tag_count_reward": 1.0, "step": 340 }, { "clip_ratio": 0.0, - "completion_length": 559.7083435058594, - "epoch": 0.341, - "grad_norm": 16.084642786041737, - "kl": 2.78125, - "learning_rate": 8.499396116122535e-07, - "loss": 0.404, - "reward": 2.5177054405212402, - "reward_std": 0.358899861574173, - "rewards/accuracy_reward": 0.583333358168602, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.018752962350845337, - "rewards/tag_count_reward": 0.953125, + "completion_length": 499.25, + "epoch": 0.1705, + "grad_norm": 2.1347353681012384, + "kl": 0.1826171875, + "learning_rate": 9.86442406878136e-07, + "loss": 0.0095, + "reward": 2.8164087533950806, + "reward_std": 0.22795867174863815, + "rewards/accuracy_reward": 0.8958333730697632, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.06032741814851761, + "rewards/tag_count_reward": 0.9947916865348816, "step": 341 }, { "clip_ratio": 0.0, - "completion_length": 552.3125305175781, - "epoch": 0.342, - "grad_norm": 22.00620591226436, - "kl": 2.71875, - "learning_rate": 8.487667956935087e-07, - "loss": 0.1789, - "reward": 2.8170549869537354, - "reward_std": 0.35434219241142273, - "rewards/accuracy_reward": 0.8958333432674408, + "completion_length": 545.9791870117188, + "epoch": 0.171, + "grad_norm": 2.0936215044851685, + "kl": 0.14453125, + "learning_rate": 9.862504074528126e-07, + "loss": -0.0212, + "reward": 2.819482445716858, + "reward_std": 0.2626145929098129, + "rewards/accuracy_reward": 0.8958333730697632, "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.026695125736296177, - "rewards/tag_count_reward": 0.9479166865348816, + "rewards/repetition_penalty_reward": -0.07635113224387169, + "rewards/tag_count_reward": 1.0, "step": 342 }, { "clip_ratio": 0.0, - "completion_length": 522.9791717529297, - "epoch": 0.343, - "grad_norm": 22.634127347183696, - "kl": 3.59375, - "learning_rate": 8.475903393956433e-07, - "loss": 0.3774, - "reward": 2.7711684703826904, - "reward_std": 0.3556895852088928, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.018762326799333096, - "rewards/tag_count_reward": 0.9427083730697632, + "completion_length": 541.4375, + "epoch": 0.1715, + "grad_norm": 2.104187790019279, + "kl": 0.15478515625, + "learning_rate": 9.860570791330911e-07, + "loss": 0.0196, + "reward": 2.7721027135849, + "reward_std": 0.3143990561366081, + "rewards/accuracy_reward": 0.8958333730697632, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.07164733856916428, + "rewards/tag_count_reward": 0.9895833432674408, "step": 343 }, { "clip_ratio": 0.0, - "completion_length": 594.7291870117188, - "epoch": 0.344, - "grad_norm": 25.37619859311342, - "kl": 2.046875, - "learning_rate": 8.464102570534061e-07, - "loss": 0.0985, - "reward": 2.51139497756958, - "reward_std": 0.4084962010383606, - "rewards/accuracy_reward": 0.583333358168602, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.03374398034065962, - "rewards/tag_count_reward": 0.9687500298023224, + "completion_length": 584.7291870117188, + "epoch": 0.172, + "grad_norm": 3.299557850274513, + "kl": 0.16455078125, + "learning_rate": 9.85862422507884e-07, + "loss": 0.1283, + "reward": 2.278268575668335, + "reward_std": 0.44967466592788696, + "rewards/accuracy_reward": 0.395833358168602, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.04812048375606537, + "rewards/tag_count_reward": 0.9791666865348816, "step": 344 }, { "clip_ratio": 0.0, - "completion_length": 584.375, - "epoch": 0.345, - "grad_norm": 8.691345932821196, - "kl": 2.03515625, - "learning_rate": 8.452265630457282e-07, - "loss": 0.2564, - "reward": 2.603990077972412, - "reward_std": 0.5245843231678009, + "completion_length": 579.9583435058594, + "epoch": 0.1725, + "grad_norm": 4.038057650027608, + "kl": 0.16552734375, + "learning_rate": 9.856664381701483e-07, + "loss": 0.0722, + "reward": 2.5613759756088257, + "reward_std": 0.376383513212204, "rewards/accuracy_reward": 0.6875000298023224, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.02448232937604189, - "rewards/tag_count_reward": 0.9479166865348816, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.06188800558447838, + "rewards/tag_count_reward": 0.984375, "step": 345 }, { "clip_ratio": 0.0, - "completion_length": 533.9166870117188, - "epoch": 0.346, - "grad_norm": 27.37252896321387, - "kl": 1.34375, - "learning_rate": 8.440392717955475e-07, - "loss": 0.4947, - "reward": 2.6430338621139526, - "reward_std": 0.4521474093198776, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.014952600467950106, - "rewards/tag_count_reward": 0.9427083432674408, + "completion_length": 485.10418701171875, + "epoch": 0.173, + "grad_norm": 2.3341365558471643, + "kl": 0.15625, + "learning_rate": 9.854691267168871e-07, + "loss": 0.059, + "reward": 2.5120105743408203, + "reward_std": 0.23116411548107862, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.057433972135186195, + "rewards/tag_count_reward": 1.0, "step": 346 }, { "clip_ratio": 0.0, - "completion_length": 528.8541870117188, - "epoch": 0.347, - "grad_norm": 3.9349545954124743, - "kl": 0.39453125, - "learning_rate": 8.428483977696328e-07, - "loss": 0.0493, - "reward": 2.7178937196731567, - "reward_std": 0.36671870201826096, - "rewards/accuracy_reward": 0.75, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.026898046024143696, - "rewards/tag_count_reward": 0.9947916865348816, + "completion_length": 540.1875305175781, + "epoch": 0.1735, + "grad_norm": 2.6154155854500614, + "kl": 0.1572265625, + "learning_rate": 9.852704887491445e-07, + "loss": -0.0386, + "reward": 2.24581515789032, + "reward_std": 0.4455568790435791, + "rewards/accuracy_reward": 0.3333333358168602, + "rewards/reasoning_steps_reward": 0.9722222089767456, + "rewards/repetition_penalty_reward": -0.05974029190838337, + "rewards/tag_count_reward": 1.0, "step": 347 }, { "clip_ratio": 0.0, - "completion_length": 536.8333740234375, - "epoch": 0.348, - "grad_norm": 5.777524675922901, - "kl": 0.3681640625, - "learning_rate": 8.416539554784089e-07, - "loss": 0.049, - "reward": 2.5611852407455444, - "reward_std": 0.3319101259112358, - "rewards/accuracy_reward": 0.6041666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.025620201602578163, - "rewards/tag_count_reward": 0.9895833432674408, + "completion_length": 543.5, + "epoch": 0.174, + "grad_norm": 2.774589039274609, + "kl": 0.16796875, + "learning_rate": 9.850705248720068e-07, + "loss": 0.0403, + "reward": 2.727095603942871, + "reward_std": 0.4202606528997421, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.047210052609443665, + "rewards/tag_count_reward": 0.9895833730697632, "step": 348 }, { "clip_ratio": 0.0, - "completion_length": 553.625, - "epoch": 0.349, - "grad_norm": 5.2707839803098055, - "kl": 0.34375, - "learning_rate": 8.404559594757777e-07, - "loss": 0.0007, - "reward": 2.545145034790039, - "reward_std": 0.22191336005926132, - "rewards/accuracy_reward": 0.5833333730697632, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.026035414077341557, - "rewards/tag_count_reward": 0.9947916865348816, + "completion_length": 659.2083435058594, + "epoch": 0.1745, + "grad_norm": 4.335912070161832, + "kl": 0.18359375, + "learning_rate": 9.848692356945981e-07, + "loss": 0.083, + "reward": 2.5149093866348267, + "reward_std": 0.48283551633358, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.07884062826633453, + "rewards/tag_count_reward": 0.96875, "step": 349 }, { "clip_ratio": 0.0, - "completion_length": 462.12501525878906, - "epoch": 0.35, - "grad_norm": 3.643489393286096, - "kl": 0.2626953125, - "learning_rate": 8.392544243589427e-07, - "loss": 0.0118, - "reward": 2.5923283100128174, - "reward_std": 0.3613494336605072, - "rewards/accuracy_reward": 0.625, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.022255297750234604, - "rewards/tag_count_reward": 0.9895833432674408, + "completion_length": 424.60418701171875, + "epoch": 0.175, + "grad_norm": 2.911920115218028, + "kl": 0.1474609375, + "learning_rate": 9.846666218300807e-07, + "loss": 0.0423, + "reward": 2.867781400680542, + "reward_std": 0.18726971745491028, + "rewards/accuracy_reward": 0.9791666865348816, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.06798264011740685, + "rewards/tag_count_reward": 0.984375, "step": 350 }, { "clip_ratio": 0.0, - "completion_length": 563.3333740234375, - "epoch": 0.351, - "grad_norm": 7.73557681949032, - "kl": 0.56640625, - "learning_rate": 8.3804936476823e-07, - "loss": 0.1536, - "reward": 2.4618531465530396, - "reward_std": 0.3520192950963974, - "rewards/accuracy_reward": 0.520833358168602, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.024257982149720192, - "rewards/tag_count_reward": 0.9791666865348816, + "completion_length": 625.1875, + "epoch": 0.1755, + "grad_norm": 4.16432723769119, + "kl": 0.263671875, + "learning_rate": 9.844626838956513e-07, + "loss": 0.0903, + "reward": 2.526502013206482, + "reward_std": 0.4054105877876282, + "rewards/accuracy_reward": 0.625, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.055095236748456955, + "rewards/tag_count_reward": 0.9843750298023224, "step": 351 }, { "clip_ratio": 0.0, - "completion_length": 523.5625305175781, - "epoch": 0.352, - "grad_norm": 3.134584879525316, - "kl": 0.16259765625, - "learning_rate": 8.368407953869103e-07, - "loss": 0.0164, - "reward": 2.8710557222366333, - "reward_std": 0.16795307025313377, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.03345821052789688, - "rewards/tag_count_reward": 0.9947916865348816, - "step": 352 - }, - { - "clip_ratio": 0.0, - "completion_length": 526.0208435058594, - "epoch": 0.353, - "grad_norm": 4.312118426726596, - "kl": 0.224609375, - "learning_rate": 8.356287309410204e-07, - "loss": 0.0462, - "reward": 2.8158966302871704, - "reward_std": 0.22574415802955627, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.0226451912894845, + "completion_length": 596.2083740234375, + "epoch": 0.176, + "grad_norm": 4.351440295785368, + "kl": 0.3583984375, + "learning_rate": 9.8425742251254e-07, + "loss": 0.0664, + "reward": 2.6749093532562256, + "reward_std": 0.30670662224292755, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9375000596046448, + "rewards/repetition_penalty_reward": -0.03863241523504257, + "rewards/tag_count_reward": 0.9843750298023224, + "step": 352 + }, + { + "clip_ratio": 0.0, + "completion_length": 551.6458587646484, + "epoch": 0.1765, + "grad_norm": 4.117439533311138, + "kl": 0.6171875, + "learning_rate": 9.84050838306009e-07, + "loss": 0.1052, + "reward": 2.7304115295410156, + "reward_std": 0.3043531756848097, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.0734081044793129, "rewards/tag_count_reward": 0.984375, "step": 353 }, { "clip_ratio": 0.0, - "completion_length": 490.1666717529297, - "epoch": 0.354, - "grad_norm": 2.993028075064908, - "kl": 0.1904296875, - "learning_rate": 8.344131861991828e-07, - "loss": -0.0266, - "reward": 2.8288527727127075, - "reward_std": 0.15067671798169613, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.01836968120187521, - "rewards/tag_count_reward": 1.0, + "completion_length": 593.0, + "epoch": 0.177, + "grad_norm": 7.403178869874331, + "kl": 1.310546875, + "learning_rate": 9.838429319053495e-07, + "loss": 0.1308, + "reward": 2.756809949874878, + "reward_std": 0.3105107471346855, + "rewards/accuracy_reward": 0.8750000298023224, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.08173201233148575, + "rewards/tag_count_reward": 0.9843750298023224, "step": 354 }, { "clip_ratio": 0.0, - "completion_length": 577.9583435058594, - "epoch": 0.355, - "grad_norm": 4.692169147728827, - "kl": 0.69189453125, - "learning_rate": 8.331941759724268e-07, - "loss": 0.1281, - "reward": 2.5701699256896973, - "reward_std": 0.3895218074321747, - "rewards/accuracy_reward": 0.6250000298023224, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.02705218270421028, - "rewards/tag_count_reward": 0.9791666865348816, + "completion_length": 587.6250305175781, + "epoch": 0.1775, + "grad_norm": 21.446192643297536, + "kl": 2.4765625, + "learning_rate": 9.836337039438803e-07, + "loss": 0.2692, + "reward": 2.3152072429656982, + "reward_std": 0.29953232035040855, + "rewards/accuracy_reward": 0.4166666716337204, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.04763999953866005, + "rewards/tag_count_reward": 0.9739583432674408, "step": 355 }, { "clip_ratio": 0.0, - "completion_length": 601.1458435058594, - "epoch": 0.356, - "grad_norm": 10.169896342116992, - "kl": 0.734375, - "learning_rate": 8.319717151140072e-07, - "loss": 0.2155, - "reward": 2.395414352416992, - "reward_std": 0.416961133480072, - "rewards/accuracy_reward": 0.4791666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.026460560970008373, - "rewards/tag_count_reward": 0.9427083432674408, + "completion_length": 559.5625305175781, + "epoch": 0.178, + "grad_norm": 28.939767658793734, + "kl": 3.359375, + "learning_rate": 9.83423155058946e-07, + "loss": 0.3177, + "reward": 2.3154940605163574, + "reward_std": 0.4568801373243332, + "rewards/accuracy_reward": 0.4166666716337204, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.03693648986518383, + "rewards/tag_count_reward": 0.9635416865348816, "step": 356 }, { "clip_ratio": 0.0, - "completion_length": 546.5416870117188, - "epoch": 0.357, - "grad_norm": 11.244917008930972, - "kl": 0.6279296875, - "learning_rate": 8.307458185192238e-07, - "loss": 0.1956, - "reward": 2.8225170373916626, - "reward_std": 0.34238358587026596, - "rewards/accuracy_reward": 0.8958333730697632, + "completion_length": 534.0, + "epoch": 0.1785, + "grad_norm": 32.29729153403837, + "kl": 2.34375, + "learning_rate": 9.832112858919155e-07, + "loss": 0.0816, + "reward": 2.6441575288772583, + "reward_std": 0.3644861727952957, + "rewards/accuracy_reward": 0.6875000298023224, "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.021233050152659416, - "rewards/tag_count_reward": 0.9479166865348816, + "rewards/repetition_penalty_reward": -0.038134196773171425, + "rewards/tag_count_reward": 0.9947916865348816, "step": 357 }, { "clip_ratio": 0.0, - "completion_length": 601.2916870117188, - "epoch": 0.358, - "grad_norm": 6.159922849123736, - "kl": 0.908203125, - "learning_rate": 8.295165011252396e-07, - "loss": 0.1542, - "reward": 2.7077341079711914, - "reward_std": 0.3084152042865753, - "rewards/accuracy_reward": 0.75, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.03184933587908745, - "rewards/tag_count_reward": 0.9895833432674408, + "completion_length": 563.75, + "epoch": 0.179, + "grad_norm": 13.6168708257534, + "kl": 1.30078125, + "learning_rate": 9.829980970881784e-07, + "loss": 0.0706, + "reward": 2.5547882318496704, + "reward_std": 0.30551889538764954, + "rewards/accuracy_reward": 0.6041666716337204, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.03548960853368044, + "rewards/tag_count_reward": 1.0, "step": 358 }, { "clip_ratio": 0.0, - "completion_length": 506.85418701171875, - "epoch": 0.359, - "grad_norm": 4.104290958705583, - "kl": 0.3466796875, - "learning_rate": 8.282837779108993e-07, - "loss": 0.0297, - "reward": 2.7414658069610596, - "reward_std": 0.21230606734752655, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.02415929874405265, - "rewards/tag_count_reward": 0.9947916865348816, + "completion_length": 569.4791870117188, + "epoch": 0.1795, + "grad_norm": 9.408634350051774, + "kl": 0.673828125, + "learning_rate": 9.82783589297145e-07, + "loss": 0.0145, + "reward": 2.756742000579834, + "reward_std": 0.25605448335409164, + "rewards/accuracy_reward": 0.875, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.06443855538964272, + "rewards/tag_count_reward": 0.9739583432674408, "step": 359 }, { "clip_ratio": 0.0, - "completion_length": 547.5000305175781, - "epoch": 0.36, - "grad_norm": 20.57792012790243, - "kl": 2.328125, - "learning_rate": 8.270476638965461e-07, - "loss": 0.502, - "reward": 2.5797927379608154, - "reward_std": 0.31256987154483795, - "rewards/accuracy_reward": 0.645833358168602, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.01916569098830223, - "rewards/tag_count_reward": 0.9531250298023224, + "completion_length": 503.06251525878906, + "epoch": 0.18, + "grad_norm": 3.6640521264364514, + "kl": 0.25537109375, + "learning_rate": 9.825677631722435e-07, + "loss": -0.0335, + "reward": 2.7158700227737427, + "reward_std": 0.3013303726911545, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.07926897704601288, + "rewards/tag_count_reward": 0.9895833432674408, "step": 360 }, { "clip_ratio": 0.0, - "completion_length": 729.3541870117188, - "epoch": 0.361, - "grad_norm": 42.22333450542258, - "kl": 6.875, - "learning_rate": 8.258081741438394e-07, - "loss": 0.8517, - "reward": 2.3151514530181885, - "reward_std": 0.5804566144943237, - "rewards/accuracy_reward": 0.5208333432674408, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.021654206328094006, - "rewards/tag_count_reward": 0.8229166865348816, + "completion_length": 513.1250305175781, + "epoch": 0.1805, + "grad_norm": 5.09067870684325, + "kl": 0.1796875, + "learning_rate": 9.823506193709174e-07, + "loss": -0.0324, + "reward": 2.6296072006225586, + "reward_std": 0.17554676160216331, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.059629036113619804, + "rewards/tag_count_reward": 0.9947916865348816, "step": 361 }, { "clip_ratio": 0.0, - "completion_length": 597.5416870117188, - "epoch": 0.362, - "grad_norm": 20.91301673590996, - "kl": 3.27734375, - "learning_rate": 8.245653237555705e-07, - "loss": 0.4718, - "reward": 2.5699336528778076, - "reward_std": 0.4157796800136566, - "rewards/accuracy_reward": 0.6666666865348816, + "completion_length": 527.1875152587891, + "epoch": 0.181, + "grad_norm": 2.3488865190438, + "kl": 0.1416015625, + "learning_rate": 9.821321585546243e-07, + "loss": -0.0337, + "reward": 2.4458130598068237, + "reward_std": 0.36342713236808777, + "rewards/accuracy_reward": 0.5416666865348816, "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.03076085541397333, - "rewards/tag_count_reward": 0.9479166865348816, + "rewards/repetition_penalty_reward": -0.08196471631526947, + "rewards/tag_count_reward": 1.0, "step": 362 }, { "clip_ratio": 0.0, - "completion_length": 526.9166717529297, - "epoch": 0.363, - "grad_norm": 15.16442092292686, - "kl": 1.78515625, - "learning_rate": 8.23319127875479e-07, - "loss": 0.2643, - "reward": 2.8612154722213745, - "reward_std": 0.2645837068557739, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.020729007199406624, - "rewards/tag_count_reward": 0.9791666865348816, + "completion_length": 511.06251525878906, + "epoch": 0.1815, + "grad_norm": 2.429968776707808, + "kl": 0.13134765625, + "learning_rate": 9.81912381388834e-07, + "loss": 0.0443, + "reward": 2.6360349655151367, + "reward_std": 0.3865511268377304, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.06535414233803749, + "rewards/tag_count_reward": 1.0, "step": 363 }, { "clip_ratio": 0.0, - "completion_length": 574.6250152587891, - "epoch": 0.364, - "grad_norm": 27.79134380026425, - "kl": 2.84375, - "learning_rate": 8.220696016880687e-07, - "loss": 0.4106, - "reward": 2.6281174421310425, - "reward_std": 0.4037089943885803, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.015979719813913107, - "rewards/tag_count_reward": 0.9218750298023224, + "completion_length": 567.7708740234375, + "epoch": 0.182, + "grad_norm": 2.0606641273830903, + "kl": 0.13134765625, + "learning_rate": 9.816912885430258e-07, + "loss": 0.0193, + "reward": 2.468637228012085, + "reward_std": 0.42131057381629944, + "rewards/accuracy_reward": 0.5625000298023224, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.07302946597337723, + "rewards/tag_count_reward": 1.0, "step": 364 }, { "clip_ratio": 0.0, - "completion_length": 736.8958435058594, - "epoch": 0.365, - "grad_norm": 20.741864587098874, - "kl": 3.328125, - "learning_rate": 8.208167604184217e-07, - "loss": 0.5224, - "reward": 2.400440216064453, - "reward_std": 0.5106681287288666, - "rewards/accuracy_reward": 0.5833333432674408, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.024907216429710388, - "rewards/tag_count_reward": 0.8489583432674408, + "completion_length": 610.7291870117188, + "epoch": 0.1825, + "grad_norm": 2.0345217984138086, + "kl": 0.127197265625, + "learning_rate": 9.814688806906868e-07, + "loss": 0.0449, + "reward": 2.4708261489868164, + "reward_std": 0.3200060650706291, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.07778490334749222, + "rewards/tag_count_reward": 1.0, "step": 365 }, { "clip_ratio": 0.0, - "completion_length": 651.7708435058594, - "epoch": 0.366, - "grad_norm": 14.227390419980614, - "kl": 2.10546875, - "learning_rate": 8.195606193320136e-07, - "loss": 0.3847, - "reward": 2.4665273427963257, - "reward_std": 0.6216453611850739, - "rewards/accuracy_reward": 0.5833333730697632, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.03347271308302879, - "rewards/tag_count_reward": 0.9166666865348816, + "completion_length": 524.7083435058594, + "epoch": 0.183, + "grad_norm": 2.1808261277456205, + "kl": 0.12158203125, + "learning_rate": 9.812451585093098e-07, + "loss": 0.0236, + "reward": 2.7193844318389893, + "reward_std": 0.34990330785512924, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.05839330144226551, + "rewards/tag_count_reward": 1.0, "step": 366 }, { "clip_ratio": 0.0, - "completion_length": 714.1458740234375, - "epoch": 0.367, - "grad_norm": 15.698648427320858, - "kl": 2.15625, - "learning_rate": 8.183011937345271e-07, - "loss": 0.3797, - "reward": 2.530668020248413, - "reward_std": 0.5888173580169678, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.02141545619815588, - "rewards/tag_count_reward": 0.8437500298023224, + "completion_length": 483.68751525878906, + "epoch": 0.1835, + "grad_norm": 2.2192226472108367, + "kl": 0.12939453125, + "learning_rate": 9.810201226803917e-07, + "loss": 0.0069, + "reward": 2.4758822917938232, + "reward_std": 0.3382147550582886, + "rewards/accuracy_reward": 0.5625000298023224, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.058839818462729454, + "rewards/tag_count_reward": 1.0, "step": 367 }, { "clip_ratio": 0.0, - "completion_length": 646.0000305175781, - "epoch": 0.368, - "grad_norm": 24.843348556432016, - "kl": 1.98828125, - "learning_rate": 8.170384989716657e-07, - "loss": 0.5184, - "reward": 2.5058876276016235, - "reward_std": 0.6075495481491089, - "rewards/accuracy_reward": 0.6458333432674408, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.023626457899808884, - "rewards/tag_count_reward": 0.8906250298023224, + "completion_length": 519.3750152587891, + "epoch": 0.184, + "grad_norm": 2.367140341215985, + "kl": 0.128662109375, + "learning_rate": 9.807937738894303e-07, + "loss": -0.0302, + "reward": 2.6772682666778564, + "reward_std": 0.2594939023256302, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.051898419857025146, + "rewards/tag_count_reward": 1.0, "step": 368 }, { "clip_ratio": 0.0, - "completion_length": 597.1875, - "epoch": 0.369, - "grad_norm": 16.338985391404897, - "kl": 1.15625, - "learning_rate": 8.157725504289664e-07, - "loss": 0.3953, - "reward": 2.7404117584228516, - "reward_std": 0.3872096836566925, - "rewards/accuracy_reward": 0.8333333730697632, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.02347709983587265, - "rewards/tag_count_reward": 0.9583333432674408, + "completion_length": 513.6875, + "epoch": 0.1845, + "grad_norm": 2.3819557955341617, + "kl": 0.13525390625, + "learning_rate": 9.805661128259235e-07, + "loss": -0.0399, + "reward": 2.5830910205841064, + "reward_std": 0.43722137808799744, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.06968686729669571, + "rewards/tag_count_reward": 1.0, "step": 369 }, { "clip_ratio": 0.0, - "completion_length": 596.0833435058594, - "epoch": 0.37, - "grad_norm": 13.06241235347548, - "kl": 1.5, - "learning_rate": 8.145033635316128e-07, - "loss": 0.3765, - "reward": 2.6401225328445435, - "reward_std": 0.5106720924377441, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.01959980558604002, - "rewards/tag_count_reward": 0.9166666865348816, + "completion_length": 563.4166870117188, + "epoch": 0.185, + "grad_norm": 2.4301633780659135, + "kl": 0.131103515625, + "learning_rate": 9.80337140183366e-07, + "loss": 0.0573, + "reward": 2.719074010848999, + "reward_std": 0.37694530189037323, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.06564832478761673, + "rewards/tag_count_reward": 1.0, "step": 370 }, { "clip_ratio": 0.0, - "completion_length": 726.5208435058594, - "epoch": 0.371, - "grad_norm": 14.668565421161508, - "kl": 2.65625, - "learning_rate": 8.13230953744247e-07, - "loss": 0.5462, - "reward": 2.342048168182373, - "reward_std": 0.4522154927253723, - "rewards/accuracy_reward": 0.4791666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.02774341218173504, - "rewards/tag_count_reward": 0.8906250298023224, + "completion_length": 482.1875, + "epoch": 0.1855, + "grad_norm": 2.4697642245743032, + "kl": 0.134521484375, + "learning_rate": 9.801068566592483e-07, + "loss": 0.0284, + "reward": 2.754256248474121, + "reward_std": 0.2568470761179924, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.06692435592412949, + "rewards/tag_count_reward": 0.9947916865348816, "step": 371 }, { "clip_ratio": 0.0, - "completion_length": 634.5416870117188, - "epoch": 0.372, - "grad_norm": 10.126972374008618, - "kl": 2.734375, - "learning_rate": 8.119553365707802e-07, - "loss": 0.5343, - "reward": 2.6384775638580322, - "reward_std": 0.559327244758606, - "rewards/accuracy_reward": 0.75, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.016036429908126593, - "rewards/tag_count_reward": 0.9114583432674408, + "completion_length": 509.1666717529297, + "epoch": 0.186, + "grad_norm": 2.448475553737196, + "kl": 0.116943359375, + "learning_rate": 9.798752629550546e-07, + "loss": 0.0663, + "reward": 2.611233115196228, + "reward_std": 0.36806730926036835, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.07800298929214478, + "rewards/tag_count_reward": 0.9947916865348816, "step": 372 }, { "clip_ratio": 0.0, - "completion_length": 620.6041870117188, - "epoch": 0.373, - "grad_norm": 12.491483118539476, - "kl": 3.515625, - "learning_rate": 8.106765275542053e-07, - "loss": 0.608, - "reward": 2.324834704399109, - "reward_std": 0.6106734275817871, - "rewards/accuracy_reward": 0.4791666716337204, - "rewards/reasoning_steps_reward": 0.9513889253139496, - "rewards/repetition_penalty_reward": -0.022387592121958733, - "rewards/tag_count_reward": 0.9166666865348816, + "completion_length": 533.8125305175781, + "epoch": 0.1865, + "grad_norm": 2.0486712055274343, + "kl": 0.118896484375, + "learning_rate": 9.796423597762588e-07, + "loss": -0.0294, + "reward": 2.593945264816284, + "reward_std": 0.43071576952934265, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.09702702611684799, + "rewards/tag_count_reward": 0.9895833432674408, "step": 373 }, { "clip_ratio": 0.0, - "completion_length": 595.9166870117188, - "epoch": 0.374, - "grad_norm": 34.92698420205304, - "kl": 4.8046875, - "learning_rate": 8.093945422764069e-07, - "loss": 0.7281, - "reward": 2.6930631399154663, - "reward_std": 0.4513649195432663, - "rewards/accuracy_reward": 0.8125000298023224, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.015270282980054617, - "rewards/tag_count_reward": 0.8958333432674408, + "completion_length": 500.25001525878906, + "epoch": 0.187, + "grad_norm": 2.3264130204399525, + "kl": 0.120849609375, + "learning_rate": 9.794081478323245e-07, + "loss": 0.0417, + "reward": 2.555578351020813, + "reward_std": 0.32255755364894867, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.10414392501115799, + "rewards/tag_count_reward": 1.0, "step": 374 }, { "clip_ratio": 0.0, - "completion_length": 555.0000152587891, - "epoch": 0.375, - "grad_norm": 20.20783740449395, - "kl": 3.421875, - "learning_rate": 8.081093963579707e-07, - "loss": 0.6528, - "reward": 2.706920862197876, - "reward_std": 0.5360372513532639, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/reasoning_steps_reward": 0.9583333730697632, - "rewards/repetition_penalty_reward": -0.01703750714659691, - "rewards/tag_count_reward": 0.9322916865348816, + "completion_length": 563.0625305175781, + "epoch": 0.1875, + "grad_norm": 2.1622259964161303, + "kl": 0.12548828125, + "learning_rate": 9.791726278367021e-07, + "loss": -0.0156, + "reward": 2.6402961015701294, + "reward_std": 0.40212856233119965, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.07671798765659332, + "rewards/tag_count_reward": 0.9947916865348816, "step": 375 }, { "clip_ratio": 0.0, - "completion_length": 809.8125, - "epoch": 0.376, - "grad_norm": 42.45590838876723, - "kl": 8.359375, - "learning_rate": 8.068211054579943e-07, - "loss": 0.939, - "reward": 2.4243576526641846, - "reward_std": 0.6719434857368469, - "rewards/accuracy_reward": 0.6041666865348816, + "completion_length": 503.00001525878906, + "epoch": 0.188, + "grad_norm": 1.9354230871668425, + "kl": 0.109375, + "learning_rate": 9.78935800506826e-07, + "loss": 0.0093, + "reward": 2.648646354675293, + "reward_std": 0.22648237645626068, + "rewards/accuracy_reward": 0.7708333730697632, "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.033975621685385704, - "rewards/tag_count_reward": 0.8750000298023224, + "rewards/repetition_penalty_reward": -0.10135376825928688, + "rewards/tag_count_reward": 1.0, "step": 376 }, { "clip_ratio": 0.0, - "completion_length": 618.2083435058594, - "epoch": 0.377, - "grad_norm": 8.055288053209214, - "kl": 2.9609375, - "learning_rate": 8.055296852738956e-07, - "loss": 0.5277, - "reward": 2.515875458717346, - "reward_std": 0.6309832334518433, - "rewards/accuracy_reward": 0.6458333432674408, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.018846786580979824, - "rewards/tag_count_reward": 0.8958333730697632, + "completion_length": 452.4791717529297, + "epoch": 0.1885, + "grad_norm": 2.4900006677038897, + "kl": 0.1298828125, + "learning_rate": 9.786976665641138e-07, + "loss": 0.0358, + "reward": 2.7121907472610474, + "reward_std": 0.29230934381484985, + "rewards/accuracy_reward": 0.8333333730697632, + "rewards/reasoning_steps_reward": 0.9652778506278992, + "rewards/repetition_penalty_reward": -0.07600371912121773, + "rewards/tag_count_reward": 0.9895833730697632, "step": 377 }, { "clip_ratio": 0.0, - "completion_length": 531.5625, - "epoch": 0.378, - "grad_norm": 10.73266293127411, - "kl": 2.0234375, - "learning_rate": 8.04235151541222e-07, - "loss": 0.5295, - "reward": 2.736036777496338, - "reward_std": 0.5330559611320496, - "rewards/accuracy_reward": 0.8333333730697632, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.012227283790707588, - "rewards/tag_count_reward": 0.9218750298023224, + "completion_length": 510.14585876464844, + "epoch": 0.189, + "grad_norm": 2.3438629757333143, + "kl": 0.12646484375, + "learning_rate": 9.784582267339622e-07, + "loss": 0.0447, + "reward": 2.517542004585266, + "reward_std": 0.44227664172649384, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.951388955116272, + "rewards/repetition_penalty_reward": -0.07968007400631905, + "rewards/tag_count_reward": 1.0, "step": 378 }, { "clip_ratio": 0.0, - "completion_length": 577.9166717529297, - "epoch": 0.379, - "grad_norm": 30.827479254698062, - "kl": 1.96484375, - "learning_rate": 8.029375200334587e-07, - "loss": 0.4051, - "reward": 2.63044536113739, - "reward_std": 0.4179770350456238, - "rewards/accuracy_reward": 0.75, - "rewards/reasoning_steps_reward": 0.9583333432674408, - "rewards/repetition_penalty_reward": -0.02059647301211953, - "rewards/tag_count_reward": 0.9427083432674408, + "completion_length": 498.12501525878906, + "epoch": 0.1895, + "grad_norm": 2.352684614662993, + "kl": 0.1328125, + "learning_rate": 9.78217481745747e-07, + "loss": -0.0119, + "reward": 2.3344022035598755, + "reward_std": 0.4069585055112839, + "rewards/accuracy_reward": 0.4375000298023224, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.049278499558568, + "rewards/tag_count_reward": 0.9947916865348816, "step": 379 }, { "clip_ratio": 0.0, - "completion_length": 571.625, - "epoch": 0.38, - "grad_norm": 11.564342505897628, - "kl": 0.927734375, - "learning_rate": 8.01636806561836e-07, - "loss": 0.286, - "reward": 2.6674641370773315, - "reward_std": 0.36413097381591797, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.030452590435743332, - "rewards/tag_count_reward": 0.96875, + "completion_length": 446.18751525878906, + "epoch": 0.19, + "grad_norm": 2.4157444132247923, + "kl": 0.12841796875, + "learning_rate": 9.779754323328192e-07, + "loss": 0.057, + "reward": 2.842611312866211, + "reward_std": 0.24086056649684906, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.060166530311107635, + "rewards/tag_count_reward": 1.0, "step": 380 }, { "clip_ratio": 0.0, - "completion_length": 627.7083587646484, - "epoch": 0.381, - "grad_norm": 20.79576339395751, - "kl": 2.859375, - "learning_rate": 8.003330269751372e-07, - "loss": 0.473, - "reward": 2.5600701570510864, - "reward_std": 0.5540246367454529, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.02152709010988474, - "rewards/tag_count_reward": 0.8802083432674408, + "completion_length": 444.62501525878906, + "epoch": 0.1905, + "grad_norm": 2.419479532121062, + "kl": 0.14306640625, + "learning_rate": 9.777320792325025e-07, + "loss": 0.1048, + "reward": 2.7526127099990845, + "reward_std": 0.14365122094750404, + "rewards/accuracy_reward": 0.8125, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.05467919073998928, + "rewards/tag_count_reward": 0.9947916865348816, "step": 381 }, { "clip_ratio": 0.0, - "completion_length": 549.8125305175781, - "epoch": 0.382, - "grad_norm": 5.584732514662178, - "kl": 0.400390625, - "learning_rate": 7.990261971595048e-07, - "loss": 0.037, - "reward": 2.806509017944336, - "reward_std": 0.29304996132850647, - "rewards/accuracy_reward": 0.8750000298023224, - "rewards/reasoning_steps_reward": 0.972222238779068, - "rewards/repetition_penalty_reward": -0.030296694487333298, - "rewards/tag_count_reward": 0.9895833432674408, + "completion_length": 459.0833435058594, + "epoch": 0.191, + "grad_norm": 2.372783987416792, + "kl": 0.12890625, + "learning_rate": 9.774874231860935e-07, + "loss": 0.0066, + "reward": 2.371148109436035, + "reward_std": 0.40637652575969696, + "rewards/accuracy_reward": 0.4791666865348816, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.07329631596803665, + "rewards/tag_count_reward": 1.0, "step": 382 }, { "clip_ratio": 0.0, - "completion_length": 804.3541870117188, - "epoch": 0.383, - "grad_norm": 11.399359693772158, - "kl": 2.4296875, - "learning_rate": 7.977163330382479e-07, - "loss": 0.4973, - "reward": 2.5117881298065186, - "reward_std": 0.5285173058509827, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.028142395429313183, - "rewards/tag_count_reward": 0.8177083730697632, + "completion_length": 471.2291717529297, + "epoch": 0.1915, + "grad_norm": 2.226194607367302, + "kl": 0.13232421875, + "learning_rate": 9.772414649388568e-07, + "loss": 0.044, + "reward": 2.6466528177261353, + "reward_std": 0.2924405038356781, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.08251398801803589, + "rewards/tag_count_reward": 1.0, "step": 383 }, { "clip_ratio": 0.0, - "completion_length": 529.2291870117188, - "epoch": 0.384, - "grad_norm": 15.184188235037553, - "kl": 1.04296875, - "learning_rate": 7.964034505716476e-07, - "loss": 0.3124, - "reward": 2.5725693702697754, - "reward_std": 0.3802667409181595, - "rewards/accuracy_reward": 0.6458333432674408, - "rewards/reasoning_steps_reward": 0.9791666567325592, - "rewards/repetition_penalty_reward": -0.015972374938428402, - "rewards/tag_count_reward": 0.9635416865348816, + "completion_length": 500.6041717529297, + "epoch": 0.192, + "grad_norm": 2.132124535999422, + "kl": 0.14892578125, + "learning_rate": 9.769942052400235e-07, + "loss": -0.031, + "reward": 2.7737842798233032, + "reward_std": 0.23282310366630554, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.9305556118488312, + "rewards/repetition_penalty_reward": -0.07343816570937634, + "rewards/tag_count_reward": 1.0, "step": 384 }, { "clip_ratio": 0.0, - "completion_length": 677.3333435058594, - "epoch": 0.385, - "grad_norm": 6.513600915558159, - "kl": 2.9375, - "learning_rate": 7.950875657567621e-07, - "loss": 0.5198, - "reward": 2.4022648334503174, - "reward_std": 0.6920067071914673, - "rewards/accuracy_reward": 0.5833333730697632, + "completion_length": 534.7708435058594, + "epoch": 0.1925, + "grad_norm": 2.1795003146204626, + "kl": 0.1416015625, + "learning_rate": 9.767456448427896e-07, + "loss": 0.0287, + "reward": 2.5758708715438843, + "reward_std": 0.38380637764930725, + "rewards/accuracy_reward": 0.6666666865348816, "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.0178740993142128, - "rewards/tag_count_reward": 0.84375, + "rewards/repetition_penalty_reward": -0.07864313945174217, + "rewards/tag_count_reward": 0.9947916865348816, "step": 385 }, { "clip_ratio": 0.0, - "completion_length": 572.7708435058594, - "epoch": 0.386, - "grad_norm": 10.893931836031141, - "kl": 1.95703125, - "learning_rate": 7.93768694627233e-07, - "loss": 0.4043, - "reward": 2.5226422548294067, - "reward_std": 0.502282902598381, - "rewards/accuracy_reward": 0.6458333432674408, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.022496800869703293, - "rewards/tag_count_reward": 0.90625, + "completion_length": 500.5625305175781, + "epoch": 0.193, + "grad_norm": 2.3063809528831736, + "kl": 0.14013671875, + "learning_rate": 9.764957845043135e-07, + "loss": -0.021, + "reward": 2.755507707595825, + "reward_std": 0.29851196706295013, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.0500478558242321, + "rewards/tag_count_reward": 1.0, "step": 386 }, { "clip_ratio": 0.0, - "completion_length": 650.7083435058594, - "epoch": 0.387, - "grad_norm": 11.694285585261715, - "kl": 2.919921875, - "learning_rate": 7.924468532530883e-07, - "loss": 0.3373, - "reward": 2.491190791130066, - "reward_std": 0.4670180529356003, - "rewards/accuracy_reward": 0.645833358168602, + "completion_length": 503.33335876464844, + "epoch": 0.1935, + "grad_norm": 2.1645139159061717, + "kl": 0.131591796875, + "learning_rate": 9.76244624985713e-07, + "loss": 0.0266, + "reward": 2.502621650695801, + "reward_std": 0.42333585023880005, + "rewards/accuracy_reward": 0.583333358168602, "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.03311488404870033, - "rewards/tag_count_reward": 0.8854166865348816, + "rewards/repetition_penalty_reward": -0.06335068121552467, + "rewards/tag_count_reward": 0.9895833432674408, "step": 387 }, { "clip_ratio": 0.0, - "completion_length": 834.1041870117188, - "epoch": 0.388, - "grad_norm": 11.93709525868574, - "kl": 5.203125, - "learning_rate": 7.911220577405484e-07, - "loss": 0.6908, - "reward": 2.1863056421279907, - "reward_std": 0.6824186444282532, - "rewards/accuracy_reward": 0.4166666865348816, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.020291661843657494, - "rewards/tag_count_reward": 0.7968750298023224, + "completion_length": 477.0833435058594, + "epoch": 0.194, + "grad_norm": 2.2028964179218256, + "kl": 0.14697265625, + "learning_rate": 9.759921670520634e-07, + "loss": -0.0024, + "reward": 2.746849298477173, + "reward_std": 0.3060501739382744, + "rewards/accuracy_reward": 0.8333333730697632, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.058706341311335564, + "rewards/tag_count_reward": 1.0, "step": 388 }, { "clip_ratio": 0.0, - "completion_length": 692.3333435058594, - "epoch": 0.389, - "grad_norm": 9.021930665026163, - "kl": 3.2265625, - "learning_rate": 7.897943242318285e-07, - "loss": 0.5433, - "reward": 2.5233638286590576, - "reward_std": 0.5364340543746948, - "rewards/accuracy_reward": 0.6875000298023224, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.026983547024428844, - "rewards/tag_count_reward": 0.8697916865348816, + "completion_length": 488.56251525878906, + "epoch": 0.1945, + "grad_norm": 2.6481468308941714, + "kl": 0.1396484375, + "learning_rate": 9.757384114723953e-07, + "loss": -0.0251, + "reward": 2.859673857688904, + "reward_std": 0.1833506003022194, + "rewards/accuracy_reward": 0.9791666865348816, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.06220110505819321, + "rewards/tag_count_reward": 0.9843750298023224, "step": 389 }, { "clip_ratio": 0.0, - "completion_length": 705.8750305175781, - "epoch": 0.39, - "grad_norm": 13.205610036016818, - "kl": 3.453125, - "learning_rate": 7.884636689049422e-07, - "loss": 0.76, - "reward": 2.53139591217041, - "reward_std": 0.6152072250843048, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/reasoning_steps_reward": 0.972222238779068, - "rewards/repetition_penalty_reward": -0.018951344303786755, - "rewards/tag_count_reward": 0.8281250298023224, + "completion_length": 490.0625, + "epoch": 0.195, + "grad_norm": 2.245242392422714, + "kl": 0.14990234375, + "learning_rate": 9.754833590196926e-07, + "loss": -0.0097, + "reward": 2.5275352001190186, + "reward_std": 0.3732472062110901, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.055798135697841644, + "rewards/tag_count_reward": 1.0, "step": 390 }, { "clip_ratio": 0.0, - "completion_length": 733.6250305175781, - "epoch": 0.391, - "grad_norm": 7.373593323078646, - "kl": 3.265625, - "learning_rate": 7.871301079735049e-07, - "loss": 0.5816, - "reward": 2.51271653175354, - "reward_std": 0.5442458987236023, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/reasoning_steps_reward": 0.979166716337204, - "rewards/repetition_penalty_reward": -0.028950226493179798, - "rewards/tag_count_reward": 0.8541666865348816, + "completion_length": 527.5833587646484, + "epoch": 0.1955, + "grad_norm": 2.988740973444905, + "kl": 0.15478515625, + "learning_rate": 9.752270104708888e-07, + "loss": 0.1089, + "reward": 2.5857293605804443, + "reward_std": 0.4840521216392517, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.06531241536140442, + "rewards/tag_count_reward": 0.984375, "step": 391 }, { "clip_ratio": 0.0, - "completion_length": 897.1666870117188, - "epoch": 0.392, - "grad_norm": 24.00799478501999, - "kl": 6.640625, - "learning_rate": 7.857936576865356e-07, - "loss": 0.9816, - "reward": 2.3230772018432617, - "reward_std": 0.7363496124744415, - "rewards/accuracy_reward": 0.6458333432674408, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.013728468678891659, - "rewards/tag_count_reward": 0.6979166865348816, + "completion_length": 483.75001525878906, + "epoch": 0.196, + "grad_norm": 2.355763241309473, + "kl": 0.146484375, + "learning_rate": 9.749693666068663e-07, + "loss": 0.024, + "reward": 2.677127242088318, + "reward_std": 0.3999278247356415, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.06072022393345833, + "rewards/tag_count_reward": 0.9947916865348816, "step": 392 }, { "clip_ratio": 0.0, - "completion_length": 753.1666870117188, - "epoch": 0.393, - "grad_norm": 12.878430832163207, - "kl": 3.63671875, - "learning_rate": 7.844543343282595e-07, - "loss": 0.7256, - "reward": 2.5792795419692993, - "reward_std": 0.5005324482917786, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.026623360812664032, - "rewards/tag_count_reward": 0.8281250298023224, + "completion_length": 443.3541717529297, + "epoch": 0.1965, + "grad_norm": 2.370253055532675, + "kl": 0.15625, + "learning_rate": 9.747104282124531e-07, + "loss": 0.0625, + "reward": 2.6037063598632812, + "reward_std": 0.2899606078863144, + "rewards/accuracy_reward": 0.6875000149011612, + "rewards/reasoning_steps_reward": 0.965277761220932, + "rewards/repetition_penalty_reward": -0.0490715391933918, + "rewards/tag_count_reward": 1.0, "step": 393 }, { "clip_ratio": 0.0, - "completion_length": 723.9791870117188, - "epoch": 0.394, - "grad_norm": 22.790620068840962, - "kl": 3.4140625, - "learning_rate": 7.831121542179086e-07, - "loss": 0.794, - "reward": 2.396026372909546, - "reward_std": 0.5379204005002975, - "rewards/accuracy_reward": 0.5833333432674408, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.02237648330628872, - "rewards/tag_count_reward": 0.8489583730697632, + "completion_length": 456.3333435058594, + "epoch": 0.197, + "grad_norm": 2.4610089541965694, + "kl": 0.15625, + "learning_rate": 9.744501960764203e-07, + "loss": 0.0754, + "reward": 2.8426754474639893, + "reward_std": 0.21852993965148926, + "rewards/accuracy_reward": 0.9375000298023224, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.07051906362175941, + "rewards/tag_count_reward": 0.9895833432674408, "step": 394 }, { "clip_ratio": 0.0, - "completion_length": 863.2291870117188, - "epoch": 0.395, - "grad_norm": 19.51867731167796, - "kl": 5.0625, - "learning_rate": 7.817671337095244e-07, - "loss": 0.5991, - "reward": 2.26111102104187, - "reward_std": 0.6565420031547546, - "rewards/accuracy_reward": 0.5000000298023224, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.02013904694467783, - "rewards/tag_count_reward": 0.7812500298023224, + "completion_length": 597.6875, + "epoch": 0.1975, + "grad_norm": 3.405083047196158, + "kl": 0.166015625, + "learning_rate": 9.741886709914803e-07, + "loss": 0.1408, + "reward": 2.701059341430664, + "reward_std": 0.3547600954771042, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.06803778372704983, + "rewards/tag_count_reward": 0.984375, "step": 395 }, { "clip_ratio": 0.0, - "completion_length": 900.2708740234375, - "epoch": 0.396, - "grad_norm": 17.88490632655815, - "kl": 6.6171875, - "learning_rate": 7.804192891917571e-07, - "loss": 0.7442, - "reward": 2.096524953842163, - "reward_std": 0.6609176695346832, - "rewards/accuracy_reward": 0.4375000149011612, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.01458633691072464, - "rewards/tag_count_reward": 0.6875, + "completion_length": 473.7916717529297, + "epoch": 0.198, + "grad_norm": 2.7966864501073188, + "kl": 0.17822265625, + "learning_rate": 9.739258537542835e-07, + "loss": -0.039, + "reward": 2.5628772974014282, + "reward_std": 0.3083910197019577, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.0725395604968071, + "rewards/tag_count_reward": 0.9895833432674408, "step": 396 }, { "clip_ratio": 0.0, - "completion_length": 727.7083435058594, - "epoch": 0.397, - "grad_norm": 7.902766105189536, - "kl": 3.4375, - "learning_rate": 7.79068637087667e-07, - "loss": 0.619, - "reward": 2.3182718753814697, - "reward_std": 0.6975450813770294, - "rewards/accuracy_reward": 0.5625000298023224, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.027214372530579567, - "rewards/tag_count_reward": 0.796875, + "completion_length": 553.5833435058594, + "epoch": 0.1985, + "grad_norm": 5.916243349638, + "kl": 0.1669921875, + "learning_rate": 9.73661745165417e-07, + "loss": 0.1781, + "reward": 2.6928629875183105, + "reward_std": 0.3948093354701996, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.08144245855510235, + "rewards/tag_count_reward": 0.96875, "step": 397 }, { "clip_ratio": 0.0, - "completion_length": 923.0833435058594, - "epoch": 0.398, - "grad_norm": 8.813050784830676, - "kl": 4.828125, - "learning_rate": 7.777151938545235e-07, - "loss": 0.744, - "reward": 2.218664765357971, - "reward_std": 0.687986433506012, - "rewards/accuracy_reward": 0.5208333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.026126965880393982, - "rewards/tag_count_reward": 0.7239583730697632, + "completion_length": 466.97918701171875, + "epoch": 0.199, + "grad_norm": 2.568535579007926, + "kl": 0.17041015625, + "learning_rate": 9.733963460294015e-07, + "loss": -0.0109, + "reward": 2.8347532749176025, + "reward_std": 0.23304447531700134, + "rewards/accuracy_reward": 0.9375000298023224, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.061080172657966614, + "rewards/tag_count_reward": 1.0, "step": 398 }, { "clip_ratio": 0.0, - "completion_length": 731.9583435058594, - "epoch": 0.399, - "grad_norm": 17.369257892310884, - "kl": 3.0703125, - "learning_rate": 7.763589759836058e-07, - "loss": 0.5943, - "reward": 2.4915345907211304, - "reward_std": 0.6682489514350891, - "rewards/accuracy_reward": 0.7083333730697632, - "rewards/reasoning_steps_reward": 0.9861111044883728, - "rewards/repetition_penalty_reward": -0.020618132315576077, - "rewards/tag_count_reward": 0.8177083432674408, + "completion_length": 444.31251525878906, + "epoch": 0.1995, + "grad_norm": 2.6070633311402096, + "kl": 0.1962890625, + "learning_rate": 9.731296571546885e-07, + "loss": 0.0098, + "reward": 2.824558138847351, + "reward_std": 0.24229364097118378, + "rewards/accuracy_reward": 0.9791666865348816, + "rewards/reasoning_steps_reward": 0.9236111640930176, + "rewards/repetition_penalty_reward": -0.07821962051093578, + "rewards/tag_count_reward": 1.0, "step": 399 }, { "clip_ratio": 0.0, - "completion_length": 669.4583435058594, - "epoch": 0.4, - "grad_norm": 28.5217854549646, - "kl": 2.2109375, - "learning_rate": 7.75e-07, - "loss": 0.495, - "reward": 2.6985886096954346, - "reward_std": 0.4976983517408371, - "rewards/accuracy_reward": 0.8750000298023224, - "rewards/reasoning_steps_reward": 0.9861111044883728, - "rewards/repetition_penalty_reward": -0.027106057852506638, - "rewards/tag_count_reward": 0.8645833730697632, + "completion_length": 473.3541717529297, + "epoch": 0.2, + "grad_norm": 3.67637754483244, + "kl": 0.19482421875, + "learning_rate": 9.728616793536587e-07, + "loss": 0.0453, + "reward": 2.5268030166625977, + "reward_std": 0.3516158163547516, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.06521105766296387, + "rewards/tag_count_reward": 0.9947916865348816, "step": 400 }, { "clip_ratio": 0.0, - "completion_length": 747.625, - "epoch": 0.401, - "grad_norm": 8.8439464948951, - "kl": 2.84375, - "learning_rate": 7.736382824623999e-07, - "loss": 0.4971, - "reward": 2.371925711631775, - "reward_std": 0.5836665034294128, + "completion_length": 438.5416717529297, + "epoch": 0.2005, + "grad_norm": 6.51901226376256, + "kl": 0.208984375, + "learning_rate": 9.72592413442619e-07, + "loss": -0.0584, + "reward": 2.4788198471069336, + "reward_std": 0.30706703662872314, "rewards/accuracy_reward": 0.5625000298023224, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.02390767727047205, - "rewards/tag_count_reward": 0.8333333432674408, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.0489580724388361, + "rewards/tag_count_reward": 0.9791666865348816, "step": 401 }, { "clip_ratio": 0.0, - "completion_length": 632.9791870117188, - "epoch": 0.402, - "grad_norm": 7.326080260643342, - "kl": 2.3515625, - "learning_rate": 7.72273839962904e-07, - "loss": 0.4332, - "reward": 2.304866313934326, - "reward_std": 0.5863041877746582, - "rewards/accuracy_reward": 0.5, + "completion_length": 488.50001525878906, + "epoch": 0.201, + "grad_norm": 2.34033624330632, + "kl": 0.18994140625, + "learning_rate": 9.723218602418e-07, + "loss": -0.016, + "reward": 2.7911014556884766, + "reward_std": 0.32635878026485443, + "rewards/accuracy_reward": 0.8750000298023224, "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.026731058955192566, - "rewards/tag_count_reward": 0.859375, + "rewards/repetition_penalty_reward": -0.05091256648302078, + "rewards/tag_count_reward": 0.9947916865348816, "step": 402 }, { "clip_ratio": 0.0, - "completion_length": 852.3958740234375, - "epoch": 0.403, - "grad_norm": 12.84430423544328, - "kl": 4.61328125, - "learning_rate": 7.709066891268133e-07, - "loss": 0.5593, - "reward": 2.23075133562088, - "reward_std": 0.5370394438505173, - "rewards/accuracy_reward": 0.5208333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.029665381647646427, - "rewards/tag_count_reward": 0.7395833432674408, + "completion_length": 476.56251525878906, + "epoch": 0.2015, + "grad_norm": 2.5623947724894274, + "kl": 0.189453125, + "learning_rate": 9.720500205753538e-07, + "loss": -0.0322, + "reward": 2.698399066925049, + "reward_std": 0.403295561671257, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.06548996269702911, + "rewards/tag_count_reward": 1.0, "step": 403 }, { "clip_ratio": 0.0, - "completion_length": 815.9375, - "epoch": 0.404, - "grad_norm": 19.101834754723175, - "kl": 5.0, - "learning_rate": 7.695368466124296e-07, - "loss": 0.5902, - "reward": 2.4460870027542114, - "reward_std": 0.5804774314165115, - "rewards/accuracy_reward": 0.6875, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.0174547852948308, - "rewards/tag_count_reward": 0.7760416865348816, + "completion_length": 471.64585876464844, + "epoch": 0.202, + "grad_norm": 2.6351050859979885, + "kl": 0.20703125, + "learning_rate": 9.717768952713511e-07, + "loss": 0.0172, + "reward": 2.64251971244812, + "reward_std": 0.2700469493865967, + "rewards/accuracy_reward": 0.708333358168602, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.04845273122191429, + "rewards/tag_count_reward": 0.9895833432674408, "step": 404 }, { "clip_ratio": 0.0, - "completion_length": 951.1458740234375, - "epoch": 0.405, - "grad_norm": 11.760380958455592, - "kl": 6.1875, - "learning_rate": 7.681643291108517e-07, - "loss": 0.8655, - "reward": 2.131704032421112, - "reward_std": 0.7297748029232025, - "rewards/accuracy_reward": 0.4375000298023224, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.017601476050913334, - "rewards/tag_count_reward": 0.7187500298023224, + "completion_length": 490.85418701171875, + "epoch": 0.2025, + "grad_norm": 2.5295824552139585, + "kl": 0.18994140625, + "learning_rate": 9.71502485161779e-07, + "loss": -0.0095, + "reward": 2.682486653327942, + "reward_std": 0.47308868169784546, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9652778506278992, + "rewards/repetition_penalty_reward": -0.0432078093290329, + "rewards/tag_count_reward": 0.9895833432674408, "step": 405 }, { "clip_ratio": 0.0, - "completion_length": 657.4375305175781, - "epoch": 0.406, - "grad_norm": 8.483281318881998, - "kl": 2.7734375, - "learning_rate": 7.667891533457718e-07, - "loss": 0.3648, - "reward": 2.377819776535034, - "reward_std": 0.6631312072277069, - "rewards/accuracy_reward": 0.6041666865348816, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.019749819301068783, - "rewards/tag_count_reward": 0.8072916865348816, + "completion_length": 449.97918701171875, + "epoch": 0.203, + "grad_norm": 4.590231605072331, + "kl": 0.1875, + "learning_rate": 9.71226791082538e-07, + "loss": 0.0868, + "reward": 2.853319525718689, + "reward_std": 0.2601998746395111, + "rewards/accuracy_reward": 0.9375000298023224, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.03730554319918156, + "rewards/tag_count_reward": 0.9947916865348816, "step": 406 }, { "clip_ratio": 0.0, - "completion_length": 532.6458587646484, - "epoch": 0.407, - "grad_norm": 14.061905199941569, - "kl": 0.779296875, - "learning_rate": 7.654113360732732e-07, - "loss": 0.1726, - "reward": 2.7471961975097656, - "reward_std": 0.508865624666214, - "rewards/accuracy_reward": 0.8333333730697632, + "completion_length": 459.7083435058594, + "epoch": 0.2035, + "grad_norm": 3.798867450655633, + "kl": 0.1826171875, + "learning_rate": 9.709498138734403e-07, + "loss": 0.0078, + "reward": 2.8343251943588257, + "reward_std": 0.3080275356769562, + "rewards/accuracy_reward": 0.8958333432674408, "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.027109289541840553, - "rewards/tag_count_reward": 0.9479166865348816, + "rewards/repetition_penalty_reward": -0.03893873654305935, + "rewards/tag_count_reward": 0.9843750298023224, "step": 407 }, { "clip_ratio": 0.0, - "completion_length": 706.9583435058594, - "epoch": 0.408, - "grad_norm": 11.779388411702078, - "kl": 2.3046875, - "learning_rate": 7.640308940816239e-07, - "loss": 0.4889, - "reward": 2.349950671195984, - "reward_std": 0.6774967312812805, - "rewards/accuracy_reward": 0.5625000298023224, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.014632808044552803, - "rewards/tag_count_reward": 0.8020833730697632, + "completion_length": 480.1041717529297, + "epoch": 0.204, + "grad_norm": 3.339773656845583, + "kl": 0.2431640625, + "learning_rate": 9.706715543782064e-07, + "loss": 0.0546, + "reward": 2.397212266921997, + "reward_std": 0.38606369495391846, + "rewards/accuracy_reward": 0.4791666716337204, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.04376017488539219, + "rewards/tag_count_reward": 0.9895833432674408, "step": 408 }, { "clip_ratio": 0.0, - "completion_length": 765.8125305175781, - "epoch": 0.409, - "grad_norm": 15.586280796607687, - "kl": 3.109375, - "learning_rate": 7.626478441910744e-07, - "loss": 0.578, - "reward": 2.306677460670471, - "reward_std": 0.7071038484573364, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.016239337623119354, - "rewards/tag_count_reward": 0.7812500298023224, + "completion_length": 438.5833435058594, + "epoch": 0.2045, + "grad_norm": 4.374047431584956, + "kl": 0.2890625, + "learning_rate": 9.703920134444632e-07, + "loss": -0.0037, + "reward": 2.664395570755005, + "reward_std": 0.20635664463043213, + "rewards/accuracy_reward": 0.7708333730697632, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.08039623126387596, + "rewards/tag_count_reward": 0.9947916865348816, "step": 409 }, { "clip_ratio": 0.0, - "completion_length": 665.75, - "epoch": 0.41, - "grad_norm": 7.201254956040354, - "kl": 2.546875, - "learning_rate": 7.612622032536507e-07, - "loss": 0.3043, - "reward": 2.2321949005126953, - "reward_std": 0.5692197978496552, - "rewards/accuracy_reward": 0.4375, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.019541208632290363, - "rewards/tag_count_reward": 0.8281250298023224, + "completion_length": 437.2708435058594, + "epoch": 0.205, + "grad_norm": 4.577231643859023, + "kl": 0.33203125, + "learning_rate": 9.701111919237408e-07, + "loss": 0.0983, + "reward": 2.6673845052719116, + "reward_std": 0.28793033957481384, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.03574047051370144, + "rewards/tag_count_reward": 0.9947916865348816, "step": 410 }, { "clip_ratio": 0.0, - "completion_length": 557.1250305175781, - "epoch": 0.411, - "grad_norm": 12.200566811103256, - "kl": 1.060546875, - "learning_rate": 7.59873988152951e-07, - "loss": 0.2802, - "reward": 2.6842163801193237, - "reward_std": 0.292233943939209, - "rewards/accuracy_reward": 0.7708333730697632, - "rewards/reasoning_steps_reward": 0.9861111044883728, - "rewards/repetition_penalty_reward": -0.031061380170285702, - "rewards/tag_count_reward": 0.9583333730697632, + "completion_length": 411.68751525878906, + "epoch": 0.2055, + "grad_norm": 3.9132489725820614, + "kl": 0.314453125, + "learning_rate": 9.698290906714702e-07, + "loss": 0.058, + "reward": 2.741675853729248, + "reward_std": 0.25958670675754547, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/reasoning_steps_reward": 0.9861111640930176, + "rewards/repetition_penalty_reward": -0.07256042212247849, + "rewards/tag_count_reward": 0.9947916865348816, "step": 411 }, { "clip_ratio": 0.0, - "completion_length": 589.4375305175781, - "epoch": 0.412, - "grad_norm": 20.90214024265737, - "kl": 1.5390625, - "learning_rate": 7.584832158039378e-07, - "loss": 0.2278, - "reward": 2.5838751792907715, - "reward_std": 0.5441871881484985, - "rewards/accuracy_reward": 0.6875, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.03070823848247528, - "rewards/tag_count_reward": 0.9270833432674408, + "completion_length": 489.3125, + "epoch": 0.206, + "grad_norm": 11.654921545662503, + "kl": 0.4736328125, + "learning_rate": 9.695457105469804e-07, + "loss": 0.1564, + "reward": 2.6907659769058228, + "reward_std": 0.36029884219169617, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.07659525983035564, + "rewards/tag_count_reward": 0.9479166865348816, "step": 412 }, { "clip_ratio": 0.0, - "completion_length": 641.4791870117188, - "epoch": 0.413, - "grad_norm": 15.755392833494586, - "kl": 3.9609375, - "learning_rate": 7.570899031527332e-07, - "loss": 0.4663, - "reward": 2.525357723236084, - "reward_std": 0.607357531785965, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.0145728699862957, - "rewards/tag_count_reward": 0.8593750298023224, + "completion_length": 472.91668701171875, + "epoch": 0.2065, + "grad_norm": 8.960365959818047, + "kl": 0.87109375, + "learning_rate": 9.69261052413497e-07, + "loss": 0.1558, + "reward": 2.5001312494277954, + "reward_std": 0.5244300961494446, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9722222089767456, + "rewards/repetition_penalty_reward": -0.05542440339922905, + "rewards/tag_count_reward": 0.8958333730697632, "step": 413 }, { "clip_ratio": 0.0, - "completion_length": 899.6250305175781, - "epoch": 0.414, - "grad_norm": 21.34036856462466, - "kl": 6.265625, - "learning_rate": 7.556940671764124e-07, - "loss": 0.7498, - "reward": 2.206377148628235, - "reward_std": 0.7690772414207458, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/reasoning_steps_reward": 0.972222238779068, - "rewards/repetition_penalty_reward": -0.021053357981145382, + "completion_length": 789.2500305175781, + "epoch": 0.207, + "grad_norm": 13.034420101645079, + "kl": 2.1484375, + "learning_rate": 9.689751171381377e-07, + "loss": 0.4389, + "reward": 2.2836403846740723, + "reward_std": 0.55134117603302, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9444445073604584, + "rewards/repetition_penalty_reward": -0.06184585019946098, "rewards/tag_count_reward": 0.7135416865348816, "step": 414 }, { "clip_ratio": 0.0, - "completion_length": 902.6250305175781, - "epoch": 0.415, - "grad_norm": 14.645548610311325, - "kl": 5.703125, - "learning_rate": 7.54295724882796e-07, - "loss": 0.7326, - "reward": 2.3041797876358032, - "reward_std": 0.5700157135725021, - "rewards/accuracy_reward": 0.6041666865348816, + "completion_length": 678.8750305175781, + "epoch": 0.2075, + "grad_norm": 24.87345626627445, + "kl": 2.734375, + "learning_rate": 9.68687905591911e-07, + "loss": 0.2528, + "reward": 2.3144073486328125, + "reward_std": 0.4746939539909363, + "rewards/accuracy_reward": 0.6250000298023224, "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.020473050884902477, + "rewards/repetition_penalty_reward": -0.031078746542334557, "rewards/tag_count_reward": 0.7343750298023224, "step": 415 }, { "clip_ratio": 0.0, - "completion_length": 601.4375, - "epoch": 0.416, - "grad_norm": 9.196475410607844, - "kl": 1.935546875, - "learning_rate": 7.528948933102438e-07, - "loss": 0.3524, - "reward": 2.62975537776947, - "reward_std": 0.5219498127698898, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.024758631363511086, - "rewards/tag_count_reward": 0.890625, + "completion_length": 756.8958435058594, + "epoch": 0.208, + "grad_norm": 10.29769838863137, + "kl": 1.921875, + "learning_rate": 9.683994186497132e-07, + "loss": 0.1948, + "reward": 2.4876515865325928, + "reward_std": 0.3638898953795433, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.05227917805314064, + "rewards/tag_count_reward": 0.8177083432674408, "step": 416 }, { "clip_ratio": 0.0, - "completion_length": 541.1666870117188, - "epoch": 0.417, - "grad_norm": 6.279974669254126, - "kl": 1.14453125, - "learning_rate": 7.514915895274463e-07, - "loss": 0.2295, - "reward": 2.4813188314437866, - "reward_std": 0.307245125528425, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.022153427824378014, - "rewards/tag_count_reward": 0.96875, + "completion_length": 458.4583435058594, + "epoch": 0.2085, + "grad_norm": 4.792568485298934, + "kl": 0.650390625, + "learning_rate": 9.681096571903252e-07, + "loss": 0.0885, + "reward": 2.4742881059646606, + "reward_std": 0.4346305727958679, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.050017524510622025, + "rewards/tag_count_reward": 0.9479166865348816, "step": 417 }, { "clip_ratio": 0.0, - "completion_length": 608.6666870117188, - "epoch": 0.418, - "grad_norm": 13.670357428378258, - "kl": 2.1640625, - "learning_rate": 7.500858306332172e-07, - "loss": 0.4449, - "reward": 2.4430500268936157, - "reward_std": 0.5130402147769928, - "rewards/accuracy_reward": 0.5625, + "completion_length": 424.3125, + "epoch": 0.209, + "grad_norm": 5.038602331803735, + "kl": 0.322265625, + "learning_rate": 9.67818622096411e-07, + "loss": 0.0884, + "reward": 2.7949297428131104, + "reward_std": 0.28741036355495453, + "rewards/accuracy_reward": 0.8333333730697632, "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.015283319167792797, - "rewards/tag_count_reward": 0.8958333432674408, + "rewards/repetition_penalty_reward": -0.038403624668717384, + "rewards/tag_count_reward": 1.0, "step": 418 }, { "clip_ratio": 0.0, - "completion_length": 513.375, - "epoch": 0.419, - "grad_norm": 8.83072473507965, - "kl": 1.2265625, - "learning_rate": 7.486776337562853e-07, - "loss": 0.2575, - "reward": 2.5676904916763306, - "reward_std": 0.4607061445713043, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.026059521362185478, - "rewards/tag_count_reward": 0.9270833432674408, + "completion_length": 450.9375, + "epoch": 0.2095, + "grad_norm": 5.384359454777853, + "kl": 0.287109375, + "learning_rate": 9.67526314254514e-07, + "loss": 0.0816, + "reward": 2.835049033164978, + "reward_std": 0.24460270255804062, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.048631489276885986, + "rewards/tag_count_reward": 0.9947916865348816, "step": 419 }, { "clip_ratio": 0.0, - "completion_length": 527.9166717529297, - "epoch": 0.42, - "grad_norm": 9.224829985446288, - "kl": 1.548828125, - "learning_rate": 7.472670160550848e-07, - "loss": 0.1764, - "reward": 2.311278820037842, - "reward_std": 0.4501464366912842, - "rewards/accuracy_reward": 0.3958333432674408, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.023790807463228703, - "rewards/tag_count_reward": 0.953125, + "completion_length": 541.2083435058594, + "epoch": 0.21, + "grad_norm": 20.48316004254315, + "kl": 0.9375, + "learning_rate": 9.672327345550543e-07, + "loss": 0.2865, + "reward": 2.627635359764099, + "reward_std": 0.5200231969356537, + "rewards/accuracy_reward": 0.7708333730697632, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.023406220600008965, + "rewards/tag_count_reward": 0.9010416865348816, "step": 420 }, { "clip_ratio": 0.0, - "completion_length": 707.0208435058594, - "epoch": 0.421, - "grad_norm": 9.815342189617178, - "kl": 3.53125, - "learning_rate": 7.458539947175473e-07, - "loss": 0.7108, - "reward": 2.3707098960876465, - "reward_std": 0.5124354809522629, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/reasoning_steps_reward": 0.979166716337204, - "rewards/repetition_penalty_reward": -0.019915098324418068, - "rewards/tag_count_reward": 0.8697916865348816, + "completion_length": 564.5833435058594, + "epoch": 0.2105, + "grad_norm": 12.49478929221166, + "kl": 1.31640625, + "learning_rate": 9.669378838923267e-07, + "loss": 0.4444, + "reward": 2.715390205383301, + "reward_std": 0.4231880307197571, + "rewards/accuracy_reward": 0.8958333730697632, + "rewards/reasoning_steps_reward": 0.9583334028720856, + "rewards/repetition_penalty_reward": -0.03460977412760258, + "rewards/tag_count_reward": 0.8958333432674408, "step": 421 }, { "clip_ratio": 0.0, - "completion_length": 500.8750305175781, - "epoch": 0.422, - "grad_norm": 5.188831712175436, - "kl": 1.2841796875, - "learning_rate": 7.444385869608921e-07, - "loss": 0.2224, - "reward": 2.582339286804199, - "reward_std": 0.25641510635614395, - "rewards/accuracy_reward": 0.6875000149011612, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.02529969811439514, - "rewards/tag_count_reward": 0.9479166865348816, + "completion_length": 962.4583435058594, + "epoch": 0.211, + "grad_norm": 23.747666802030537, + "kl": 3.1328125, + "learning_rate": 9.666417631644976e-07, + "loss": 0.4393, + "reward": 2.1522072553634644, + "reward_std": 0.5904508531093597, + "rewards/accuracy_reward": 0.4375, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.02487611025571823, + "rewards/tag_count_reward": 0.7812500298023224, "step": 422 }, { "clip_ratio": 0.0, - "completion_length": 470.8333435058594, - "epoch": 0.423, - "grad_norm": 8.589041831141525, - "kl": 0.720703125, - "learning_rate": 7.430208100314156e-07, - "loss": 0.1102, - "reward": 2.710915446281433, - "reward_std": 0.390904039144516, - "rewards/accuracy_reward": 0.7708333730697632, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.026931931264698505, - "rewards/tag_count_reward": 0.9739583432674408, + "completion_length": 1850.354248046875, + "epoch": 0.2115, + "grad_norm": 22.007156665570427, + "kl": 4.5625, + "learning_rate": 9.66344373273602e-07, + "loss": 0.3166, + "reward": 1.7171184420585632, + "reward_std": 0.5808897018432617, + "rewards/accuracy_reward": 0.1875, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.0033677266910672188, + "rewards/tag_count_reward": 0.5885416716337204, "step": 423 }, { "clip_ratio": 0.0, - "completion_length": 489.8333435058594, - "epoch": 0.424, - "grad_norm": 10.226503973140625, - "kl": 1.58984375, - "learning_rate": 7.416006812042827e-07, - "loss": 0.3074, - "reward": 2.462380290031433, - "reward_std": 0.39044664800167084, - "rewards/accuracy_reward": 0.5, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.016786448657512665, - "rewards/tag_count_reward": 0.9791666865348816, + "completion_length": 1172.25, + "epoch": 0.212, + "grad_norm": 13.046087807963628, + "kl": 3.3359375, + "learning_rate": 9.66045715125541e-07, + "loss": 0.5982, + "reward": 2.00005042552948, + "reward_std": 0.6750081181526184, + "rewards/accuracy_reward": 0.375, + "rewards/reasoning_steps_reward": 0.9444445371627808, + "rewards/repetition_penalty_reward": -0.0121024283580482, + "rewards/tag_count_reward": 0.6927083432674408, "step": 424 }, { "clip_ratio": 0.0, - "completion_length": 642.6666870117188, - "epoch": 0.425, - "grad_norm": 14.863563634108994, - "kl": 4.3671875, - "learning_rate": 7.401782177833147e-07, - "loss": 0.7175, - "reward": 2.3808945417404175, - "reward_std": 0.6382828205823898, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.03750831447541714, - "rewards/tag_count_reward": 0.890625, + "completion_length": 777.1666870117188, + "epoch": 0.2125, + "grad_norm": 8.697075914848647, + "kl": 1.41015625, + "learning_rate": 9.657457896300791e-07, + "loss": 0.4061, + "reward": 2.4428231716156006, + "reward_std": 0.44499611109495163, + "rewards/accuracy_reward": 0.6458333730697632, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.03634357824921608, + "rewards/tag_count_reward": 0.8541666865348816, "step": 425 }, { "clip_ratio": 0.0, - "completion_length": 619.1666870117188, - "epoch": 0.426, - "grad_norm": 10.866155494400388, - "kl": 2.935546875, - "learning_rate": 7.387534371007797e-07, - "loss": 0.3152, - "reward": 2.6118627786636353, - "reward_std": 0.29477719962596893, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.028762279078364372, - "rewards/tag_count_reward": 0.9114583730697632, + "completion_length": 629.6875152587891, + "epoch": 0.213, + "grad_norm": 12.127551785913408, + "kl": 1.099609375, + "learning_rate": 9.654445977008414e-07, + "loss": 0.4644, + "reward": 2.2689449787139893, + "reward_std": 0.3217324912548065, + "rewards/accuracy_reward": 0.4166666865348816, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.026193935424089432, + "rewards/tag_count_reward": 0.9062500298023224, "step": 426 }, { "clip_ratio": 0.0, - "completion_length": 638.8333435058594, - "epoch": 0.427, - "grad_norm": 11.888028322568847, - "kl": 3.5625, - "learning_rate": 7.373263565171805e-07, - "loss": 0.3712, - "reward": 2.460385322570801, - "reward_std": 0.48997509479522705, - "rewards/accuracy_reward": 0.6041666865348816, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.025725997984409332, - "rewards/tag_count_reward": 0.8958333730697632, + "completion_length": 435.3958435058594, + "epoch": 0.2135, + "grad_norm": 14.647819824321752, + "kl": 0.37890625, + "learning_rate": 9.651421402553108e-07, + "loss": 0.2392, + "reward": 2.611443519592285, + "reward_std": 0.21422222256660461, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.04654255323112011, + "rewards/tag_count_reward": 0.9635416865348816, "step": 427 }, { "clip_ratio": 0.0, - "completion_length": 579.4375, - "epoch": 0.428, - "grad_norm": 12.847512777948014, - "kl": 2.53515625, - "learning_rate": 7.358969934210438e-07, - "loss": 0.6338, - "reward": 2.6187379360198975, - "reward_std": 0.53090900182724, - "rewards/accuracy_reward": 0.8125000298023224, - "rewards/reasoning_steps_reward": 0.944444477558136, - "rewards/repetition_penalty_reward": -0.02362314984202385, - "rewards/tag_count_reward": 0.8854166865348816, + "completion_length": 741.8125, + "epoch": 0.214, + "grad_norm": 6.865030392191665, + "kl": 1.41015625, + "learning_rate": 9.648384182148252e-07, + "loss": 0.3917, + "reward": 2.2415451407432556, + "reward_std": 0.5755177438259125, + "rewards/accuracy_reward": 0.5000000149011612, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.03970504552125931, + "rewards/tag_count_reward": 0.8229166865348816, "step": 428 }, { "clip_ratio": 0.0, - "completion_length": 483.70835876464844, - "epoch": 0.429, - "grad_norm": 11.418979135114895, - "kl": 1.544921875, - "learning_rate": 7.344653652287077e-07, - "loss": 0.2723, - "reward": 2.708911895751953, - "reward_std": 0.45899534225463867, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 0.9861111044883728, - "rewards/repetition_penalty_reward": -0.02199092786759138, - "rewards/tag_count_reward": 0.953125, + "completion_length": 539.2083435058594, + "epoch": 0.2145, + "grad_norm": 4.7239397954996205, + "kl": 0.71875, + "learning_rate": 9.645334325045745e-07, + "loss": 0.1712, + "reward": 2.2595887184143066, + "reward_std": 0.39921020716428757, + "rewards/accuracy_reward": 0.4375000298023224, + "rewards/reasoning_steps_reward": 0.9652778506278992, + "rewards/repetition_penalty_reward": -0.04423084668815136, + "rewards/tag_count_reward": 0.9010416865348816, "step": 429 }, { "clip_ratio": 0.0, - "completion_length": 705.2083435058594, - "epoch": 0.43, - "grad_norm": 25.61496750803982, - "kl": 5.7578125, - "learning_rate": 7.330314893841101e-07, - "loss": 0.6231, - "reward": 2.4273844957351685, - "reward_std": 0.5663270652294159, - "rewards/accuracy_reward": 0.625, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.02400454506278038, - "rewards/tag_count_reward": 0.8333333432674408, + "completion_length": 687.2083435058594, + "epoch": 0.215, + "grad_norm": 5.41675055528973, + "kl": 1.40234375, + "learning_rate": 9.64227184053598e-07, + "loss": 0.5379, + "reward": 2.37734055519104, + "reward_std": 0.4606352895498276, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.05668727494776249, + "rewards/tag_count_reward": 0.8854166865348816, "step": 430 }, { "clip_ratio": 0.0, - "completion_length": 462.1666717529297, - "epoch": 0.431, - "grad_norm": 16.91868590026825, - "kl": 1.2421875, - "learning_rate": 7.315953833585755e-07, - "loss": 0.1736, - "reward": 2.821496605873108, - "reward_std": 0.40228843688964844, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.020517424680292606, - "rewards/tag_count_reward": 0.9531250298023224, - "step": 431 - }, - { + "completion_length": 852.3958435058594, + "epoch": 0.2155, + "grad_norm": 12.180552388615105, + "kl": 2.3359375, + "learning_rate": 9.63919673794782e-07, + "loss": 0.55, + "reward": 2.2885630130767822, + "reward_std": 0.5578717291355133, + "rewards/accuracy_reward": 0.5416666716337204, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.03088139370083809, + "rewards/tag_count_reward": 0.8333333730697632, + "step": 431 + }, + { "clip_ratio": 0.0, - "completion_length": 564.7708435058594, - "epoch": 0.432, - "grad_norm": 10.05316608237381, - "kl": 3.078125, - "learning_rate": 7.301570646506027e-07, - "loss": 0.7108, - "reward": 2.748389720916748, - "reward_std": 0.52855084836483, - "rewards/accuracy_reward": 0.8750000298023224, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.02070755325257778, - "rewards/tag_count_reward": 0.9010416865348816, + "completion_length": 1157.7291870117188, + "epoch": 0.216, + "grad_norm": 21.263145759520075, + "kl": 3.4140625, + "learning_rate": 9.636109026648554e-07, + "loss": 0.514, + "reward": 1.711805820465088, + "reward_std": 0.5387972891330719, + "rewards/accuracy_reward": 0.1458333358168602, + "rewards/reasoning_steps_reward": 0.9375, + "rewards/repetition_penalty_reward": -0.02777751348912716, + "rewards/tag_count_reward": 0.6562500298023224, "step": 432 }, { "clip_ratio": 0.0, - "completion_length": 779.3333435058594, - "epoch": 0.433, - "grad_norm": 27.40142017382295, - "kl": 7.359375, - "learning_rate": 7.287165507856512e-07, - "loss": 0.8687, - "reward": 2.269049644470215, - "reward_std": 0.7071286737918854, - "rewards/accuracy_reward": 0.5833333432674408, - "rewards/reasoning_steps_reward": 0.9513889253139496, - "rewards/repetition_penalty_reward": -0.015672642271965742, - "rewards/tag_count_reward": 0.75, + "completion_length": 740.2500305175781, + "epoch": 0.2165, + "grad_norm": 8.545418331851147, + "kl": 1.56640625, + "learning_rate": 9.633008716043892e-07, + "loss": 0.3121, + "reward": 2.3099186420440674, + "reward_std": 0.5152548253536224, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.037303626537323, + "rewards/tag_count_reward": 0.8333333730697632, "step": 433 }, { "clip_ratio": 0.0, - "completion_length": 671.7916870117188, - "epoch": 0.434, - "grad_norm": 21.75332490966816, - "kl": 4.8125, - "learning_rate": 7.27273859315928e-07, - "loss": 0.6014, - "reward": 2.483176350593567, - "reward_std": 0.5736861526966095, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/reasoning_steps_reward": 0.9722222983837128, - "rewards/repetition_penalty_reward": -0.02550439164042473, - "rewards/tag_count_reward": 0.8697916865348816, + "completion_length": 547.2708435058594, + "epoch": 0.217, + "grad_norm": 4.67948611938176, + "kl": 1.0, + "learning_rate": 9.629895815577915e-07, + "loss": 0.358, + "reward": 2.493169069290161, + "reward_std": 0.6183354258537292, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9375000298023224, + "rewards/repetition_penalty_reward": -0.022456128150224686, + "rewards/tag_count_reward": 0.8906250298023224, "step": 434 }, { "clip_ratio": 0.0, - "completion_length": 847.4375305175781, - "epoch": 0.435, - "grad_norm": 27.079037368491598, - "kl": 6.625, - "learning_rate": 7.258290078201731e-07, - "loss": 0.8844, - "reward": 1.987673282623291, - "reward_std": 0.6426667273044586, - "rewards/accuracy_reward": 0.2708333358168602, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.027951689437031746, - "rewards/tag_count_reward": 0.7447916865348816, + "completion_length": 468.25, + "epoch": 0.2175, + "grad_norm": 5.989166621610532, + "kl": 0.51123046875, + "learning_rate": 9.626770334733058e-07, + "loss": 0.108, + "reward": 2.392013192176819, + "reward_std": 0.47137293219566345, + "rewards/accuracy_reward": 0.6041666716337204, + "rewards/reasoning_steps_reward": 0.9027778506278992, + "rewards/repetition_penalty_reward": -0.036806222051382065, + "rewards/tag_count_reward": 0.921875, "step": 435 }, { "clip_ratio": 0.0, - "completion_length": 611.125, - "epoch": 0.436, - "grad_norm": 10.091665515824934, - "kl": 4.046875, - "learning_rate": 7.243820139034464e-07, - "loss": 0.6799, - "reward": 2.2479928731918335, - "reward_std": 0.6026458740234375, - "rewards/accuracy_reward": 0.458333358168602, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.015896069817245007, - "rewards/tag_count_reward": 0.8125000298023224, + "completion_length": 488.8958435058594, + "epoch": 0.218, + "grad_norm": 9.738518456979653, + "kl": 0.54296875, + "learning_rate": 9.623632283030077e-07, + "loss": 0.1873, + "reward": 2.3263269662857056, + "reward_std": 0.3005019724369049, + "rewards/accuracy_reward": 0.5000000149011612, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.05561743676662445, + "rewards/tag_count_reward": 0.8958333432674408, "step": 436 }, { "clip_ratio": 0.0, - "completion_length": 750.5833740234375, - "epoch": 0.437, - "grad_norm": 9.40390565894805, - "kl": 4.390625, - "learning_rate": 7.229328951969115e-07, - "loss": 0.7124, - "reward": 2.4180572032928467, - "reward_std": 0.6964816451072693, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.01944286935031414, - "rewards/tag_count_reward": 0.7916666865348816, + "completion_length": 400.2083435058594, + "epoch": 0.2185, + "grad_norm": 8.158385637923423, + "kl": 0.357421875, + "learning_rate": 9.620481670028026e-07, + "loss": 0.1567, + "reward": 2.596799850463867, + "reward_std": 0.24532540142536163, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.033408566378057, + "rewards/tag_count_reward": 0.9635416865348816, "step": 437 }, { "clip_ratio": 0.0, - "completion_length": 597.7500305175781, - "epoch": 0.438, - "grad_norm": 14.732573596337566, - "kl": 1.73828125, - "learning_rate": 7.214816693576234e-07, - "loss": 0.4652, - "reward": 2.5944327116012573, - "reward_std": 0.4662973880767822, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.021886682137846947, - "rewards/tag_count_reward": 0.8802083432674408, + "completion_length": 425.375, + "epoch": 0.219, + "grad_norm": 5.659616408154006, + "kl": 0.5888671875, + "learning_rate": 9.617318505324212e-07, + "loss": 0.0578, + "reward": 2.552333116531372, + "reward_std": 0.32321399450302124, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.04488925263285637, + "rewards/tag_count_reward": 0.9375, "step": 438 }, { "clip_ratio": 0.0, - "completion_length": 666.2083587646484, - "epoch": 0.439, - "grad_norm": 15.776670792271592, - "kl": 3.07421875, - "learning_rate": 7.200283540683102e-07, - "loss": 0.5074, - "reward": 2.3149194717407227, - "reward_std": 0.568027138710022, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/reasoning_steps_reward": 0.9861111044883728, - "rewards/repetition_penalty_reward": -0.02014992106705904, - "rewards/tag_count_reward": 0.8072916865348816, + "completion_length": 420.0833435058594, + "epoch": 0.2195, + "grad_norm": 8.173779954105225, + "kl": 0.58984375, + "learning_rate": 9.614142798554186e-07, + "loss": 0.2351, + "reward": 2.7584837675094604, + "reward_std": 0.42886343598365784, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.040127404034137726, + "rewards/tag_count_reward": 0.9583333432674408, "step": 439 }, { "clip_ratio": 0.0, - "completion_length": 579.7500305175781, - "epoch": 0.44, - "grad_norm": 13.110846882696643, - "kl": 1.84765625, - "learning_rate": 7.185729670371604e-07, - "loss": 0.3424, - "reward": 2.5865591764450073, - "reward_std": 0.5619917958974838, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.024551907554268837, - "rewards/tag_count_reward": 0.8958333432674408, + "completion_length": 584.5208740234375, + "epoch": 0.22, + "grad_norm": 12.102598421917286, + "kl": 1.78515625, + "learning_rate": 9.610954559391704e-07, + "loss": 0.4177, + "reward": 2.3829479217529297, + "reward_std": 0.5593430995941162, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.9166666865348816, + "rewards/repetition_penalty_reward": -0.06496894918382168, + "rewards/tag_count_reward": 0.8854166865348816, "step": 440 }, { "clip_ratio": 0.0, - "completion_length": 691.2916870117188, - "epoch": 0.441, - "grad_norm": 13.083680161708491, - "kl": 2.3515625, - "learning_rate": 7.171155259976057e-07, - "loss": 0.5228, - "reward": 2.60994291305542, - "reward_std": 0.5069815963506699, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 0.972222238779068, - "rewards/repetition_penalty_reward": -0.034154389053583145, - "rewards/tag_count_reward": 0.9010416865348816, + "completion_length": 666.3541870117188, + "epoch": 0.2205, + "grad_norm": 12.188104046012809, + "kl": 2.4296875, + "learning_rate": 9.607753797548691e-07, + "loss": 0.5983, + "reward": 2.4352097511291504, + "reward_std": 0.7042646259069443, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.8888888955116272, + "rewards/repetition_penalty_reward": -0.052637550979852676, + "rewards/tag_count_reward": 0.8489583432674408, "step": 441 }, { "clip_ratio": 0.0, - "completion_length": 646.6041870117188, - "epoch": 0.442, - "grad_norm": 50.887150157363536, - "kl": 4.890625, - "learning_rate": 7.156560487081051e-07, - "loss": 0.9085, - "reward": 2.2685351371765137, - "reward_std": 0.7841026484966278, + "completion_length": 619.4166870117188, + "epoch": 0.221, + "grad_norm": 17.355391790641967, + "kl": 2.421875, + "learning_rate": 9.604540522775227e-07, + "loss": 0.3384, + "reward": 2.2807594537734985, + "reward_std": 0.47133713960647583, "rewards/accuracy_reward": 0.5208333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.03354842588305473, - "rewards/tag_count_reward": 0.7812500298023224, + "rewards/reasoning_steps_reward": 0.9305556118488312, + "rewards/repetition_penalty_reward": -0.03521277289837599, + "rewards/tag_count_reward": 0.8645833730697632, "step": 442 }, { "clip_ratio": 0.0, - "completion_length": 755.8750305175781, - "epoch": 0.443, - "grad_norm": 15.956258650623388, - "kl": 5.609375, - "learning_rate": 7.141945529519288e-07, - "loss": 0.7977, - "reward": 2.418252944946289, - "reward_std": 0.683345377445221, - "rewards/accuracy_reward": 0.6250000298023224, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.027927756309509277, - "rewards/tag_count_reward": 0.828125, + "completion_length": 727.7291870117188, + "epoch": 0.2215, + "grad_norm": 19.28638169980206, + "kl": 2.24609375, + "learning_rate": 9.601314744859504e-07, + "loss": 0.692, + "reward": 2.0420679450035095, + "reward_std": 0.7488097846508026, + "rewards/accuracy_reward": 0.4166666716337204, + "rewards/reasoning_steps_reward": 0.8819444179534912, + "rewards/repetition_penalty_reward": -0.02737656608223915, + "rewards/tag_count_reward": 0.7708333730697632, "step": 443 }, { "clip_ratio": 0.0, - "completion_length": 743.5416870117188, - "epoch": 0.444, - "grad_norm": 24.6724782512694, - "kl": 7.890625, - "learning_rate": 7.127310565369415e-07, - "loss": 1.0148, - "reward": 2.227971076965332, - "reward_std": 0.8724583983421326, - "rewards/accuracy_reward": 0.5625, - "rewards/reasoning_steps_reward": 0.965277761220932, - "rewards/repetition_penalty_reward": -0.04980688448995352, - "rewards/tag_count_reward": 0.7500000298023224, + "completion_length": 447.93751525878906, + "epoch": 0.222, + "grad_norm": 3862503.859265399, + "kl": 2170.0, + "learning_rate": 9.598076473627796e-07, + "loss": 259.2312, + "reward": 2.4890825748443604, + "reward_std": 0.6550872325897217, + "rewards/accuracy_reward": 0.6875, + "rewards/reasoning_steps_reward": 0.9166666865348816, + "rewards/repetition_penalty_reward": -0.03695919178426266, + "rewards/tag_count_reward": 0.9218750298023224, "step": 444 }, { "clip_ratio": 0.0, - "completion_length": 617.1458435058594, - "epoch": 0.445, - "grad_norm": 18.521282427965453, - "kl": 4.4140625, - "learning_rate": 7.11265577295385e-07, - "loss": 0.5287, - "reward": 2.4945348501205444, - "reward_std": 0.5981617867946625, - "rewards/accuracy_reward": 0.6875000298023224, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.01761806895956397, - "rewards/tag_count_reward": 0.8385416865348816, + "completion_length": 389.00001525878906, + "epoch": 0.2225, + "grad_norm": 20458711.67176828, + "kl": 37120.201171875, + "learning_rate": 9.594825718944444e-07, + "loss": 1892.2664, + "reward": 2.576767325401306, + "reward_std": 0.32869401574134827, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.8611111044883728, + "rewards/repetition_penalty_reward": -0.055177152156829834, + "rewards/tag_count_reward": 0.9583333432674408, "step": 445 }, { "clip_ratio": 0.0, - "completion_length": 732.5625305175781, - "epoch": 0.446, - "grad_norm": 34.24653385750582, - "kl": 6.484375, - "learning_rate": 7.097981330836616e-07, - "loss": 0.7675, - "reward": 1.965806484222412, - "reward_std": 0.6163901686668396, - "rewards/accuracy_reward": 0.2708333358168602, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.04113807622343302, - "rewards/tag_count_reward": 0.75, + "completion_length": 375.6666717529297, + "epoch": 0.223, + "grad_norm": 6.226734549137509, + "kl": 0.50830078125, + "learning_rate": 9.59156249071181e-07, + "loss": 0.01, + "reward": 2.7551599740982056, + "reward_std": 0.20863118767738342, + "rewards/accuracy_reward": 0.875, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.04345105215907097, + "rewards/tag_count_reward": 0.9791666865348816, "step": 446 }, { "clip_ratio": 0.0, - "completion_length": 672.2708435058594, - "epoch": 0.447, - "grad_norm": 15.152108234478831, - "kl": 5.34375, - "learning_rate": 7.083287417821157e-07, - "loss": 0.7173, - "reward": 2.316531181335449, - "reward_std": 0.5732046067714691, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/reasoning_steps_reward": 0.9652777910232544, - "rewards/repetition_penalty_reward": -0.018538246862590313, - "rewards/tag_count_reward": 0.8281250298023224, + "completion_length": 473.45835876464844, + "epoch": 0.2235, + "grad_norm": 5.567081842645192, + "kl": 0.68359375, + "learning_rate": 9.588286798870248e-07, + "loss": 0.1501, + "reward": 2.574015259742737, + "reward_std": 0.45759186148643494, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9305556118488312, + "rewards/repetition_penalty_reward": -0.049248674884438515, + "rewards/tag_count_reward": 0.9427083432674408, "step": 447 }, { "clip_ratio": 0.0, - "completion_length": 730.1875305175781, - "epoch": 0.448, - "grad_norm": 9.314353876415918, - "kl": 4.265625, - "learning_rate": 7.068574212948169e-07, - "loss": 0.7717, - "reward": 2.5467538833618164, - "reward_std": 0.5365928113460541, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.03137107007205486, - "rewards/tag_count_reward": 0.8281250298023224, + "completion_length": 493.04168701171875, + "epoch": 0.224, + "grad_norm": 4.077704475094219, + "kl": 0.634765625, + "learning_rate": 9.58499865339809e-07, + "loss": 0.0452, + "reward": 2.5271689891815186, + "reward_std": 0.3818424344062805, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.05963664874434471, + "rewards/tag_count_reward": 0.9479166865348816, "step": 448 }, { "clip_ratio": 0.0, - "completion_length": 650.6875305175781, - "epoch": 0.449, - "grad_norm": 46.24553233658842, - "kl": 2.59375, - "learning_rate": 7.053841895493406e-07, - "loss": 0.778, - "reward": 2.618269443511963, - "reward_std": 0.6291141211986542, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.029300065711140633, - "rewards/tag_count_reward": 0.8906250298023224, + "completion_length": 453.5833435058594, + "epoch": 0.2245, + "grad_norm": 4.624099127542048, + "kl": 0.5810546875, + "learning_rate": 9.581698064311592e-07, + "loss": 0.0303, + "reward": 2.3410807847976685, + "reward_std": 0.4072131812572479, + "rewards/accuracy_reward": 0.5, + "rewards/reasoning_steps_reward": 0.9375000298023224, + "rewards/repetition_penalty_reward": -0.054752614349126816, + "rewards/tag_count_reward": 0.9583333432674408, "step": 449 }, { "clip_ratio": 0.0, - "completion_length": 628.1458435058594, - "epoch": 0.45, - "grad_norm": 22.01768222586756, - "kl": 1.95703125, - "learning_rate": 7.039090644965509e-07, - "loss": 0.4547, - "reward": 2.60259747505188, - "reward_std": 0.5832863450050354, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.02934711705893278, - "rewards/tag_count_reward": 0.8958333432674408, + "completion_length": 336.81251525878906, + "epoch": 0.225, + "grad_norm": 4.480015408899521, + "kl": 0.248046875, + "learning_rate": 9.578385041664925e-07, + "loss": 0.03, + "reward": 2.6452146768569946, + "reward_std": 0.1535702757537365, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.8958334028720856, + "rewards/repetition_penalty_reward": -0.04228539019823074, + "rewards/tag_count_reward": 1.0, "step": 450 }, { "clip_ratio": 0.0, - "completion_length": 517.1458435058594, - "epoch": 0.451, - "grad_norm": 33.402974244613176, - "kl": 1.6484375, - "learning_rate": 7.024320641103811e-07, - "loss": 0.4482, - "reward": 2.561974287033081, - "reward_std": 0.5507599115371704, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.944444477558136, - "rewards/repetition_penalty_reward": -0.017886925488710403, - "rewards/tag_count_reward": 0.90625, + "completion_length": 464.5833435058594, + "epoch": 0.2255, + "grad_norm": 6.047533958166858, + "kl": 0.5625, + "learning_rate": 9.575059595550127e-07, + "loss": 0.1545, + "reward": 2.5978249311447144, + "reward_std": 0.3733315169811249, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9305556118488312, + "rewards/repetition_penalty_reward": -0.041063896380364895, + "rewards/tag_count_reward": 0.9583333432674408, "step": 451 }, { "clip_ratio": 0.0, - "completion_length": 501.41668701171875, - "epoch": 0.452, - "grad_norm": 12.388889897731671, - "kl": 1.1015625, - "learning_rate": 7.009532063876148e-07, - "loss": -0.028, - "reward": 2.4829607009887695, - "reward_std": 0.4351721853017807, - "rewards/accuracy_reward": 0.6041666865348816, - "rewards/reasoning_steps_reward": 0.979166716337204, - "rewards/repetition_penalty_reward": -0.022247745655477047, - "rewards/tag_count_reward": 0.921875, + "completion_length": 398.43751525878906, + "epoch": 0.226, + "grad_norm": 11.490949893657614, + "kl": 0.505859375, + "learning_rate": 9.571721736097088e-07, + "loss": 0.1815, + "reward": 2.7292104959487915, + "reward_std": 0.36046914756298065, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.9027777910232544, + "rewards/repetition_penalty_reward": -0.05898405984044075, + "rewards/tag_count_reward": 0.96875, "step": 452 }, { "clip_ratio": 0.0, - "completion_length": 663.8125305175781, - "epoch": 0.453, - "grad_norm": 8.122371694455087, - "kl": 3.46875, - "learning_rate": 6.994725093476664e-07, - "loss": 0.586, - "reward": 2.5677295923233032, - "reward_std": 0.5909285247325897, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.0173398619517684, - "rewards/tag_count_reward": 0.8697916865348816, + "completion_length": 461.81251525878906, + "epoch": 0.2265, + "grad_norm": 6.831195619894885, + "kl": 1.2734375, + "learning_rate": 9.568371473473503e-07, + "loss": 0.2574, + "reward": 2.4620362520217896, + "reward_std": 0.4237401932477951, + "rewards/accuracy_reward": 0.6250000149011612, + "rewards/reasoning_steps_reward": 0.9375000596046448, + "rewards/repetition_penalty_reward": -0.03796376567333937, + "rewards/tag_count_reward": 0.9375000298023224, "step": 453 }, { "clip_ratio": 0.0, - "completion_length": 724.4375457763672, - "epoch": 0.454, - "grad_norm": 24.750328325907926, - "kl": 3.8046875, - "learning_rate": 6.979899910323624e-07, - "loss": 0.4282, - "reward": 2.418397545814514, - "reward_std": 0.5793856680393219, - "rewards/accuracy_reward": 0.6458333730697632, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.022574756294488907, - "rewards/tag_count_reward": 0.8229166865348816, + "completion_length": 518.1458435058594, + "epoch": 0.227, + "grad_norm": 17.14178008843787, + "kl": 2.23828125, + "learning_rate": 9.565008817884854e-07, + "loss": 0.3425, + "reward": 2.2992377281188965, + "reward_std": 0.7252504229545593, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.86111119389534, + "rewards/repetition_penalty_reward": -0.025415126234292984, + "rewards/tag_count_reward": 0.8593750298023224, "step": 454 }, { "clip_ratio": 0.0, - "completion_length": 670.1875, - "epoch": 0.455, - "grad_norm": 32.10532320367166, - "kl": 2.8515625, - "learning_rate": 6.965056695057204e-07, - "loss": 0.5623, - "reward": 2.5078253746032715, - "reward_std": 0.7500589489936829, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.04252194054424763, - "rewards/tag_count_reward": 0.8281250298023224, + "completion_length": 386.7083435058594, + "epoch": 0.2275, + "grad_norm": 11.113475621778726, + "kl": 1.48828125, + "learning_rate": 9.561633779574372e-07, + "loss": 0.1915, + "reward": 2.620706558227539, + "reward_std": 0.4370867908000946, + "rewards/accuracy_reward": 0.8125, + "rewards/reasoning_steps_reward": 0.8958334028720856, + "rewards/repetition_penalty_reward": -0.03554350510239601, + "rewards/tag_count_reward": 0.9479166865348816, "step": 455 }, { "clip_ratio": 0.0, - "completion_length": 769.7083435058594, - "epoch": 0.456, - "grad_norm": 35.5370005747395, - "kl": 5.8203125, - "learning_rate": 6.950195628537299e-07, - "loss": 0.7239, - "reward": 2.236100912094116, - "reward_std": 0.6354574412107468, - "rewards/accuracy_reward": 0.4791666865348816, - "rewards/reasoning_steps_reward": 0.979166716337204, - "rewards/repetition_penalty_reward": -0.024315819144248962, - "rewards/tag_count_reward": 0.8020833432674408, + "completion_length": 441.91668701171875, + "epoch": 0.228, + "grad_norm": 4.925397785816655, + "kl": 0.98046875, + "learning_rate": 9.55824636882301e-07, + "loss": 0.3033, + "reward": 2.390235185623169, + "reward_std": 0.44205035269260406, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 0.875, + "rewards/repetition_penalty_reward": -0.042056621983647346, + "rewards/tag_count_reward": 0.9322916865348816, "step": 456 }, { "clip_ratio": 0.0, - "completion_length": 598.4166870117188, - "epoch": 0.457, - "grad_norm": 15.54748803668102, - "kl": 2.38671875, - "learning_rate": 6.935316891841315e-07, - "loss": 0.5143, - "reward": 2.461942434310913, - "reward_std": 0.5678964406251907, - "rewards/accuracy_reward": 0.6250000149011612, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.02764105051755905, - "rewards/tag_count_reward": 0.8645833432674408, + "completion_length": 543.9583435058594, + "epoch": 0.2285, + "grad_norm": 6.065645620729275, + "kl": 0.68359375, + "learning_rate": 9.554846595949413e-07, + "loss": 0.2455, + "reward": 2.332500398159027, + "reward_std": 0.3640214204788208, + "rewards/accuracy_reward": 0.5000000111758709, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.06159696541726589, + "rewards/tag_count_reward": 0.9010416865348816, "step": 457 }, { "clip_ratio": 0.0, - "completion_length": 687.0208740234375, - "epoch": 0.458, - "grad_norm": 9.674950437917838, - "kl": 3.5078125, - "learning_rate": 6.920420666261961e-07, - "loss": 0.525, - "reward": 2.373473048210144, - "reward_std": 0.6431159377098083, - "rewards/accuracy_reward": 0.5833333432674408, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.02409642282873392, - "rewards/tag_count_reward": 0.8281250298023224, + "completion_length": 538.5416717529297, + "epoch": 0.229, + "grad_norm": 13.405805070703696, + "kl": 0.806640625, + "learning_rate": 9.55143447130987e-07, + "loss": 0.2765, + "reward": 2.613925814628601, + "reward_std": 0.4678662419319153, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.965277761220932, + "rewards/repetition_penalty_reward": -0.04926872253417969, + "rewards/tag_count_reward": 0.9270833432674408, "step": 458 }, { "clip_ratio": 0.0, - "completion_length": 597.1666870117188, - "epoch": 0.459, - "grad_norm": 11.267949423552796, - "kl": 2.6328125, - "learning_rate": 6.905507133305047e-07, - "loss": 0.5289, - "reward": 2.4548171758651733, - "reward_std": 0.5164096057415009, - "rewards/accuracy_reward": 0.6041666865348816, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.017405156511813402, - "rewards/tag_count_reward": 0.8750000298023224, + "completion_length": 684.75, + "epoch": 0.2295, + "grad_norm": 5.353993038328121, + "kl": 1.541015625, + "learning_rate": 9.54801000529831e-07, + "loss": 0.3747, + "reward": 2.072951376438141, + "reward_std": 0.6868791580200195, + "rewards/accuracy_reward": 0.458333358168602, + "rewards/reasoning_steps_reward": 0.8402778506278992, + "rewards/repetition_penalty_reward": -0.03295143134891987, + "rewards/tag_count_reward": 0.8072916865348816, "step": 459 }, { "clip_ratio": 0.0, - "completion_length": 624.125, - "epoch": 0.46, - "grad_norm": 11.88684382845764, - "kl": 3.703125, - "learning_rate": 6.890576474687263e-07, - "loss": 0.6898, - "reward": 2.5472575426101685, - "reward_std": 0.6651158332824707, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.016978577245026827, - "rewards/tag_count_reward": 0.8489583730697632, + "completion_length": 697.5416870117188, + "epoch": 0.23, + "grad_norm": 6.2141087927742555, + "kl": 1.39453125, + "learning_rate": 9.54457320834625e-07, + "loss": 0.5803, + "reward": 2.205751061439514, + "reward_std": 0.5500113666057587, + "rewards/accuracy_reward": 0.4791666716337204, + "rewards/reasoning_steps_reward": 0.8958333432674408, + "rewards/repetition_penalty_reward": -0.018207357730716467, + "rewards/tag_count_reward": 0.8489583432674408, "step": 460 }, { "clip_ratio": 0.0, - "completion_length": 752.25, - "epoch": 0.461, - "grad_norm": 14.448078905297072, - "kl": 5.765625, - "learning_rate": 6.875628872333975e-07, - "loss": 0.7562, - "reward": 2.2633214592933655, - "reward_std": 0.6216214001178741, - "rewards/accuracy_reward": 0.5000000149011612, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.02660918142646551, - "rewards/tag_count_reward": 0.7968750298023224, + "completion_length": 374.7708435058594, + "epoch": 0.2305, + "grad_norm": 4.32996026619283, + "kl": 0.51171875, + "learning_rate": 9.54112409092277e-07, + "loss": 0.1318, + "reward": 2.5193281173706055, + "reward_std": 0.2269902043044567, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.03969983570277691, + "rewards/tag_count_reward": 0.96875, "step": 461 }, { "clip_ratio": 0.0, - "completion_length": 708.3958740234375, - "epoch": 0.462, - "grad_norm": 20.449164875872643, - "kl": 7.078125, - "learning_rate": 6.860664508377001e-07, - "loss": 0.7331, - "reward": 2.1556172370910645, - "reward_std": 0.7438893914222717, - "rewards/accuracy_reward": 0.4583333432674408, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.01799399685114622, - "rewards/tag_count_reward": 0.7291666865348816, + "completion_length": 522.4791870117188, + "epoch": 0.231, + "grad_norm": 6.138638623677164, + "kl": 0.767578125, + "learning_rate": 9.537662663534477e-07, + "loss": 0.2944, + "reward": 2.0977752208709717, + "reward_std": 0.47963058948516846, + "rewards/accuracy_reward": 0.2708333432674408, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.06368312053382397, + "rewards/tag_count_reward": 0.9322916865348816, "step": 462 }, { "clip_ratio": 0.0, - "completion_length": 650.5000305175781, - "epoch": 0.463, - "grad_norm": 14.487854173063429, - "kl": 4.7265625, - "learning_rate": 6.84568356515239e-07, - "loss": 0.818, - "reward": 2.314304828643799, - "reward_std": 0.6960805356502533, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.025973046198487282, - "rewards/tag_count_reward": 0.8125000298023224, + "completion_length": 549.1666870117188, + "epoch": 0.2315, + "grad_norm": 4.983489188907526, + "kl": 0.802734375, + "learning_rate": 9.534188936725483e-07, + "loss": 0.209, + "reward": 2.581421971321106, + "reward_std": 0.44028259813785553, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9375000596046448, + "rewards/repetition_penalty_reward": -0.03316138405352831, + "rewards/tag_count_reward": 0.9270833432674408, "step": 463 }, { "clip_ratio": 0.0, - "completion_length": 727.3125, - "epoch": 0.464, - "grad_norm": 26.696071790504657, - "kl": 7.0625, - "learning_rate": 6.83068622519821e-07, - "loss": 0.8028, - "reward": 2.3517472743988037, - "reward_std": 0.6738306879997253, - "rewards/accuracy_reward": 0.6250000298023224, - "rewards/reasoning_steps_reward": 0.9652778208255768, - "rewards/repetition_penalty_reward": -0.02498907968401909, - "rewards/tag_count_reward": 0.7864583432674408, + "completion_length": 517.0000152587891, + "epoch": 0.232, + "grad_norm": 519611.1496287819, + "kl": 284.578125, + "learning_rate": 9.530702921077358e-07, + "loss": 46.5488, + "reward": 2.31371533870697, + "reward_std": 0.47075986862182617, + "rewards/accuracy_reward": 0.5000000111758709, + "rewards/reasoning_steps_reward": 0.9097222685813904, + "rewards/repetition_penalty_reward": -0.028298573568463326, + "rewards/tag_count_reward": 0.9322916865348816, "step": 464 }, { "clip_ratio": 0.0, - "completion_length": 644.5000305175781, - "epoch": 0.465, - "grad_norm": 13.62258323479659, - "kl": 3.65625, - "learning_rate": 6.815672671252315e-07, - "loss": 0.8709, - "reward": 2.5122939348220825, - "reward_std": 0.6799641847610474, - "rewards/accuracy_reward": 0.6875, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.0224284203723073, - "rewards/tag_count_reward": 0.8541666865348816, + "completion_length": 600.5208587646484, + "epoch": 0.2325, + "grad_norm": 17.962574871609352, + "kl": 1.216796875, + "learning_rate": 9.527204627209112e-07, + "loss": 0.357, + "reward": 2.464834451675415, + "reward_std": 0.5079529285430908, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9166666865348816, + "rewards/repetition_penalty_reward": -0.07162389159202576, + "rewards/tag_count_reward": 0.9114583432674408, "step": 465 }, { "clip_ratio": 0.0, - "completion_length": 696.3125305175781, - "epoch": 0.466, - "grad_norm": 16.003897480590176, - "kl": 4.6328125, - "learning_rate": 6.800643086250121e-07, - "loss": 0.8121, - "reward": 2.034106969833374, - "reward_std": 0.537964329123497, - "rewards/accuracy_reward": 0.2500000074505806, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.021448652260005474, - "rewards/tag_count_reward": 0.8125000298023224, + "completion_length": 338.62501525878906, + "epoch": 0.233, + "grad_norm": 14.124197845323842, + "kl": 0.435546875, + "learning_rate": 9.523694065777156e-07, + "loss": 0.3118, + "reward": 2.8386720418930054, + "reward_std": 0.370292603969574, + "rewards/accuracy_reward": 0.9375000298023224, + "rewards/reasoning_steps_reward": 0.9375000596046448, + "rewards/repetition_penalty_reward": -0.020702947862446308, + "rewards/tag_count_reward": 0.984375, "step": 466 }, { "clip_ratio": 0.0, - "completion_length": 834.3333740234375, - "epoch": 0.467, - "grad_norm": 78.01009343549657, - "kl": 9.53125, - "learning_rate": 6.78559765332238e-07, - "loss": 0.9044, - "reward": 1.9304699301719666, - "reward_std": 0.6387233734130859, - "rewards/accuracy_reward": 0.3125, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.02786353789269924, - "rewards/tag_count_reward": 0.6666666865348816, + "completion_length": 639.2708740234375, + "epoch": 0.2335, + "grad_norm": 22.217309088723397, + "kl": 2.3359375, + "learning_rate": 9.520171247475268e-07, + "loss": 0.5155, + "reward": 2.592587947845459, + "reward_std": 0.3498579263687134, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.0410926416516304, + "rewards/tag_count_reward": 0.9114583432674408, "step": 467 }, { "clip_ratio": 0.0, - "completion_length": 653.5000305175781, - "epoch": 0.468, - "grad_norm": 26.98679371952424, - "kl": 4.2109375, - "learning_rate": 6.770536555792944e-07, - "loss": 0.8219, - "reward": 2.4220420122146606, - "reward_std": 0.6534742116928101, - "rewards/accuracy_reward": 0.6041666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.018930173479020596, - "rewards/tag_count_reward": 0.84375, + "completion_length": 587.0208435058594, + "epoch": 0.234, + "grad_norm": 26.970069655903107, + "kl": 1.96484375, + "learning_rate": 9.516636183034564e-07, + "loss": 0.4617, + "reward": 2.5388882160186768, + "reward_std": 0.5101897567510605, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.02882016822695732, + "rewards/tag_count_reward": 0.921875, "step": 468 }, { "clip_ratio": 0.0, - "completion_length": 627.3750305175781, - "epoch": 0.469, - "grad_norm": 149.59812362533228, - "kl": 6.53125, - "learning_rate": 6.755459977176532e-07, - "loss": 0.8048, - "reward": 2.1985827684402466, - "reward_std": 0.603963702917099, - "rewards/accuracy_reward": 0.4375, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.03405630309134722, - "rewards/tag_count_reward": 0.8229166865348816, + "completion_length": 551.1666870117188, + "epoch": 0.2345, + "grad_norm": 10.814287456638, + "kl": 1.375, + "learning_rate": 9.513088883223463e-07, + "loss": 0.3882, + "reward": 2.5434218645095825, + "reward_std": 0.49620410799980164, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9513889849185944, + "rewards/repetition_penalty_reward": -0.03817532956600189, + "rewards/tag_count_reward": 0.9218750298023224, "step": 469 }, { "clip_ratio": 0.0, - "completion_length": 470.20835876464844, - "epoch": 0.47, - "grad_norm": 17.189852933380237, - "kl": 1.8984375, - "learning_rate": 6.740368101176495e-07, - "loss": 0.2061, - "reward": 2.4457980394363403, - "reward_std": 0.44828473031520844, - "rewards/accuracy_reward": 0.5625, - "rewards/reasoning_steps_reward": 0.979166716337204, - "rewards/repetition_penalty_reward": -0.01774365920573473, - "rewards/tag_count_reward": 0.9218750298023224, + "completion_length": 476.70835876464844, + "epoch": 0.235, + "grad_norm": 15.36470721622306, + "kl": 1.408203125, + "learning_rate": 9.509529358847654e-07, + "loss": 0.2803, + "reward": 2.6564310789108276, + "reward_std": 0.321887843310833, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9722222089767456, + "rewards/repetition_penalty_reward": -0.024124516174197197, + "rewards/tag_count_reward": 0.9166666865348816, "step": 470 }, { "clip_ratio": 0.0, - "completion_length": 547.5625, - "epoch": 0.471, - "grad_norm": 15.50803120857332, - "kl": 2.32421875, - "learning_rate": 6.725261111682584e-07, - "loss": 0.4077, - "reward": 2.327059745788574, - "reward_std": 0.5171791017055511, - "rewards/accuracy_reward": 0.4791666716337204, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.023634711280465126, - "rewards/tag_count_reward": 0.8854166865348816, + "completion_length": 579.4791717529297, + "epoch": 0.2355, + "grad_norm": 26.432404929557944, + "kl": 1.341796875, + "learning_rate": 9.505957620750069e-07, + "loss": 0.3973, + "reward": 2.486717700958252, + "reward_std": 0.4216475263237953, + "rewards/accuracy_reward": 0.7291666716337204, + "rewards/reasoning_steps_reward": 0.916666716337204, + "rewards/repetition_penalty_reward": -0.05494917556643486, + "rewards/tag_count_reward": 0.8958333432674408, "step": 471 }, { "clip_ratio": 0.0, - "completion_length": 642.6666870117188, - "epoch": 0.472, - "grad_norm": 21.870962603035192, - "kl": 3.46875, - "learning_rate": 6.710139192768694e-07, - "loss": 0.7413, - "reward": 2.383070707321167, - "reward_std": 0.5808857977390289, - "rewards/accuracy_reward": 0.5625000298023224, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.02838774584233761, - "rewards/tag_count_reward": 0.8489583432674408, + "completion_length": 638.1666870117188, + "epoch": 0.236, + "grad_norm": 34.94887870173885, + "kl": 1.388671875, + "learning_rate": 9.502373679810839e-07, + "loss": 0.5501, + "reward": 2.225524663925171, + "reward_std": 0.6959312558174133, + "rewards/accuracy_reward": 0.4583333432674408, + "rewards/reasoning_steps_reward": 0.9236111044883728, + "rewards/repetition_penalty_reward": -0.05225317180156708, + "rewards/tag_count_reward": 0.8958333432674408, "step": 472 }, { "clip_ratio": 0.0, - "completion_length": 642.8958435058594, - "epoch": 0.473, - "grad_norm": 10.842744450293143, - "kl": 3.8203125, - "learning_rate": 6.695002528690639e-07, - "loss": 0.6264, - "reward": 2.4516608715057373, - "reward_std": 0.5648539513349533, - "rewards/accuracy_reward": 0.6875000298023224, - "rewards/reasoning_steps_reward": 0.9861111044883728, - "rewards/repetition_penalty_reward": -0.024033674970269203, - "rewards/tag_count_reward": 0.8020833432674408, + "completion_length": 379.0416717529297, + "epoch": 0.2365, + "grad_norm": 5.72045110026632, + "kl": 0.39453125, + "learning_rate": 9.49877754694727e-07, + "loss": -0.0209, + "reward": 2.2306947112083435, + "reward_std": 0.19987352192401886, + "rewards/accuracy_reward": 0.29166667722165585, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.027985903434455395, + "rewards/tag_count_reward": 0.9739583432674408, "step": 473 }, { "clip_ratio": 0.0, - "completion_length": 724.2708435058594, - "epoch": 0.474, - "grad_norm": 29.595685417395913, - "kl": 5.6640625, - "learning_rate": 6.679851303883891e-07, - "loss": 0.4416, - "reward": 2.2453513741493225, - "reward_std": 0.5234881341457367, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/reasoning_steps_reward": 0.9583333730697632, - "rewards/repetition_penalty_reward": -0.020273788832128048, - "rewards/tag_count_reward": 0.7656250298023224, + "completion_length": 407.0833435058594, + "epoch": 0.237, + "grad_norm": 32.792554292092795, + "kl": 0.365234375, + "learning_rate": 9.495169233113806e-07, + "loss": 0.1724, + "reward": 2.5822925567626953, + "reward_std": 0.33261267840862274, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.8680555820465088, + "rewards/repetition_penalty_reward": -0.061804771423339844, + "rewards/tag_count_reward": 0.984375, "step": 474 }, { "clip_ratio": 0.0, - "completion_length": 648.6458435058594, - "epoch": 0.475, - "grad_norm": 12.791295320250482, - "kl": 4.8125, - "learning_rate": 6.664685702961344e-07, - "loss": 0.6559, - "reward": 2.5282609462738037, - "reward_std": 0.6692875623703003, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.030766917392611504, - "rewards/tag_count_reward": 0.8437500298023224, + "completion_length": 486.6458435058594, + "epoch": 0.2375, + "grad_norm": 74.12575189964451, + "kl": 1.287109375, + "learning_rate": 9.491548749301997e-07, + "loss": 0.4262, + "reward": 2.6630338430404663, + "reward_std": 0.4112485349178314, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.07134135626256466, + "rewards/tag_count_reward": 0.9635416865348816, "step": 475 }, { "clip_ratio": 0.0, - "completion_length": 548.0833435058594, - "epoch": 0.476, - "grad_norm": 14.816045187511754, - "kl": 2.8046875, - "learning_rate": 6.649505910711058e-07, - "loss": 0.4964, - "reward": 2.352592945098877, - "reward_std": 0.5124609172344208, - "rewards/accuracy_reward": 0.5416666716337204, - "rewards/reasoning_steps_reward": 0.9583333432674408, - "rewards/repetition_penalty_reward": -0.03282391466200352, - "rewards/tag_count_reward": 0.8854166865348816, + "completion_length": 404.4375, + "epoch": 0.238, + "grad_norm": 18.64237541974552, + "kl": 0.95751953125, + "learning_rate": 9.487916106540465e-07, + "loss": 0.2883, + "reward": 2.847753882408142, + "reward_std": 0.25422170013189316, + "rewards/accuracy_reward": 0.9375000298023224, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.05676013603806496, + "rewards/tag_count_reward": 0.9739583432674408, "step": 476 }, { "clip_ratio": 0.0, - "completion_length": 653.0208435058594, - "epoch": 0.477, - "grad_norm": 17.711933807676367, - "kl": 4.5625, - "learning_rate": 6.634312112094013e-07, - "loss": 0.5825, - "reward": 2.517646074295044, - "reward_std": 0.46930187940597534, - "rewards/accuracy_reward": 0.6875000298023224, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.020548363216221333, - "rewards/tag_count_reward": 0.8645833432674408, + "completion_length": 495.93751525878906, + "epoch": 0.2385, + "grad_norm": 528.7163050198875, + "kl": 15.3125, + "learning_rate": 9.484271315894871e-07, + "loss": 0.9612, + "reward": 2.6119762659072876, + "reward_std": 0.4331580549478531, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9375000298023224, + "rewards/repetition_penalty_reward": -0.04427382908761501, + "rewards/tag_count_reward": 0.9479166865348816, "step": 477 }, { "clip_ratio": 0.0, - "completion_length": 569.8541870117188, - "epoch": 0.478, - "grad_norm": 15.060185969581552, - "kl": 2.9296875, - "learning_rate": 6.619104492241847e-07, - "loss": 0.6103, - "reward": 2.419161558151245, - "reward_std": 0.5570693910121918, - "rewards/accuracy_reward": 0.5833333432674408, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.01833842322230339, - "rewards/tag_count_reward": 0.8750000298023224, + "completion_length": 383.9375, + "epoch": 0.239, + "grad_norm": 132.3126145140169, + "kl": 3.9453125, + "learning_rate": 9.480614388467877e-07, + "loss": 0.2224, + "reward": 2.751030921936035, + "reward_std": 0.23816969990730286, + "rewards/accuracy_reward": 0.8750000298023224, + "rewards/reasoning_steps_reward": 0.9236111640930176, + "rewards/repetition_penalty_reward": -0.047580257058143616, + "rewards/tag_count_reward": 1.0, "step": 478 }, { "clip_ratio": 0.0, - "completion_length": 820.8541870117188, - "epoch": 0.479, - "grad_norm": 67.25826820013505, - "kl": 5.484375, - "learning_rate": 6.603883236454612e-07, - "loss": 0.759, - "reward": 2.107418715953827, - "reward_std": 0.7182769775390625, - "rewards/accuracy_reward": 0.4375000298023224, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.050567377358675, - "rewards/tag_count_reward": 0.7343750298023224, + "completion_length": 376.375, + "epoch": 0.2395, + "grad_norm": 44.2595079452565, + "kl": 0.7900390625, + "learning_rate": 9.47694533539912e-07, + "loss": 0.2293, + "reward": 2.831836700439453, + "reward_std": 0.23659071326255798, + "rewards/accuracy_reward": 0.8958333730697632, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.04142730962485075, + "rewards/tag_count_reward": 0.9843750298023224, "step": 479 }, { "clip_ratio": 0.0, - "completion_length": 534.2291870117188, - "epoch": 0.48, - "grad_norm": 19.548359681394896, - "kl": 2.03515625, - "learning_rate": 6.588648530198504e-07, - "loss": 0.5902, - "reward": 2.236539840698242, - "reward_std": 0.47467684745788574, - "rewards/accuracy_reward": 0.3750000149011612, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.03082139603793621, - "rewards/tag_count_reward": 0.9062500298023224, + "completion_length": 381.4583435058594, + "epoch": 0.24, + "grad_norm": 5.3725546070452115, + "kl": 0.3857421875, + "learning_rate": 9.473264167865171e-07, + "loss": 0.1157, + "reward": 2.626276731491089, + "reward_std": 0.19565383344888687, + "rewards/accuracy_reward": 0.7708333730697632, + "rewards/reasoning_steps_reward": 0.9097222089767456, + "rewards/repetition_penalty_reward": -0.05427892133593559, + "rewards/tag_count_reward": 1.0, "step": 480 }, { "clip_ratio": 0.0, - "completion_length": 557.5833587646484, - "epoch": 0.481, - "grad_norm": 27.83836565318464, - "kl": 2.5, - "learning_rate": 6.573400559103613e-07, - "loss": 0.6866, - "reward": 2.6256160736083984, - "reward_std": 0.6559298038482666, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 0.9722222089767456, - "rewards/repetition_penalty_reward": -0.018481258302927017, - "rewards/tag_count_reward": 0.8802083432674408, + "completion_length": 394.4166717529297, + "epoch": 0.2405, + "grad_norm": 17.822823313036796, + "kl": 0.291015625, + "learning_rate": 9.469570897079504e-07, + "loss": 0.1667, + "reward": 2.8159937858581543, + "reward_std": 0.23275446519255638, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.036437010392546654, + "rewards/tag_count_reward": 0.9635416865348816, "step": 481 }, { "clip_ratio": 0.0, - "completion_length": 544.4791870117188, - "epoch": 0.482, - "grad_norm": 16.75197360847501, - "kl": 2.359375, - "learning_rate": 6.558139508961654e-07, - "loss": 0.4691, - "reward": 2.4990181922912598, - "reward_std": 0.5695154368877411, - "rewards/accuracy_reward": 0.6250000298023224, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.02528748568147421, - "rewards/tag_count_reward": 0.9062500298023224, + "completion_length": 494.6250305175781, + "epoch": 0.241, + "grad_norm": 37.12601936582567, + "kl": 0.4287109375, + "learning_rate": 9.465865534292464e-07, + "loss": 0.3572, + "reward": 2.4508873224258423, + "reward_std": 0.49121496081352234, + "rewards/accuracy_reward": 0.6458333730697632, + "rewards/reasoning_steps_reward": 0.9097222685813904, + "rewards/repetition_penalty_reward": -0.06300175003707409, + "rewards/tag_count_reward": 0.9583333432674408, "step": 482 }, { "clip_ratio": 0.0, - "completion_length": 513.3125152587891, - "epoch": 0.483, - "grad_norm": 11.150322642215151, - "kl": 3.21875, - "learning_rate": 6.542865565723707e-07, - "loss": 0.6304, - "reward": 2.340551257133484, - "reward_std": 0.5843985080718994, - "rewards/accuracy_reward": 0.5416666716337204, - "rewards/reasoning_steps_reward": 0.9513889849185944, - "rewards/repetition_penalty_reward": -0.02229604311287403, - "rewards/tag_count_reward": 0.8697916865348816, + "completion_length": 472.6875, + "epoch": 0.2415, + "grad_norm": 17.990973842263713, + "kl": 0.3642578125, + "learning_rate": 9.462148090791228e-07, + "loss": 0.0626, + "reward": 2.607269763946533, + "reward_std": 0.3627200424671173, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9791666269302368, + "rewards/repetition_penalty_reward": -0.08543861098587513, + "rewards/tag_count_reward": 0.9635416865348816, "step": 483 }, { "clip_ratio": 0.0, - "completion_length": 555.0, - "epoch": 0.484, - "grad_norm": 25.161999743535905, - "kl": 4.7734375, - "learning_rate": 6.527578915497951e-07, - "loss": 0.7346, - "reward": 2.534891724586487, - "reward_std": 0.6275463998317719, - "rewards/accuracy_reward": 0.7708333730697632, - "rewards/reasoning_steps_reward": 0.9652778506278992, - "rewards/repetition_penalty_reward": -0.029344591312110424, - "rewards/tag_count_reward": 0.8281250298023224, + "completion_length": 348.5625, + "epoch": 0.242, + "grad_norm": 3.9182155917466623, + "kl": 0.2919921875, + "learning_rate": 9.458418577899774e-07, + "loss": 0.0877, + "reward": 2.6882437467575073, + "reward_std": 0.4514298141002655, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.9444444477558136, + "rewards/repetition_penalty_reward": -0.05828405171632767, + "rewards/tag_count_reward": 0.9895833730697632, "step": 484 }, { "clip_ratio": 0.0, - "completion_length": 780.125, - "epoch": 0.485, - "grad_norm": 51.74758202769363, - "kl": 10.09375, - "learning_rate": 6.512279744547392e-07, - "loss": 0.9894, - "reward": 1.9237075448036194, - "reward_std": 0.6222628057003021, - "rewards/accuracy_reward": 0.2708333432674408, - "rewards/reasoning_steps_reward": 0.972222238779068, - "rewards/repetition_penalty_reward": -0.01726479548960924, - "rewards/tag_count_reward": 0.6979166865348816, + "completion_length": 449.2916717529297, + "epoch": 0.2425, + "grad_norm": 25.86980665658515, + "kl": 0.513671875, + "learning_rate": 9.454677006978842e-07, + "loss": 0.2189, + "reward": 2.6299872398376465, + "reward_std": 0.3183176666498184, + "rewards/accuracy_reward": 0.75, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.07834616675972939, + "rewards/tag_count_reward": 0.9791666865348816, "step": 485 }, { "clip_ratio": 0.0, - "completion_length": 694.1875305175781, - "epoch": 0.486, - "grad_norm": 26.567315708037814, - "kl": 5.890625, - "learning_rate": 6.496968239287603e-07, - "loss": 0.7402, - "reward": 2.210293173789978, - "reward_std": 0.7222527861595154, - "rewards/accuracy_reward": 0.4791666865348816, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.02060973085463047, - "rewards/tag_count_reward": 0.7656250298023224, + "completion_length": 400.9583435058594, + "epoch": 0.243, + "grad_norm": 5.7429495684505, + "kl": 1.07421875, + "learning_rate": 9.450923389425911e-07, + "loss": 0.2514, + "reward": 2.870607614517212, + "reward_std": 0.19876050017774105, + "rewards/accuracy_reward": 0.9583333432674408, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.05647587403655052, + "rewards/tag_count_reward": 0.9895833432674408, "step": 486 }, { "clip_ratio": 0.0, - "completion_length": 701.7291870117188, - "epoch": 0.487, - "grad_norm": 23.60086720039633, - "kl": 5.703125, - "learning_rate": 6.481644586284442e-07, - "loss": 0.8466, - "reward": 2.353806734085083, - "reward_std": 0.6038880944252014, - "rewards/accuracy_reward": 0.583333358168602, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.03334605507552624, - "rewards/tag_count_reward": 0.8177083432674408, + "completion_length": 434.25001525878906, + "epoch": 0.2435, + "grad_norm": 13.342575140209078, + "kl": 0.84912109375, + "learning_rate": 9.44715773667515e-07, + "loss": 0.2413, + "reward": 2.700625419616699, + "reward_std": 0.31798748672008514, + "rewards/accuracy_reward": 0.8125, + "rewards/reasoning_steps_reward": 0.9652778506278992, + "rewards/repetition_penalty_reward": -0.06673579290509224, + "rewards/tag_count_reward": 0.9895833432674408, "step": 487 }, { "clip_ratio": 0.0, - "completion_length": 748.4583435058594, - "epoch": 0.488, - "grad_norm": 19.03851418441834, - "kl": 6.2734375, - "learning_rate": 6.466308972251785e-07, - "loss": 1.0066, - "reward": 2.5121508836746216, - "reward_std": 0.7225759625434875, - "rewards/accuracy_reward": 0.75, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.038196416571736336, - "rewards/tag_count_reward": 0.8281250298023224, + "completion_length": 322.5, + "epoch": 0.244, + "grad_norm": 5.343676568554896, + "kl": 0.31591796875, + "learning_rate": 9.443380060197385e-07, + "loss": 0.0725, + "reward": 2.9111135005950928, + "reward_std": 0.15393263846635818, + "rewards/accuracy_reward": 0.9791666865348816, + "rewards/reasoning_steps_reward": 0.9722221791744232, + "rewards/repetition_penalty_reward": -0.03506710007786751, + "rewards/tag_count_reward": 0.9947916865348816, "step": 488 }, { "clip_ratio": 0.0, - "completion_length": 558.0416870117188, - "epoch": 0.489, - "grad_norm": 12.839961288051784, - "kl": 3.6796875, - "learning_rate": 6.45096158404925e-07, - "loss": 0.7412, - "reward": 2.5207144021987915, - "reward_std": 0.6570230424404144, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.015743907541036606, - "rewards/tag_count_reward": 0.890625, + "completion_length": 386.1458435058594, + "epoch": 0.2445, + "grad_norm": 5.09132953467604, + "kl": 0.3408203125, + "learning_rate": 9.43959037150008e-07, + "loss": 0.0758, + "reward": 2.8277556896209717, + "reward_std": 0.2439712956547737, + "rewards/accuracy_reward": 0.9375000298023224, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.06286950036883354, + "rewards/tag_count_reward": 0.9947916865348816, "step": 489 }, { "clip_ratio": 0.0, - "completion_length": 594.1666870117188, - "epoch": 0.49, - "grad_norm": 11.918218758688853, - "kl": 4.0390625, - "learning_rate": 6.435602608679916e-07, - "loss": 0.5896, - "reward": 2.576896905899048, - "reward_std": 0.6758164465427399, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.018589303828775883, - "rewards/tag_count_reward": 0.8593750298023224, + "completion_length": 448.2916717529297, + "epoch": 0.245, + "grad_norm": 23.327613224213003, + "kl": 1.8515625, + "learning_rate": 9.43578868212728e-07, + "loss": 0.2664, + "reward": 2.1128222346305847, + "reward_std": 0.39175570011138916, + "rewards/accuracy_reward": 0.2916666716337204, + "rewards/reasoning_steps_reward": 0.9166666865348816, + "rewards/repetition_penalty_reward": -0.03821949101984501, + "rewards/tag_count_reward": 0.9427083432674408, "step": 490 }, { "clip_ratio": 0.0, - "completion_length": 531.25, - "epoch": 0.491, - "grad_norm": 12.572669494859374, - "kl": 4.7421875, - "learning_rate": 6.420232233288055e-07, - "loss": 0.4827, - "reward": 2.256177067756653, - "reward_std": 0.543413519859314, - "rewards/accuracy_reward": 0.4375000223517418, - "rewards/reasoning_steps_reward": 0.9652777910232544, - "rewards/repetition_penalty_reward": -0.01639253133907914, - "rewards/tag_count_reward": 0.8697916865348816, + "completion_length": 404.3958435058594, + "epoch": 0.2455, + "grad_norm": 6.480996990735909, + "kl": 0.669921875, + "learning_rate": 9.431975003659594e-07, + "loss": 0.1923, + "reward": 2.416967749595642, + "reward_std": 0.4523526728153229, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.044837912544608116, + "rewards/tag_count_reward": 0.96875, "step": 491 }, { "clip_ratio": 0.0, - "completion_length": 400.9375, - "epoch": 0.492, - "grad_norm": 25.196826952891467, - "kl": 1.765625, - "learning_rate": 6.404850645156841e-07, - "loss": 0.3313, - "reward": 2.747916102409363, - "reward_std": 0.490363210439682, - "rewards/accuracy_reward": 0.8750000298023224, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.022917456924915314, - "rewards/tag_count_reward": 0.9166666865348816, + "completion_length": 474.22918701171875, + "epoch": 0.246, + "grad_norm": 17.872104813321265, + "kl": 2.296875, + "learning_rate": 9.428149347714143e-07, + "loss": 0.3238, + "reward": 2.4794113636016846, + "reward_std": 0.3524549901485443, + "rewards/accuracy_reward": 0.645833358168602, + "rewards/reasoning_steps_reward": 0.9513889849185944, + "rewards/repetition_penalty_reward": -0.05531090311706066, + "rewards/tag_count_reward": 0.9375, "step": 492 }, { "clip_ratio": 0.0, - "completion_length": 536.6458435058594, - "epoch": 0.493, - "grad_norm": 12.14349711767654, - "kl": 2.984375, - "learning_rate": 6.389458031706068e-07, - "loss": 0.5739, - "reward": 2.6953213214874268, - "reward_std": 0.5101250112056732, - "rewards/accuracy_reward": 0.8333333730697632, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.026901046745479107, - "rewards/tag_count_reward": 0.9166666865348816, + "completion_length": 429.5625305175781, + "epoch": 0.2465, + "grad_norm": 25.50926751724998, + "kl": 2.3203125, + "learning_rate": 9.424311725944543e-07, + "loss": 0.3623, + "reward": 2.5525494813919067, + "reward_std": 0.40840910375118256, + "rewards/accuracy_reward": 0.6875, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.037728333845734596, + "rewards/tag_count_reward": 0.9583333730697632, "step": 493 }, { "clip_ratio": 0.0, - "completion_length": 399.7083435058594, - "epoch": 0.494, - "grad_norm": 19.752623374452337, - "kl": 2.17578125, - "learning_rate": 6.374054580489873e-07, - "loss": 0.1944, - "reward": 2.5966198444366455, - "reward_std": 0.4636584371328354, - "rewards/accuracy_reward": 0.7083333730697632, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.02317184768617153, - "rewards/tag_count_reward": 0.9322916865348816, + "completion_length": 353.12501525878906, + "epoch": 0.247, + "grad_norm": 5.9043836446965186, + "kl": 0.50390625, + "learning_rate": 9.420462150040852e-07, + "loss": 0.0674, + "reward": 2.6686885356903076, + "reward_std": 0.2424444481730461, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.0465893279761076, + "rewards/tag_count_reward": 1.0, "step": 494 }, { "clip_ratio": 0.0, - "completion_length": 465.91668701171875, - "epoch": 0.495, - "grad_norm": 14.16827974955118, - "kl": 1.6328125, - "learning_rate": 6.358640479194451e-07, - "loss": 0.3459, - "reward": 2.6907471418380737, - "reward_std": 0.4236067831516266, + "completion_length": 375.7708435058594, + "epoch": 0.2475, + "grad_norm": 8.837736782220068, + "kl": 0.705078125, + "learning_rate": 9.416600631729548e-07, + "loss": 0.1302, + "reward": 2.721538782119751, + "reward_std": 0.13726279512047768, "rewards/accuracy_reward": 0.7916666865348816, "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.0332112442702055, - "rewards/tag_count_reward": 0.9322916865348816, + "rewards/repetition_penalty_reward": -0.0597112700343132, + "rewards/tag_count_reward": 0.9895833730697632, "step": 495 }, { "clip_ratio": 0.0, - "completion_length": 587.9791870117188, - "epoch": 0.496, - "grad_norm": 28.995423975163437, - "kl": 3.40625, - "learning_rate": 6.343215915635761e-07, - "loss": 0.5617, - "reward": 2.512778878211975, - "reward_std": 0.5804081857204437, - "rewards/accuracy_reward": 0.6458333432674408, - "rewards/reasoning_steps_reward": 0.9791666567325592, - "rewards/repetition_penalty_reward": -0.028887868858873844, - "rewards/tag_count_reward": 0.9166666865348816, + "completion_length": 385.7708435058594, + "epoch": 0.248, + "grad_norm": 11.405430982188452, + "kl": 1.044921875, + "learning_rate": 9.412727182773486e-07, + "loss": 0.201, + "reward": 2.7786643505096436, + "reward_std": 0.3927687704563141, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.03730800375342369, + "rewards/tag_count_reward": 0.96875, "step": 496 }, { "clip_ratio": 0.0, - "completion_length": 411.7083435058594, - "epoch": 0.497, - "grad_norm": 14.922976729616561, - "kl": 2.7109375, - "learning_rate": 6.327781077757241e-07, - "loss": 0.3736, - "reward": 2.570963978767395, - "reward_std": 0.578457772731781, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/reasoning_steps_reward": 0.9652777910232544, - "rewards/repetition_penalty_reward": -0.019313913770020008, - "rewards/tag_count_reward": 0.9166666865348816, + "completion_length": 473.85418701171875, + "epoch": 0.2485, + "grad_norm": 11.676094147579299, + "kl": 1.74609375, + "learning_rate": 9.408841814971861e-07, + "loss": 0.3357, + "reward": 2.485311985015869, + "reward_std": 0.4508149325847626, + "rewards/accuracy_reward": 0.75, + "rewards/reasoning_steps_reward": 0.8958334028720856, + "rewards/repetition_penalty_reward": -0.045938010327517986, + "rewards/tag_count_reward": 0.8854166865348816, "step": 497 }, { "clip_ratio": 0.0, - "completion_length": 570.1875, - "epoch": 0.498, - "grad_norm": 12.22816448398453, - "kl": 3.7890625, - "learning_rate": 6.31233615362752e-07, - "loss": 0.7994, - "reward": 2.4899239540100098, - "reward_std": 0.45974400639533997, - "rewards/accuracy_reward": 0.6458333432674408, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.0222289408557117, - "rewards/tag_count_reward": 0.8802083432674408, + "completion_length": 342.25001525878906, + "epoch": 0.249, + "grad_norm": 6.336940055320027, + "kl": 0.8125, + "learning_rate": 9.404944540160177e-07, + "loss": 0.0604, + "reward": 2.6701611280441284, + "reward_std": 0.355712890625, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.055533334612846375, + "rewards/tag_count_reward": 0.9687500298023224, "step": 498 }, { "clip_ratio": 0.0, - "completion_length": 541.2083435058594, - "epoch": 0.499, - "grad_norm": 24.07992840689996, - "kl": 3.2890625, - "learning_rate": 6.296881331438126e-07, - "loss": 0.6263, - "reward": 2.4488918781280518, - "reward_std": 0.6255036890506744, - "rewards/accuracy_reward": 0.6041666865348816, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.02680276893079281, - "rewards/tag_count_reward": 0.8854166865348816, + "completion_length": 369.79168701171875, + "epoch": 0.2495, + "grad_norm": 18.051875973959618, + "kl": 1.072265625, + "learning_rate": 9.401035370210212e-07, + "loss": 0.2629, + "reward": 2.47028648853302, + "reward_std": 0.44121938943862915, + "rewards/accuracy_reward": 0.645833358168602, + "rewards/reasoning_steps_reward": 0.9305555820465088, + "rewards/repetition_penalty_reward": -0.05401917174458504, + "rewards/tag_count_reward": 0.9479166865348816, "step": 499 }, { "clip_ratio": 0.0, - "completion_length": 539.4375, - "epoch": 0.5, - "grad_norm": 13.625300381502933, - "kl": 3.3359375, - "learning_rate": 6.281416799501187e-07, - "loss": 0.3615, - "reward": 2.6264405250549316, - "reward_std": 0.5198075622320175, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.01939287781715393, - "rewards/tag_count_reward": 0.8750000298023224, + "completion_length": 440.31251525878906, + "epoch": 0.25, + "grad_norm": 20.96572788542454, + "kl": 2.51171875, + "learning_rate": 9.397114317029974e-07, + "loss": 0.4272, + "reward": 2.3335598707199097, + "reward_std": 0.625691831111908, + "rewards/accuracy_reward": 0.5625000298023224, + "rewards/reasoning_steps_reward": 0.8958333730697632, + "rewards/repetition_penalty_reward": -0.02581525407731533, + "rewards/tag_count_reward": 0.9010416865348816, "step": 500 }, { "clip_ratio": 0.0, - "completion_length": 527.1041870117188, - "epoch": 0.501, - "grad_norm": 14.175570746863878, - "kl": 4.4375, - "learning_rate": 6.265942746247146e-07, - "loss": 0.5509, - "reward": 2.34303081035614, - "reward_std": 0.5519561469554901, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/reasoning_steps_reward": 0.9375000596046448, - "rewards/repetition_penalty_reward": -0.021552613005042076, - "rewards/tag_count_reward": 0.8854166865348816, + "completion_length": 325.9583435058594, + "epoch": 0.2505, + "grad_norm": 4.461987602762651, + "kl": 0.57421875, + "learning_rate": 9.393181392563669e-07, + "loss": 0.0403, + "reward": 2.7603014707565308, + "reward_std": 0.20193683356046677, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.03136515896767378, + "rewards/tag_count_reward": 0.9791666865348816, "step": 501 }, { "clip_ratio": 0.0, - "completion_length": 698.0625305175781, - "epoch": 0.502, - "grad_norm": 11.766159336622879, - "kl": 5.6875, - "learning_rate": 6.25045936022246e-07, - "loss": 0.9051, - "reward": 2.2460073232650757, - "reward_std": 0.7623308002948761, - "rewards/accuracy_reward": 0.5208333432674408, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.019617782905697823, - "rewards/tag_count_reward": 0.765625, + "completion_length": 325.9583435058594, + "epoch": 0.251, + "grad_norm": 3.5280127016605, + "kl": 0.2861328125, + "learning_rate": 9.38923660879167e-07, + "loss": 0.0561, + "reward": 2.8909952640533447, + "reward_std": 0.14273904263973236, + "rewards/accuracy_reward": 0.9583333432674408, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.03435206413269043, + "rewards/tag_count_reward": 0.9947916865348816, "step": 502 }, { "clip_ratio": 0.0, - "completion_length": 680.4375305175781, - "epoch": 0.503, - "grad_norm": 29.510968471399966, - "kl": 7.453125, - "learning_rate": 6.2349668300873e-07, - "loss": 1.0744, - "reward": 2.1472885608673096, - "reward_std": 0.7113551497459412, - "rewards/accuracy_reward": 0.4583333432674408, - "rewards/reasoning_steps_reward": 0.972222238779068, - "rewards/repetition_penalty_reward": -0.03847557958215475, - "rewards/tag_count_reward": 0.7552083432674408, + "completion_length": 347.4583435058594, + "epoch": 0.2515, + "grad_norm": 12.53255748366457, + "kl": 1.65576171875, + "learning_rate": 9.385279977730472e-07, + "loss": 0.2711, + "reward": 2.432243227958679, + "reward_std": 0.06772075779736042, + "rewards/accuracy_reward": 0.5, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.033034625463187695, + "rewards/tag_count_reward": 0.9791666865348816, "step": 503 }, { "clip_ratio": 0.0, - "completion_length": 477.1041717529297, - "epoch": 0.504, - "grad_norm": 14.23791373020675, - "kl": 3.9609375, - "learning_rate": 6.219465344613258e-07, - "loss": 0.6783, - "reward": 2.5481714010238647, - "reward_std": 0.5729511976242065, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.012592657934874296, - "rewards/tag_count_reward": 0.8593750298023224, + "completion_length": 365.0, + "epoch": 0.252, + "grad_norm": 12.118654747376015, + "kl": 1.572265625, + "learning_rate": 9.381311511432658e-07, + "loss": 0.1095, + "reward": 2.758858323097229, + "reward_std": 0.3722696304321289, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.025863975286483765, + "rewards/tag_count_reward": 0.9583333432674408, "step": 504 }, { "clip_ratio": 0.0, - "completion_length": 497.35418701171875, - "epoch": 0.505, - "grad_norm": 11.708756841965316, - "kl": 3.3359375, - "learning_rate": 6.203955092681039e-07, - "loss": 0.4639, - "reward": 2.479733467102051, - "reward_std": 0.38708843290805817, - "rewards/accuracy_reward": 0.625, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.022002640645951033, - "rewards/tag_count_reward": 0.890625, + "completion_length": 332.56251525878906, + "epoch": 0.2525, + "grad_norm": 12.675358816220538, + "kl": 0.609375, + "learning_rate": 9.377331221986866e-07, + "loss": 0.1461, + "reward": 2.7681565284729004, + "reward_std": 0.3903198540210724, + "rewards/accuracy_reward": 0.8750000298023224, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.04087108187377453, + "rewards/tag_count_reward": 0.96875, "step": 505 }, { "clip_ratio": 0.0, - "completion_length": 681.0625305175781, - "epoch": 0.506, - "grad_norm": 18.2185049016805, - "kl": 5.5390625, - "learning_rate": 6.188436263278172e-07, - "loss": 0.7, - "reward": 2.064489424228668, - "reward_std": 0.6310321092605591, - "rewards/accuracy_reward": 0.3333333432674408, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.022316260263323784, - "rewards/tag_count_reward": 0.7812500298023224, + "completion_length": 366.9166717529297, + "epoch": 0.253, + "grad_norm": 8.279601199477703, + "kl": 0.5146484375, + "learning_rate": 9.373339121517746e-07, + "loss": 0.0372, + "reward": 2.770949602127075, + "reward_std": 0.34754781424999237, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.046758661046624184, + "rewards/tag_count_reward": 0.984375, "step": 506 }, { "clip_ratio": 0.0, - "completion_length": 550.75, - "epoch": 0.507, - "grad_norm": 17.7803570241784, - "kl": 3.546875, - "learning_rate": 6.172909045496694e-07, - "loss": 0.8123, - "reward": 2.2492624521255493, - "reward_std": 0.5181691646575928, - "rewards/accuracy_reward": 0.4166666716337204, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.02330716885626316, - "rewards/tag_count_reward": 0.8697916865348816, + "completion_length": 474.625, + "epoch": 0.2535, + "grad_norm": 12.183520807470723, + "kl": 1.2109375, + "learning_rate": 9.36933522218593e-07, + "loss": 0.2878, + "reward": 2.3224886655807495, + "reward_std": 0.6696373820304871, + "rewards/accuracy_reward": 0.4791666716337204, + "rewards/reasoning_steps_reward": 0.9375000596046448, + "rewards/repetition_penalty_reward": -0.026469644159078598, + "rewards/tag_count_reward": 0.9322916865348816, "step": 507 }, { "clip_ratio": 0.0, - "completion_length": 807.4375305175781, - "epoch": 0.508, - "grad_norm": 42.53879608641054, - "kl": 6.71875, - "learning_rate": 6.157373628530852e-07, - "loss": 0.9312, - "reward": 2.1224422454833984, - "reward_std": 0.8008854687213898, - "rewards/accuracy_reward": 0.4583333432674408, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.06505792587995529, - "rewards/tag_count_reward": 0.75, + "completion_length": 355.1458435058594, + "epoch": 0.254, + "grad_norm": 17.57821101350406, + "kl": 0.7890625, + "learning_rate": 9.36531953618799e-07, + "loss": 0.2123, + "reward": 2.774104595184326, + "reward_std": 0.31567811965942383, + "rewards/accuracy_reward": 0.8333333730697632, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.026242737658321857, + "rewards/tag_count_reward": 0.9739583730697632, "step": 508 }, { "clip_ratio": 0.0, - "completion_length": 703.625, - "epoch": 0.509, - "grad_norm": 38.46406571699496, - "kl": 5.875, - "learning_rate": 6.141830201674802e-07, - "loss": 1.0649, - "reward": 2.162044405937195, - "reward_std": 0.7470089793205261, - "rewards/accuracy_reward": 0.520833358168602, - "rewards/reasoning_steps_reward": 0.9097223281860352, - "rewards/repetition_penalty_reward": -0.04976117052137852, - "rewards/tag_count_reward": 0.78125, + "completion_length": 477.3333435058594, + "epoch": 0.2545, + "grad_norm": 20.44793363137805, + "kl": 2.091796875, + "learning_rate": 9.361292075756401e-07, + "loss": 0.4205, + "reward": 2.6159251928329468, + "reward_std": 0.4438074082136154, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.03338045813143253, + "rewards/tag_count_reward": 0.9270833730697632, "step": 509 }, { "clip_ratio": 0.0, - "completion_length": 537.3541870117188, - "epoch": 0.51, - "grad_norm": 23.200007837390707, - "kl": 2.9609375, - "learning_rate": 6.126278954320294e-07, - "loss": 0.8415, - "reward": 2.5392757654190063, - "reward_std": 0.7050909399986267, - "rewards/accuracy_reward": 0.7083333730697632, - "rewards/reasoning_steps_reward": 0.9513889849185944, - "rewards/repetition_penalty_reward": -0.02148817665874958, - "rewards/tag_count_reward": 0.9010416865348816, + "completion_length": 320.56251525878906, + "epoch": 0.255, + "grad_norm": 7.717284653235476, + "kl": 0.5712890625, + "learning_rate": 9.357252853159505e-07, + "loss": 0.0217, + "reward": 2.717986583709717, + "reward_std": 0.2879996597766876, + "rewards/accuracy_reward": 0.7708333730697632, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.03895781189203262, + "rewards/tag_count_reward": 1.0, "step": 510 }, { "clip_ratio": 0.0, - "completion_length": 605.9375152587891, - "epoch": 0.511, - "grad_norm": 12.217834862993339, - "kl": 4.390625, - "learning_rate": 6.11072007595437e-07, - "loss": 0.5492, - "reward": 2.281681537628174, - "reward_std": 0.5939317345619202, - "rewards/accuracy_reward": 0.5208333432674408, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.018665821757167578, - "rewards/tag_count_reward": 0.8072916865348816, + "completion_length": 503.25001525878906, + "epoch": 0.2555, + "grad_norm": 23.04840840134793, + "kl": 2.5546875, + "learning_rate": 9.353201880701477e-07, + "loss": 0.7477, + "reward": 2.436654567718506, + "reward_std": 0.581574410200119, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.039039863273501396, + "rewards/tag_count_reward": 0.9270833432674408, "step": 511 }, { "clip_ratio": 0.0, - "completion_length": 659.0000305175781, - "epoch": 0.512, - "grad_norm": 11.142187913544113, - "kl": 5.203125, - "learning_rate": 6.095153756157051e-07, - "loss": 0.8536, - "reward": 2.404225468635559, - "reward_std": 0.7626966536045074, - "rewards/accuracy_reward": 0.6458333432674408, - "rewards/reasoning_steps_reward": 0.9722222089767456, - "rewards/repetition_penalty_reward": -0.01591357495635748, - "rewards/tag_count_reward": 0.8020833730697632, + "completion_length": 369.75001525878906, + "epoch": 0.256, + "grad_norm": 6.140144067454673, + "kl": 1.4453125, + "learning_rate": 9.34913917072228e-07, + "loss": 0.1252, + "reward": 2.3770639896392822, + "reward_std": 0.4367944300174713, + "rewards/accuracy_reward": 0.4791666865348816, + "rewards/reasoning_steps_reward": 0.9652778506278992, + "rewards/repetition_penalty_reward": -0.04654715396463871, + "rewards/tag_count_reward": 0.9791666865348816, "step": 512 }, { "clip_ratio": 0.0, - "completion_length": 482.1875, - "epoch": 0.513, - "grad_norm": 27.609945341526714, - "kl": 2.7734375, - "learning_rate": 6.079580184599032e-07, - "loss": 0.5479, - "reward": 2.167789936065674, - "reward_std": 0.3920954018831253, - "rewards/accuracy_reward": 0.27083333395421505, - "rewards/reasoning_steps_reward": 0.9791666567325592, - "rewards/repetition_penalty_reward": -0.019710074178874493, + "completion_length": 412.6458435058594, + "epoch": 0.2565, + "grad_norm": 14.724218719110599, + "kl": 1.90625, + "learning_rate": 9.345064735597633e-07, + "loss": 0.2505, + "reward": 2.666745901107788, + "reward_std": 0.5124035775661469, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.9583334028720856, + "rewards/repetition_penalty_reward": -0.04158730432391167, "rewards/tag_count_reward": 0.9375000298023224, "step": 513 }, { "clip_ratio": 0.0, - "completion_length": 602.0208435058594, - "epoch": 0.514, - "grad_norm": 21.276603242655675, - "kl": 4.65625, - "learning_rate": 6.06399955103937e-07, - "loss": 0.9838, - "reward": 2.233444333076477, - "reward_std": 0.626254141330719, - "rewards/accuracy_reward": 0.4166666716337204, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.03218065481632948, - "rewards/tag_count_reward": 0.8697916865348816, + "completion_length": 344.0833435058594, + "epoch": 0.257, + "grad_norm": 12.147136815173782, + "kl": 2.13671875, + "learning_rate": 9.340978587738972e-07, + "loss": 0.2299, + "reward": 2.7790675163269043, + "reward_std": 0.28115659207105637, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.042113007977604866, + "rewards/tag_count_reward": 0.9739583432674408, "step": 514 }, { "clip_ratio": 0.0, - "completion_length": 548.7708587646484, - "epoch": 0.515, - "grad_norm": 8.99915341715262, - "kl": 2.984375, - "learning_rate": 6.048412045323164e-07, - "loss": 0.5247, - "reward": 2.4330430030822754, - "reward_std": 0.4966660887002945, - "rewards/accuracy_reward": 0.5833333432674408, - "rewards/reasoning_steps_reward": 0.9652778208255768, - "rewards/repetition_penalty_reward": -0.02702661231160164, - "rewards/tag_count_reward": 0.9114583432674408, + "completion_length": 465.04168701171875, + "epoch": 0.2575, + "grad_norm": 42.65752880873406, + "kl": 4.375, + "learning_rate": 9.336880739593415e-07, + "loss": 0.3814, + "reward": 2.218036413192749, + "reward_std": 0.5327357053756714, + "rewards/accuracy_reward": 0.3333333358168602, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.021547013893723488, + "rewards/tag_count_reward": 0.9270833730697632, "step": 515 }, { "clip_ratio": 0.0, - "completion_length": 607.3125305175781, - "epoch": 0.516, - "grad_norm": 12.744716558501036, - "kl": 4.21875, - "learning_rate": 6.032817857379256e-07, - "loss": 0.6478, - "reward": 2.4581196308135986, - "reward_std": 0.674341470003128, - "rewards/accuracy_reward": 0.6458333730697632, - "rewards/reasoning_steps_reward": 0.9861111044883728, - "rewards/repetition_penalty_reward": -0.017574850469827652, - "rewards/tag_count_reward": 0.8437500298023224, + "completion_length": 355.1458435058594, + "epoch": 0.258, + "grad_norm": 12.695454347179494, + "kl": 2.56640625, + "learning_rate": 9.332771203643714e-07, + "loss": 0.3547, + "reward": 2.7579511404037476, + "reward_std": 0.3366604894399643, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.023298936896026134, + "rewards/tag_count_reward": 0.9270833432674408, "step": 516 }, { "clip_ratio": 0.0, - "completion_length": 550.0833587646484, - "epoch": 0.517, - "grad_norm": 25.040001378142637, - "kl": 3.984375, - "learning_rate": 6.017217177217899e-07, - "loss": 0.8487, - "reward": 2.251497983932495, - "reward_std": 0.6527323424816132, - "rewards/accuracy_reward": 0.4791666865348816, - "rewards/reasoning_steps_reward": 0.9652778506278992, - "rewards/repetition_penalty_reward": -0.015863300301134586, - "rewards/tag_count_reward": 0.8229166865348816, + "completion_length": 314.2708435058594, + "epoch": 0.2585, + "grad_norm": 8.577846989702975, + "kl": 0.6416015625, + "learning_rate": 9.328649992408231e-07, + "loss": 0.0241, + "reward": 2.9229161739349365, + "reward_std": 0.11198093183338642, + "rewards/accuracy_reward": 0.9791666865348816, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.042361509054899216, + "rewards/tag_count_reward": 1.0, "step": 517 }, { "clip_ratio": 0.0, - "completion_length": 413.8333435058594, - "epoch": 0.518, - "grad_norm": 25.626709792783128, - "kl": 2.48046875, - "learning_rate": 6.001610194928464e-07, - "loss": 0.4396, - "reward": 2.7058017253875732, - "reward_std": 0.5824707746505737, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 0.9444445371627808, - "rewards/repetition_penalty_reward": -0.019892807584255934, - "rewards/tag_count_reward": 0.9270833730697632, + "completion_length": 386.1458435058594, + "epoch": 0.259, + "grad_norm": 17.4553779455926, + "kl": 2.1640625, + "learning_rate": 9.324517118440888e-07, + "loss": 0.4768, + "reward": 2.671400547027588, + "reward_std": 0.4989718794822693, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.9444444477558136, + "rewards/repetition_penalty_reward": -0.02825235854834318, + "rewards/tag_count_reward": 0.9427083730697632, "step": 518 }, { "clip_ratio": 0.0, - "completion_length": 741.0833435058594, - "epoch": 0.519, - "grad_norm": 59.3566513287147, - "kl": 8.140625, - "learning_rate": 5.985997100677103e-07, - "loss": 1.4268, - "reward": 2.335380792617798, - "reward_std": 0.8169020414352417, - "rewards/accuracy_reward": 0.6250000298023224, - "rewards/reasoning_steps_reward": 0.9583333432674408, - "rewards/repetition_penalty_reward": -0.03441094420850277, - "rewards/tag_count_reward": 0.7864583432674408, + "completion_length": 299.3333435058594, + "epoch": 0.2595, + "grad_norm": 11.830503067220192, + "kl": 1.0546875, + "learning_rate": 9.320372594331137e-07, + "loss": 0.0964, + "reward": 2.7533164024353027, + "reward_std": 0.3934124857187271, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.03140602447092533, + "rewards/tag_count_reward": 1.0, "step": 519 }, { "clip_ratio": 0.0, - "completion_length": 692.6875, - "epoch": 0.52, - "grad_norm": 30.718619323707678, - "kl": 7.515625, - "learning_rate": 5.97037808470444e-07, - "loss": 0.9893, - "reward": 1.9578059911727905, - "reward_std": 0.5474497377872467, - "rewards/accuracy_reward": 0.2083333432674408, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.014416326768696308, - "rewards/tag_count_reward": 0.7916666865348816, + "completion_length": 347.8333435058594, + "epoch": 0.26, + "grad_norm": 6.457880743365504, + "kl": 0.3310546875, + "learning_rate": 9.316216432703916e-07, + "loss": -0.0572, + "reward": 2.5639514923095703, + "reward_std": 0.2705356106162071, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.04021530598402023, + "rewards/tag_count_reward": 1.0, "step": 520 }, { "clip_ratio": 0.0, - "completion_length": 486.4166717529297, - "epoch": 0.521, - "grad_norm": 18.357835870055244, - "kl": 4.421875, - "learning_rate": 5.954753337323259e-07, - "loss": 0.4358, - "reward": 2.387876033782959, - "reward_std": 0.5994940996170044, - "rewards/accuracy_reward": 0.5625000298023224, - "rewards/reasoning_steps_reward": 0.9722222089767456, - "rewards/repetition_penalty_reward": -0.02184621151536703, - "rewards/tag_count_reward": 0.8750000298023224, + "completion_length": 315.2916717529297, + "epoch": 0.2605, + "grad_norm": 6.392279281489095, + "kl": 0.775390625, + "learning_rate": 9.312048646219617e-07, + "loss": 0.1502, + "reward": 2.8109084367752075, + "reward_std": 0.21814071387052536, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.03978624939918518, + "rewards/tag_count_reward": 0.96875, "step": 521 }, { "clip_ratio": 0.0, - "completion_length": 589.25, - "epoch": 0.522, - "grad_norm": 23.01587053197016, - "kl": 4.734375, - "learning_rate": 5.939123048916173e-07, - "loss": 0.7259, - "reward": 2.393532395362854, - "reward_std": 0.7403541803359985, - "rewards/accuracy_reward": 0.6250000298023224, - "rewards/reasoning_steps_reward": 0.979166716337204, - "rewards/repetition_penalty_reward": -0.028342580422759056, - "rewards/tag_count_reward": 0.8177083730697632, + "completion_length": 343.22918701171875, + "epoch": 0.261, + "grad_norm": 6.132812824609326, + "kl": 0.546875, + "learning_rate": 9.307869247574038e-07, + "loss": 0.0556, + "reward": 2.6175711154937744, + "reward_std": 0.26942718029022217, + "rewards/accuracy_reward": 0.645833358168602, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.02826231624931097, + "rewards/tag_count_reward": 1.0, "step": 522 }, { "clip_ratio": 0.0, - "completion_length": 502.4791717529297, - "epoch": 0.523, - "grad_norm": 81.64058877460842, - "kl": 4.53125, - "learning_rate": 5.923487409933315e-07, - "loss": 0.9372, - "reward": 2.3321489691734314, - "reward_std": 0.47031281888484955, - "rewards/accuracy_reward": 0.4791666679084301, - "rewards/reasoning_steps_reward": 0.9652777910232544, - "rewards/repetition_penalty_reward": -0.018545696511864662, - "rewards/tag_count_reward": 0.90625, + "completion_length": 304.93751525878906, + "epoch": 0.2615, + "grad_norm": 4.409570648820208, + "kl": 0.3115234375, + "learning_rate": 9.303678249498352e-07, + "loss": 0.0109, + "reward": 2.9457967281341553, + "reward_std": 0.0824959184974432, + "rewards/accuracy_reward": 0.9791666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.033369969576597214, + "rewards/tag_count_reward": 1.0, "step": 523 }, { "clip_ratio": 0.0, - "completion_length": 536.5416870117188, - "epoch": 0.524, - "grad_norm": 15.605773200090558, - "kl": 4.109375, - "learning_rate": 5.907846610890011e-07, - "loss": 0.5794, - "reward": 2.2709691524505615, - "reward_std": 0.644355833530426, - "rewards/accuracy_reward": 0.520833358168602, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.01375302067026496, - "rewards/tag_count_reward": 0.7916666865348816, + "completion_length": 335.1458435058594, + "epoch": 0.262, + "grad_norm": 10.6806703810238, + "kl": 0.740234375, + "learning_rate": 9.299475664759068e-07, + "loss": 0.1467, + "reward": 2.5661017894744873, + "reward_std": 0.3743235468864441, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.03632892295718193, + "rewards/tag_count_reward": 0.9635416865348816, "step": 524 }, { "clip_ratio": 0.0, - "completion_length": 655.5833740234375, - "epoch": 0.525, - "grad_norm": 13.665522547941404, - "kl": 5.3125, - "learning_rate": 5.892200842364462e-07, - "loss": 0.839, - "reward": 2.3515175580978394, - "reward_std": 0.7731278240680695, - "rewards/accuracy_reward": 0.6250000298023224, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.028690868988633156, - "rewards/tag_count_reward": 0.7760416865348816, + "completion_length": 346.7916717529297, + "epoch": 0.2625, + "grad_norm": 7.022946103666297, + "kl": 0.4912109375, + "learning_rate": 9.295261506157985e-07, + "loss": -0.0496, + "reward": 2.487242579460144, + "reward_std": 0.4531702846288681, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.028382405638694763, + "rewards/tag_count_reward": 0.9947916865348816, "step": 525 }, { "clip_ratio": 0.0, - "completion_length": 641.8750305175781, - "epoch": 0.526, - "grad_norm": 9.492149650000716, - "kl": 3.7890625, - "learning_rate": 5.87655029499542e-07, - "loss": 0.7068, - "reward": 2.473028063774109, - "reward_std": 0.6603610515594482, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/reasoning_steps_reward": 0.9652778208255768, - "rewards/repetition_penalty_reward": -0.018291576765477657, - "rewards/tag_count_reward": 0.7760416865348816, + "completion_length": 344.6666717529297, + "epoch": 0.263, + "grad_norm": 5.783429974481017, + "kl": 0.4072265625, + "learning_rate": 9.291035786532163e-07, + "loss": 0.0516, + "reward": 2.7101194858551025, + "reward_std": 0.01704893447458744, + "rewards/accuracy_reward": 0.75, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.03988067805767059, + "rewards/tag_count_reward": 1.0, "step": 526 }, { "clip_ratio": 0.0, - "completion_length": 498.0625, - "epoch": 0.527, - "grad_norm": 9.861486616032105, - "kl": 3.2421875, - "learning_rate": 5.860895159479864e-07, - "loss": 0.5474, - "reward": 2.4325687885284424, - "reward_std": 0.5291797816753387, - "rewards/accuracy_reward": 0.6458333432674408, - "rewards/reasoning_steps_reward": 0.9583334028720856, - "rewards/repetition_penalty_reward": -0.020556333474814892, - "rewards/tag_count_reward": 0.8489583432674408, + "completion_length": 356.72918701171875, + "epoch": 0.2635, + "grad_norm": 5.172841688416704, + "kl": 0.6357421875, + "learning_rate": 9.286798518753878e-07, + "loss": 0.0223, + "reward": 2.5386565923690796, + "reward_std": 0.35626935213804245, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.022107405588030815, + "rewards/tag_count_reward": 0.984375, "step": 527 }, { "clip_ratio": 0.0, - "completion_length": 562.8958587646484, - "epoch": 0.528, - "grad_norm": 11.54211464068912, - "kl": 4.3984375, - "learning_rate": 5.845235626570683e-07, - "loss": 0.6108, - "reward": 2.3765957355499268, - "reward_std": 0.6778049767017365, - "rewards/accuracy_reward": 0.5625, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.01229327404871583, - "rewards/tag_count_reward": 0.8541666865348816, + "completion_length": 335.0625, + "epoch": 0.264, + "grad_norm": 4.496157225131026, + "kl": 0.2734375, + "learning_rate": 9.282549715730579e-07, + "loss": 0.0545, + "reward": 2.803345203399658, + "reward_std": 0.2169586569070816, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.043877096846699715, + "rewards/tag_count_reward": 1.0, "step": 528 }, { "clip_ratio": 0.0, - "completion_length": 580.6041870117188, - "epoch": 0.529, - "grad_norm": 12.470353874580706, - "kl": 4.125, - "learning_rate": 5.829571887074343e-07, - "loss": 0.5239, - "reward": 2.4933345317840576, - "reward_std": 0.509730190038681, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/reasoning_steps_reward": 0.9791666567325592, - "rewards/repetition_penalty_reward": -0.02229050174355507, - "rewards/tag_count_reward": 0.8281250298023224, + "completion_length": 417.2708435058594, + "epoch": 0.2645, + "grad_norm": 6.658335679549266, + "kl": 0.66796875, + "learning_rate": 9.278289390404859e-07, + "loss": 0.0364, + "reward": 2.3210874795913696, + "reward_std": 0.5226413607597351, + "rewards/accuracy_reward": 0.3958333432674408, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.05217655561864376, + "rewards/tag_count_reward": 0.9843750298023224, "step": 529 }, { "clip_ratio": 0.0, - "completion_length": 833.7917175292969, - "epoch": 0.53, - "grad_norm": 18.321092863583786, - "kl": 6.75, - "learning_rate": 5.813904131848564e-07, - "loss": 0.6788, - "reward": 1.967248022556305, - "reward_std": 0.6098497807979584, - "rewards/accuracy_reward": 0.3333333358168602, - "rewards/reasoning_steps_reward": 0.9375000298023224, - "rewards/repetition_penalty_reward": -0.0327520202845335, - "rewards/tag_count_reward": 0.7291666865348816, + "completion_length": 357.1458435058594, + "epoch": 0.265, + "grad_norm": 3.9330369236217146, + "kl": 0.5556640625, + "learning_rate": 9.274017555754407e-07, + "loss": 0.0323, + "reward": 2.767940640449524, + "reward_std": 0.2660471647977829, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9722221791744232, + "rewards/repetition_penalty_reward": -0.03240684233605862, + "rewards/tag_count_reward": 0.9739583432674408, "step": 530 }, { "clip_ratio": 0.0, - "completion_length": 697.2916870117188, - "epoch": 0.531, - "grad_norm": 19.33500544989958, - "kl": 5.546875, - "learning_rate": 5.798232551800002e-07, - "loss": 0.6624, - "reward": 2.311483144760132, - "reward_std": 0.6170333027839661, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/reasoning_steps_reward": 0.9652778208255768, - "rewards/repetition_penalty_reward": -0.018378185108304024, - "rewards/tag_count_reward": 0.8229166865348816, + "completion_length": 328.41668701171875, + "epoch": 0.2655, + "grad_norm": 9.064114909150554, + "kl": 0.44140625, + "learning_rate": 9.269734224791974e-07, + "loss": 0.1061, + "reward": 2.6510632038116455, + "reward_std": 0.4317702651023865, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9583334028720856, + "rewards/repetition_penalty_reward": -0.03643686696887016, + "rewards/tag_count_reward": 0.9583333730697632, "step": 531 }, { "clip_ratio": 0.0, - "completion_length": 734.2916870117188, - "epoch": 0.532, - "grad_norm": 16.207591684417004, - "kl": 4.796875, - "learning_rate": 5.78255733788191e-07, - "loss": 0.8369, - "reward": 2.042443633079529, - "reward_std": 0.6627461314201355, - "rewards/accuracy_reward": 0.354166679084301, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.02873703371733427, - "rewards/tag_count_reward": 0.7447916865348816, + "completion_length": 356.25001525878906, + "epoch": 0.266, + "grad_norm": 8.929164182079782, + "kl": 0.3369140625, + "learning_rate": 9.265439410565328e-07, + "loss": 0.0927, + "reward": 2.6343066692352295, + "reward_std": 0.19984594732522964, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.04624889977276325, + "rewards/tag_count_reward": 1.0, "step": 532 }, { "clip_ratio": 0.0, - "completion_length": 670.3333435058594, - "epoch": 0.533, - "grad_norm": 17.149503798992292, - "kl": 5.671875, - "learning_rate": 5.766878681091828e-07, - "loss": 0.8688, - "reward": 1.991923749446869, - "reward_std": 0.7662231922149658, - "rewards/accuracy_reward": 0.3541666679084301, - "rewards/reasoning_steps_reward": 0.9305556416511536, - "rewards/repetition_penalty_reward": -0.04279853031039238, - "rewards/tag_count_reward": 0.7500000298023224, + "completion_length": 351.93751525878906, + "epoch": 0.2665, + "grad_norm": 12.113702687047534, + "kl": 0.48828125, + "learning_rate": 9.261133126157217e-07, + "loss": 0.1091, + "reward": 2.534152388572693, + "reward_std": 0.40387988090515137, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.05438925884664059, + "rewards/tag_count_reward": 0.9635416865348816, "step": 533 }, { "clip_ratio": 0.0, - "completion_length": 579.8125, - "epoch": 0.534, - "grad_norm": 18.63845179074038, - "kl": 4.59375, - "learning_rate": 5.751196772469237e-07, - "loss": 0.7095, - "reward": 2.073606848716736, - "reward_std": 0.7093684077262878, - "rewards/accuracy_reward": 0.375, - "rewards/reasoning_steps_reward": 0.9722222089767456, - "rewards/repetition_penalty_reward": -0.023615523241460323, - "rewards/tag_count_reward": 0.7500000298023224, + "completion_length": 401.18751525878906, + "epoch": 0.267, + "grad_norm": 11.16084804643565, + "kl": 1.84375, + "learning_rate": 9.256815384685328e-07, + "loss": 0.3193, + "reward": 2.4775713682174683, + "reward_std": 0.46769386529922485, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.03631755895912647, + "rewards/tag_count_reward": 0.9583333432674408, "step": 534 }, { "clip_ratio": 0.0, - "completion_length": 618.0, - "epoch": 0.535, - "grad_norm": 26.379982268973194, - "kl": 4.984375, - "learning_rate": 5.735511803093248e-07, - "loss": 0.8534, - "reward": 2.1693702936172485, - "reward_std": 0.578357994556427, - "rewards/accuracy_reward": 0.4375000298023224, - "rewards/reasoning_steps_reward": 0.9375000596046448, - "rewards/repetition_penalty_reward": -0.0389630775898695, - "rewards/tag_count_reward": 0.8333333432674408, + "completion_length": 492.7708435058594, + "epoch": 0.2675, + "grad_norm": 31.0072808202828, + "kl": 3.375, + "learning_rate": 9.252486199302256e-07, + "loss": 0.4648, + "reward": 2.288800835609436, + "reward_std": 0.569076657295227, + "rewards/accuracy_reward": 0.4583333432674408, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.02890750952064991, + "rewards/tag_count_reward": 0.8802083432674408, "step": 535 }, { "clip_ratio": 0.0, - "completion_length": 630.9791870117188, - "epoch": 0.536, - "grad_norm": 16.107255305572483, - "kl": 3.328125, - "learning_rate": 5.71982396408026e-07, - "loss": 0.6713, - "reward": 2.3328845500946045, - "reward_std": 0.762814462184906, - "rewards/accuracy_reward": 0.5625000298023224, - "rewards/reasoning_steps_reward": 0.951388955116272, - "rewards/repetition_penalty_reward": -0.024754411540925503, - "rewards/tag_count_reward": 0.8437500298023224, + "completion_length": 418.3541717529297, + "epoch": 0.268, + "grad_norm": 21.076285472286582, + "kl": 2.7109375, + "learning_rate": 9.248145583195447e-07, + "loss": 0.3675, + "reward": 2.3963736295700073, + "reward_std": 0.5206663012504578, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9305555820465088, + "rewards/repetition_penalty_reward": -0.05501541867852211, + "rewards/tag_count_reward": 0.9375000298023224, "step": 536 }, { "clip_ratio": 0.0, - "completion_length": 644.5416870117188, - "epoch": 0.537, - "grad_norm": 9.431209455223057, - "kl": 4.125, - "learning_rate": 5.704133446581642e-07, - "loss": 0.5649, - "reward": 2.1833406686782837, - "reward_std": 0.6518435776233673, - "rewards/accuracy_reward": 0.4166666716337204, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.023256704211235046, - "rewards/tag_count_reward": 0.796875, - "step": 537 - }, + "completion_length": 446.3958435058594, + "epoch": 0.2685, + "grad_norm": 21.190596871719485, + "kl": 3.33203125, + "learning_rate": 9.243793549587171e-07, + "loss": 0.4056, + "reward": 2.491150140762329, + "reward_std": 0.5588173568248749, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.027947167865931988, + "rewards/tag_count_reward": 0.9010416865348816, + "step": 537 + }, { "clip_ratio": 0.0, - "completion_length": 521.5625, - "epoch": 0.538, - "grad_norm": 13.691614334173481, - "kl": 2.828125, - "learning_rate": 5.688440441781398e-07, - "loss": 0.532, - "reward": 2.5351574420928955, - "reward_std": 0.5137946009635925, - "rewards/accuracy_reward": 0.6875000298023224, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.02213435433804989, - "rewards/tag_count_reward": 0.8906250298023224, + "completion_length": 453.68751525878906, + "epoch": 0.269, + "grad_norm": 15.39662311115678, + "kl": 3.171875, + "learning_rate": 9.239430111734476e-07, + "loss": 0.3442, + "reward": 2.3229269981384277, + "reward_std": 0.5908633470535278, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 0.9305556118488312, + "rewards/repetition_penalty_reward": -0.0347119364887476, + "rewards/tag_count_reward": 0.9062500298023224, "step": 538 }, { "clip_ratio": 0.0, - "completion_length": 504.20835876464844, - "epoch": 0.539, - "grad_norm": 15.514611157640976, - "kl": 3.296875, - "learning_rate": 5.672745140893839e-07, - "loss": 0.4842, - "reward": 2.3555127382278442, - "reward_std": 0.6762471795082092, - "rewards/accuracy_reward": 0.5625, - "rewards/reasoning_steps_reward": 0.9513889253139496, - "rewards/repetition_penalty_reward": -0.017751268576830626, - "rewards/tag_count_reward": 0.8593750298023224, + "completion_length": 402.1458435058594, + "epoch": 0.2695, + "grad_norm": 16.818966771875186, + "kl": 1.31640625, + "learning_rate": 9.235055282929153e-07, + "loss": 0.3005, + "reward": 2.69978666305542, + "reward_std": 0.5072591304779053, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.025908033829182386, + "rewards/tag_count_reward": 0.9479166865348816, "step": 539 }, { "clip_ratio": 0.0, - "completion_length": 500.2083435058594, - "epoch": 0.54, - "grad_norm": 33.71631385555809, - "kl": 4.0390625, - "learning_rate": 5.657047735161255e-07, - "loss": 0.5623, - "reward": 2.4260315895080566, - "reward_std": 0.7543806433677673, - "rewards/accuracy_reward": 0.6458333432674408, - "rewards/reasoning_steps_reward": 0.9513889253139496, - "rewards/repetition_penalty_reward": -0.02014908567070961, - "rewards/tag_count_reward": 0.8489583730697632, + "completion_length": 387.8125, + "epoch": 0.27, + "grad_norm": 12.06656265047348, + "kl": 0.79296875, + "learning_rate": 9.230669076497687e-07, + "loss": 0.2646, + "reward": 2.6891995668411255, + "reward_std": 0.4540497958660126, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.9722223281860352, + "rewards/repetition_penalty_reward": -0.038231078535318375, + "rewards/tag_count_reward": 0.9427083432674408, "step": 540 }, { "clip_ratio": 0.0, - "completion_length": 645.2083435058594, - "epoch": 0.541, - "grad_norm": 18.05553096101935, - "kl": 5.75, - "learning_rate": 5.641348415851577e-07, - "loss": 0.9497, - "reward": 2.2081546783447266, - "reward_std": 0.5377081632614136, - "rewards/accuracy_reward": 0.4375000149011612, - "rewards/reasoning_steps_reward": 0.9652778208255768, - "rewards/repetition_penalty_reward": -0.017539918422698975, - "rewards/tag_count_reward": 0.8229166865348816, + "completion_length": 411.1458435058594, + "epoch": 0.2705, + "grad_norm": 10.66189870366724, + "kl": 0.7666015625, + "learning_rate": 9.226271505801224e-07, + "loss": 0.2545, + "reward": 2.7011055946350098, + "reward_std": 0.16866168193519115, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.05410290136933327, + "rewards/tag_count_reward": 0.9635416865348816, "step": 541 }, { "clip_ratio": 0.0, - "completion_length": 739.2291870117188, - "epoch": 0.542, - "grad_norm": 16.273564979773667, - "kl": 7.109375, - "learning_rate": 5.625647374256061e-07, - "loss": 0.902, - "reward": 2.1837064027786255, - "reward_std": 0.6955267190933228, - "rewards/accuracy_reward": 0.5208333432674408, - "rewards/reasoning_steps_reward": 0.9583333432674408, - "rewards/repetition_penalty_reward": -0.014210469089448452, - "rewards/tag_count_reward": 0.71875, + "completion_length": 433.5625, + "epoch": 0.271, + "grad_norm": 8.633799282198257, + "kl": 1.1171875, + "learning_rate": 9.221862584235526e-07, + "loss": 0.1229, + "reward": 2.806559443473816, + "reward_std": 0.2913419157266617, + "rewards/accuracy_reward": 0.8750000298023224, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.035454532131552696, + "rewards/tag_count_reward": 0.9739583730697632, "step": 542 }, { "clip_ratio": 0.0, - "completion_length": 586.3125, - "epoch": 0.543, - "grad_norm": 13.748732846527533, - "kl": 3.7109375, - "learning_rate": 5.60994480168694e-07, - "loss": 0.8548, - "reward": 2.5483158826828003, - "reward_std": 0.593627005815506, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.024600773118436337, - "rewards/tag_count_reward": 0.8645833432674408, + "completion_length": 450.375, + "epoch": 0.2715, + "grad_norm": 12.521383665249783, + "kl": 2.5703125, + "learning_rate": 9.217442325230936e-07, + "loss": 0.3894, + "reward": 2.494984745979309, + "reward_std": 0.5038753598928452, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.03626537322998047, + "rewards/tag_count_reward": 0.9270833432674408, "step": 543 }, { "clip_ratio": 0.0, - "completion_length": 609.5416870117188, - "epoch": 0.544, - "grad_norm": 17.36578790885452, - "kl": 6.140625, - "learning_rate": 5.594240889475106e-07, - "loss": 1.0892, - "reward": 2.431919813156128, - "reward_std": 0.8149698972702026, - "rewards/accuracy_reward": 0.75, - "rewards/reasoning_steps_reward": 0.8958333730697632, - "rewards/repetition_penalty_reward": -0.021205355413258076, - "rewards/tag_count_reward": 0.8072916865348816, + "completion_length": 462.29168701171875, + "epoch": 0.272, + "grad_norm": 13.38780899364238, + "kl": 2.8515625, + "learning_rate": 9.213010742252327e-07, + "loss": 0.4016, + "reward": 2.420200288295746, + "reward_std": 0.4867652505636215, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.930555522441864, + "rewards/repetition_penalty_reward": -0.025980187579989433, + "rewards/tag_count_reward": 0.9114583432674408, "step": 544 }, { "clip_ratio": 0.0, - "completion_length": 527.9791870117188, - "epoch": 0.545, - "grad_norm": 30.417556701818675, - "kl": 2.8984375, - "learning_rate": 5.578535828967777e-07, - "loss": 0.8559, - "reward": 2.590558648109436, - "reward_std": 0.6466452777385712, - "rewards/accuracy_reward": 0.75, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.030969200655817986, - "rewards/tag_count_reward": 0.8854166865348816, + "completion_length": 420.22918701171875, + "epoch": 0.2725, + "grad_norm": 29.596911334695022, + "kl": 2.2265625, + "learning_rate": 9.208567848799069e-07, + "loss": 0.4394, + "reward": 2.488977313041687, + "reward_std": 0.3840087577700615, + "rewards/accuracy_reward": 0.625, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.040536582469940186, + "rewards/tag_count_reward": 0.9531250298023224, "step": 545 }, { "clip_ratio": 0.0, - "completion_length": 633.8958435058594, - "epoch": 0.546, - "grad_norm": 15.846035378270587, - "kl": 6.0234375, - "learning_rate": 5.562829811526154e-07, - "loss": 1.0087, - "reward": 2.4130187034606934, - "reward_std": 0.680559515953064, - "rewards/accuracy_reward": 0.6666666716337204, - "rewards/reasoning_steps_reward": 0.944444477558136, - "rewards/repetition_penalty_reward": -0.01580093428492546, - "rewards/tag_count_reward": 0.8177083432674408, + "completion_length": 510.3541717529297, + "epoch": 0.273, + "grad_norm": 26.31735095796572, + "kl": 4.09375, + "learning_rate": 9.204113658404989e-07, + "loss": 0.706, + "reward": 2.371821641921997, + "reward_std": 0.6036520600318909, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.02574794925749302, + "rewards/tag_count_reward": 0.8906250298023224, "step": 546 }, { "clip_ratio": 0.0, - "completion_length": 432.8958435058594, - "epoch": 0.547, - "grad_norm": 11.403437149970372, - "kl": 1.345703125, - "learning_rate": 5.547123028523106e-07, - "loss": 0.0869, - "reward": 2.532930016517639, - "reward_std": 0.4439847320318222, - "rewards/accuracy_reward": 0.6041666865348816, - "rewards/reasoning_steps_reward": 0.9861111044883728, - "rewards/repetition_penalty_reward": -0.03130619879812002, - "rewards/tag_count_reward": 0.9739583432674408, + "completion_length": 454.18751525878906, + "epoch": 0.2735, + "grad_norm": 30.066527015445274, + "kl": 4.25, + "learning_rate": 9.199648184638318e-07, + "loss": 0.3264, + "reward": 2.3577940464019775, + "reward_std": 0.4556938707828522, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9444444477558136, + "rewards/repetition_penalty_reward": -0.05019213631749153, + "rewards/tag_count_reward": 0.8802083432674408, "step": 547 }, { "clip_ratio": 0.0, - "completion_length": 646.3750305175781, - "epoch": 0.548, - "grad_norm": 67.38569099061115, - "kl": 7.3125, - "learning_rate": 5.531415671340826e-07, - "loss": 1.0126, - "reward": 2.4480226039886475, - "reward_std": 0.7663466334342957, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/reasoning_steps_reward": 0.944444477558136, - "rewards/repetition_penalty_reward": -0.02767193131148815, - "rewards/tag_count_reward": 0.8229166865348816, + "completion_length": 548.5, + "epoch": 0.274, + "grad_norm": 12.579666635260093, + "kl": 2.2412109375, + "learning_rate": 9.195171441101668e-07, + "loss": 0.2961, + "reward": 2.1001862287521362, + "reward_std": 0.2963993363082409, + "rewards/accuracy_reward": 0.2291666716337204, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.030022156424820423, + "rewards/tag_count_reward": 0.9010416865348816, "step": 548 }, { "clip_ratio": 0.0, - "completion_length": 534.4166717529297, - "epoch": 0.549, - "grad_norm": 15.126469736221692, - "kl": 3.61328125, - "learning_rate": 5.515707931368507e-07, - "loss": 0.4266, - "reward": 2.385190963745117, - "reward_std": 0.47373223304748535, - "rewards/accuracy_reward": 0.5208333432674408, - "rewards/reasoning_steps_reward": 0.972222238779068, - "rewards/repetition_penalty_reward": -0.01932289730757475, - "rewards/tag_count_reward": 0.9114583432674408, + "completion_length": 341.3333435058594, + "epoch": 0.2745, + "grad_norm": 6.4673055298014965, + "kl": 0.5400390625, + "learning_rate": 9.190683441431974e-07, + "loss": 0.0892, + "reward": 2.7759108543395996, + "reward_std": 0.3490236699581146, + "rewards/accuracy_reward": 0.8750000298023224, + "rewards/reasoning_steps_reward": 0.9583333134651184, + "rewards/repetition_penalty_reward": -0.047005822882056236, + "rewards/tag_count_reward": 0.9895833730697632, "step": 549 }, { "clip_ratio": 0.0, - "completion_length": 622.2916870117188, - "epoch": 0.55, - "grad_norm": 16.89430814413906, - "kl": 5.453125, - "learning_rate": 5.5e-07, - "loss": 0.6763, - "reward": 2.388463854789734, - "reward_std": 0.5397979617118835, - "rewards/accuracy_reward": 0.6041666865348816, - "rewards/reasoning_steps_reward": 0.9583333730697632, - "rewards/repetition_penalty_reward": -0.017786113545298576, - "rewards/tag_count_reward": 0.8437500298023224, + "completion_length": 361.62501525878906, + "epoch": 0.275, + "grad_norm": 12.47875427754187, + "kl": 0.83984375, + "learning_rate": 9.186184199300463e-07, + "loss": 0.082, + "reward": 2.453283429145813, + "reward_std": 0.3730267733335495, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.951388955116272, + "rewards/repetition_penalty_reward": -0.04498073272407055, + "rewards/tag_count_reward": 0.9635416865348816, "step": 550 }, { "clip_ratio": 0.0, - "completion_length": 533.1666717529297, - "epoch": 0.551, - "grad_norm": 13.91558589732855, - "kl": 4.0078125, - "learning_rate": 5.484292068631494e-07, - "loss": 0.6629, - "reward": 2.265714168548584, - "reward_std": 0.6671868860721588, - "rewards/accuracy_reward": 0.4791666865348816, - "rewards/reasoning_steps_reward": 0.9375000596046448, - "rewards/repetition_penalty_reward": -0.020744211971759796, - "rewards/tag_count_reward": 0.8697916865348816, + "completion_length": 372.97918701171875, + "epoch": 0.2755, + "grad_norm": 6.003413536757677, + "kl": 0.533203125, + "learning_rate": 9.181673728412605e-07, + "loss": 0.0929, + "reward": 2.6774885654449463, + "reward_std": 0.32011160254478455, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.06556697562336922, + "rewards/tag_count_reward": 0.9791666865348816, "step": 551 }, { "clip_ratio": 0.0, - "completion_length": 573.5625305175781, - "epoch": 0.552, - "grad_norm": 16.883404148942905, - "kl": 5.0, - "learning_rate": 5.468584328659172e-07, - "loss": 0.8272, - "reward": 2.377629041671753, - "reward_std": 0.6626934707164764, - "rewards/accuracy_reward": 0.5625, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.021676572039723396, - "rewards/tag_count_reward": 0.84375, + "completion_length": 371.2083435058594, + "epoch": 0.276, + "grad_norm": 11.901672638559532, + "kl": 0.51806640625, + "learning_rate": 9.177152042508077e-07, + "loss": 0.0808, + "reward": 2.7159184217453003, + "reward_std": 0.22979847341775894, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.05317908897995949, + "rewards/tag_count_reward": 0.9635416865348816, "step": 552 }, { "clip_ratio": 0.0, - "completion_length": 641.9583435058594, - "epoch": 0.553, - "grad_norm": 12.510249676603848, - "kl": 6.515625, - "learning_rate": 5.452876971476896e-07, - "loss": 1.1594, - "reward": 2.224699914455414, - "reward_std": 0.7095433175563812, - "rewards/accuracy_reward": 0.4791666865348816, - "rewards/reasoning_steps_reward": 0.9583333432674408, - "rewards/repetition_penalty_reward": -0.020091742277145386, - "rewards/tag_count_reward": 0.8072916865348816, + "completion_length": 487.4791717529297, + "epoch": 0.2765, + "grad_norm": 15.136748639101315, + "kl": 1.328125, + "learning_rate": 9.17261915536072e-07, + "loss": 0.334, + "reward": 2.423860192298889, + "reward_std": 0.5218981206417084, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.04836214520037174, + "rewards/tag_count_reward": 0.9166666865348816, "step": 553 }, { "clip_ratio": 0.0, - "completion_length": 621.0000305175781, - "epoch": 0.554, - "grad_norm": 20.15924923331073, - "kl": 4.90625, - "learning_rate": 5.437170188473847e-07, - "loss": 1.009, - "reward": 2.4554017782211304, - "reward_std": 0.6323766112327576, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/reasoning_steps_reward": 0.9652778208255768, - "rewards/repetition_penalty_reward": -0.025501138530671597, - "rewards/tag_count_reward": 0.8489583432674408, + "completion_length": 461.7291717529297, + "epoch": 0.277, + "grad_norm": 26.981007155408008, + "kl": 1.1640625, + "learning_rate": 9.168075080778494e-07, + "loss": 0.5053, + "reward": 2.441664218902588, + "reward_std": 0.5236180424690247, + "rewards/accuracy_reward": 0.6041666716337204, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.04791930224746466, + "rewards/tag_count_reward": 0.9062500298023224, "step": 554 }, { "clip_ratio": 0.0, - "completion_length": 516.3125305175781, - "epoch": 0.555, - "grad_norm": 15.439703900925965, - "kl": 3.703125, - "learning_rate": 5.421464171032224e-07, - "loss": 0.6462, - "reward": 2.5977327823638916, - "reward_std": 0.5676628202199936, - "rewards/accuracy_reward": 0.75, - "rewards/reasoning_steps_reward": 0.9583333730697632, - "rewards/repetition_penalty_reward": -0.016850699670612812, - "rewards/tag_count_reward": 0.9062500298023224, + "completion_length": 317.31251525878906, + "epoch": 0.2775, + "grad_norm": 11.002217558072193, + "kl": 0.91796875, + "learning_rate": 9.163519832603436e-07, + "loss": 0.147, + "reward": 2.499881386756897, + "reward_std": 0.4323354959487915, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9861111640930176, + "rewards/repetition_penalty_reward": -0.03831310383975506, + "rewards/tag_count_reward": 0.9479166865348816, "step": 555 }, { "clip_ratio": 0.0, - "completion_length": 483.5625, - "epoch": 0.556, - "grad_norm": 11.490664107960336, - "kl": 3.9921875, - "learning_rate": 5.405759110524894e-07, - "loss": 0.5193, - "reward": 2.5783601999282837, - "reward_std": 0.6397126764059067, - "rewards/accuracy_reward": 0.7708333730697632, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.0292786480858922, - "rewards/tag_count_reward": 0.8645833432674408, + "completion_length": 379.5208435058594, + "epoch": 0.278, + "grad_norm": 12.09872643997718, + "kl": 1.6318359375, + "learning_rate": 9.158953424711624e-07, + "loss": 0.1784, + "reward": 2.6038191318511963, + "reward_std": 0.46002739667892456, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.04027818236500025, + "rewards/tag_count_reward": 0.9635416865348816, "step": 556 }, { "clip_ratio": 0.0, - "completion_length": 516.6041870117188, - "epoch": 0.557, - "grad_norm": 27.558157304060057, - "kl": 4.078125, - "learning_rate": 5.390055198313061e-07, - "loss": 0.7408, - "reward": 2.3512425422668457, - "reward_std": 0.5882950127124786, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/reasoning_steps_reward": 0.965277761220932, - "rewards/repetition_penalty_reward": -0.025493742898106575, - "rewards/tag_count_reward": 0.8697916865348816, + "completion_length": 318.625, + "epoch": 0.2785, + "grad_norm": 10.050531964018983, + "kl": 1.021484375, + "learning_rate": 9.154375871013128e-07, + "loss": 0.1225, + "reward": 2.6270726919174194, + "reward_std": 0.3049708902835846, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.04306626692414284, + "rewards/tag_count_reward": 0.9895833432674408, "step": 557 }, { "clip_ratio": 0.0, - "completion_length": 540.9166870117188, - "epoch": 0.558, - "grad_norm": 22.15871149302614, - "kl": 5.25, - "learning_rate": 5.37435262574394e-07, - "loss": 0.7217, - "reward": 2.273875594139099, - "reward_std": 0.604903370141983, - "rewards/accuracy_reward": 0.5208333432674408, - "rewards/reasoning_steps_reward": 0.951388955116272, - "rewards/repetition_penalty_reward": -0.01605505309998989, - "rewards/tag_count_reward": 0.8177083432674408, + "completion_length": 459.52085876464844, + "epoch": 0.279, + "grad_norm": 39.99093411330565, + "kl": 5.265625, + "learning_rate": 9.149787185451969e-07, + "loss": 0.4026, + "reward": 2.456482768058777, + "reward_std": 0.5135739296674728, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.03830893710255623, + "rewards/tag_count_reward": 0.9114583432674408, "step": 558 }, { "clip_ratio": 0.0, - "completion_length": 564.1875305175781, - "epoch": 0.559, - "grad_norm": 20.132907345221618, - "kl": 5.90625, - "learning_rate": 5.358651584148423e-07, - "loss": 0.975, - "reward": 2.391770362854004, - "reward_std": 0.7469497919082642, + "completion_length": 568.2291870117188, + "epoch": 0.2795, + "grad_norm": 28.34326913260782, + "kl": 4.4140625, + "learning_rate": 9.145187382006081e-07, + "loss": 0.9447, + "reward": 2.393509268760681, + "reward_std": 0.7060167789459229, "rewards/accuracy_reward": 0.6250000298023224, - "rewards/reasoning_steps_reward": 0.958333432674408, - "rewards/repetition_penalty_reward": -0.0248964074999094, - "rewards/tag_count_reward": 0.8333333432674408, + "rewards/reasoning_steps_reward": 0.9375000298023224, + "rewards/repetition_penalty_reward": -0.01794920302927494, + "rewards/tag_count_reward": 0.8489583730697632, "step": 559 }, { "clip_ratio": 0.0, - "completion_length": 602.7500305175781, - "epoch": 0.56, - "grad_norm": 20.768809175821836, - "kl": 5.4296875, - "learning_rate": 5.342952264838747e-07, - "loss": 0.7266, - "reward": 1.9830502271652222, - "reward_std": 0.5142233371734619, - "rewards/accuracy_reward": 0.2500000111758709, - "rewards/reasoning_steps_reward": 0.972222238779068, - "rewards/repetition_penalty_reward": -0.02563036046922207, - "rewards/tag_count_reward": 0.7864583432674408, + "completion_length": 346.6666717529297, + "epoch": 0.28, + "grad_norm": 16.615012130478203, + "kl": 1.669921875, + "learning_rate": 9.140576474687263e-07, + "loss": 0.3664, + "reward": 2.7198301553726196, + "reward_std": 0.38381946086883545, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.030169978737831116, + "rewards/tag_count_reward": 0.9791666865348816, "step": 560 }, { "clip_ratio": 0.0, - "completion_length": 522.9583435058594, - "epoch": 0.561, - "grad_norm": 17.71687219992085, - "kl": 6.890625, - "learning_rate": 5.32725485910616e-07, - "loss": 0.4285, - "reward": 2.2837836742401123, - "reward_std": 0.7183063626289368, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/reasoning_steps_reward": 0.9583333134651184, - "rewards/repetition_penalty_reward": -0.02350800298154354, - "rewards/tag_count_reward": 0.8072916865348816, + "completion_length": 556.7916870117188, + "epoch": 0.2805, + "grad_norm": 17.593736355103562, + "kl": 2.7265625, + "learning_rate": 9.135954477541137e-07, + "loss": 0.8128, + "reward": 2.2776578664779663, + "reward_std": 0.6524731516838074, + "rewards/accuracy_reward": 0.4583333432674408, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.024425473995506763, + "rewards/tag_count_reward": 0.8854166865348816, "step": 561 }, { "clip_ratio": 0.0, - "completion_length": 533.6666870117188, - "epoch": 0.562, - "grad_norm": 17.98874232563595, - "kl": 2.96875, - "learning_rate": 5.311559558218603e-07, - "loss": 0.7934, - "reward": 2.4511380195617676, - "reward_std": 0.5958435237407684, - "rewards/accuracy_reward": 0.5833333432674408, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.021084393840283155, - "rewards/tag_count_reward": 0.8958333432674408, + "completion_length": 403.9583435058594, + "epoch": 0.281, + "grad_norm": 9.443969242253909, + "kl": 0.826171875, + "learning_rate": 9.131321404647109e-07, + "loss": 0.3193, + "reward": 2.3959869742393494, + "reward_std": 0.171494722366333, + "rewards/accuracy_reward": 0.4791666865348816, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.034568555653095245, + "rewards/tag_count_reward": 0.9583333432674408, "step": 562 }, { "clip_ratio": 0.0, - "completion_length": 678.0625305175781, - "epoch": 0.563, - "grad_norm": 20.342890063735087, - "kl": 6.2265625, - "learning_rate": 5.295866553418358e-07, - "loss": 0.8234, - "reward": 2.3168532848358154, - "reward_std": 0.6717122793197632, - "rewards/accuracy_reward": 0.6250000149011612, - "rewards/reasoning_steps_reward": 0.9305556118488312, - "rewards/repetition_penalty_reward": -0.025160694494843483, - "rewards/tag_count_reward": 0.7864583730697632, + "completion_length": 376.12501525878906, + "epoch": 0.2815, + "grad_norm": 235.68207779933957, + "kl": 1.48046875, + "learning_rate": 9.126677270118322e-07, + "loss": 0.3893, + "reward": 2.7154510021209717, + "reward_std": 0.47927525639533997, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.031076885759830475, + "rewards/tag_count_reward": 0.9479166865348816, "step": 563 }, { "clip_ratio": 0.0, - "completion_length": 617.2500305175781, - "epoch": 0.564, - "grad_norm": 16.061198883154272, - "kl": 3.953125, - "learning_rate": 5.28017603591974e-07, - "loss": 0.9804, - "reward": 2.5708781480789185, - "reward_std": 0.6792239546775818, - "rewards/accuracy_reward": 0.75, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.024608048610389233, - "rewards/tag_count_reward": 0.8593750298023224, + "completion_length": 512.9375305175781, + "epoch": 0.282, + "grad_norm": 14.828955399249537, + "kl": 1.9921875, + "learning_rate": 9.122022088101613e-07, + "loss": 0.6226, + "reward": 2.4416096210479736, + "reward_std": 0.424076110124588, + "rewards/accuracy_reward": 0.583333358168602, + "rewards/reasoning_steps_reward": 0.9861111640930176, + "rewards/repetition_penalty_reward": -0.03408493287861347, + "rewards/tag_count_reward": 0.9062500298023224, "step": 564 }, { "clip_ratio": 0.0, - "completion_length": 500.29168701171875, - "epoch": 0.565, - "grad_norm": 17.72408984099671, - "kl": 2.89453125, - "learning_rate": 5.264488196906752e-07, - "loss": 0.7968, - "reward": 2.518498420715332, - "reward_std": 0.6503982543945312, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/reasoning_steps_reward": 0.9652778208255768, - "rewards/repetition_penalty_reward": -0.024904441088438034, - "rewards/tag_count_reward": 0.9114583432674408, + "completion_length": 453.0208435058594, + "epoch": 0.2825, + "grad_norm": 71.8537331742799, + "kl": 3.9765625, + "learning_rate": 9.117355872777477e-07, + "loss": 0.6687, + "reward": 2.5574655532836914, + "reward_std": 0.44046418368816376, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.046701231971383095, + "rewards/tag_count_reward": 0.9166666865348816, "step": 565 }, { "clip_ratio": 0.0, - "completion_length": 576.8958435058594, - "epoch": 0.566, - "grad_norm": 22.26454438337593, - "kl": 4.609375, - "learning_rate": 5.248803227530763e-07, - "loss": 1.0678, - "reward": 2.483291506767273, - "reward_std": 0.4923284649848938, - "rewards/accuracy_reward": 0.6666666865348816, + "completion_length": 585.7708435058594, + "epoch": 0.283, + "grad_norm": 133.27147379184413, + "kl": 6.51171875, + "learning_rate": 9.112678638360015e-07, + "loss": 0.8637, + "reward": 2.124269187450409, + "reward_std": 0.5485763549804688, + "rewards/accuracy_reward": 0.3333333358168602, "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.020180770196020603, - "rewards/tag_count_reward": 0.8437500298023224, + "rewards/repetition_penalty_reward": -0.025036394596099854, + "rewards/tag_count_reward": 0.8229166865348816, "step": 566 }, { "clip_ratio": 0.0, - "completion_length": 555.2708587646484, - "epoch": 0.567, - "grad_norm": 15.919462093451902, - "kl": 5.5703125, - "learning_rate": 5.233121318908173e-07, - "loss": 0.7808, - "reward": 2.2207025289535522, - "reward_std": 0.6656961143016815, - "rewards/accuracy_reward": 0.4375000149011612, - "rewards/reasoning_steps_reward": 0.9375001192092896, - "rewards/repetition_penalty_reward": -0.018881036899983883, - "rewards/tag_count_reward": 0.8645833432674408, + "completion_length": 390.8333435058594, + "epoch": 0.2835, + "grad_norm": 68.07666811218944, + "kl": 2.1826171875, + "learning_rate": 9.107990399096893e-07, + "loss": 0.3028, + "reward": 2.555752396583557, + "reward_std": 0.39118392765522003, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.03452543169260025, + "rewards/tag_count_reward": 0.9375, "step": 567 }, { "clip_ratio": 0.0, - "completion_length": 642.1041870117188, - "epoch": 0.568, - "grad_norm": 21.902991875998683, - "kl": 6.234375, - "learning_rate": 5.21744266211809e-07, - "loss": 0.7828, - "reward": 2.2312402725219727, - "reward_std": 0.6396611332893372, - "rewards/accuracy_reward": 0.5, - "rewards/reasoning_steps_reward": 0.9583333432674408, - "rewards/repetition_penalty_reward": -0.023968255147337914, - "rewards/tag_count_reward": 0.7968750298023224, + "completion_length": 411.5416717529297, + "epoch": 0.284, + "grad_norm": 15.00691010233856, + "kl": 1.05078125, + "learning_rate": 9.103291169269299e-07, + "loss": 0.3439, + "reward": 2.596803307533264, + "reward_std": 0.5693235397338867, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.031668941490352154, + "rewards/tag_count_reward": 0.9479166865348816, "step": 568 }, { "clip_ratio": 0.0, - "completion_length": 493.2708435058594, - "epoch": 0.569, - "grad_norm": 11.796721143909934, - "kl": 2.9140625, - "learning_rate": 5.2017674482e-07, - "loss": 0.4568, - "reward": 2.5521020889282227, - "reward_std": 0.6523496210575104, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9583334028720856, - "rewards/repetition_penalty_reward": -0.02081457804888487, - "rewards/tag_count_reward": 0.8854166865348816, + "completion_length": 466.6666717529297, + "epoch": 0.2845, + "grad_norm": 444.847404957193, + "kl": 3.9453125, + "learning_rate": 9.098580963191907e-07, + "loss": 0.8048, + "reward": 2.4301793575286865, + "reward_std": 0.7720136642456055, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.021209627389907837, + "rewards/tag_count_reward": 0.8541666865348816, "step": 569 }, { "clip_ratio": 0.0, - "completion_length": 563.8750305175781, - "epoch": 0.57, - "grad_norm": 16.743725013071696, - "kl": 4.6015625, - "learning_rate": 5.186095868151436e-07, - "loss": 0.542, - "reward": 2.138919949531555, - "reward_std": 0.6218420267105103, - "rewards/accuracy_reward": 0.3958333432674408, - "rewards/reasoning_steps_reward": 0.9583333432674408, - "rewards/repetition_penalty_reward": -0.017330123111605644, - "rewards/tag_count_reward": 0.8020833432674408, + "completion_length": 373.87501525878906, + "epoch": 0.285, + "grad_norm": 32.33608353360448, + "kl": 1.00390625, + "learning_rate": 9.093859795212817e-07, + "loss": 0.2574, + "reward": 2.688089966773987, + "reward_std": 0.47320832312107086, + "rewards/accuracy_reward": 0.7708333730697632, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.039340706542134285, + "rewards/tag_count_reward": 0.9635416865348816, "step": 570 }, { "clip_ratio": 0.0, - "completion_length": 593.0625305175781, - "epoch": 0.571, - "grad_norm": 31.64391481670669, - "kl": 5.453125, - "learning_rate": 5.170428112925659e-07, - "loss": 0.6923, - "reward": 2.3365062475204468, - "reward_std": 0.6305623352527618, - "rewards/accuracy_reward": 0.5833333432674408, - "rewards/reasoning_steps_reward": 0.9444445371627808, - "rewards/repetition_penalty_reward": -0.02460480574518442, - "rewards/tag_count_reward": 0.8333333432674408, + "completion_length": 374.75001525878906, + "epoch": 0.2855, + "grad_norm": 17.560098791576593, + "kl": 0.44921875, + "learning_rate": 9.089127679713529e-07, + "loss": 0.1948, + "reward": 2.7453384399414062, + "reward_std": 0.3349706828594208, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.048064423725008965, + "rewards/tag_count_reward": 0.9739583432674408, "step": 571 }, { "clip_ratio": 0.0, - "completion_length": 431.81251525878906, - "epoch": 0.572, - "grad_norm": 18.846581957559952, - "kl": 1.83203125, - "learning_rate": 5.154764373429315e-07, - "loss": 0.3648, - "reward": 2.5100269317626953, - "reward_std": 0.38545116782188416, - "rewards/accuracy_reward": 0.625, + "completion_length": 322.18751525878906, + "epoch": 0.286, + "grad_norm": 18.129855941015013, + "kl": 0.556640625, + "learning_rate": 9.084384631108882e-07, + "loss": 0.1961, + "reward": 2.65604305267334, + "reward_std": 0.520334392786026, + "rewards/accuracy_reward": 0.7291666865348816, "rewards/reasoning_steps_reward": 0.9861111044883728, - "rewards/repetition_penalty_reward": -0.022959282621741295, - "rewards/tag_count_reward": 0.9218750298023224, + "rewards/repetition_penalty_reward": -0.027984846383333206, + "rewards/tag_count_reward": 0.9687500298023224, "step": 572 }, { "clip_ratio": 0.0, - "completion_length": 553.2291717529297, - "epoch": 0.573, - "grad_norm": 13.121698712408023, - "kl": 3.3359375, - "learning_rate": 5.139104840520135e-07, - "loss": 0.7169, - "reward": 2.331916570663452, - "reward_std": 0.6915982961654663, - "rewards/accuracy_reward": 0.5208333432674408, - "rewards/reasoning_steps_reward": 0.972222238779068, - "rewards/repetition_penalty_reward": -0.030930647626519203, - "rewards/tag_count_reward": 0.8697916865348816, + "completion_length": 459.5208435058594, + "epoch": 0.2865, + "grad_norm": 18.09227325152774, + "kl": 2.9375, + "learning_rate": 9.079630663847031e-07, + "loss": 0.5215, + "reward": 2.2212284803390503, + "reward_std": 0.5788050144910812, + "rewards/accuracy_reward": 0.4583333432674408, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.027035493403673172, + "rewards/tag_count_reward": 0.8385416865348816, "step": 573 }, { "clip_ratio": 0.0, - "completion_length": 730.5625, - "epoch": 0.574, - "grad_norm": 35.573664674103505, - "kl": 7.890625, - "learning_rate": 5.123449705004581e-07, - "loss": 0.8329, - "reward": 1.8582618832588196, - "reward_std": 0.7164104580879211, - "rewards/accuracy_reward": 0.2708333432674408, - "rewards/reasoning_steps_reward": 0.9027777910232544, - "rewards/repetition_penalty_reward": -0.018474191427230835, - "rewards/tag_count_reward": 0.703125, + "completion_length": 358.81251525878906, + "epoch": 0.287, + "grad_norm": 29.6582060019414, + "kl": 2.4140625, + "learning_rate": 9.074865792409381e-07, + "loss": 0.4198, + "reward": 2.4573017358779907, + "reward_std": 0.4562261551618576, + "rewards/accuracy_reward": 0.583333358168602, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.027073362842202187, + "rewards/tag_count_reward": 0.921875, "step": 574 }, { "clip_ratio": 0.0, - "completion_length": 706.9166870117188, - "epoch": 0.575, - "grad_norm": 12.890226305523278, - "kl": 5.65625, - "learning_rate": 5.107799157635538e-07, - "loss": 0.7742, - "reward": 2.287819743156433, - "reward_std": 0.8128792345523834, - "rewards/accuracy_reward": 0.6458333730697632, - "rewards/reasoning_steps_reward": 0.9027777910232544, - "rewards/repetition_penalty_reward": -0.02641639020293951, - "rewards/tag_count_reward": 0.7656250298023224, + "completion_length": 335.18751525878906, + "epoch": 0.2875, + "grad_norm": 59.93954067426397, + "kl": 2.78857421875, + "learning_rate": 9.070090031310558e-07, + "loss": 0.3862, + "reward": 2.782563328742981, + "reward_std": 0.31483452301472425, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.04382561706006527, + "rewards/tag_count_reward": 0.9791666865348816, "step": 575 }, { "clip_ratio": 0.0, - "completion_length": 678.9166870117188, - "epoch": 0.576, - "grad_norm": 22.0467723806393, - "kl": 6.984375, - "learning_rate": 5.09215338910999e-07, - "loss": 0.8976, - "reward": 2.1274478435516357, - "reward_std": 0.8857446014881134, - "rewards/accuracy_reward": 0.4583333432674408, - "rewards/reasoning_steps_reward": 0.9305555820465088, - "rewards/repetition_penalty_reward": -0.016649481374770403, - "rewards/tag_count_reward": 0.7552083432674408, + "completion_length": 397.4166717529297, + "epoch": 0.288, + "grad_norm": 23.817675077634586, + "kl": 3.94921875, + "learning_rate": 9.065303395098358e-07, + "loss": 0.48, + "reward": 2.5549936294555664, + "reward_std": 0.3741450160741806, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.017923136241734028, + "rewards/tag_count_reward": 0.8854166865348816, "step": 576 }, { "clip_ratio": 0.0, - "completion_length": 843.8333435058594, - "epoch": 0.577, - "grad_norm": 24.290171025234606, - "kl": 7.203125, - "learning_rate": 5.076512590066685e-07, - "loss": 0.9397, - "reward": 1.9734002351760864, - "reward_std": 0.8574240803718567, - "rewards/accuracy_reward": 0.3750000149011612, - "rewards/reasoning_steps_reward": 0.9652777910232544, - "rewards/repetition_penalty_reward": -0.012710887007415295, - "rewards/tag_count_reward": 0.6458333432674408, + "completion_length": 289.75, + "epoch": 0.2885, + "grad_norm": 9.736727012422476, + "kl": 0.5712890625, + "learning_rate": 9.060505898353705e-07, + "loss": 0.1109, + "reward": 2.759944438934326, + "reward_std": 0.2943428307771683, + "rewards/accuracy_reward": 0.8125, + "rewards/reasoning_steps_reward": 0.9791667461395264, + "rewards/repetition_penalty_reward": -0.02130552940070629, + "rewards/tag_count_reward": 0.9895833730697632, "step": 577 }, { "clip_ratio": 0.0, - "completion_length": 601.8958435058594, - "epoch": 0.578, - "grad_norm": 13.84698671695797, - "kl": 4.265625, - "learning_rate": 5.060876951083828e-07, - "loss": 0.6815, - "reward": 2.2862409353256226, - "reward_std": 0.7043590843677521, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/reasoning_steps_reward": 0.9652778208255768, - "rewards/repetition_penalty_reward": -0.01757870987057686, - "rewards/tag_count_reward": 0.796875, + "completion_length": 458.87501525878906, + "epoch": 0.289, + "grad_norm": 32.414744280583555, + "kl": 5.390625, + "learning_rate": 9.055697555690607e-07, + "loss": 0.549, + "reward": 2.2597694396972656, + "reward_std": 0.8498954474925995, + "rewards/accuracy_reward": 0.5625000298023224, + "rewards/reasoning_steps_reward": 0.888888955116272, + "rewards/repetition_penalty_reward": -0.01974454615265131, + "rewards/tag_count_reward": 0.8281250298023224, "step": 578 }, { "clip_ratio": 0.0, - "completion_length": 473.2083435058594, - "epoch": 0.579, - "grad_norm": 21.25841736364304, - "kl": 2.7265625, - "learning_rate": 5.045246662676741e-07, - "loss": 0.4115, - "reward": 2.3248329162597656, - "reward_std": 0.6839044392108917, - "rewards/accuracy_reward": 0.5000000149011612, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.022389397025108337, - "rewards/tag_count_reward": 0.8750000298023224, + "completion_length": 442.91668701171875, + "epoch": 0.2895, + "grad_norm": 18.23142677889983, + "kl": 3.25390625, + "learning_rate": 9.050878381756107e-07, + "loss": 0.4236, + "reward": 2.289909839630127, + "reward_std": 0.6958013772964478, + "rewards/accuracy_reward": 0.5625000298023224, + "rewards/reasoning_steps_reward": 0.9444444477558136, + "rewards/repetition_penalty_reward": -0.02432657964527607, + "rewards/tag_count_reward": 0.8072916865348816, "step": 579 }, { "clip_ratio": 0.0, - "completion_length": 506.2708435058594, - "epoch": 0.58, - "grad_norm": 13.768352502526005, - "kl": 2.55859375, - "learning_rate": 5.02962191529556e-07, - "loss": 0.4207, - "reward": 2.34650194644928, - "reward_std": 0.4788215011358261, - "rewards/accuracy_reward": 0.5208333358168602, - "rewards/reasoning_steps_reward": 0.9652777910232544, - "rewards/repetition_penalty_reward": -0.03023417294025421, - "rewards/tag_count_reward": 0.890625, + "completion_length": 504.58335876464844, + "epoch": 0.29, + "grad_norm": 29.70121123782455, + "kl": 2.6796875, + "learning_rate": 9.046048391230247e-07, + "loss": 0.4642, + "reward": 2.1951472759246826, + "reward_std": 0.8341259658336639, + "rewards/accuracy_reward": 0.5000000149011612, + "rewards/reasoning_steps_reward": 0.9236111342906952, + "rewards/repetition_penalty_reward": -0.02013048715889454, + "rewards/tag_count_reward": 0.7916666865348816, "step": 580 }, { "clip_ratio": 0.0, - "completion_length": 509.14585876464844, - "epoch": 0.581, - "grad_norm": 31.447352351340843, - "kl": 2.5546875, - "learning_rate": 5.014002899322896e-07, - "loss": 0.7482, - "reward": 2.202039122581482, - "reward_std": 0.689550518989563, - "rewards/accuracy_reward": 0.4166666716337204, - "rewards/reasoning_steps_reward": 0.9375000596046448, - "rewards/repetition_penalty_reward": -0.02712748385965824, - "rewards/tag_count_reward": 0.8750000298023224, + "completion_length": 460.6458435058594, + "epoch": 0.2905, + "grad_norm": 19.307347308311925, + "kl": 2.3359375, + "learning_rate": 9.041207598826017e-07, + "loss": 0.3662, + "reward": 2.1872280836105347, + "reward_std": 0.5830141305923462, + "rewards/accuracy_reward": 0.4375000149011612, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.02804980892688036, + "rewards/tag_count_reward": 0.8333333730697632, "step": 581 }, { "clip_ratio": 0.0, - "completion_length": 636.8750305175781, - "epoch": 0.582, - "grad_norm": 31.988630155115366, - "kl": 5.734375, - "learning_rate": 4.998389805071536e-07, - "loss": 0.9076, - "reward": 2.169505000114441, - "reward_std": 0.861674964427948, - "rewards/accuracy_reward": 0.5000000149011612, - "rewards/reasoning_steps_reward": 0.9166666865348816, - "rewards/repetition_penalty_reward": -0.028411878272891045, - "rewards/tag_count_reward": 0.7812500298023224, + "completion_length": 397.6666717529297, + "epoch": 0.291, + "grad_norm": 15.271088740961835, + "kl": 1.3125, + "learning_rate": 9.036356019289309e-07, + "loss": 0.3569, + "reward": 2.5574233531951904, + "reward_std": 0.5160808861255646, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9861111640930176, + "rewards/repetition_penalty_reward": -0.04847951419651508, + "rewards/tag_count_reward": 0.9114583730697632, "step": 582 }, { "clip_ratio": 0.0, - "completion_length": 605.1666870117188, - "epoch": 0.583, - "grad_norm": 18.313268799229025, - "kl": 3.3671875, - "learning_rate": 4.982782822782101e-07, - "loss": 0.8556, - "reward": 2.253504157066345, - "reward_std": 0.5843206644058228, - "rewards/accuracy_reward": 0.4166666865348816, - "rewards/reasoning_steps_reward": 0.9722222983837128, - "rewards/repetition_penalty_reward": -0.020801476202905178, - "rewards/tag_count_reward": 0.8854166865348816, + "completion_length": 376.75, + "epoch": 0.2915, + "grad_norm": 17.403821452003328, + "kl": 1.63671875, + "learning_rate": 9.031493667398872e-07, + "loss": 0.2783, + "reward": 2.41110360622406, + "reward_std": 0.37185367196798325, + "rewards/accuracy_reward": 0.5625000223517418, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.0524381659924984, + "rewards/tag_count_reward": 0.9218750298023224, "step": 583 }, { "clip_ratio": 0.0, - "completion_length": 519.5000305175781, - "epoch": 0.584, - "grad_norm": 12.893361447903395, - "kl": 5.90625, - "learning_rate": 4.967182142620745e-07, - "loss": 0.5758, - "reward": 2.2402533292770386, - "reward_std": 0.9014249742031097, - "rewards/accuracy_reward": 0.583333358168602, - "rewards/reasoning_steps_reward": 0.916666716337204, - "rewards/repetition_penalty_reward": -0.020163366571068764, - "rewards/tag_count_reward": 0.7604166865348816, + "completion_length": 439.8333435058594, + "epoch": 0.292, + "grad_norm": 63.805319864699065, + "kl": 7.046875, + "learning_rate": 9.026620557966279e-07, + "loss": 0.713, + "reward": 2.4353259801864624, + "reward_std": 0.5490872263908386, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.965277761220932, + "rewards/repetition_penalty_reward": -0.01432684762403369, + "rewards/tag_count_reward": 0.8385416865348816, "step": 584 }, { "clip_ratio": 0.0, - "completion_length": 507.9583435058594, - "epoch": 0.585, - "grad_norm": 2061.671060113234, - "kl": 9.4453125, - "learning_rate": 4.951587954676837e-07, - "loss": 1.0213, - "reward": 2.4675891399383545, - "reward_std": 0.6850857138633728, - "rewards/accuracy_reward": 0.6250000149011612, - "rewards/reasoning_steps_reward": 0.9513888955116272, - "rewards/repetition_penalty_reward": -0.02546628937125206, - "rewards/tag_count_reward": 0.9166666865348816, + "completion_length": 344.2708435058594, + "epoch": 0.2925, + "grad_norm": 42.56798762474608, + "kl": 3.458984375, + "learning_rate": 9.021736705835862e-07, + "loss": 0.2748, + "reward": 2.646929144859314, + "reward_std": 0.3856794238090515, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.02841815911233425, + "rewards/tag_count_reward": 0.953125, "step": 585 }, { "clip_ratio": 0.0, - "completion_length": 554.5208587646484, - "epoch": 0.586, - "grad_norm": 19.335025088394126, - "kl": 3.296875, - "learning_rate": 4.93600044896063e-07, - "loss": 0.7192, - "reward": 2.120868682861328, - "reward_std": 0.5526419132947922, - "rewards/accuracy_reward": 0.2916666679084301, - "rewards/reasoning_steps_reward": 0.9722222983837128, - "rewards/repetition_penalty_reward": -0.01802036538720131, - "rewards/tag_count_reward": 0.875, + "completion_length": 294.3958435058594, + "epoch": 0.293, + "grad_norm": 19.403283253518403, + "kl": 2.52734375, + "learning_rate": 9.016842125884684e-07, + "loss": 0.3443, + "reward": 2.468275308609009, + "reward_std": 0.40393635630607605, + "rewards/accuracy_reward": 0.5833333730697632, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.03346090763807297, + "rewards/tag_count_reward": 0.9322916865348816, "step": 586 }, { "clip_ratio": 0.0, - "completion_length": 629.6041870117188, - "epoch": 0.587, - "grad_norm": 18.146859923722392, - "kl": 4.8671875, - "learning_rate": 4.920419815400968e-07, - "loss": 0.886, - "reward": 2.283421516418457, - "reward_std": 0.7014163732528687, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/reasoning_steps_reward": 0.9652778208255768, - "rewards/repetition_penalty_reward": -0.02039794623851776, - "rewards/tag_count_reward": 0.7968750298023224, + "completion_length": 469.7291717529297, + "epoch": 0.2935, + "grad_norm": 73.87564910344872, + "kl": 6.8203125, + "learning_rate": 9.011936833022484e-07, + "loss": 0.8008, + "reward": 2.222065567970276, + "reward_std": 0.6714163422584534, + "rewards/accuracy_reward": 0.5000000149011612, + "rewards/reasoning_steps_reward": 0.9027777910232544, + "rewards/repetition_penalty_reward": -0.019253874197602272, + "rewards/tag_count_reward": 0.8385416865348816, "step": 587 }, { "clip_ratio": 0.0, - "completion_length": 870.6041870117188, - "epoch": 0.588, - "grad_norm": 40.04439172717602, - "kl": 9.53125, - "learning_rate": 4.904846243842949e-07, - "loss": 0.8823, - "reward": 1.9214681386947632, - "reward_std": 0.8858973979949951, - "rewards/accuracy_reward": 0.3541666716337204, - "rewards/reasoning_steps_reward": 0.9166666865348816, - "rewards/repetition_penalty_reward": -0.0368652418255806, - "rewards/tag_count_reward": 0.6875000298023224, + "completion_length": 461.56251525878906, + "epoch": 0.294, + "grad_norm": 93.860844445193, + "kl": 4.3984375, + "learning_rate": 9.007020842191634e-07, + "loss": 0.7156, + "reward": 2.255267858505249, + "reward_std": 0.670578122138977, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.8888889253139496, + "rewards/repetition_penalty_reward": -0.03466268070042133, + "rewards/tag_count_reward": 0.859375, "step": 588 }, { "clip_ratio": 0.0, - "completion_length": 655.1250305175781, - "epoch": 0.589, - "grad_norm": 38.61615762624294, - "kl": 6.953125, - "learning_rate": 4.88927992404563e-07, - "loss": 0.9102, - "reward": 2.024886429309845, - "reward_std": 0.7311058044433594, - "rewards/accuracy_reward": 0.3333333358168602, - "rewards/reasoning_steps_reward": 0.944444477558136, - "rewards/repetition_penalty_reward": -0.028933134861290455, - "rewards/tag_count_reward": 0.7760416865348816, + "completion_length": 596.9583740234375, + "epoch": 0.2945, + "grad_norm": 12.13076498842532, + "kl": 3.4375, + "learning_rate": 9.002094168367095e-07, + "loss": 0.602, + "reward": 2.0464794039726257, + "reward_std": 0.7266620993614197, + "rewards/accuracy_reward": 0.3541666716337204, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.019492903724312782, + "rewards/tag_count_reward": 0.7604166865348816, "step": 589 }, { "clip_ratio": 0.0, - "completion_length": 536.8333435058594, - "epoch": 0.59, - "grad_norm": 18.369342242727228, - "kl": 4.1875, - "learning_rate": 4.873721045679706e-07, - "loss": 0.7111, - "reward": 2.163462996482849, - "reward_std": 0.677480012178421, - "rewards/accuracy_reward": 0.3750000149011612, - "rewards/reasoning_steps_reward": 0.9444445371627808, - "rewards/repetition_penalty_reward": -0.020564796403050423, - "rewards/tag_count_reward": 0.8645833432674408, + "completion_length": 320.5, + "epoch": 0.295, + "grad_norm": 24.895394841833685, + "kl": 1.365234375, + "learning_rate": 8.997156826556369e-07, + "loss": 0.4022, + "reward": 2.6690471172332764, + "reward_std": 0.5990716367959976, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9305555820465088, + "rewards/repetition_penalty_reward": -0.037550284527242184, + "rewards/tag_count_reward": 0.9218750298023224, "step": 590 }, { "clip_ratio": 0.0, - "completion_length": 424.18751525878906, - "epoch": 0.591, - "grad_norm": 16.75415987323558, - "kl": 2.5, - "learning_rate": 4.858169798325198e-07, - "loss": 0.3791, - "reward": 2.6984031200408936, - "reward_std": 0.4382772147655487, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 0.9513889253139496, - "rewards/repetition_penalty_reward": -0.018610700964927673, - "rewards/tag_count_reward": 0.9114583730697632, + "completion_length": 374.56251525878906, + "epoch": 0.2955, + "grad_norm": 22.56888130354956, + "kl": 1.072265625, + "learning_rate": 8.992208831799456e-07, + "loss": 0.451, + "reward": 2.7043356895446777, + "reward_std": 0.4858996868133545, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/reasoning_steps_reward": 0.9861111640930176, + "rewards/repetition_penalty_reward": -0.03698389232158661, + "rewards/tag_count_reward": 0.921875, "step": 591 }, { "clip_ratio": 0.0, - "completion_length": 541.2083435058594, - "epoch": 0.592, - "grad_norm": 29.77449375477728, - "kl": 5.0546875, - "learning_rate": 4.842626371469149e-07, - "loss": 0.6774, - "reward": 2.1399881839752197, - "reward_std": 0.769228607416153, - "rewards/accuracy_reward": 0.4583333432674408, - "rewards/reasoning_steps_reward": 0.9305556118488312, - "rewards/repetition_penalty_reward": -0.0197342149913311, - "rewards/tag_count_reward": 0.7708333432674408, + "completion_length": 393.0833435058594, + "epoch": 0.296, + "grad_norm": 15.07305486993295, + "kl": 1.38671875, + "learning_rate": 8.987250199168808e-07, + "loss": 0.3822, + "reward": 2.2811846137046814, + "reward_std": 0.35361041128635406, + "rewards/accuracy_reward": 0.5000000204890966, + "rewards/reasoning_steps_reward": 0.9444445073604584, + "rewards/repetition_penalty_reward": -0.03825983218848705, + "rewards/tag_count_reward": 0.875, "step": 592 }, { "clip_ratio": 0.0, - "completion_length": 536.8125305175781, - "epoch": 0.593, - "grad_norm": 18.37611006786208, - "kl": 2.578125, - "learning_rate": 4.827090954503308e-07, - "loss": 0.5711, - "reward": 2.4687711000442505, - "reward_std": 0.6630526483058929, - "rewards/accuracy_reward": 0.6250000298023224, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.02428449969738722, - "rewards/tag_count_reward": 0.8750000298023224, + "completion_length": 682.1458435058594, + "epoch": 0.2965, + "grad_norm": 15.040586253928733, + "kl": 3.90625, + "learning_rate": 8.982280943769278e-07, + "loss": 0.8728, + "reward": 2.138270854949951, + "reward_std": 0.7431787848472595, + "rewards/accuracy_reward": 0.5000000298023224, + "rewards/reasoning_steps_reward": 0.8611111342906952, + "rewards/repetition_penalty_reward": -0.030132046900689602, + "rewards/tag_count_reward": 0.8072916865348816, "step": 593 }, { "clip_ratio": 0.0, - "completion_length": 586.25, - "epoch": 0.594, - "grad_norm": 14.12630099692924, - "kl": 4.84375, - "learning_rate": 4.811563736721829e-07, - "loss": 0.7276, - "reward": 2.257407784461975, - "reward_std": 0.7551295161247253, - "rewards/accuracy_reward": 0.6041666865348816, - "rewards/reasoning_steps_reward": 0.9444445371627808, - "rewards/repetition_penalty_reward": -0.030786780640482903, - "rewards/tag_count_reward": 0.7395833730697632, + "completion_length": 564.4583435058594, + "epoch": 0.297, + "grad_norm": 22.6147129609363, + "kl": 5.03125, + "learning_rate": 8.977301080738079e-07, + "loss": 0.6899, + "reward": 2.386793375015259, + "reward_std": 0.6881283521652222, + "rewards/accuracy_reward": 0.6875, + "rewards/reasoning_steps_reward": 0.9027777910232544, + "rewards/repetition_penalty_reward": -0.03681784123182297, + "rewards/tag_count_reward": 0.8333333432674408, "step": 594 }, { "clip_ratio": 0.0, - "completion_length": 642.9583435058594, - "epoch": 0.595, - "grad_norm": 21.92582517717831, - "kl": 4.5703125, - "learning_rate": 4.79604490731896e-07, - "loss": 1.0746, - "reward": 2.405961036682129, - "reward_std": 0.8549763560295105, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.92361119389534, - "rewards/repetition_penalty_reward": -0.012441786471754313, - "rewards/tag_count_reward": 0.7656250298023224, + "completion_length": 401.4583435058594, + "epoch": 0.2975, + "grad_norm": 3434.828082451659, + "kl": 46.5859375, + "learning_rate": 8.97231062524474e-07, + "loss": 3.8228, + "reward": 2.6234763860702515, + "reward_std": 0.7114923894405365, + "rewards/accuracy_reward": 0.7708333730697632, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.02582914289087057, + "rewards/tag_count_reward": 0.9270833432674408, "step": 595 }, { "clip_ratio": 0.0, - "completion_length": 626.7500305175781, - "epoch": 0.596, - "grad_norm": 17.21164988814367, - "kl": 4.7109375, - "learning_rate": 4.780534655386743e-07, - "loss": 0.8805, - "reward": 2.197944164276123, - "reward_std": 0.7794502377510071, - "rewards/accuracy_reward": 0.4583333432674408, - "rewards/reasoning_steps_reward": 0.9583333730697632, - "rewards/repetition_penalty_reward": -0.020805937238037586, - "rewards/tag_count_reward": 0.8020833432674408, + "completion_length": 522.4791870117188, + "epoch": 0.298, + "grad_norm": 81.28452749558095, + "kl": 4.79296875, + "learning_rate": 8.967309592491052e-07, + "loss": 0.9035, + "reward": 2.28587806224823, + "reward_std": 0.5564829260110855, + "rewards/accuracy_reward": 0.4791666716337204, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.0613443311303854, + "rewards/tag_count_reward": 0.9166666865348816, "step": 596 }, { "clip_ratio": 0.0, - "completion_length": 835.3958435058594, - "epoch": 0.597, - "grad_norm": 15.129508307858261, - "kl": 5.84375, - "learning_rate": 4.7650331699127013e-07, - "loss": 0.8528, - "reward": 2.195970058441162, - "reward_std": 0.8639613389968872, - "rewards/accuracy_reward": 0.5625, - "rewards/reasoning_steps_reward": 0.9652777910232544, - "rewards/repetition_penalty_reward": -0.01930786296725273, - "rewards/tag_count_reward": 0.6875000298023224, + "completion_length": 384.18751525878906, + "epoch": 0.2985, + "grad_norm": 27.314706299556043, + "kl": 2.41015625, + "learning_rate": 8.962297997711027e-07, + "loss": 0.6414, + "reward": 2.3818459510803223, + "reward_std": 0.5823481678962708, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.04697362706065178, + "rewards/tag_count_reward": 0.9427083432674408, "step": 597 }, { "clip_ratio": 0.0, - "completion_length": 786.5208435058594, - "epoch": 0.598, - "grad_norm": 11.69120320460148, - "kl": 5.765625, - "learning_rate": 4.749540639777539e-07, - "loss": 1.0647, - "reward": 2.122739553451538, - "reward_std": 0.8401265442371368, - "rewards/accuracy_reward": 0.5000000298023224, - "rewards/reasoning_steps_reward": 0.9444445073604584, - "rewards/repetition_penalty_reward": -0.014413274824619293, - "rewards/tag_count_reward": 0.6927083432674408, + "completion_length": 362.56251525878906, + "epoch": 0.299, + "grad_norm": 24.978178060621413, + "kl": 3.5078125, + "learning_rate": 8.957275856170855e-07, + "loss": 0.6036, + "reward": 2.4027575254440308, + "reward_std": 0.4491366446018219, + "rewards/accuracy_reward": 0.5416666716337204, + "rewards/reasoning_steps_reward": 0.951388955116272, + "rewards/repetition_penalty_reward": -0.017381365410983562, + "rewards/tag_count_reward": 0.9270833730697632, "step": 598 }, { "clip_ratio": 0.0, - "completion_length": 647.9375305175781, - "epoch": 0.599, - "grad_norm": 56.51129938563486, - "kl": 5.953125, - "learning_rate": 4.7340572537528547e-07, - "loss": 0.9006, - "reward": 1.922993242740631, - "reward_std": 0.7214177250862122, - "rewards/accuracy_reward": 0.2500000074505806, - "rewards/reasoning_steps_reward": 0.888888955116272, - "rewards/repetition_penalty_reward": -0.023187357001006603, - "rewards/tag_count_reward": 0.8072916865348816, + "completion_length": 610.6041870117188, + "epoch": 0.2995, + "grad_norm": 56.7569904139385, + "kl": 5.6875, + "learning_rate": 8.952243183168848e-07, + "loss": 0.4682, + "reward": 2.0318877696990967, + "reward_std": 0.40879975259304047, + "rewards/accuracy_reward": 0.3125, + "rewards/reasoning_steps_reward": 0.9375000894069672, + "rewards/repetition_penalty_reward": -0.0358206108212471, + "rewards/tag_count_reward": 0.8177083730697632, "step": 599 }, { "clip_ratio": 0.0, - "completion_length": 613.0833587646484, - "epoch": 0.6, - "grad_norm": 19.124513938177042, - "kl": 4.421875, - "learning_rate": 4.7185832004988133e-07, - "loss": 0.8773, - "reward": 2.302823781967163, - "reward_std": 0.793424665927887, - "rewards/accuracy_reward": 0.5625000298023224, - "rewards/reasoning_steps_reward": 0.9652777910232544, - "rewards/repetition_penalty_reward": -0.03745407238602638, - "rewards/tag_count_reward": 0.8125, + "completion_length": 329.6041717529297, + "epoch": 0.3, + "grad_norm": 12.091825230727677, + "kl": 1.107421875, + "learning_rate": 8.9471999940354e-07, + "loss": 0.28, + "reward": 2.2527668476104736, + "reward_std": 0.4062964767217636, + "rewards/accuracy_reward": 0.354166679084301, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.026747104711830616, + "rewards/tag_count_reward": 0.9739583432674408, "step": 600 }, { "clip_ratio": 0.0, - "completion_length": 573.4375305175781, - "epoch": 0.601, - "grad_norm": 21.633820801259493, - "kl": 5.6875, - "learning_rate": 4.703118668561875e-07, - "loss": 0.8408, - "reward": 1.9621968269348145, - "reward_std": 0.7473039925098419, - "rewards/accuracy_reward": 0.2916666716337204, - "rewards/reasoning_steps_reward": 0.9166666567325592, - "rewards/repetition_penalty_reward": -0.016969827935099602, - "rewards/tag_count_reward": 0.7708333432674408, + "completion_length": 358.3541717529297, + "epoch": 0.3005, + "grad_norm": 18.582827147329553, + "kl": 1.2119140625, + "learning_rate": 8.942146304132943e-07, + "loss": 0.3149, + "reward": 2.3439489603042603, + "reward_std": 0.46688786149024963, + "rewards/accuracy_reward": 0.5000000298023224, + "rewards/reasoning_steps_reward": 0.9236111342906952, + "rewards/repetition_penalty_reward": -0.03799547627568245, + "rewards/tag_count_reward": 0.9583333432674408, "step": 601 }, { "clip_ratio": 0.0, - "completion_length": 409.9583435058594, - "epoch": 0.602, - "grad_norm": 21.57642847298448, - "kl": 2.21875, - "learning_rate": 4.68766384637248e-07, - "loss": 0.595, - "reward": 2.3019765615463257, - "reward_std": 0.5564347207546234, - "rewards/accuracy_reward": 0.4166666679084301, - "rewards/reasoning_steps_reward": 0.979166716337204, - "rewards/repetition_penalty_reward": -0.01573185622692108, - "rewards/tag_count_reward": 0.9218750298023224, + "completion_length": 347.68751525878906, + "epoch": 0.301, + "grad_norm": 25.38946154820617, + "kl": 0.908203125, + "learning_rate": 8.937082128855891e-07, + "loss": 0.3989, + "reward": 2.2832257747650146, + "reward_std": 0.44448164105415344, + "rewards/accuracy_reward": 0.3958333358168602, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.03795480914413929, + "rewards/tag_count_reward": 0.953125, "step": 602 }, { "clip_ratio": 0.0, - "completion_length": 656.0000305175781, - "epoch": 0.603, - "grad_norm": 18.78803423208506, - "kl": 4.390625, - "learning_rate": 4.672218922242759e-07, - "loss": 0.8124, - "reward": 2.2104203701019287, - "reward_std": 0.60920250415802, - "rewards/accuracy_reward": 0.5, - "rewards/reasoning_steps_reward": 0.9305555820465088, - "rewards/repetition_penalty_reward": -0.0170103432610631, - "rewards/tag_count_reward": 0.796875, + "completion_length": 391.10418701171875, + "epoch": 0.3015, + "grad_norm": 37.47070105859472, + "kl": 2.890625, + "learning_rate": 8.932007483630596e-07, + "loss": 0.673, + "reward": 2.4643146991729736, + "reward_std": 0.42459146678447723, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.025268660858273506, + "rewards/tag_count_reward": 0.9270833432674408, "step": 603 }, { "clip_ratio": 0.0, - "completion_length": 704.3541870117188, - "epoch": 0.604, - "grad_norm": 17.09263062702093, - "kl": 5.625, - "learning_rate": 4.656784084364238e-07, - "loss": 0.8064, - "reward": 1.9496487379074097, - "reward_std": 0.6896158158779144, - "rewards/accuracy_reward": 0.3333333432674408, - "rewards/reasoning_steps_reward": 0.9305555820465088, - "rewards/repetition_penalty_reward": -0.017365206964313984, - "rewards/tag_count_reward": 0.7031250298023224, + "completion_length": 499.81251525878906, + "epoch": 0.302, + "grad_norm": 27.80855047913277, + "kl": 3.8515625, + "learning_rate": 8.926922383915315e-07, + "loss": 0.6036, + "reward": 2.298141598701477, + "reward_std": 0.6866291165351868, + "rewards/accuracy_reward": 0.5000000149011612, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.03171971254050732, + "rewards/tag_count_reward": 0.8854166865348816, "step": 604 }, { "clip_ratio": 0.0, - "completion_length": 709.5416870117188, - "epoch": 0.605, - "grad_norm": 14.18186552493172, - "kl": 5.40625, - "learning_rate": 4.641359520805548e-07, - "loss": 0.9465, - "reward": 1.9041990041732788, - "reward_std": 0.5497189462184906, - "rewards/accuracy_reward": 0.25000000558793545, - "rewards/reasoning_steps_reward": 0.92361119389534, - "rewards/repetition_penalty_reward": -0.014204025734215975, - "rewards/tag_count_reward": 0.7447916865348816, + "completion_length": 437.6041717529297, + "epoch": 0.3025, + "grad_norm": 14.735625956308642, + "kl": 2.66015625, + "learning_rate": 8.921826845200138e-07, + "loss": 0.4696, + "reward": 2.3992427587509155, + "reward_std": 0.566670224070549, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9444445073604584, + "rewards/repetition_penalty_reward": -0.029576689936220646, + "rewards/tag_count_reward": 0.9010416865348816, "step": 605 }, { "clip_ratio": 0.0, - "completion_length": 644.1875, - "epoch": 0.606, - "grad_norm": 21.61369420565558, - "kl": 4.0625, - "learning_rate": 4.6259454195101267e-07, - "loss": 1.0074, - "reward": 2.424658179283142, - "reward_std": 0.7420503497123718, - "rewards/accuracy_reward": 0.6458333730697632, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.021522408351302147, - "rewards/tag_count_reward": 0.8072916865348816, + "completion_length": 311.0, + "epoch": 0.303, + "grad_norm": 12.206231699378542, + "kl": 1.1015625, + "learning_rate": 8.916720883006963e-07, + "loss": 0.1574, + "reward": 2.774844169616699, + "reward_std": 0.4380817115306854, + "rewards/accuracy_reward": 0.8750000298023224, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.034183867275714874, + "rewards/tag_count_reward": 0.9687500298023224, "step": 606 }, { "clip_ratio": 0.0, - "completion_length": 675.5833740234375, - "epoch": 0.607, - "grad_norm": 20.01514537964165, - "kl": 4.484375, - "learning_rate": 4.6105419682939316e-07, - "loss": 0.94, - "reward": 2.2794888019561768, - "reward_std": 0.6631845533847809, - "rewards/accuracy_reward": 0.5833333432674408, - "rewards/reasoning_steps_reward": 0.9513888955116272, - "rewards/repetition_penalty_reward": -0.015650033485144377, - "rewards/tag_count_reward": 0.7604166865348816, + "completion_length": 292.18751525878906, + "epoch": 0.3035, + "grad_norm": 31.971619676180616, + "kl": 2.32421875, + "learning_rate": 8.911604512889434e-07, + "loss": 0.3176, + "reward": 2.726539731025696, + "reward_std": 0.508837565779686, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 0.8958333432674408, + "rewards/repetition_penalty_reward": -0.028668691404163837, + "rewards/tag_count_reward": 0.9635416865348816, "step": 607 }, { "clip_ratio": 0.0, - "completion_length": 588.3125, - "epoch": 0.608, - "grad_norm": 15.233743739571594, - "kl": 3.671875, - "learning_rate": 4.59514935484316e-07, - "loss": 0.6811, - "reward": 2.3253376483917236, - "reward_std": 0.7875647842884064, - "rewards/accuracy_reward": 0.5833333432674408, - "rewards/reasoning_steps_reward": 0.9583333730697632, - "rewards/repetition_penalty_reward": -0.013204073999077082, - "rewards/tag_count_reward": 0.7968750298023224, + "completion_length": 377.7708435058594, + "epoch": 0.304, + "grad_norm": 47.81529047217983, + "kl": 5.484375, + "learning_rate": 8.906477750432903e-07, + "loss": 0.7836, + "reward": 2.3868411779403687, + "reward_std": 0.6196691393852234, + "rewards/accuracy_reward": 0.5625000298023224, + "rewards/reasoning_steps_reward": 0.9236111342906952, + "rewards/repetition_penalty_reward": -0.02114514308050275, + "rewards/tag_count_reward": 0.9218750298023224, "step": 608 }, { "clip_ratio": 0.0, - "completion_length": 443.0625, - "epoch": 0.609, - "grad_norm": 23.918832923297472, - "kl": 2.390625, - "learning_rate": 4.579767766711944e-07, - "loss": 0.5455, - "reward": 2.240389823913574, - "reward_std": 0.6115424335002899, - "rewards/accuracy_reward": 0.4166666716337204, - "rewards/reasoning_steps_reward": 0.9513889253139496, - "rewards/repetition_penalty_reward": -0.02349921502172947, - "rewards/tag_count_reward": 0.8958333432674408, + "completion_length": 377.00001525878906, + "epoch": 0.3045, + "grad_norm": 64.95300495357336, + "kl": 3.546875, + "learning_rate": 8.901340611254378e-07, + "loss": 0.7444, + "reward": 2.6928662061691284, + "reward_std": 0.6587081551551819, + "rewards/accuracy_reward": 0.8333333730697632, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.03630049154162407, + "rewards/tag_count_reward": 0.9375000298023224, "step": 609 }, { "clip_ratio": 0.0, - "completion_length": 501.1458435058594, - "epoch": 0.61, - "grad_norm": 21.038617061450985, - "kl": 4.1015625, - "learning_rate": 4.5643973913200837e-07, - "loss": 0.809, - "reward": 2.464809775352478, - "reward_std": 0.7636359930038452, - "rewards/accuracy_reward": 0.6875000298023224, - "rewards/reasoning_steps_reward": 0.9374999701976776, - "rewards/repetition_penalty_reward": -0.019565261900424957, - "rewards/tag_count_reward": 0.8593750298023224, + "completion_length": 409.3958435058594, + "epoch": 0.305, + "grad_norm": 40.39290779152927, + "kl": 5.671875, + "learning_rate": 8.896193111002475e-07, + "loss": 0.4324, + "reward": 2.182259440422058, + "reward_std": 0.708200603723526, + "rewards/accuracy_reward": 0.4791666716337204, + "rewards/reasoning_steps_reward": 0.8541666865348816, + "rewards/repetition_penalty_reward": -0.0364905521273613, + "rewards/tag_count_reward": 0.8854166865348816, "step": 610 }, { "clip_ratio": 0.0, - "completion_length": 681.8125, - "epoch": 0.611, - "grad_norm": 166.2589762047824, - "kl": 12.0625, - "learning_rate": 4.549038415950751e-07, - "loss": 1.4371, - "reward": 1.9376187920570374, - "reward_std": 0.9399848580360413, - "rewards/accuracy_reward": 0.3750000149011612, - "rewards/reasoning_steps_reward": 0.8888888955116272, - "rewards/repetition_penalty_reward": -0.013770201243460178, - "rewards/tag_count_reward": 0.6875000298023224, + "completion_length": 382.5416717529297, + "epoch": 0.3055, + "grad_norm": 18.330556875068268, + "kl": 2.4296875, + "learning_rate": 8.891035265357371e-07, + "loss": 0.5719, + "reward": 2.4720453023910522, + "reward_std": 0.5455569326877594, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.9236111044883728, + "rewards/repetition_penalty_reward": -0.024482558481395245, + "rewards/tag_count_reward": 0.9270833432674408, "step": 611 }, { "clip_ratio": 0.0, - "completion_length": 600.0625152587891, - "epoch": 0.612, - "grad_norm": 63.22955290170651, - "kl": 7.67578125, - "learning_rate": 4.5336910277482155e-07, - "loss": 1.1131, - "reward": 2.076914668083191, - "reward_std": 0.6576991975307465, - "rewards/accuracy_reward": 0.3958333432674408, - "rewards/reasoning_steps_reward": 0.8472222685813904, - "rewards/repetition_penalty_reward": -0.02030777558684349, - "rewards/tag_count_reward": 0.8541666865348816, + "completion_length": 463.68751525878906, + "epoch": 0.306, + "grad_norm": 20.965812940157566, + "kl": 3.12109375, + "learning_rate": 8.88586709003076e-07, + "loss": 0.5898, + "reward": 2.5473427772521973, + "reward_std": 0.7215102910995483, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9097222685813904, + "rewards/repetition_penalty_reward": -0.02904626540839672, + "rewards/tag_count_reward": 0.875, "step": 612 }, { "clip_ratio": 0.0, - "completion_length": 442.7291717529297, - "epoch": 0.613, - "grad_norm": 41.71612091835444, - "kl": 4.90625, - "learning_rate": 4.51835541371556e-07, - "loss": 0.4885, - "reward": 2.2476359605789185, - "reward_std": 0.6959330439567566, - "rewards/accuracy_reward": 0.4375000149011612, - "rewards/reasoning_steps_reward": 0.9444445371627808, - "rewards/repetition_penalty_reward": -0.01972527615725994, - "rewards/tag_count_reward": 0.8854166865348816, + "completion_length": 277.93751525878906, + "epoch": 0.3065, + "grad_norm": 21.659936470606215, + "kl": 1.1328125, + "learning_rate": 8.8806886007658e-07, + "loss": 0.1303, + "reward": 2.7708282470703125, + "reward_std": 0.48425713181495667, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.026046608574688435, + "rewards/tag_count_reward": 0.9635416865348816, "step": 613 }, { "clip_ratio": 0.0, - "completion_length": 576.3750305175781, - "epoch": 0.614, - "grad_norm": 22.018133081081007, - "kl": 4.546875, - "learning_rate": 4.503031760712397e-07, - "loss": 0.813, - "reward": 2.5740822553634644, - "reward_std": 0.7949818074703217, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 0.944444477558136, - "rewards/repetition_penalty_reward": -0.01619557524099946, - "rewards/tag_count_reward": 0.8541666865348816, + "completion_length": 278.68751525878906, + "epoch": 0.307, + "grad_norm": 22.568162104962866, + "kl": 0.58203125, + "learning_rate": 8.875499813337067e-07, + "loss": 0.2044, + "reward": 2.8232574462890625, + "reward_std": 0.385304257273674, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 0.965277761220932, + "rewards/repetition_penalty_reward": -0.022228870075196028, + "rewards/tag_count_reward": 0.9843750298023224, "step": 614 }, { "clip_ratio": 0.0, - "completion_length": 558.2916870117188, - "epoch": 0.615, - "grad_norm": 20.90403864764217, - "kl": 4.046875, - "learning_rate": 4.4877202554526084e-07, - "loss": 0.8765, - "reward": 2.31480872631073, - "reward_std": 0.6609582602977753, - "rewards/accuracy_reward": 0.5, - "rewards/reasoning_steps_reward": 0.9583333730697632, - "rewards/repetition_penalty_reward": -0.018524575512856245, - "rewards/tag_count_reward": 0.875, + "completion_length": 345.8333435058594, + "epoch": 0.3075, + "grad_norm": 13.0831888313229, + "kl": 0.63818359375, + "learning_rate": 8.87030074355051e-07, + "loss": 0.2361, + "reward": 2.7106308937072754, + "reward_std": 0.3397009428590536, + "rewards/accuracy_reward": 0.8125, + "rewards/reasoning_steps_reward": 0.9652777314186096, + "rewards/repetition_penalty_reward": -0.030688603408634663, + "rewards/tag_count_reward": 0.9635416865348816, "step": 615 }, { "clip_ratio": 0.0, - "completion_length": 571.6666870117188, - "epoch": 0.616, - "grad_norm": 20.967646246131107, - "kl": 5.28125, - "learning_rate": 4.4724210845020494e-07, - "loss": 0.7339, - "reward": 2.366945743560791, - "reward_std": 0.5788512378931046, + "completion_length": 297.50001525878906, + "epoch": 0.308, + "grad_norm": 9.770757617195565, + "kl": 0.42138671875, + "learning_rate": 8.865091407243394e-07, + "loss": 0.0876, + "reward": 2.564429521560669, + "reward_std": 0.381553053855896, "rewards/accuracy_reward": 0.6250000298023224, - "rewards/reasoning_steps_reward": 0.944444477558136, - "rewards/repetition_penalty_reward": -0.014998776838183403, - "rewards/tag_count_reward": 0.8125, + "rewards/reasoning_steps_reward": 0.9861111640930176, + "rewards/repetition_penalty_reward": -0.041473302990198135, + "rewards/tag_count_reward": 0.9947916865348816, "step": 616 }, { "clip_ratio": 0.0, - "completion_length": 597.75, - "epoch": 0.617, - "grad_norm": 62.998128656761025, - "kl": 8.015625, - "learning_rate": 4.457134434276293e-07, - "loss": 1.0981, - "reward": 2.1459895372390747, - "reward_std": 0.7351883053779602, - "rewards/accuracy_reward": 0.4583333432674408, - "rewards/reasoning_steps_reward": 0.9444444477558136, - "rewards/repetition_penalty_reward": -0.011996725108474493, - "rewards/tag_count_reward": 0.7552083432674408, - "step": 617 + "completion_length": 433.37501525878906, + "epoch": 0.3085, + "grad_norm": 15.238026778188452, + "kl": 2.79296875, + "learning_rate": 8.859871820284261e-07, + "loss": 0.6701, + "reward": 2.4649064540863037, + "reward_std": 0.5701871514320374, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.916666716337204, + "rewards/repetition_penalty_reward": -0.029885290190577507, + "rewards/tag_count_reward": 0.9322916865348816, + "step": 617 }, { "clip_ratio": 0.0, - "completion_length": 578.9583435058594, - "epoch": 0.618, - "grad_norm": 35.2952233783846, - "kl": 7.296875, - "learning_rate": 4.441860491038345e-07, - "loss": 1.1757, - "reward": 1.9162747263908386, - "reward_std": 0.7140241265296936, - "rewards/accuracy_reward": 0.2708333432674408, - "rewards/reasoning_steps_reward": 0.9097222685813904, - "rewards/repetition_penalty_reward": -0.029905788600444794, - "rewards/tag_count_reward": 0.7656250298023224, + "completion_length": 383.4166717529297, + "epoch": 0.309, + "grad_norm": 22.09258325323561, + "kl": 2.6484375, + "learning_rate": 8.85464199857288e-07, + "loss": 0.3808, + "reward": 2.376816987991333, + "reward_std": 0.6643776893615723, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.04158583842217922, + "rewards/tag_count_reward": 0.9322916865348816, "step": 618 }, { "clip_ratio": 0.0, - "completion_length": 412.0208435058594, - "epoch": 0.619, - "grad_norm": 20.183860058369124, - "kl": 3.5234375, - "learning_rate": 4.4265994408963867e-07, - "loss": 0.6553, - "reward": 2.6022950410842896, - "reward_std": 0.690807044506073, + "completion_length": 418.75001525878906, + "epoch": 0.3095, + "grad_norm": 18.162555409765705, + "kl": 2.8828125, + "learning_rate": 8.849401958040192e-07, + "loss": 0.5727, + "reward": 2.6696484088897705, + "reward_std": 0.5072681605815887, "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 0.9583333730697632, - "rewards/repetition_penalty_reward": -0.01749663334339857, - "rewards/tag_count_reward": 0.8697916865348816, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.019587615504860878, + "rewards/tag_count_reward": 0.9322916865348816, "step": 619 }, { "clip_ratio": 0.0, - "completion_length": 551.7083587646484, - "epoch": 0.62, - "grad_norm": 30.2741526195306, - "kl": 3.6640625, - "learning_rate": 4.4113514698014953e-07, - "loss": 1.2213, - "reward": 2.3724182844161987, - "reward_std": 0.7322799563407898, - "rewards/accuracy_reward": 0.5625000149011612, + "completion_length": 368.75, + "epoch": 0.31, + "grad_norm": 26.7814555750565, + "kl": 2.5703125, + "learning_rate": 8.844151714648274e-07, + "loss": 0.5849, + "reward": 2.5200384855270386, + "reward_std": 0.5991145968437195, + "rewards/accuracy_reward": 0.6875000298023224, "rewards/reasoning_steps_reward": 0.951388955116272, - "rewards/repetition_penalty_reward": -0.021679001860320568, - "rewards/tag_count_reward": 0.8802083730697632, + "rewards/repetition_penalty_reward": -0.025100392289459705, + "rewards/tag_count_reward": 0.90625, "step": 620 }, { "clip_ratio": 0.0, - "completion_length": 477.89585876464844, - "epoch": 0.621, - "grad_norm": 23.62241565191626, - "kl": 4.26171875, - "learning_rate": 4.3961167635453876e-07, - "loss": 0.6591, - "reward": 2.3674203753471375, - "reward_std": 0.6224862933158875, - "rewards/accuracy_reward": 0.6458333432674408, - "rewards/reasoning_steps_reward": 0.9375000596046448, - "rewards/repetition_penalty_reward": -0.012787933927029371, - "rewards/tag_count_reward": 0.7968750298023224, + "completion_length": 393.2083435058594, + "epoch": 0.3105, + "grad_norm": 14.7849673987086, + "kl": 2.59765625, + "learning_rate": 8.838891284390273e-07, + "loss": 0.5688, + "reward": 2.561677932739258, + "reward_std": 0.5532724261283875, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9236111640930176, + "rewards/repetition_penalty_reward": -0.033808158710598946, + "rewards/tag_count_reward": 0.9427083730697632, "step": 621 }, { "clip_ratio": 0.0, - "completion_length": 589.2500305175781, - "epoch": 0.622, - "grad_norm": 26.80506804197189, - "kl": 4.28125, - "learning_rate": 4.3808955077581546e-07, - "loss": 1.3034, - "reward": 2.1128806471824646, - "reward_std": 0.6486698687076569, - "rewards/accuracy_reward": 0.37500002048909664, - "rewards/reasoning_steps_reward": 0.9236111044883728, - "rewards/repetition_penalty_reward": -0.01906377449631691, - "rewards/tag_count_reward": 0.8333333432674408, + "completion_length": 471.14585876464844, + "epoch": 0.311, + "grad_norm": 20.53699662396806, + "kl": 3.734375, + "learning_rate": 8.833620683290375e-07, + "loss": 0.8009, + "reward": 2.2843284606933594, + "reward_std": 0.7857449054718018, + "rewards/accuracy_reward": 0.5625, + "rewards/reasoning_steps_reward": 0.8819444179534912, + "rewards/repetition_penalty_reward": -0.029907681047916412, + "rewards/tag_count_reward": 0.8697916865348816, "step": 622 }, { "clip_ratio": 0.0, - "completion_length": 591.5416870117188, - "epoch": 0.623, - "grad_norm": 17.029635227087233, - "kl": 4.71875, - "learning_rate": 4.365687887905988e-07, - "loss": 0.7684, - "reward": 2.364261269569397, - "reward_std": 0.7807703912258148, - "rewards/accuracy_reward": 0.6458333432674408, - "rewards/reasoning_steps_reward": 0.9236111640930176, - "rewards/repetition_penalty_reward": -0.017683234997093678, - "rewards/tag_count_reward": 0.8125000298023224, + "completion_length": 650.0000152587891, + "epoch": 0.3115, + "grad_norm": 69.27168548723343, + "kl": 9.65625, + "learning_rate": 8.828339927403745e-07, + "loss": 1.1394, + "reward": 1.8982658386230469, + "reward_std": 0.9247550368309021, + "rewards/accuracy_reward": 0.4375000149011612, + "rewards/reasoning_steps_reward": 0.7361111044883728, + "rewards/repetition_penalty_reward": -0.020136947743594646, + "rewards/tag_count_reward": 0.7447916865348816, "step": 623 }, { "clip_ratio": 0.0, - "completion_length": 657.8541870117188, - "epoch": 0.624, - "grad_norm": 29.359681860338085, - "kl": 8.28125, - "learning_rate": 4.350494089288943e-07, - "loss": 1.1389, - "reward": 2.2420893907546997, - "reward_std": 0.8185697495937347, - "rewards/accuracy_reward": 0.6041666865348816, - "rewards/reasoning_steps_reward": 0.944444477558136, - "rewards/repetition_penalty_reward": -0.01485508494079113, - "rewards/tag_count_reward": 0.7083333432674408, + "completion_length": 845.4792175292969, + "epoch": 0.312, + "grad_norm": 31.60208939904907, + "kl": 7.09375, + "learning_rate": 8.823049032816478e-07, + "loss": 0.8724, + "reward": 1.8869645595550537, + "reward_std": 0.8868106305599213, + "rewards/accuracy_reward": 0.4583333432674408, + "rewards/reasoning_steps_reward": 0.7152778208255768, + "rewards/repetition_penalty_reward": -0.021021696738898754, + "rewards/tag_count_reward": 0.7343750298023224, "step": 624 }, { "clip_ratio": 0.0, - "completion_length": 572.8958435058594, - "epoch": 0.625, - "grad_norm": 25.429883504952183, - "kl": 5.6875, - "learning_rate": 4.3353142970386557e-07, - "loss": 0.8027, - "reward": 2.140592336654663, - "reward_std": 0.777917891740799, - "rewards/accuracy_reward": 0.4375, - "rewards/reasoning_steps_reward": 0.9513889253139496, - "rewards/repetition_penalty_reward": -0.019129890482872725, - "rewards/tag_count_reward": 0.7708333730697632, + "completion_length": 501.29168701171875, + "epoch": 0.3125, + "grad_norm": 37.663529836109795, + "kl": 3.2734375, + "learning_rate": 8.817748015645558e-07, + "loss": 0.8728, + "reward": 2.3407769203186035, + "reward_std": 0.8801598846912384, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.86111119389534, + "rewards/repetition_penalty_reward": -0.025542644783854485, + "rewards/tag_count_reward": 0.8385416865348816, "step": 625 }, { "clip_ratio": 0.0, - "completion_length": 713.5833435058594, - "epoch": 0.626, - "grad_norm": 43.92929389982015, - "kl": 8.28125, - "learning_rate": 4.3201486961161093e-07, - "loss": 1.2015, - "reward": 1.720135509967804, - "reward_std": 0.6745887994766235, - "rewards/accuracy_reward": 0.2083333358168602, + "completion_length": 470.6041717529297, + "epoch": 0.313, + "grad_norm": 29.75409371364291, + "kl": 2.640625, + "learning_rate": 8.812436892038805e-07, + "loss": 0.7078, + "reward": 2.3025213479995728, + "reward_std": 0.6479816734790802, + "rewards/accuracy_reward": 0.520833358168602, "rewards/reasoning_steps_reward": 0.9097222685813904, - "rewards/repetition_penalty_reward": -0.012503455393016338, - "rewards/tag_count_reward": 0.6145833730697632, + "rewards/repetition_penalty_reward": -0.023867566138505936, + "rewards/tag_count_reward": 0.8958333432674408, "step": 626 }, { "clip_ratio": 0.0, - "completion_length": 645.3125, - "epoch": 0.627, - "grad_norm": 57.87651440683306, - "kl": 8.28125, - "learning_rate": 4.304997471309361e-07, - "loss": 1.0589, - "reward": 2.1112887859344482, - "reward_std": 0.8265305161476135, - "rewards/accuracy_reward": 0.5208333432674408, - "rewards/reasoning_steps_reward": 0.9027777910232544, - "rewards/repetition_penalty_reward": -0.010239105205982924, - "rewards/tag_count_reward": 0.6979166865348816, + "completion_length": 459.4375, + "epoch": 0.3135, + "grad_norm": 20.212825860910232, + "kl": 2.25390625, + "learning_rate": 8.807115678174819e-07, + "loss": 0.5701, + "reward": 2.5997543334960938, + "reward_std": 0.5478241741657257, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9375000298023224, + "rewards/repetition_penalty_reward": -0.03045404888689518, + "rewards/tag_count_reward": 0.9427083432674408, "step": 627 }, { "clip_ratio": 0.0, - "completion_length": 648.2916870117188, - "epoch": 0.628, - "grad_norm": 50.008615375469205, - "kl": 5.703125, - "learning_rate": 4.2898608072313045e-07, - "loss": 1.1558, - "reward": 2.0761906504631042, - "reward_std": 0.8007937371730804, - "rewards/accuracy_reward": 0.3958333432674408, - "rewards/reasoning_steps_reward": 0.9027778208255768, - "rewards/repetition_penalty_reward": -0.02450390998274088, - "rewards/tag_count_reward": 0.8020833432674408, + "completion_length": 447.72918701171875, + "epoch": 0.314, + "grad_norm": 29.197351580965048, + "kl": 4.796875, + "learning_rate": 8.801784390262943e-07, + "loss": 0.8627, + "reward": 2.3921085596084595, + "reward_std": 0.6133331656455994, + "rewards/accuracy_reward": 0.6250000149011612, + "rewards/reasoning_steps_reward": 0.8819444477558136, + "rewards/repetition_penalty_reward": -0.02108590304851532, + "rewards/tag_count_reward": 0.9062500298023224, "step": 628 }, { "clip_ratio": 0.0, - "completion_length": 510.45835876464844, - "epoch": 0.629, - "grad_norm": 29.38809691299576, - "kl": 3.390625, - "learning_rate": 4.2747388883174154e-07, - "loss": 0.6864, - "reward": 2.3009127378463745, - "reward_std": 0.7014258801937103, - "rewards/accuracy_reward": 0.6041666865348816, - "rewards/reasoning_steps_reward": 0.9305556118488312, - "rewards/repetition_penalty_reward": -0.009851326234638691, - "rewards/tag_count_reward": 0.7760416865348816, + "completion_length": 463.91668701171875, + "epoch": 0.3145, + "grad_norm": 44.79167908730396, + "kl": 4.546875, + "learning_rate": 8.796443044543203e-07, + "loss": 0.8839, + "reward": 2.317416787147522, + "reward_std": 0.7653204798698425, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 0.9166666865348816, + "rewards/repetition_penalty_reward": -0.026333278976380825, + "rewards/tag_count_reward": 0.9062500298023224, "step": 629 }, { "clip_ratio": 0.0, - "completion_length": 722.8750305175781, - "epoch": 0.63, - "grad_norm": 15.997709703522245, - "kl": 5.765625, - "learning_rate": 4.2596318988235037e-07, - "loss": 0.7532, - "reward": 1.911007285118103, - "reward_std": 0.7613692283630371, - "rewards/accuracy_reward": 0.3541666716337204, - "rewards/reasoning_steps_reward": 0.8958333730697632, - "rewards/repetition_penalty_reward": -0.021284431219100952, - "rewards/tag_count_reward": 0.6822916865348816, + "completion_length": 412.25001525878906, + "epoch": 0.315, + "grad_norm": 44.27467343188964, + "kl": 3.2734375, + "learning_rate": 8.791091657286267e-07, + "loss": 0.8308, + "reward": 2.6593304872512817, + "reward_std": 0.5533206462860107, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.024697369895875454, + "rewards/tag_count_reward": 0.9479166865348816, "step": 630 }, { "clip_ratio": 0.0, - "completion_length": 539.125, - "epoch": 0.631, - "grad_norm": 21.914178804717245, - "kl": 3.2734375, - "learning_rate": 4.2445400228234687e-07, - "loss": 0.7349, - "reward": 2.5217862129211426, - "reward_std": 0.7204074263572693, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.018144560046494007, - "rewards/tag_count_reward": 0.8385416865348816, + "completion_length": 368.5833435058594, + "epoch": 0.3155, + "grad_norm": 30.80486315018559, + "kl": 2.9765625, + "learning_rate": 8.785730244793386e-07, + "loss": 0.8518, + "reward": 2.6147913932800293, + "reward_std": 0.5551804751157761, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9305556118488312, + "rewards/repetition_penalty_reward": -0.03451400436460972, + "rewards/tag_count_reward": 0.9479166865348816, "step": 631 }, { "clip_ratio": 0.0, - "completion_length": 582.5000152587891, - "epoch": 0.632, - "grad_norm": 23.574151149835284, - "kl": 4.3125, - "learning_rate": 4.2294634442070553e-07, - "loss": 0.776, - "reward": 2.2236552238464355, - "reward_std": 0.7735611796379089, - "rewards/accuracy_reward": 0.4791666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.014192146249115467, - "rewards/tag_count_reward": 0.7656250298023224, + "completion_length": 411.91668701171875, + "epoch": 0.316, + "grad_norm": 20.520000609846605, + "kl": 3.0859375, + "learning_rate": 8.780358823396352e-07, + "loss": 0.5721, + "reward": 2.6789560317993164, + "reward_std": 0.5304727852344513, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/reasoning_steps_reward": 0.9375000298023224, + "rewards/repetition_penalty_reward": -0.039794113487005234, + "rewards/tag_count_reward": 0.9479166865348816, "step": 632 }, { "clip_ratio": 0.0, - "completion_length": 606.8125305175781, - "epoch": 0.633, - "grad_norm": 27.523457434836338, - "kl": 3.375, - "learning_rate": 4.214402346677619e-07, - "loss": 0.8245, - "reward": 2.0943539142608643, - "reward_std": 0.7719534635543823, - "rewards/accuracy_reward": 0.375, - "rewards/reasoning_steps_reward": 0.9583333730697632, - "rewards/repetition_penalty_reward": -0.020229644142091274, - "rewards/tag_count_reward": 0.78125, + "completion_length": 363.5, + "epoch": 0.3165, + "grad_norm": 63.575635135783344, + "kl": 2.2265625, + "learning_rate": 8.774977409457447e-07, + "loss": 0.5012, + "reward": 2.500579595565796, + "reward_std": 0.2963787615299225, + "rewards/accuracy_reward": 0.625, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.03935109078884125, + "rewards/tag_count_reward": 0.9635416865348816, "step": 633 }, { "clip_ratio": 0.0, - "completion_length": 468.04168701171875, - "epoch": 0.634, - "grad_norm": 26.042764464725323, - "kl": 3.5234375, - "learning_rate": 4.1993569137498776e-07, - "loss": 0.6013, - "reward": 2.314329147338867, - "reward_std": 0.48445817828178406, - "rewards/accuracy_reward": 0.541666679084301, - "rewards/reasoning_steps_reward": 0.9305555522441864, - "rewards/repetition_penalty_reward": -0.01205990044400096, - "rewards/tag_count_reward": 0.8541666865348816, + "completion_length": 379.9791717529297, + "epoch": 0.317, + "grad_norm": 12.662953099547957, + "kl": 1.4296875, + "learning_rate": 8.769586019369391e-07, + "loss": 0.3589, + "reward": 2.5281978845596313, + "reward_std": 0.6068725287914276, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.04645505174994469, + "rewards/tag_count_reward": 0.9635416865348816, "step": 634 }, { "clip_ratio": 0.0, - "completion_length": 585.9791717529297, - "epoch": 0.635, - "grad_norm": 110.89920440001275, - "kl": 5.375, - "learning_rate": 4.1843273287476854e-07, - "loss": 1.2961, - "reward": 2.3673198223114014, - "reward_std": 0.7202720046043396, - "rewards/accuracy_reward": 0.5833333432674408, - "rewards/reasoning_steps_reward": 0.9652778506278992, - "rewards/repetition_penalty_reward": -0.014624850358814001, - "rewards/tag_count_reward": 0.8333333432674408, + "completion_length": 461.27085876464844, + "epoch": 0.3175, + "grad_norm": 12.945423045438853, + "kl": 1.072265625, + "learning_rate": 8.764184669555293e-07, + "loss": 0.7605, + "reward": 2.4953324794769287, + "reward_std": 0.6080846786499023, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9305555522441864, + "rewards/repetition_penalty_reward": -0.039389871060848236, + "rewards/tag_count_reward": 0.9375000298023224, "step": 635 }, { "clip_ratio": 0.0, - "completion_length": 543.1250152587891, - "epoch": 0.636, - "grad_norm": 168.91845413040815, - "kl": 6.78125, - "learning_rate": 4.1693137748017915e-07, - "loss": 0.9638, - "reward": 2.172168493270874, - "reward_std": 0.7465826869010925, - "rewards/accuracy_reward": 0.4166666865348816, - "rewards/reasoning_steps_reward": 0.9305556118488312, - "rewards/repetition_penalty_reward": -0.01880389917641878, - "rewards/tag_count_reward": 0.8437500298023224, + "completion_length": 443.5208435058594, + "epoch": 0.318, + "grad_norm": 7.650991561257087, + "kl": 1.1640625, + "learning_rate": 8.758773376468604e-07, + "loss": 0.3799, + "reward": 2.4011194705963135, + "reward_std": 0.4720783084630966, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9166666567325592, + "rewards/repetition_penalty_reward": -0.025964029133319855, + "rewards/tag_count_reward": 0.9270833432674408, "step": 636 }, { "clip_ratio": 0.0, - "completion_length": 367.12501525878906, - "epoch": 0.637, - "grad_norm": 14.167427077773384, - "kl": 1.734375, - "learning_rate": 4.15431643484761e-07, - "loss": 0.2434, - "reward": 2.3709245920181274, - "reward_std": 0.41279859840869904, - "rewards/accuracy_reward": 0.4791666716337204, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.011019973549991846, - "rewards/tag_count_reward": 0.9166666865348816, + "completion_length": 408.93751525878906, + "epoch": 0.3185, + "grad_norm": 15.77947801589448, + "kl": 0.68359375, + "learning_rate": 8.753352156593055e-07, + "loss": 0.5988, + "reward": 2.430037498474121, + "reward_std": 0.7513845264911652, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9513889849185944, + "rewards/repetition_penalty_reward": -0.03697636350989342, + "rewards/tag_count_reward": 0.9322916865348816, "step": 637 }, { "clip_ratio": 0.0, - "completion_length": 409.0208435058594, - "epoch": 0.638, - "grad_norm": 44.523191376668905, - "kl": 2.34765625, - "learning_rate": 4.1393354916230005e-07, - "loss": 0.6225, - "reward": 2.353254556655884, - "reward_std": 0.5010079145431519, - "rewards/accuracy_reward": 0.5000000111758709, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.025217789225280285, - "rewards/tag_count_reward": 0.9062500298023224, + "completion_length": 375.37501525878906, + "epoch": 0.319, + "grad_norm": 13.718169867441816, + "kl": 0.548828125, + "learning_rate": 8.747921026442629e-07, + "loss": 0.424, + "reward": 2.6864466667175293, + "reward_std": 0.45896846055984497, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.027095147408545017, + "rewards/tag_count_reward": 0.9635416865348816, "step": 638 }, { "clip_ratio": 0.0, - "completion_length": 375.56251525878906, - "epoch": 0.639, - "grad_norm": 19.368462137693786, - "kl": 1.640625, - "learning_rate": 4.124371127666024e-07, - "loss": 0.4301, - "reward": 2.260585904121399, - "reward_std": 0.4507751762866974, - "rewards/accuracy_reward": 0.3750000149011612, - "rewards/reasoning_steps_reward": 0.9652778506278992, - "rewards/repetition_penalty_reward": -0.017191945109516382, - "rewards/tag_count_reward": 0.9375000298023224, + "completion_length": 341.875, + "epoch": 0.3195, + "grad_norm": 17.467108774461327, + "kl": 0.828125, + "learning_rate": 8.742480002561487e-07, + "loss": 0.3762, + "reward": 2.4510825872421265, + "reward_std": 0.5428140312433243, + "rewards/accuracy_reward": 0.5625000298023224, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.02808418497443199, + "rewards/tag_count_reward": 0.9583333432674408, "step": 639 }, { "clip_ratio": 0.0, - "completion_length": 527.1666870117188, - "epoch": 0.64, - "grad_norm": 23.408643386653363, - "kl": 5.40625, - "learning_rate": 4.1094235253127374e-07, - "loss": 1.0619, - "reward": 2.11304247379303, - "reward_std": 0.6343182921409607, - "rewards/accuracy_reward": 0.3333333432674408, - "rewards/reasoning_steps_reward": 0.9583334028720856, - "rewards/repetition_penalty_reward": -0.017165867146104574, - "rewards/tag_count_reward": 0.8385416865348816, + "completion_length": 345.0208435058594, + "epoch": 0.32, + "grad_norm": 9.631118470397297, + "kl": 1.0517578125, + "learning_rate": 8.737029101523929e-07, + "loss": 0.2993, + "reward": 2.2707436084747314, + "reward_std": 0.6229533404111862, + "rewards/accuracy_reward": 0.458333358168602, + "rewards/reasoning_steps_reward": 0.9097222685813904, + "rewards/repetition_penalty_reward": -0.02439526654779911, + "rewards/tag_count_reward": 0.9270833730697632, "step": 640 }, { "clip_ratio": 0.0, - "completion_length": 338.7083435058594, - "epoch": 0.641, - "grad_norm": 26.252165148333535, - "kl": 2.26953125, - "learning_rate": 4.0944928666949527e-07, - "loss": 0.0696, - "reward": 2.4866459369659424, - "reward_std": 0.29595404863357544, - "rewards/accuracy_reward": 0.5833333432674408, - "rewards/reasoning_steps_reward": 0.9652778506278992, - "rewards/repetition_penalty_reward": -0.020298474468290806, - "rewards/tag_count_reward": 0.9583333730697632, + "completion_length": 456.97918701171875, + "epoch": 0.3205, + "grad_norm": 18.87763752323685, + "kl": 1.7734375, + "learning_rate": 8.731568339934348e-07, + "loss": 0.5908, + "reward": 1.9133847951889038, + "reward_std": 0.4975127577781677, + "rewards/accuracy_reward": 0.18750000558793545, + "rewards/reasoning_steps_reward": 0.8611111640930176, + "rewards/repetition_penalty_reward": -0.036268092691898346, + "rewards/tag_count_reward": 0.9010416865348816, "step": 641 }, { "clip_ratio": 0.0, - "completion_length": 613.9166717529297, - "epoch": 0.642, - "grad_norm": 83.06728366905014, - "kl": 9.09375, - "learning_rate": 4.079579333738039e-07, - "loss": 1.4448, - "reward": 2.021218776702881, - "reward_std": 0.7841091454029083, - "rewards/accuracy_reward": 0.3541666679084301, - "rewards/reasoning_steps_reward": 0.881944477558136, - "rewards/repetition_penalty_reward": -0.011767241638153791, - "rewards/tag_count_reward": 0.7968750298023224, + "completion_length": 352.35418701171875, + "epoch": 0.321, + "grad_norm": 9.424502666967252, + "kl": 1.328125, + "learning_rate": 8.726097734427172e-07, + "loss": 0.1237, + "reward": 2.5691460371017456, + "reward_std": 0.49686890840530396, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9652778506278992, + "rewards/repetition_penalty_reward": -0.05759007856249809, + "rewards/tag_count_reward": 0.9531250298023224, "step": 642 }, { "clip_ratio": 0.0, - "completion_length": 517.4375, - "epoch": 0.643, - "grad_norm": 29.352168555788648, - "kl": 4.1328125, - "learning_rate": 4.064683108158685e-07, - "loss": 0.8538, - "reward": 2.1417415142059326, - "reward_std": 0.7165846824645996, - "rewards/accuracy_reward": 0.3333333432674408, - "rewards/reasoning_steps_reward": 0.9583333730697632, - "rewards/repetition_penalty_reward": -0.019716893322765827, - "rewards/tag_count_reward": 0.8697916865348816, + "completion_length": 334.6666717529297, + "epoch": 0.3215, + "grad_norm": 8.243342051209725, + "kl": 1.203125, + "learning_rate": 8.72061730166681e-07, + "loss": 0.2726, + "reward": 2.5343031883239746, + "reward_std": 0.46366532891988754, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9444444477558136, + "rewards/repetition_penalty_reward": -0.024724683724343777, + "rewards/tag_count_reward": 0.90625, "step": 643 }, { "clip_ratio": 0.0, - "completion_length": 472.8958435058594, - "epoch": 0.644, - "grad_norm": 27.512095159522406, - "kl": 5.0078125, - "learning_rate": 4.0498043714627006e-07, - "loss": 0.4818, - "reward": 2.178748071193695, - "reward_std": 0.5007181763648987, - "rewards/accuracy_reward": 0.3958333544433117, - "rewards/reasoning_steps_reward": 0.9444445073604584, - "rewards/repetition_penalty_reward": -0.01569645293056965, - "rewards/tag_count_reward": 0.8541666865348816, + "completion_length": 359.8333435058594, + "epoch": 0.322, + "grad_norm": 13.81419209256365, + "kl": 1.3359375, + "learning_rate": 8.715127058347614e-07, + "loss": 0.4082, + "reward": 2.7587969303131104, + "reward_std": 0.44490741193294525, + "rewards/accuracy_reward": 0.8750000298023224, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.04849480465054512, + "rewards/tag_count_reward": 0.9531250298023224, "step": 644 }, { "clip_ratio": 0.0, - "completion_length": 487.25001525878906, - "epoch": 0.645, - "grad_norm": 42.088759648308134, - "kl": 2.484375, - "learning_rate": 4.034943304942796e-07, - "loss": 0.9438, - "reward": 2.6129757165908813, - "reward_std": 0.6440701186656952, - "rewards/accuracy_reward": 0.75, - "rewards/reasoning_steps_reward": 0.9583333432674408, - "rewards/repetition_penalty_reward": -0.02244096901267767, - "rewards/tag_count_reward": 0.9270833730697632, + "completion_length": 306.0208435058594, + "epoch": 0.3225, + "grad_norm": 7.722256504528858, + "kl": 1.244140625, + "learning_rate": 8.709627021193816e-07, + "loss": 0.259, + "reward": 2.2697885036468506, + "reward_std": 0.5568395256996155, + "rewards/accuracy_reward": 0.3958333432674408, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.04097543843090534, + "rewards/tag_count_reward": 0.9635416865348816, "step": 645 }, { "clip_ratio": 0.0, - "completion_length": 506.22918701171875, - "epoch": 0.646, - "grad_norm": 15.525366461402383, - "kl": 3.5234375, - "learning_rate": 4.020100089676376e-07, - "loss": 0.5343, - "reward": 2.1641972064971924, - "reward_std": 0.6387200355529785, - "rewards/accuracy_reward": 0.3750000149011612, - "rewards/reasoning_steps_reward": 0.9374999701976776, - "rewards/repetition_penalty_reward": -0.018094514962285757, - "rewards/tag_count_reward": 0.8697916865348816, + "completion_length": 317.5833435058594, + "epoch": 0.323, + "grad_norm": 20.616365592900145, + "kl": 3.88671875, + "learning_rate": 8.704117206959484e-07, + "loss": 0.4825, + "reward": 2.6422855854034424, + "reward_std": 0.6127668023109436, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.027853261679410934, + "rewards/tag_count_reward": 0.9270833730697632, "step": 646 }, { "clip_ratio": 0.0, - "completion_length": 676.6875305175781, - "epoch": 0.647, - "grad_norm": 27.736362696423306, - "kl": 5.53125, - "learning_rate": 4.005274906523336e-07, - "loss": 1.0089, - "reward": 2.418774127960205, - "reward_std": 0.8770462274551392, - "rewards/accuracy_reward": 0.7083333730697632, - "rewards/reasoning_steps_reward": 0.9305555820465088, - "rewards/repetition_penalty_reward": -0.022198081016540527, - "rewards/tag_count_reward": 0.8020833730697632, + "completion_length": 302.8958435058594, + "epoch": 0.3235, + "grad_norm": 8.074414625169473, + "kl": 1.1787109375, + "learning_rate": 8.698597632428466e-07, + "loss": 0.1034, + "reward": 2.6797112226486206, + "reward_std": 0.35129064321517944, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.03383052162826061, + "rewards/tag_count_reward": 0.984375, "step": 647 }, { "clip_ratio": 0.0, - "completion_length": 477.10418701171875, - "epoch": 0.648, - "grad_norm": 32.594519011116475, - "kl": 3.15625, - "learning_rate": 3.9904679361238526e-07, - "loss": 0.8488, - "reward": 2.172785520553589, - "reward_std": 0.6095466017723083, - "rewards/accuracy_reward": 0.3541666716337204, - "rewards/reasoning_steps_reward": 0.9583333730697632, - "rewards/repetition_penalty_reward": -0.014714594930410385, - "rewards/tag_count_reward": 0.875, + "completion_length": 305.7708435058594, + "epoch": 0.324, + "grad_norm": 16.085041721886572, + "kl": 1.09375, + "learning_rate": 8.693068314414344e-07, + "loss": 0.2719, + "reward": 2.777750849723816, + "reward_std": 0.49438077211380005, + "rewards/accuracy_reward": 0.8958333730697632, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.041693685576319695, + "rewards/tag_count_reward": 0.9583333432674408, "step": 648 }, { "clip_ratio": 0.0, - "completion_length": 466.7083435058594, - "epoch": 0.649, - "grad_norm": 16.053982100692465, - "kl": 2.6982421875, - "learning_rate": 3.975679358896189e-07, - "loss": 0.6344, - "reward": 2.492353320121765, - "reward_std": 0.48535051196813583, - "rewards/accuracy_reward": 0.6041666716337204, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.014591319020837545, - "rewards/tag_count_reward": 0.9166666865348816, + "completion_length": 385.125, + "epoch": 0.3245, + "grad_norm": 16.9777467202549, + "kl": 1.3984375, + "learning_rate": 8.687529269760379e-07, + "loss": 0.4593, + "reward": 2.6736103296279907, + "reward_std": 0.5818447470664978, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.022570334374904633, + "rewards/tag_count_reward": 0.9322916865348816, "step": 649 }, { "clip_ratio": 0.0, - "completion_length": 421.4583435058594, - "epoch": 0.65, - "grad_norm": 33.807152135246525, - "kl": 4.578125, - "learning_rate": 3.9609093550344907e-07, - "loss": 0.9576, - "reward": 2.364701509475708, - "reward_std": 0.6279256045818329, - "rewards/accuracy_reward": 0.5000000149011612, - "rewards/reasoning_steps_reward": 0.9513889849185944, - "rewards/repetition_penalty_reward": -0.01377071999013424, - "rewards/tag_count_reward": 0.9270833432674408, + "completion_length": 391.6875, + "epoch": 0.325, + "grad_norm": 16.236705823940458, + "kl": 0.869140625, + "learning_rate": 8.681980515339463e-07, + "loss": 0.5449, + "reward": 2.599353790283203, + "reward_std": 0.6066733002662659, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.03953541815280914, + "rewards/tag_count_reward": 0.9583333730697632, "step": 650 }, { "clip_ratio": 0.0, - "completion_length": 430.08335876464844, - "epoch": 0.651, - "grad_norm": 54.92633243016837, - "kl": 4.2021484375, - "learning_rate": 3.946158104506594e-07, - "loss": 0.7082, - "reward": 2.6497297286987305, - "reward_std": 0.48640553653240204, - "rewards/accuracy_reward": 0.75, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.02214530110359192, - "rewards/tag_count_reward": 0.9427083432674408, + "completion_length": 360.8333435058594, + "epoch": 0.3255, + "grad_norm": 8.416276221716757, + "kl": 0.7939453125, + "learning_rate": 8.676422068054064e-07, + "loss": 0.163, + "reward": 2.606260299682617, + "reward_std": 0.40933507680892944, + "rewards/accuracy_reward": 0.6875000149011612, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.03957311250269413, + "rewards/tag_count_reward": 0.9583333730697632, "step": 651 }, { "clip_ratio": 0.0, - "completion_length": 449.62501525878906, - "epoch": 0.652, - "grad_norm": 338.42582974401023, - "kl": 9.8125, - "learning_rate": 3.931425787051832e-07, - "loss": 1.7215, - "reward": 2.6626009941101074, - "reward_std": 0.6703141033649445, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 0.9375, - "rewards/repetition_penalty_reward": -0.019690793938934803, - "rewards/tag_count_reward": 0.8906250298023224, + "completion_length": 283.0625, + "epoch": 0.326, + "grad_norm": 15.69003304653512, + "kl": 0.537109375, + "learning_rate": 8.670853944836176e-07, + "loss": 0.0603, + "reward": 2.657088875770569, + "reward_std": 0.4240477532148361, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.026939059607684612, + "rewards/tag_count_reward": 0.9895833432674408, "step": 652 }, { "clip_ratio": 0.0, - "completion_length": 304.85418701171875, - "epoch": 0.653, - "grad_norm": 17.319403464457796, - "kl": 1.7109375, - "learning_rate": 3.9167125821788416e-07, - "loss": 0.14, - "reward": 2.6247986555099487, - "reward_std": 0.37955524772405624, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/reasoning_steps_reward": 0.9583333730697632, - "rewards/repetition_penalty_reward": -0.015826416201889515, - "rewards/tag_count_reward": 0.9739583432674408, + "completion_length": 282.4166717529297, + "epoch": 0.3265, + "grad_norm": 14.606960299212915, + "kl": 1.259765625, + "learning_rate": 8.665276162647267e-07, + "loss": 0.1115, + "reward": 2.3430339097976685, + "reward_std": 0.4984329789876938, + "rewards/accuracy_reward": 0.4583333432674408, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.0232857009395957, + "rewards/tag_count_reward": 0.9427083730697632, "step": 653 }, { "clip_ratio": 0.0, - "completion_length": 333.5, - "epoch": 0.654, - "grad_norm": 73.877312813324, - "kl": 1.705078125, - "learning_rate": 3.902018669163384e-07, - "loss": 0.4122, - "reward": 2.905174732208252, - "reward_std": 0.26645080000162125, - "rewards/accuracy_reward": 0.9583333730697632, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.01843650173395872, - "rewards/tag_count_reward": 0.9791666865348816, + "completion_length": 359.8333435058594, + "epoch": 0.327, + "grad_norm": 17.83111188587982, + "kl": 1.36328125, + "learning_rate": 8.659688738478231e-07, + "loss": 0.336, + "reward": 2.423835873603821, + "reward_std": 0.32535358518362045, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.02408089768141508, + "rewards/tag_count_reward": 0.9270833730697632, "step": 654 }, { "clip_ratio": 0.0, - "completion_length": 458.52085876464844, - "epoch": 0.655, - "grad_norm": 87.77289847817504, - "kl": 6.890625, - "learning_rate": 3.8873442270461485e-07, - "loss": 1.0085, - "reward": 2.5801607370376587, - "reward_std": 0.7309716641902924, - "rewards/accuracy_reward": 0.8125000298023224, - "rewards/reasoning_steps_reward": 0.92361119389534, - "rewards/repetition_penalty_reward": -0.02574214804917574, - "rewards/tag_count_reward": 0.8697916865348816, + "completion_length": 341.375, + "epoch": 0.3275, + "grad_norm": 11.748552434251017, + "kl": 1.251953125, + "learning_rate": 8.654091689349329e-07, + "loss": 0.2916, + "reward": 2.4952768087387085, + "reward_std": 0.5520404577255249, + "rewards/accuracy_reward": 0.625, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.03770934417843819, + "rewards/tag_count_reward": 0.9218750298023224, "step": 655 }, { "clip_ratio": 0.0, - "completion_length": 542.7500305175781, - "epoch": 0.656, - "grad_norm": 104.02522400010055, - "kl": 8.09375, - "learning_rate": 3.872689434630585e-07, - "loss": 1.0897, - "reward": 2.465412139892578, - "reward_std": 0.540164515376091, - "rewards/accuracy_reward": 0.6458333432674408, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.010282414965331554, - "rewards/tag_count_reward": 0.8437500298023224, + "completion_length": 436.0, + "epoch": 0.328, + "grad_norm": 12.053678746654692, + "kl": 1.6171875, + "learning_rate": 8.648485032310144e-07, + "loss": 0.687, + "reward": 2.4294373989105225, + "reward_std": 0.5722118020057678, + "rewards/accuracy_reward": 0.583333358168602, + "rewards/reasoning_steps_reward": 0.9583334028720856, + "rewards/repetition_penalty_reward": -0.023687631357461214, + "rewards/tag_count_reward": 0.9114583432674408, "step": 656 }, { "clip_ratio": 0.0, - "completion_length": 510.3958435058594, - "epoch": 0.657, - "grad_norm": 43.13318047502928, - "kl": 6.53125, - "learning_rate": 3.8580544704807117e-07, - "loss": 0.8464, - "reward": 2.118070363998413, - "reward_std": 0.7317387461662292, - "rewards/accuracy_reward": 0.3541666865348816, - "rewards/reasoning_steps_reward": 0.9375000596046448, - "rewards/repetition_penalty_reward": -0.02255462296307087, - "rewards/tag_count_reward": 0.8489583432674408, + "completion_length": 437.93751525878906, + "epoch": 0.3285, + "grad_norm": 6.092908560579486, + "kl": 1.51171875, + "learning_rate": 8.642868784439527e-07, + "loss": 0.4096, + "reward": 2.373212218284607, + "reward_std": 0.5252508372068405, + "rewards/accuracy_reward": 0.5625, + "rewards/reasoning_steps_reward": 0.9444445073604584, + "rewards/repetition_penalty_reward": -0.02435732912272215, + "rewards/tag_count_reward": 0.8906250298023224, "step": 657 }, { "clip_ratio": 0.0, - "completion_length": 491.0833435058594, - "epoch": 0.658, - "grad_norm": 40.435901700616085, - "kl": 2.390625, - "learning_rate": 3.843439512918949e-07, - "loss": 0.7348, - "reward": 2.421813726425171, - "reward_std": 0.6320691108703613, - "rewards/accuracy_reward": 0.5625000298023224, - "rewards/reasoning_steps_reward": 0.9652777910232544, - "rewards/repetition_penalty_reward": -0.017422514967620373, - "rewards/tag_count_reward": 0.9114583432674408, + "completion_length": 469.1458435058594, + "epoch": 0.329, + "grad_norm": 7.258415740610364, + "kl": 1.6015625, + "learning_rate": 8.63724296284554e-07, + "loss": 0.6144, + "reward": 2.5958104133605957, + "reward_std": 0.7455363571643829, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.030925797298550606, + "rewards/tag_count_reward": 0.8697916865348816, "step": 658 }, { "clip_ratio": 0.0, - "completion_length": 508.5000305175781, - "epoch": 0.659, - "grad_norm": 13.800630290866199, - "kl": 2.7109375, - "learning_rate": 3.8288447400239443e-07, - "loss": 0.4474, - "reward": 2.462868094444275, - "reward_std": 0.6122282147407532, - "rewards/accuracy_reward": 0.6041666865348816, + "completion_length": 470.0208435058594, + "epoch": 0.3295, + "grad_norm": 12.390886242134757, + "kl": 1.51953125, + "learning_rate": 8.631607584665413e-07, + "loss": 0.4942, + "reward": 2.3429232835769653, + "reward_std": 0.5768862962722778, + "rewards/accuracy_reward": 0.5000000298023224, "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.016298661939799786, - "rewards/tag_count_reward": 0.8958333432674408, + "rewards/repetition_penalty_reward": -0.04249337501823902, + "rewards/tag_count_reward": 0.9062500298023224, "step": 659 }, { "clip_ratio": 0.0, - "completion_length": 857.5000305175781, - "epoch": 0.66, - "grad_norm": 26.939008770933444, - "kl": 8.09375, - "learning_rate": 3.8142703296283953e-07, - "loss": 0.9327, - "reward": 1.7486651539802551, - "reward_std": 0.8026353716850281, - "rewards/accuracy_reward": 0.2708333358168602, - "rewards/reasoning_steps_reward": 0.8750000298023224, - "rewards/repetition_penalty_reward": -0.027376560494303703, - "rewards/tag_count_reward": 0.6302083432674408, + "completion_length": 527.1458435058594, + "epoch": 0.33, + "grad_norm": 9.725057262385192, + "kl": 1.94140625, + "learning_rate": 8.625962667065487e-07, + "loss": 0.4938, + "reward": 2.334376096725464, + "reward_std": 0.6681367456912994, + "rewards/accuracy_reward": 0.5625000298023224, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.05277678743004799, + "rewards/tag_count_reward": 0.8802083432674408, "step": 660 }, { "clip_ratio": 0.0, - "completion_length": 504.16668701171875, - "epoch": 0.661, - "grad_norm": 40.18446601488726, - "kl": 2.203125, - "learning_rate": 3.7997164593168983e-07, - "loss": 0.8515, - "reward": 2.3131070137023926, - "reward_std": 0.6172126531600952, - "rewards/accuracy_reward": 0.479166679084301, - "rewards/reasoning_steps_reward": 0.9583334028720856, - "rewards/repetition_penalty_reward": -0.02543477900326252, - "rewards/tag_count_reward": 0.9010416865348816, + "completion_length": 334.9791717529297, + "epoch": 0.3305, + "grad_norm": 7.932623599953271, + "kl": 1.439453125, + "learning_rate": 8.620308227241157e-07, + "loss": 0.3623, + "reward": 2.632012963294983, + "reward_std": 0.5029560029506683, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.9166667461395264, + "rewards/repetition_penalty_reward": -0.03465373069047928, + "rewards/tag_count_reward": 0.9375, "step": 661 }, { "clip_ratio": 0.0, - "completion_length": 404.04168701171875, - "epoch": 0.662, - "grad_norm": 19.64806111061822, - "kl": 1.80078125, - "learning_rate": 3.785183306423767e-07, - "loss": 0.3147, - "reward": 2.165276527404785, - "reward_std": 0.5021775662899017, - "rewards/accuracy_reward": 0.2708333358168602, - "rewards/reasoning_steps_reward": 0.9652778506278992, - "rewards/repetition_penalty_reward": -0.018751220777630806, - "rewards/tag_count_reward": 0.9479166865348816, + "completion_length": 446.8958435058594, + "epoch": 0.331, + "grad_norm": 7.989726237635147, + "kl": 1.38671875, + "learning_rate": 8.614644282416831e-07, + "loss": 0.5033, + "reward": 2.527227759361267, + "reward_std": 0.6098933517932892, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.04048066772520542, + "rewards/tag_count_reward": 0.9010416865348816, "step": 662 }, { "clip_ratio": 0.0, - "completion_length": 457.5416717529297, - "epoch": 0.663, - "grad_norm": 55.11379027399269, - "kl": 2.46875, - "learning_rate": 3.7706710480308835e-07, - "loss": 0.9405, - "reward": 2.3307788372039795, - "reward_std": 0.5423199832439423, - "rewards/accuracy_reward": 0.4583333432674408, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.014707335270941257, - "rewards/tag_count_reward": 0.9010416865348816, + "completion_length": 411.3333435058594, + "epoch": 0.3315, + "grad_norm": 10.307046200594176, + "kl": 0.96875, + "learning_rate": 8.608970849845862e-07, + "loss": 0.7632, + "reward": 2.6601351499557495, + "reward_std": 0.5646943897008896, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/reasoning_steps_reward": 0.9305556416511536, + "rewards/repetition_penalty_reward": -0.025628680363297462, + "rewards/tag_count_reward": 0.9218750298023224, "step": 663 }, { "clip_ratio": 0.0, - "completion_length": 376.9583435058594, - "epoch": 0.664, - "grad_norm": 46.90834721507952, - "kl": 3.6171875, - "learning_rate": 3.7561798609655373e-07, - "loss": 0.7765, - "reward": 2.5231001377105713, - "reward_std": 0.43632885813713074, - "rewards/accuracy_reward": 0.6458333432674408, - "rewards/reasoning_steps_reward": 0.9583333730697632, - "rewards/repetition_penalty_reward": -0.018566494807600975, - "rewards/tag_count_reward": 0.9375, + "completion_length": 404.5416717529297, + "epoch": 0.332, + "grad_norm": 16.627402867924832, + "kl": 2.0234375, + "learning_rate": 8.603287946810513e-07, + "loss": 0.5452, + "reward": 2.565666437149048, + "reward_std": 0.7432173788547516, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.03155587986111641, + "rewards/tag_count_reward": 0.9166666865348816, "step": 664 }, { "clip_ratio": 0.0, - "completion_length": 472.875, - "epoch": 0.665, - "grad_norm": 36.36032137924947, - "kl": 4.66015625, - "learning_rate": 3.7417099217982686e-07, - "loss": 0.6785, - "reward": 2.34568190574646, - "reward_std": 0.5171235352754593, - "rewards/accuracy_reward": 0.5, - "rewards/reasoning_steps_reward": 0.9583333730697632, - "rewards/repetition_penalty_reward": -0.013693147338926792, - "rewards/tag_count_reward": 0.9010416865348816, + "completion_length": 428.9166717529297, + "epoch": 0.3325, + "grad_norm": 10.078394182499805, + "kl": 1.65234375, + "learning_rate": 8.597595590621892e-07, + "loss": 0.4888, + "reward": 2.448451519012451, + "reward_std": 0.7055695652961731, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.9236111342906952, + "rewards/repetition_penalty_reward": -0.03765958361327648, + "rewards/tag_count_reward": 0.9166666865348816, "step": 665 }, { "clip_ratio": 0.0, - "completion_length": 389.18751525878906, - "epoch": 0.666, - "grad_norm": 43.19617147385512, - "kl": 4.390625, - "learning_rate": 3.72726140684072e-07, - "loss": 0.7594, - "reward": 2.619339108467102, - "reward_std": 0.5449521988630295, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/reasoning_steps_reward": 0.9583334028720856, - "rewards/repetition_penalty_reward": -0.0160775538533926, + "completion_length": 480.7916717529297, + "epoch": 0.333, + "grad_norm": 11.542957521544354, + "kl": 1.796875, + "learning_rate": 8.591893798619903e-07, + "loss": 0.3699, + "reward": 2.4829243421554565, + "reward_std": 0.61925408244133, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 0.9722222983837128, + "rewards/repetition_penalty_reward": -0.04138128738850355, "rewards/tag_count_reward": 0.9270833432674408, "step": 666 }, { "clip_ratio": 0.0, - "completion_length": 399.2708435058594, - "epoch": 0.667, - "grad_norm": 34.057601735761196, - "kl": 3.2890625, - "learning_rate": 3.712834492143487e-07, - "loss": 0.504, - "reward": 2.495935320854187, - "reward_std": 0.37501704692840576, - "rewards/accuracy_reward": 0.6041666865348816, + "completion_length": 306.1666717529297, + "epoch": 0.3335, + "grad_norm": 12.408852597847376, + "kl": 0.822265625, + "learning_rate": 8.586182588173194e-07, + "loss": 0.2134, + "reward": 2.452653646469116, + "reward_std": 0.415747806429863, + "rewards/accuracy_reward": 0.5208333432674408, "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.01448146067559719, - "rewards/tag_count_reward": 0.9270833432674408, + "rewards/repetition_penalty_reward": -0.03172139264643192, + "rewards/tag_count_reward": 0.9843750298023224, "step": 667 }, { "clip_ratio": 0.0, - "completion_length": 516.7500305175781, - "epoch": 0.668, - "grad_norm": 69.82503772312313, - "kl": 5.609375, - "learning_rate": 3.6984293534939737e-07, - "loss": 0.9785, - "reward": 2.399042248725891, - "reward_std": 0.7354680299758911, - "rewards/accuracy_reward": 0.6041666865348816, - "rewards/reasoning_steps_reward": 0.9513889849185944, - "rewards/repetition_penalty_reward": -0.015888389199972153, - "rewards/tag_count_reward": 0.8593750298023224, + "completion_length": 321.1666717529297, + "epoch": 0.334, + "grad_norm": 5.871197681642463, + "kl": 0.3994140625, + "learning_rate": 8.580461976679099e-07, + "loss": 0.2745, + "reward": 2.681954860687256, + "reward_std": 0.4819119870662689, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.0472117830067873, + "rewards/tag_count_reward": 0.9791666865348816, "step": 668 }, { "clip_ratio": 0.0, - "completion_length": 462.50001525878906, - "epoch": 0.669, - "grad_norm": 32.586128695642, - "kl": 4.359375, - "learning_rate": 3.6840461664142444e-07, - "loss": 0.7185, - "reward": 2.410151481628418, - "reward_std": 0.44753579795360565, - "rewards/accuracy_reward": 0.5625000149011612, - "rewards/reasoning_steps_reward": 0.979166716337204, - "rewards/repetition_penalty_reward": -0.016932022757828236, - "rewards/tag_count_reward": 0.8854166865348816, + "completion_length": 290.1458435058594, + "epoch": 0.3345, + "grad_norm": 20.400573211519653, + "kl": 1.712890625, + "learning_rate": 8.574731981563597e-07, + "loss": -0.0559, + "reward": 2.527996778488159, + "reward_std": 0.21311798691749573, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.037975482642650604, + "rewards/tag_count_reward": 0.9895833432674408, "step": 669 }, { "clip_ratio": 0.0, - "completion_length": 551.4375305175781, - "epoch": 0.67, - "grad_norm": 22.702344571228725, - "kl": 3.390625, - "learning_rate": 3.6696851061588994e-07, - "loss": 0.7687, - "reward": 2.3318662643432617, - "reward_std": 0.6909976303577423, - "rewards/accuracy_reward": 0.5208333432674408, - "rewards/reasoning_steps_reward": 0.9375000596046448, - "rewards/repetition_penalty_reward": -0.017092150636017323, - "rewards/tag_count_reward": 0.890625, + "completion_length": 338.2083435058594, + "epoch": 0.335, + "grad_norm": 16.428046693280866, + "kl": 0.92578125, + "learning_rate": 8.568992620281243e-07, + "loss": 0.1399, + "reward": 2.6338796615600586, + "reward_std": 0.30435943603515625, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.05014814343303442, + "rewards/tag_count_reward": 0.9895833432674408, "step": 670 }, { "clip_ratio": 0.0, - "completion_length": 565.4791870117188, - "epoch": 0.671, - "grad_norm": 27.618008409549773, - "kl": 4.2265625, - "learning_rate": 3.655346347712922e-07, - "loss": 0.8779, - "reward": 2.0886669158935547, - "reward_std": 0.6510388255119324, - "rewards/accuracy_reward": 0.2916666716337204, - "rewards/reasoning_steps_reward": 0.9444445073604584, - "rewards/repetition_penalty_reward": -0.017235863022506237, - "rewards/tag_count_reward": 0.8697916865348816, + "completion_length": 283.06251525878906, + "epoch": 0.3355, + "grad_norm": 9.998010496236825, + "kl": 0.4912109375, + "learning_rate": 8.56324391031513e-07, + "loss": 0.1636, + "reward": 2.781676411628723, + "reward_std": 0.2892776355147362, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.916666716337204, + "rewards/repetition_penalty_reward": -0.04124024324119091, + "rewards/tag_count_reward": 0.9895833432674408, "step": 671 }, { "clip_ratio": 0.0, - "completion_length": 487.18751525878906, - "epoch": 0.672, - "grad_norm": 24.355953329836403, - "kl": 2.40625, - "learning_rate": 3.641030065789562e-07, - "loss": 0.5358, - "reward": 2.412584662437439, - "reward_std": 0.6757764220237732, - "rewards/accuracy_reward": 0.5625000298023224, - "rewards/reasoning_steps_reward": 0.9652777910232544, - "rewards/repetition_penalty_reward": -0.021443082485347986, - "rewards/tag_count_reward": 0.9062500298023224, - "step": 672 - }, - { + "completion_length": 328.75001525878906, + "epoch": 0.336, + "grad_norm": 15.737714162396987, + "kl": 0.54296875, + "learning_rate": 8.557485869176825e-07, + "loss": 0.3096, + "reward": 2.5456645488739014, + "reward_std": 0.3961578607559204, + "rewards/accuracy_reward": 0.645833358168602, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.04114115610718727, + "rewards/tag_count_reward": 0.96875, + "step": 672 + }, + { "clip_ratio": 0.0, - "completion_length": 558.0833435058594, - "epoch": 0.673, - "grad_norm": 45.77951130491503, - "kl": 3.328125, - "learning_rate": 3.6267364348281946e-07, - "loss": 1.0366, - "reward": 2.355557918548584, - "reward_std": 0.6195343136787415, - "rewards/accuracy_reward": 0.5208333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.02465049270540476, - "rewards/tag_count_reward": 0.859375, + "completion_length": 274.3333435058594, + "epoch": 0.3365, + "grad_norm": 18.76746009667458, + "kl": 0.302734375, + "learning_rate": 8.551718514406318e-07, + "loss": 0.0533, + "reward": 2.44698166847229, + "reward_std": 0.15537243708968163, + "rewards/accuracy_reward": 0.5, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.03912944719195366, + "rewards/tag_count_reward": 1.0, "step": 673 }, { "clip_ratio": 0.0, - "completion_length": 529.1666870117188, - "epoch": 0.674, - "grad_norm": 41.84743305028243, - "kl": 4.65625, - "learning_rate": 3.612465628992203e-07, - "loss": 0.8446, - "reward": 2.305051267147064, - "reward_std": 0.6280795484781265, - "rewards/accuracy_reward": 0.5416666716337204, - "rewards/reasoning_steps_reward": 0.9513888955116272, - "rewards/repetition_penalty_reward": -0.010920950677245855, - "rewards/tag_count_reward": 0.8229166865348816, + "completion_length": 259.5208435058594, + "epoch": 0.337, + "grad_norm": 6.288258024651295, + "kl": 1.1064453125, + "learning_rate": 8.545941863571973e-07, + "loss": 0.0915, + "reward": 2.7296632528305054, + "reward_std": 0.41032251715660095, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.029017308726906776, + "rewards/tag_count_reward": 0.9947916865348816, "step": 674 }, { "clip_ratio": 0.0, - "completion_length": 442.2083435058594, - "epoch": 0.675, - "grad_norm": 27.763872293665937, - "kl": 2.62890625, - "learning_rate": 3.5982178221668533e-07, - "loss": 0.7363, - "reward": 2.520777702331543, - "reward_std": 0.6029552221298218, - "rewards/accuracy_reward": 0.6458333432674408, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.019152968656271696, - "rewards/tag_count_reward": 0.9218750298023224, + "completion_length": 288.62501525878906, + "epoch": 0.3375, + "grad_norm": 5.025259032815617, + "kl": 0.3076171875, + "learning_rate": 8.540155934270471e-07, + "loss": 0.0915, + "reward": 2.7451287508010864, + "reward_std": 0.32976970821619034, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.04653800651431084, + "rewards/tag_count_reward": 1.0, "step": 675 }, { "clip_ratio": 0.0, - "completion_length": 626.0833435058594, - "epoch": 0.676, - "grad_norm": 146.14695704648105, - "kl": 11.421875, - "learning_rate": 3.5839931879571725e-07, - "loss": 1.492, - "reward": 1.9430179595947266, - "reward_std": 0.49505507946014404, - "rewards/accuracy_reward": 0.2291666716337204, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.027468254789710045, - "rewards/tag_count_reward": 0.7552083432674408, + "completion_length": 325.7083435058594, + "epoch": 0.338, + "grad_norm": 9.159439405693467, + "kl": 1.34375, + "learning_rate": 8.534360744126753e-07, + "loss": 0.4539, + "reward": 2.591364622116089, + "reward_std": 0.5737589597702026, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.047524161636829376, + "rewards/tag_count_reward": 0.9583333432674408, "step": 676 }, { "clip_ratio": 0.0, - "completion_length": 448.0000305175781, - "epoch": 0.677, - "grad_norm": 147.52034576763307, - "kl": 6.8095703125, - "learning_rate": 3.5697918996858443e-07, - "loss": 0.9345, - "reward": 2.3936002254486084, - "reward_std": 0.4429845064878464, - "rewards/accuracy_reward": 0.5208333358168602, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.010913772508502007, - "rewards/tag_count_reward": 0.9114583432674408, + "completion_length": 329.0833435058594, + "epoch": 0.3385, + "grad_norm": 8.458585993366595, + "kl": 1.11328125, + "learning_rate": 8.528556310793979e-07, + "loss": 0.4093, + "reward": 2.7048439979553223, + "reward_std": 0.45828860998153687, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.034739382565021515, + "rewards/tag_count_reward": 0.9687500298023224, "step": 677 }, { "clip_ratio": 0.0, - "completion_length": 512.9166870117188, - "epoch": 0.678, - "grad_norm": 140.08402872570105, - "kl": 7.609375, - "learning_rate": 3.555614130391079e-07, - "loss": 1.0663, - "reward": 2.3663707971572876, - "reward_std": 0.6558624804019928, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/reasoning_steps_reward": 0.9513888955116272, - "rewards/repetition_penalty_reward": -0.02772645093500614, - "rewards/tag_count_reward": 0.9010416865348816, + "completion_length": 332.0208435058594, + "epoch": 0.339, + "grad_norm": 8.46266477495005, + "kl": 0.63671875, + "learning_rate": 8.522742651953456e-07, + "loss": 0.444, + "reward": 2.682199239730835, + "reward_std": 0.5500051826238632, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.02960638701915741, + "rewards/tag_count_reward": 0.96875, "step": 678 }, { "clip_ratio": 0.0, - "completion_length": 417.0, - "epoch": 0.679, - "grad_norm": 96.73929298933032, - "kl": 4.5390625, - "learning_rate": 3.5414600528245266e-07, - "loss": 1.0437, - "reward": 2.23331880569458, - "reward_std": 0.6037196218967438, - "rewards/accuracy_reward": 0.37500002048909664, - "rewards/reasoning_steps_reward": 0.9652777910232544, - "rewards/repetition_penalty_reward": -0.02883412316441536, - "rewards/tag_count_reward": 0.9218750298023224, + "completion_length": 417.56251525878906, + "epoch": 0.3395, + "grad_norm": 10.566916898300722, + "kl": 1.509765625, + "learning_rate": 8.516919785314595e-07, + "loss": 0.3333, + "reward": 2.5015060901641846, + "reward_std": 0.57795649766922, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9375000596046448, + "rewards/repetition_penalty_reward": -0.03495226427912712, + "rewards/tag_count_reward": 0.9114583432674408, "step": 679 }, { "clip_ratio": 0.0, - "completion_length": 357.5625, - "epoch": 0.68, - "grad_norm": 21.200219203477406, - "kl": 2.416015625, - "learning_rate": 3.5273298394491515e-07, - "loss": 0.5258, - "reward": 2.639436721801758, - "reward_std": 0.38805626332759857, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.009868907742202282, - "rewards/tag_count_reward": 0.9479166865348816, + "completion_length": 333.6666717529297, + "epoch": 0.34, + "grad_norm": 11.247764389759753, + "kl": 0.75390625, + "learning_rate": 8.511087728614862e-07, + "loss": 0.5781, + "reward": 2.7087562084198, + "reward_std": 0.4665229171514511, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.04471610300242901, + "rewards/tag_count_reward": 0.96875, "step": 680 }, { "clip_ratio": 0.0, - "completion_length": 387.7708435058594, - "epoch": 0.681, - "grad_norm": 14.11167765417183, - "kl": 1.080078125, - "learning_rate": 3.513223662437147e-07, - "loss": 0.1954, - "reward": 2.5773390531539917, - "reward_std": 0.42192623019218445, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/reasoning_steps_reward": 0.9652777910232544, - "rewards/repetition_penalty_reward": -0.02856375463306904, - "rewards/tag_count_reward": 0.9739583432674408, + "completion_length": 308.6458435058594, + "epoch": 0.3405, + "grad_norm": 6.290960445037793, + "kl": 0.47705078125, + "learning_rate": 8.50524649961971e-07, + "loss": 0.3798, + "reward": 2.7960026264190674, + "reward_std": 0.2514218669384718, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.03385855816304684, + "rewards/tag_count_reward": 0.9895833432674408, "step": 681 }, { "clip_ratio": 0.0, - "completion_length": 463.89585876464844, - "epoch": 0.682, - "grad_norm": 35.11880079033104, - "kl": 5.19921875, - "learning_rate": 3.4991416936678276e-07, - "loss": 0.666, - "reward": 2.4289733171463013, - "reward_std": 0.7871748507022858, - "rewards/accuracy_reward": 0.6458333432674408, - "rewards/reasoning_steps_reward": 0.944444477558136, - "rewards/repetition_penalty_reward": -0.025887836702167988, - "rewards/tag_count_reward": 0.8645833432674408, + "completion_length": 262.375, + "epoch": 0.341, + "grad_norm": 3.8670578486681526, + "kl": 0.21484375, + "learning_rate": 8.499396116122535e-07, + "loss": -0.0315, + "reward": 2.5164601802825928, + "reward_std": 0.1599935106933117, + "rewards/accuracy_reward": 0.5625, + "rewards/reasoning_steps_reward": 0.9861111640930176, + "rewards/repetition_penalty_reward": -0.03215097542852163, + "rewards/tag_count_reward": 1.0, "step": 682 }, { "clip_ratio": 0.0, - "completion_length": 596.7500305175781, - "epoch": 0.683, - "grad_norm": 21.947154239119367, - "kl": 5.1796875, - "learning_rate": 3.4850841047255364e-07, - "loss": 0.7009, - "reward": 1.906419038772583, - "reward_std": 0.5553164780139923, - "rewards/accuracy_reward": 0.1666666716337204, - "rewards/reasoning_steps_reward": 0.9583333432674408, - "rewards/repetition_penalty_reward": -0.015455981716513634, - "rewards/tag_count_reward": 0.7968750298023224, + "completion_length": 307.4791717529297, + "epoch": 0.3415, + "grad_norm": 5.1795240406789285, + "kl": 0.51318359375, + "learning_rate": 8.493536595944622e-07, + "loss": 0.2556, + "reward": 2.6397162675857544, + "reward_std": 0.3590812534093857, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9375000596046448, + "rewards/repetition_penalty_reward": -0.04778374172747135, + "rewards/tag_count_reward": 0.9791666865348816, "step": 683 }, { "clip_ratio": 0.0, - "completion_length": 661.0208435058594, - "epoch": 0.684, - "grad_norm": 25.598616282466537, - "kl": 4.953125, - "learning_rate": 3.471051066897562e-07, - "loss": 0.9532, - "reward": 2.2370306253433228, - "reward_std": 0.6496289968490601, - "rewards/accuracy_reward": 0.4375, - "rewards/reasoning_steps_reward": 0.9722222089767456, - "rewards/repetition_penalty_reward": -0.016441638581454754, - "rewards/tag_count_reward": 0.8437500298023224, + "completion_length": 347.79168701171875, + "epoch": 0.342, + "grad_norm": 22.46580378549878, + "kl": 1.2060546875, + "learning_rate": 8.487667956935087e-07, + "loss": 0.5061, + "reward": 2.5328707695007324, + "reward_std": 0.5347878038883209, + "rewards/accuracy_reward": 0.6250000149011612, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.05740702152252197, + "rewards/tag_count_reward": 0.9791666865348816, "step": 684 }, { "clip_ratio": 0.0, - "completion_length": 464.70835876464844, - "epoch": 0.685, - "grad_norm": 49.95217737848169, - "kl": 1.669921875, - "learning_rate": 3.45704275117204e-07, - "loss": 0.614, - "reward": 2.6332918405532837, - "reward_std": 0.49084220826625824, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.01948598213493824, - "rewards/tag_count_reward": 0.9166666865348816, + "completion_length": 322.66668701171875, + "epoch": 0.3425, + "grad_norm": 25.14822551370442, + "kl": 1.689453125, + "learning_rate": 8.481790216970819e-07, + "loss": 0.386, + "reward": 2.6461331844329834, + "reward_std": 0.41380666196346283, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.03442266769707203, + "rewards/tag_count_reward": 0.9791666865348816, "step": 685 }, { "clip_ratio": 0.0, - "completion_length": 686.3958435058594, - "epoch": 0.686, - "grad_norm": 68.70749191055191, - "kl": 5.984375, - "learning_rate": 3.4430593282358777e-07, - "loss": 0.9271, - "reward": 2.089155912399292, - "reward_std": 0.5732367038726807, - "rewards/accuracy_reward": 0.3750000223517418, - "rewards/reasoning_steps_reward": 0.9513888955116272, - "rewards/repetition_penalty_reward": -0.013274761848151684, - "rewards/tag_count_reward": 0.7760416865348816, + "completion_length": 395.5208435058594, + "epoch": 0.343, + "grad_norm": 78.13941184528808, + "kl": 3.5234375, + "learning_rate": 8.475903393956433e-07, + "loss": 0.7695, + "reward": 2.565727114677429, + "reward_std": 0.7066747546195984, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9236111044883728, + "rewards/repetition_penalty_reward": -0.03496745601296425, + "rewards/tag_count_reward": 0.9270833730697632, "step": 686 }, { "clip_ratio": 0.0, - "completion_length": 432.2083435058594, - "epoch": 0.687, - "grad_norm": 32.08822044000933, - "kl": 1.79296875, - "learning_rate": 3.429100968472668e-07, - "loss": 0.5082, - "reward": 2.5415300130844116, - "reward_std": 0.48894260823726654, - "rewards/accuracy_reward": 0.625, - "rewards/reasoning_steps_reward": 0.9652777910232544, - "rewards/repetition_penalty_reward": -0.017497885040938854, - "rewards/tag_count_reward": 0.9687500298023224, + "completion_length": 333.4375, + "epoch": 0.3435, + "grad_norm": 10.49194552043806, + "kl": 0.970703125, + "learning_rate": 8.470007505824215e-07, + "loss": 0.3382, + "reward": 2.6929080486297607, + "reward_std": 0.45839959383010864, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.03973084129393101, + "rewards/tag_count_reward": 0.96875, "step": 687 }, { "clip_ratio": 0.0, - "completion_length": 427.12501525878906, - "epoch": 0.688, - "grad_norm": 44.25186835660776, - "kl": 1.40625, - "learning_rate": 3.4151678419606233e-07, - "loss": 0.6032, - "reward": 2.457394003868103, - "reward_std": 0.4042099863290787, - "rewards/accuracy_reward": 0.5416666716337204, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.020036617293953896, - "rewards/tag_count_reward": 0.9427083730697632, + "completion_length": 340.00001525878906, + "epoch": 0.344, + "grad_norm": 24.890945025863278, + "kl": 1.193359375, + "learning_rate": 8.464102570534061e-07, + "loss": 0.5374, + "reward": 2.5198949575424194, + "reward_std": 0.4044055640697479, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.02698008343577385, + "rewards/tag_count_reward": 0.9635416865348816, "step": 688 }, { "clip_ratio": 0.0, - "completion_length": 475.31251525878906, - "epoch": 0.689, - "grad_norm": 29.033085916995276, - "kl": 3.84375, - "learning_rate": 3.4012601184704904e-07, - "loss": 0.7725, - "reward": 2.554749846458435, - "reward_std": 0.610569179058075, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.023375309072434902, - "rewards/tag_count_reward": 0.890625, + "completion_length": 338.6666717529297, + "epoch": 0.3445, + "grad_norm": 18.61334570722872, + "kl": 0.78125, + "learning_rate": 8.458188606073431e-07, + "loss": 0.4962, + "reward": 2.715654969215393, + "reward_std": 0.4276605248451233, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.030872873961925507, + "rewards/tag_count_reward": 0.96875, "step": 689 }, { "clip_ratio": 0.0, - "completion_length": 494.12501525878906, - "epoch": 0.69, - "grad_norm": 57.99082028977846, - "kl": 5.2578125, - "learning_rate": 3.387377967463493e-07, - "loss": 0.5999, - "reward": 2.3892362117767334, - "reward_std": 0.5195634961128235, - "rewards/accuracy_reward": 0.5625, - "rewards/reasoning_steps_reward": 0.9722222983837128, - "rewards/repetition_penalty_reward": -0.020485990215092897, - "rewards/tag_count_reward": 0.875, + "completion_length": 270.2083435058594, + "epoch": 0.345, + "grad_norm": 95.37811997865627, + "kl": 2.6484375, + "learning_rate": 8.452265630457282e-07, + "loss": 0.161, + "reward": 2.8763829469680786, + "reward_std": 0.2372975405305624, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.026394893415272236, + "rewards/tag_count_reward": 1.0, "step": 690 }, { "clip_ratio": 0.0, - "completion_length": 491.29168701171875, - "epoch": 0.691, - "grad_norm": 63.801944338558975, - "kl": 6.546875, - "learning_rate": 3.3735215580892575e-07, - "loss": 0.8708, - "reward": 2.57023286819458, - "reward_std": 0.5704822838306427, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/reasoning_steps_reward": 0.944444477558136, - "rewards/repetition_penalty_reward": -0.020044888369739056, - "rewards/tag_count_reward": 0.8958333730697632, + "completion_length": 285.7708435058594, + "epoch": 0.3455, + "grad_norm": 4.725098859206571, + "kl": 0.3291015625, + "learning_rate": 8.446333661728028e-07, + "loss": 0.0645, + "reward": 2.569468140602112, + "reward_std": 0.23410103470087051, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.027754205279052258, + "rewards/tag_count_reward": 1.0, "step": 691 }, { "clip_ratio": 0.0, - "completion_length": 438.83335876464844, - "epoch": 0.692, - "grad_norm": 63.66620073917318, - "kl": 4.0234375, - "learning_rate": 3.359691059183761e-07, - "loss": 0.7175, - "reward": 2.389517307281494, - "reward_std": 0.6298649907112122, - "rewards/accuracy_reward": 0.5625000298023224, - "rewards/reasoning_steps_reward": 0.9583333432674408, - "rewards/repetition_penalty_reward": -0.03756603505462408, - "rewards/tag_count_reward": 0.9062500298023224, + "completion_length": 287.62501525878906, + "epoch": 0.346, + "grad_norm": 12.97522462997103, + "kl": 1.1708984375, + "learning_rate": 8.440392717955475e-07, + "loss": 0.121, + "reward": 2.6713898181915283, + "reward_std": 0.5372248291969299, + "rewards/accuracy_reward": 0.7708333730697632, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.04909628629684448, + "rewards/tag_count_reward": 0.984375, "step": 692 }, { "clip_ratio": 0.0, - "completion_length": 559.5416870117188, - "epoch": 0.693, - "grad_norm": 261.1873431552047, - "kl": 10.6875, - "learning_rate": 3.3458866392672694e-07, - "loss": 1.6707, - "reward": 2.32285612821579, - "reward_std": 0.4213666617870331, - "rewards/accuracy_reward": 0.5000000111758709, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.019157718401402235, - "rewards/tag_count_reward": 0.8489583432674408, + "completion_length": 288.9583435058594, + "epoch": 0.3465, + "grad_norm": 77.5201396944965, + "kl": 4.1875, + "learning_rate": 8.434442817236765e-07, + "loss": 0.6809, + "reward": 2.826512098312378, + "reward_std": 0.32751043140888214, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.03112685587257147, + "rewards/tag_count_reward": 0.9687500298023224, "step": 693 }, { "clip_ratio": 0.0, - "completion_length": 548.9375, - "epoch": 0.694, - "grad_norm": 30.493282980019504, - "kl": 5.34375, - "learning_rate": 3.3321084665422803e-07, - "loss": 1.2128, - "reward": 2.4720669984817505, - "reward_std": 0.7172423005104065, - "rewards/accuracy_reward": 0.6458333432674408, - "rewards/reasoning_steps_reward": 0.9513888955116272, - "rewards/repetition_penalty_reward": -0.015780417248606682, - "rewards/tag_count_reward": 0.8906250298023224, + "completion_length": 396.9166717529297, + "epoch": 0.347, + "grad_norm": 65.84497722604661, + "kl": 4.5859375, + "learning_rate": 8.428483977696328e-07, + "loss": 0.9671, + "reward": 2.560770630836487, + "reward_std": 0.5635119080543518, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9305555522441864, + "rewards/repetition_penalty_reward": -0.03645172622054815, + "rewards/tag_count_reward": 0.9375, "step": 694 }, { "clip_ratio": 0.0, - "completion_length": 527.9375305175781, - "epoch": 0.695, - "grad_norm": 32.828395091508476, - "kl": 5.03125, - "learning_rate": 3.3183567088914833e-07, - "loss": 0.8333, - "reward": 2.6700429916381836, - "reward_std": 0.5373901128768921, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/reasoning_steps_reward": 0.972222238779068, - "rewards/repetition_penalty_reward": -0.02092924155294895, - "rewards/tag_count_reward": 0.8854166865348816, + "completion_length": 381.16668701171875, + "epoch": 0.3475, + "grad_norm": 18.561448766988402, + "kl": 2.12109375, + "learning_rate": 8.422516217485825e-07, + "loss": 0.6426, + "reward": 2.567765712738037, + "reward_std": 0.5413838922977448, + "rewards/accuracy_reward": 0.6875, + "rewards/reasoning_steps_reward": 0.9375000298023224, + "rewards/repetition_penalty_reward": -0.025984477251768112, + "rewards/tag_count_reward": 0.96875, "step": 695 }, { "clip_ratio": 0.0, - "completion_length": 374.8333435058594, - "epoch": 0.696, - "grad_norm": 13.216131597812279, - "kl": 2.4765625, - "learning_rate": 3.3046315338757026e-07, - "loss": 0.3144, - "reward": 2.609209179878235, - "reward_std": 0.655095100402832, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.01926326658576727, - "rewards/tag_count_reward": 0.9270833432674408, + "completion_length": 315.6666717529297, + "epoch": 0.348, + "grad_norm": 12.793725754335805, + "kl": 0.51318359375, + "learning_rate": 8.416539554784089e-07, + "loss": 0.1811, + "reward": 2.8265974521636963, + "reward_std": 0.3042995482683182, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.03451373428106308, + "rewards/tag_count_reward": 0.9791666865348816, "step": 696 }, { "clip_ratio": 0.0, - "completion_length": 318.6666717529297, - "epoch": 0.697, - "grad_norm": 10.118006800111877, - "kl": 0.5234375, - "learning_rate": 3.290933108731866e-07, - "loss": -0.0809, - "reward": 2.3212504386901855, - "reward_std": 0.5412976741790771, - "rewards/accuracy_reward": 0.3958333432674408, - "rewards/reasoning_steps_reward": 0.9722222089767456, - "rewards/repetition_penalty_reward": -0.020763473585247993, - "rewards/tag_count_reward": 0.9739583432674408, + "completion_length": 312.1458435058594, + "epoch": 0.3485, + "grad_norm": 12.387839093466074, + "kl": 1.2890625, + "learning_rate": 8.410554007797068e-07, + "loss": 0.2936, + "reward": 2.6493937969207764, + "reward_std": 0.44927215576171875, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.022481410764157772, + "rewards/tag_count_reward": 0.9635416865348816, "step": 697 }, { "clip_ratio": 0.0, - "completion_length": 742.3541870117188, - "epoch": 0.698, - "grad_norm": 28.054949929765485, - "kl": 5.546875, - "learning_rate": 3.2772616003709616e-07, - "loss": 0.7595, - "reward": 2.193854570388794, - "reward_std": 0.8353064060211182, - "rewards/accuracy_reward": 0.5000000149011612, - "rewards/reasoning_steps_reward": 0.9375000298023224, - "rewards/repetition_penalty_reward": -0.035312142223119736, - "rewards/tag_count_reward": 0.7916666865348816, + "completion_length": 364.125, + "epoch": 0.349, + "grad_norm": 21.88721713904914, + "kl": 1.349609375, + "learning_rate": 8.404559594757777e-07, + "loss": 0.2126, + "reward": 2.453208088874817, + "reward_std": 0.3716874122619629, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.03463919833302498, + "rewards/tag_count_reward": 0.9739583432674408, "step": 698 }, { "clip_ratio": 0.0, - "completion_length": 452.37501525878906, - "epoch": 0.699, - "grad_norm": 41.658117016977116, - "kl": 2.1875, - "learning_rate": 3.263617175376001e-07, - "loss": 0.8256, - "reward": 2.7566497325897217, - "reward_std": 0.49461202323436737, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.01418376062065363, - "rewards/tag_count_reward": 0.9375, + "completion_length": 319.54168701171875, + "epoch": 0.3495, + "grad_norm": 23.304107901739318, + "kl": 3.1298828125, + "learning_rate": 8.398556333926239e-07, + "loss": 0.396, + "reward": 2.7863314151763916, + "reward_std": 0.40553222596645355, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.040057627484202385, + "rewards/tag_count_reward": 0.9583333432674408, "step": 699 }, { "clip_ratio": 0.0, - "completion_length": 663.5000305175781, - "epoch": 0.7, - "grad_norm": 26.592270906723318, - "kl": 6.0625, - "learning_rate": 3.250000000000001e-07, - "loss": 0.7409, - "reward": 2.101960301399231, - "reward_std": 0.7690182626247406, - "rewards/accuracy_reward": 0.4166666865348816, - "rewards/reasoning_steps_reward": 0.9375000596046448, - "rewards/repetition_penalty_reward": -0.02824808470904827, - "rewards/tag_count_reward": 0.7760416865348816, + "completion_length": 347.87501525878906, + "epoch": 0.35, + "grad_norm": 38.63296720283707, + "kl": 3.421875, + "learning_rate": 8.392544243589427e-07, + "loss": 0.9736, + "reward": 2.3871514797210693, + "reward_std": 0.5459754467010498, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 0.9305555522441864, + "rewards/repetition_penalty_reward": -0.027779086492955685, + "rewards/tag_count_reward": 0.9635416865348816, "step": 700 }, { "clip_ratio": 0.0, - "completion_length": 576.25, - "epoch": 0.701, - "grad_norm": 19.522315074592626, - "kl": 4.1015625, - "learning_rate": 3.2364102401639423e-07, - "loss": 0.6651, - "reward": 2.648020029067993, - "reward_std": 0.46788084506988525, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/reasoning_steps_reward": 0.9652778506278992, - "rewards/repetition_penalty_reward": -0.009966201148927212, - "rewards/tag_count_reward": 0.8593750298023224, + "completion_length": 302.5416717529297, + "epoch": 0.3505, + "grad_norm": 16.95817212965768, + "kl": 1.1083984375, + "learning_rate": 8.38652334206121e-07, + "loss": 0.4275, + "reward": 2.909200429916382, + "reward_std": 0.14170600473880768, + "rewards/accuracy_reward": 0.9791666865348816, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.04566095769405365, + "rewards/tag_count_reward": 0.9895833432674408, "step": 701 }, { "clip_ratio": 0.0, - "completion_length": 593.4375305175781, - "epoch": 0.702, - "grad_norm": 21.19308541158298, - "kl": 4.4375, - "learning_rate": 3.222848061454764e-07, - "loss": 0.8651, - "reward": 2.0901803970336914, - "reward_std": 0.6151553392410278, - "rewards/accuracy_reward": 0.3333333432674408, - "rewards/reasoning_steps_reward": 0.9444444179534912, - "rewards/repetition_penalty_reward": -0.020930853206664324, - "rewards/tag_count_reward": 0.8333333730697632, + "completion_length": 389.2708435058594, + "epoch": 0.351, + "grad_norm": 254.15785403417703, + "kl": 8.96875, + "learning_rate": 8.3804936476823e-07, + "loss": 1.3585, + "reward": 2.471347212791443, + "reward_std": 0.4235590100288391, + "rewards/accuracy_reward": 0.6250000149011612, + "rewards/reasoning_steps_reward": 0.9236111342906952, + "rewards/repetition_penalty_reward": -0.03038894012570381, + "rewards/tag_count_reward": 0.953125, "step": 702 }, { "clip_ratio": 0.0, - "completion_length": 448.5416717529297, - "epoch": 0.703, - "grad_norm": 14.569396100243361, - "kl": 2.4140625, - "learning_rate": 3.209313629123329e-07, - "loss": 0.3438, - "reward": 2.424591898918152, - "reward_std": 0.4449867308139801, - "rewards/accuracy_reward": 0.5625, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.025061040185391903, - "rewards/tag_count_reward": 0.9010416865348816, + "completion_length": 304.0208435058594, + "epoch": 0.3515, + "grad_norm": 19.759137725479157, + "kl": 1.263671875, + "learning_rate": 8.374455178820189e-07, + "loss": 0.4074, + "reward": 2.781951069831848, + "reward_std": 0.3059141989797354, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.032285166904330254, + "rewards/tag_count_reward": 0.9947916865348816, "step": 703 }, { "clip_ratio": 0.0, - "completion_length": 571.1458435058594, - "epoch": 0.704, - "grad_norm": 40.600173749644895, - "kl": 6.4453125, - "learning_rate": 3.195807108082429e-07, - "loss": 0.9898, - "reward": 2.4354113340377808, - "reward_std": 0.6632784008979797, - "rewards/accuracy_reward": 0.645833358168602, - "rewards/reasoning_steps_reward": 0.9652778506278992, - "rewards/repetition_penalty_reward": -0.014241641853004694, - "rewards/tag_count_reward": 0.8385416865348816, + "completion_length": 400.4583435058594, + "epoch": 0.352, + "grad_norm": 87.69689094374543, + "kl": 7.359375, + "learning_rate": 8.368407953869103e-07, + "loss": 0.4764, + "reward": 2.40726637840271, + "reward_std": 0.5606774091720581, + "rewards/accuracy_reward": 0.583333358168602, + "rewards/reasoning_steps_reward": 0.9236111640930176, + "rewards/repetition_penalty_reward": -0.031969884410500526, + "rewards/tag_count_reward": 0.9322916865348816, "step": 704 }, { "clip_ratio": 0.0, - "completion_length": 480.1458435058594, - "epoch": 0.705, - "grad_norm": 39.53208480397733, - "kl": 3.4765625, - "learning_rate": 3.182328662904756e-07, - "loss": 0.7597, - "reward": 2.5596699714660645, - "reward_std": 0.6091938018798828, - "rewards/accuracy_reward": 0.6875000298023224, - "rewards/reasoning_steps_reward": 0.9791667461395264, - "rewards/repetition_penalty_reward": -0.02366339974105358, - "rewards/tag_count_reward": 0.9166666865348816, + "completion_length": 365.6041717529297, + "epoch": 0.3525, + "grad_norm": 55.10624289780696, + "kl": 2.1630859375, + "learning_rate": 8.362351991249937e-07, + "loss": 0.3022, + "reward": 2.5830687284469604, + "reward_std": 0.543460875749588, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.04193143546581268, + "rewards/tag_count_reward": 0.9583333432674408, "step": 705 }, { "clip_ratio": 0.0, - "completion_length": 376.5416717529297, - "epoch": 0.706, - "grad_norm": 26.171445195886303, - "kl": 2.21484375, - "learning_rate": 3.168878457820915e-07, - "loss": 0.5183, - "reward": 2.515265941619873, - "reward_std": 0.4705822169780731, - "rewards/accuracy_reward": 0.6250000298023224, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.017720218747854233, - "rewards/tag_count_reward": 0.921875, + "completion_length": 370.1458435058594, + "epoch": 0.353, + "grad_norm": 38.93542768766942, + "kl": 1.20703125, + "learning_rate": 8.356287309410204e-07, + "loss": 0.4547, + "reward": 2.704145312309265, + "reward_std": 0.5362651348114014, + "rewards/accuracy_reward": 0.8750000298023224, + "rewards/reasoning_steps_reward": 0.9027778208255768, + "rewards/repetition_penalty_reward": -0.037174249067902565, + "rewards/tag_count_reward": 0.9635416865348816, "step": 706 }, { "clip_ratio": 0.0, - "completion_length": 421.41668701171875, - "epoch": 0.707, - "grad_norm": 23.58547734465505, - "kl": 5.1875, - "learning_rate": 3.155456656717408e-07, - "loss": 0.9749, - "reward": 2.6543229818344116, - "reward_std": 0.67611363530159, - "rewards/accuracy_reward": 0.8125000298023224, - "rewards/reasoning_steps_reward": 0.9513888955116272, - "rewards/repetition_penalty_reward": -0.010607585310935974, - "rewards/tag_count_reward": 0.9010416865348816, + "completion_length": 443.0208435058594, + "epoch": 0.3535, + "grad_norm": 32.744332762719786, + "kl": 0.75, + "learning_rate": 8.350213926823974e-07, + "loss": 0.44, + "reward": 2.5751278400421143, + "reward_std": 0.5337015986442566, + "rewards/accuracy_reward": 0.6875, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.04987236298620701, + "rewards/tag_count_reward": 0.9583333432674408, "step": 707 }, { "clip_ratio": 0.0, - "completion_length": 420.1458435058594, - "epoch": 0.708, - "grad_norm": 22.53505575124777, - "kl": 3.265625, - "learning_rate": 3.142063423134644e-07, - "loss": 0.5001, - "reward": 2.434324622154236, - "reward_std": 0.42964955419301987, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.01706432970240712, - "rewards/tag_count_reward": 0.9166666865348816, + "completion_length": 372.5625, + "epoch": 0.354, + "grad_norm": 42.03312426418706, + "kl": 0.68359375, + "learning_rate": 8.344131861991828e-07, + "loss": 0.4816, + "reward": 2.814732551574707, + "reward_std": 0.4671812057495117, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.04117002338171005, + "rewards/tag_count_reward": 0.9739583432674408, "step": 708 }, { "clip_ratio": 0.0, - "completion_length": 604.5000305175781, - "epoch": 0.709, - "grad_norm": 100.60447189912114, - "kl": 10.71875, - "learning_rate": 3.1286989202649503e-07, - "loss": 1.2098, - "reward": 2.214650869369507, - "reward_std": 0.7704206705093384, - "rewards/accuracy_reward": 0.520833358168602, - "rewards/reasoning_steps_reward": 0.92361119389534, - "rewards/repetition_penalty_reward": -0.021460428833961487, - "rewards/tag_count_reward": 0.7916666865348816, + "completion_length": 402.8333435058594, + "epoch": 0.3545, + "grad_norm": 14.798336180101453, + "kl": 2.26171875, + "learning_rate": 8.338041133440788e-07, + "loss": 0.6263, + "reward": 2.585590362548828, + "reward_std": 0.5589672327041626, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9444445371627808, + "rewards/repetition_penalty_reward": -0.03593753091990948, + "rewards/tag_count_reward": 0.9479166865348816, "step": 709 }, { "clip_ratio": 0.0, - "completion_length": 441.5208435058594, - "epoch": 0.71, - "grad_norm": 62.885652673494626, - "kl": 4.6171875, - "learning_rate": 3.115363310950578e-07, - "loss": 0.6795, - "reward": 2.4361919164657593, - "reward_std": 0.6633725464344025, - "rewards/accuracy_reward": 0.6041666865348816, - "rewards/reasoning_steps_reward": 0.9375000596046448, - "rewards/repetition_penalty_reward": -0.022141442634165287, - "rewards/tag_count_reward": 0.9166666865348816, + "completion_length": 484.1666717529297, + "epoch": 0.355, + "grad_norm": 329.43975783332286, + "kl": 9.34375, + "learning_rate": 8.331941759724268e-07, + "loss": 1.4746, + "reward": 2.464582681655884, + "reward_std": 0.6360695958137512, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.9166666865348816, + "rewards/repetition_penalty_reward": -0.03541737608611584, + "rewards/tag_count_reward": 0.9375000298023224, "step": 710 }, { "clip_ratio": 0.0, - "completion_length": 505.9791717529297, - "epoch": 0.711, - "grad_norm": 91.4316669194455, - "kl": 6.65625, - "learning_rate": 3.102056757681715e-07, - "loss": 1.2926, - "reward": 2.3233230113983154, - "reward_std": 0.7938812673091888, + "completion_length": 458.4791717529297, + "epoch": 0.3555, + "grad_norm": 367.74155787665666, + "kl": 11.8759765625, + "learning_rate": 8.325833759422021e-07, + "loss": 1.0719, + "reward": 2.316919684410095, + "reward_std": 0.4943684861063957, "rewards/accuracy_reward": 0.5416666716337204, - "rewards/reasoning_steps_reward": 0.9375001192092896, - "rewards/repetition_penalty_reward": -0.01001034933142364, - "rewards/tag_count_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.875, + "rewards/repetition_penalty_reward": -0.032038720324635506, + "rewards/tag_count_reward": 0.9322916865348816, "step": 711 }, { "clip_ratio": 0.0, - "completion_length": 497.7708435058594, - "epoch": 0.712, - "grad_norm": 49.94435657649251, - "kl": 5.265625, - "learning_rate": 3.0887794225945143e-07, - "loss": 0.7182, - "reward": 2.6127805709838867, - "reward_std": 0.5191345363855362, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 0.9652778208255768, - "rewards/repetition_penalty_reward": -0.019164070021361113, - "rewards/tag_count_reward": 0.875, + "completion_length": 483.7916717529297, + "epoch": 0.356, + "grad_norm": 47.212787106240015, + "kl": 4.234375, + "learning_rate": 8.319717151140072e-07, + "loss": 1.1337, + "reward": 2.351717948913574, + "reward_std": 0.6439139246940613, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9097222685813904, + "rewards/repetition_penalty_reward": -0.04758777469396591, + "rewards/tag_count_reward": 0.9062500298023224, "step": 712 }, { "clip_ratio": 0.0, - "completion_length": 408.9791717529297, - "epoch": 0.713, - "grad_norm": 24.539952113509884, - "kl": 3.46875, - "learning_rate": 3.075531467469116e-07, - "loss": 0.5752, - "reward": 2.349856376647949, - "reward_std": 0.4715754985809326, - "rewards/accuracy_reward": 0.4583333432674408, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.023407643660902977, - "rewards/tag_count_reward": 0.9218750298023224, + "completion_length": 324.25, + "epoch": 0.3565, + "grad_norm": 3.183005147595655, + "kl": 0.24560546875, + "learning_rate": 8.313591953510673e-07, + "loss": -0.005, + "reward": 2.9382745027542114, + "reward_std": 0.0876419385895133, + "rewards/accuracy_reward": 0.9791666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.04089214652776718, + "rewards/tag_count_reward": 1.0, "step": 713 }, { "clip_ratio": 0.0, - "completion_length": 506.7708435058594, - "epoch": 0.714, - "grad_norm": 32.90722497302991, - "kl": 4.640625, - "learning_rate": 3.062313053727671e-07, - "loss": 0.8249, - "reward": 2.2763147354125977, - "reward_std": 0.7002745866775513, - "rewards/accuracy_reward": 0.4791666865348816, - "rewards/reasoning_steps_reward": 0.9513889849185944, - "rewards/repetition_penalty_reward": -0.013615832198411226, - "rewards/tag_count_reward": 0.859375, + "completion_length": 322.5208435058594, + "epoch": 0.357, + "grad_norm": 21.014869886703735, + "kl": 0.328125, + "learning_rate": 8.307458185192238e-07, + "loss": 0.259, + "reward": 2.6625412702560425, + "reward_std": 0.37747399508953094, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.035375405102968216, + "rewards/tag_count_reward": 0.9895833432674408, "step": 714 }, { "clip_ratio": 0.0, - "completion_length": 362.0833435058594, - "epoch": 0.715, - "grad_norm": 27.678515846159396, - "kl": 2.14453125, - "learning_rate": 3.0491243424323783e-07, - "loss": 0.3263, - "reward": 2.8031435012817383, - "reward_std": 0.45686857402324677, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.023245446383953094, - "rewards/tag_count_reward": 0.9375, + "completion_length": 349.4375, + "epoch": 0.3575, + "grad_norm": 20.703256092729664, + "kl": 0.3984375, + "learning_rate": 8.301315864869289e-07, + "loss": 0.2191, + "reward": 2.67917537689209, + "reward_std": 0.3260675296187401, + "rewards/accuracy_reward": 0.7708333730697632, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.06908857077360153, + "rewards/tag_count_reward": 0.984375, "step": 715 }, { "clip_ratio": 0.0, - "completion_length": 466.6666717529297, - "epoch": 0.716, - "grad_norm": 29.560245962070923, - "kl": 3.8515625, - "learning_rate": 3.0359654942835247e-07, - "loss": 0.8846, - "reward": 2.3151395320892334, - "reward_std": 0.7553818225860596, - "rewards/accuracy_reward": 0.5, - "rewards/reasoning_steps_reward": 0.9583333432674408, - "rewards/repetition_penalty_reward": -0.03381876181811094, - "rewards/tag_count_reward": 0.8906250298023224, + "completion_length": 356.5208435058594, + "epoch": 0.358, + "grad_norm": 9.846153349140325, + "kl": 0.90625, + "learning_rate": 8.295165011252396e-07, + "loss": 0.1475, + "reward": 2.4251718521118164, + "reward_std": 0.41564081609249115, + "rewards/accuracy_reward": 0.520833358168602, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.04878660198301077, + "rewards/tag_count_reward": 0.9739583730697632, "step": 716 }, { "clip_ratio": 0.0, - "completion_length": 677.5625, - "epoch": 0.717, - "grad_norm": 28.56711039404196, - "kl": 6.859375, - "learning_rate": 3.02283666961752e-07, - "loss": 0.8948, - "reward": 2.116614818572998, - "reward_std": 0.61385178565979, - "rewards/accuracy_reward": 0.3750000223517418, - "rewards/reasoning_steps_reward": 0.958333432674408, - "rewards/repetition_penalty_reward": -0.018802037462592125, - "rewards/tag_count_reward": 0.8020833730697632, + "completion_length": 426.5625, + "epoch": 0.3585, + "grad_norm": 22.268351473633047, + "kl": 2.8046875, + "learning_rate": 8.289005643078131e-07, + "loss": 1.0537, + "reward": 2.636265277862549, + "reward_std": 0.5826082229614258, + "rewards/accuracy_reward": 0.75, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.04602637514472008, + "rewards/tag_count_reward": 0.9739583432674408, "step": 717 }, { "clip_ratio": 0.0, - "completion_length": 651.3125305175781, - "epoch": 0.718, - "grad_norm": 28.83560352945601, - "kl": 6.46875, - "learning_rate": 3.0097380284049523e-07, - "loss": 1.0283, - "reward": 2.1972588300704956, - "reward_std": 0.6690528243780136, - "rewards/accuracy_reward": 0.4791666716337204, - "rewards/reasoning_steps_reward": 0.9513889253139496, - "rewards/repetition_penalty_reward": -0.014546706806868315, - "rewards/tag_count_reward": 0.7812500298023224, + "completion_length": 345.04168701171875, + "epoch": 0.359, + "grad_norm": 49.628296082541894, + "kl": 2.771484375, + "learning_rate": 8.282837779108993e-07, + "loss": 0.5828, + "reward": 2.618976354598999, + "reward_std": 0.40643376111984253, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.965277761220932, + "rewards/repetition_penalty_reward": -0.044218238443136215, + "rewards/tag_count_reward": 0.96875, "step": 718 }, { "clip_ratio": 0.0, - "completion_length": 389.4375, - "epoch": 0.719, - "grad_norm": 10.934491674221926, - "kl": 1.875, - "learning_rate": 2.996669730248628e-07, - "loss": 0.2297, - "reward": 2.1007025241851807, - "reward_std": 0.48380589485168457, - "rewards/accuracy_reward": 0.2291666716337204, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.017353057861328125, - "rewards/tag_count_reward": 0.9166666865348816, + "completion_length": 376.6041717529297, + "epoch": 0.3595, + "grad_norm": 17.745954704948424, + "kl": 1.49609375, + "learning_rate": 8.276661438133368e-07, + "loss": 0.4342, + "reward": 2.6793724298477173, + "reward_std": 0.35265351831912994, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.03937762975692749, + "rewards/tag_count_reward": 0.9895833730697632, "step": 719 }, { "clip_ratio": 0.0, - "completion_length": 394.1041717529297, - "epoch": 0.72, - "grad_norm": 29.82240224098748, - "kl": 3.19921875, - "learning_rate": 2.9836319343816397e-07, - "loss": 0.6818, - "reward": 2.698520064353943, - "reward_std": 0.6881579607725143, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 0.944444477558136, - "rewards/repetition_penalty_reward": -0.011549403425306082, - "rewards/tag_count_reward": 0.9114583432674408, + "completion_length": 353.66668701171875, + "epoch": 0.36, + "grad_norm": 27.380512621837557, + "kl": 0.939453125, + "learning_rate": 8.270476638965461e-07, + "loss": 0.5235, + "reward": 2.807437300682068, + "reward_std": 0.43915510177612305, + "rewards/accuracy_reward": 0.8958333730697632, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.03978483937680721, + "rewards/tag_count_reward": 0.9791666865348816, "step": 720 }, { "clip_ratio": 0.0, - "completion_length": 467.0208435058594, - "epoch": 0.721, - "grad_norm": 25.761812478587363, - "kl": 3.8984375, - "learning_rate": 2.9706247996654134e-07, - "loss": 0.5726, - "reward": 2.4670268297195435, - "reward_std": 0.6666984856128693, - "rewards/accuracy_reward": 0.6250000298023224, - "rewards/reasoning_steps_reward": 0.979166716337204, - "rewards/repetition_penalty_reward": -0.0173482783138752, - "rewards/tag_count_reward": 0.8802083432674408, + "completion_length": 389.6875, + "epoch": 0.3605, + "grad_norm": 24.933099331429055, + "kl": 0.765625, + "learning_rate": 8.264283400445243e-07, + "loss": 0.4441, + "reward": 2.6986876726150513, + "reward_std": 0.5908840298652649, + "rewards/accuracy_reward": 0.8125, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.06346515193581581, + "rewards/tag_count_reward": 0.9635416865348816, "step": 721 }, { "clip_ratio": 0.0, - "completion_length": 459.50001525878906, - "epoch": 0.722, - "grad_norm": 66.05822501836576, - "kl": 4.71875, - "learning_rate": 2.9576484845877793e-07, - "loss": 0.7149, - "reward": 2.672752857208252, - "reward_std": 0.6598548293113708, - "rewards/accuracy_reward": 0.8333333730697632, - "rewards/reasoning_steps_reward": 0.9583333730697632, - "rewards/repetition_penalty_reward": -0.02516369242221117, - "rewards/tag_count_reward": 0.9062500298023224, + "completion_length": 427.4583435058594, + "epoch": 0.361, + "grad_norm": 21.155555874267726, + "kl": 1.40625, + "learning_rate": 8.258081741438394e-07, + "loss": 0.6891, + "reward": 2.5299737453460693, + "reward_std": 0.5110133141279221, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/reasoning_steps_reward": 0.9583334028720856, + "rewards/repetition_penalty_reward": -0.06377626396715641, + "rewards/tag_count_reward": 0.96875, "step": 722 }, { "clip_ratio": 0.0, - "completion_length": 469.5, - "epoch": 0.723, - "grad_norm": 33.50765624110453, - "kl": 3.296875, - "learning_rate": 2.944703147261046e-07, - "loss": 1.0137, - "reward": 2.1091710329055786, - "reward_std": 0.5909655094146729, - "rewards/accuracy_reward": 0.2916666716337204, - "rewards/reasoning_steps_reward": 0.9305555522441864, - "rewards/repetition_penalty_reward": -0.008884491166099906, - "rewards/tag_count_reward": 0.8958333432674408, + "completion_length": 358.2708435058594, + "epoch": 0.3615, + "grad_norm": 22.1107293389388, + "kl": 1.6025390625, + "learning_rate": 8.25187168083624e-07, + "loss": 0.3977, + "reward": 2.5182286500930786, + "reward_std": 0.3348126895725727, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.047743676230311394, + "rewards/tag_count_reward": 0.96875, "step": 723 }, { "clip_ratio": 0.0, - "completion_length": 475.9583435058594, - "epoch": 0.724, - "grad_norm": 32.701475311916404, - "kl": 4.078125, - "learning_rate": 2.931788945420058e-07, - "loss": 1.2, - "reward": 2.4817965030670166, - "reward_std": 0.5851520895957947, - "rewards/accuracy_reward": 0.6250000149011612, - "rewards/reasoning_steps_reward": 0.979166716337204, - "rewards/repetition_penalty_reward": -0.012995375785976648, - "rewards/tag_count_reward": 0.890625, + "completion_length": 336.125, + "epoch": 0.362, + "grad_norm": 7.252900143586577, + "kl": 0.908203125, + "learning_rate": 8.245653237555705e-07, + "loss": 0.2413, + "reward": 2.539818525314331, + "reward_std": 0.4269237220287323, + "rewards/accuracy_reward": 0.645833358168602, + "rewards/reasoning_steps_reward": 0.951388955116272, + "rewards/repetition_penalty_reward": -0.04698727838695049, + "rewards/tag_count_reward": 0.9895833432674408, "step": 724 }, { "clip_ratio": 0.0, - "completion_length": 554.7083435058594, - "epoch": 0.725, - "grad_norm": 32.79842010966518, - "kl": 5.91796875, - "learning_rate": 2.918906036420294e-07, - "loss": 0.8106, - "reward": 2.432934522628784, - "reward_std": 0.6692648828029633, - "rewards/accuracy_reward": 0.6041666865348816, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.01671838667243719, - "rewards/tag_count_reward": 0.859375, - "step": 725 - }, - { + "completion_length": 460.52085876464844, + "epoch": 0.3625, + "grad_norm": 35.246874774684656, + "kl": 3.2578125, + "learning_rate": 8.239426430539243e-07, + "loss": 0.9074, + "reward": 2.555440068244934, + "reward_std": 0.6377770006656647, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9583334028720856, + "rewards/repetition_penalty_reward": -0.05914325639605522, + "rewards/tag_count_reward": 0.9270833432674408, + "step": 725 + }, + { "clip_ratio": 0.0, - "completion_length": 572.5208435058594, - "epoch": 0.726, - "grad_norm": 34.07001019061104, - "kl": 4.34375, - "learning_rate": 2.9060545772359305e-07, - "loss": 0.985, - "reward": 2.4105674028396606, - "reward_std": 0.5044368803501129, - "rewards/accuracy_reward": 0.5625000298023224, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.02866880688816309, - "rewards/tag_count_reward": 0.8906250298023224, + "completion_length": 350.3125, + "epoch": 0.363, + "grad_norm": 9.204556850773619, + "kl": 1.97265625, + "learning_rate": 8.23319127875479e-07, + "loss": 0.4855, + "reward": 2.772892713546753, + "reward_std": 0.43467026948928833, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.055232321843504906, + "rewards/tag_count_reward": 0.953125, "step": 726 }, { "clip_ratio": 0.0, - "completion_length": 485.41668701171875, - "epoch": 0.727, - "grad_norm": 22.090198544561794, - "kl": 3.53125, - "learning_rate": 2.893234724457946e-07, - "loss": 0.662, - "reward": 2.4937909841537476, - "reward_std": 0.6231936812400818, - "rewards/accuracy_reward": 0.6458333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.027042274363338947, - "rewards/tag_count_reward": 0.875, + "completion_length": 363.9166717529297, + "epoch": 0.3635, + "grad_norm": 6.6860563949651475, + "kl": 1.0283203125, + "learning_rate": 8.226947801195699e-07, + "loss": 0.1283, + "reward": 2.6459619998931885, + "reward_std": 0.2583989417180419, + "rewards/accuracy_reward": 0.7291666716337204, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.057163089513778687, + "rewards/tag_count_reward": 0.9947916865348816, "step": 727 }, { "clip_ratio": 0.0, - "completion_length": 596.9375, - "epoch": 0.728, - "grad_norm": 39.74856940380198, - "kl": 5.5, - "learning_rate": 2.8804466342921987e-07, - "loss": 1.4125, - "reward": 2.3083548545837402, - "reward_std": 0.7532355189323425, - "rewards/accuracy_reward": 0.5625, - "rewards/reasoning_steps_reward": 0.9513888657093048, - "rewards/repetition_penalty_reward": -0.02324253274127841, - "rewards/tag_count_reward": 0.8177083432674408, + "completion_length": 261.0416717529297, + "epoch": 0.364, + "grad_norm": 3.671168469586843, + "kl": 0.251953125, + "learning_rate": 8.220696016880687e-07, + "loss": 0.0623, + "reward": 2.95377254486084, + "reward_std": 0.04473966360092163, + "rewards/accuracy_reward": 1.0, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.03233869280666113, + "rewards/tag_count_reward": 1.0, "step": 728 }, { "clip_ratio": 0.0, - "completion_length": 644.2708587646484, - "epoch": 0.729, - "grad_norm": 27.585966675541705, - "kl": 5.046875, - "learning_rate": 2.86769046255753e-07, - "loss": 0.8454, - "reward": 2.0349258184432983, - "reward_std": 0.7431914508342743, - "rewards/accuracy_reward": 0.375, - "rewards/reasoning_steps_reward": 0.9236111640930176, - "rewards/repetition_penalty_reward": -0.024102029390633106, - "rewards/tag_count_reward": 0.7604166865348816, + "completion_length": 476.3333435058594, + "epoch": 0.3645, + "grad_norm": 33.63088386822394, + "kl": 0.951171875, + "learning_rate": 8.21443594485377e-07, + "loss": 0.5476, + "reward": 2.48537540435791, + "reward_std": 0.7262991070747375, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.9722221791744232, + "rewards/repetition_penalty_reward": -0.08580523729324341, + "rewards/tag_count_reward": 0.953125, "step": 729 }, { "clip_ratio": 0.0, - "completion_length": 416.8958435058594, - "epoch": 0.73, - "grad_norm": 38.15694283580282, - "kl": 1.8203125, - "learning_rate": 2.854966364683872e-07, - "loss": 0.4531, - "reward": 2.5842431783676147, - "reward_std": 0.4230002462863922, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/reasoning_steps_reward": 0.9652778208255768, - "rewards/repetition_penalty_reward": -0.021659548394382, - "rewards/tag_count_reward": 0.9322916865348816, + "completion_length": 392.16668701171875, + "epoch": 0.365, + "grad_norm": 30.396775140349977, + "kl": 0.5615234375, + "learning_rate": 8.208167604184217e-07, + "loss": 0.3701, + "reward": 2.5822075605392456, + "reward_std": 0.48322977125644684, + "rewards/accuracy_reward": 0.6875, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.07404248043894768, + "rewards/tag_count_reward": 0.9895833432674408, "step": 730 }, { "clip_ratio": 0.0, - "completion_length": 522.5416870117188, - "epoch": 0.731, - "grad_norm": 19.973765241704356, - "kl": 3.59375, - "learning_rate": 2.842274495710335e-07, - "loss": 0.5592, - "reward": 2.3725064992904663, - "reward_std": 0.5484490990638733, - "rewards/accuracy_reward": 0.5625000298023224, - "rewards/reasoning_steps_reward": 0.9513888955116272, - "rewards/repetition_penalty_reward": -0.016382555477321148, - "rewards/tag_count_reward": 0.8750000298023224, + "completion_length": 446.97918701171875, + "epoch": 0.3655, + "grad_norm": 39.11126689033728, + "kl": 1.0234375, + "learning_rate": 8.201891013966478e-07, + "loss": 0.5577, + "reward": 2.379446864128113, + "reward_std": 0.5678260624408722, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.08062271773815155, + "rewards/tag_count_reward": 0.953125, "step": 731 }, { "clip_ratio": 0.0, - "completion_length": 775.3125305175781, - "epoch": 0.732, - "grad_norm": 19.54218927587623, - "kl": 8.140625, - "learning_rate": 2.829615010283344e-07, - "loss": 1.0839, - "reward": 2.0963175296783447, - "reward_std": 0.8241714835166931, - "rewards/accuracy_reward": 0.4791666865348816, - "rewards/reasoning_steps_reward": 0.9513888955116272, - "rewards/repetition_penalty_reward": -0.06340477429330349, - "rewards/tag_count_reward": 0.7291666865348816, + "completion_length": 537.5416870117188, + "epoch": 0.366, + "grad_norm": 15.068758274769461, + "kl": 1.6875, + "learning_rate": 8.195606193320136e-07, + "loss": 0.6393, + "reward": 2.3407026529312134, + "reward_std": 0.7115332186222076, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 0.9583333134651184, + "rewards/repetition_penalty_reward": -0.07075561210513115, + "rewards/tag_count_reward": 0.9322916865348816, "step": 732 }, { "clip_ratio": 0.0, - "completion_length": 618.8958435058594, - "epoch": 0.733, - "grad_norm": 15.706934322980523, - "kl": 5.859375, - "learning_rate": 2.8169880626547283e-07, - "loss": 0.8039, - "reward": 2.311804175376892, - "reward_std": 0.7484097182750702, - "rewards/accuracy_reward": 0.6250000149011612, - "rewards/reasoning_steps_reward": 0.944444477558136, - "rewards/repetition_penalty_reward": -0.033682181499898434, - "rewards/tag_count_reward": 0.7760416865348816, + "completion_length": 421.2708435058594, + "epoch": 0.3665, + "grad_norm": 9.730085638274787, + "kl": 1.34375, + "learning_rate": 8.189313161389844e-07, + "loss": 0.4843, + "reward": 2.605613946914673, + "reward_std": 0.44479694962501526, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.035011230036616325, + "rewards/tag_count_reward": 0.953125, "step": 733 }, { "clip_ratio": 0.0, - "completion_length": 640.8541870117188, - "epoch": 0.734, - "grad_norm": 45.104210914901735, - "kl": 4.265625, - "learning_rate": 2.8043938066798645e-07, - "loss": 1.1408, - "reward": 2.3691216707229614, - "reward_std": 0.8749706745147705, - "rewards/accuracy_reward": 0.6458333730697632, - "rewards/reasoning_steps_reward": 0.9305555820465088, - "rewards/repetition_penalty_reward": -0.01976742222905159, - "rewards/tag_count_reward": 0.8125, + "completion_length": 562.0000305175781, + "epoch": 0.367, + "grad_norm": 36.417464044540424, + "kl": 4.1171875, + "learning_rate": 8.183011937345271e-07, + "loss": 1.1908, + "reward": 2.5398595333099365, + "reward_std": 0.769398033618927, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.06430716440081596, + "rewards/tag_count_reward": 0.8958333432674408, "step": 734 }, { "clip_ratio": 0.0, - "completion_length": 556.1041717529297, - "epoch": 0.735, - "grad_norm": 25.06133634060821, - "kl": 4.1171875, - "learning_rate": 2.791832395815782e-07, - "loss": 0.901, - "reward": 2.317103087902069, - "reward_std": 0.6717338263988495, - "rewards/accuracy_reward": 0.5625000298023224, - "rewards/reasoning_steps_reward": 0.9513889253139496, - "rewards/repetition_penalty_reward": -0.030119193717837334, - "rewards/tag_count_reward": 0.8333333730697632, + "completion_length": 439.6875, + "epoch": 0.3675, + "grad_norm": 30.83756444127917, + "kl": 2.984375, + "learning_rate": 8.176702540381036e-07, + "loss": 0.9181, + "reward": 2.7064634561538696, + "reward_std": 0.5320930778980255, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9375000298023224, + "rewards/repetition_penalty_reward": -0.03832832910120487, + "rewards/tag_count_reward": 0.9531250298023224, "step": 735 }, { "clip_ratio": 0.0, - "completion_length": 541.4166870117188, - "epoch": 0.736, - "grad_norm": 70.06964895356546, - "kl": 4.8828125, - "learning_rate": 2.7793039831193133e-07, - "loss": 0.6714, - "reward": 2.38431978225708, - "reward_std": 0.4595172256231308, - "rewards/accuracy_reward": 0.5416666716337204, - "rewards/reasoning_steps_reward": 0.9791666567325592, - "rewards/repetition_penalty_reward": -0.016721924766898155, - "rewards/tag_count_reward": 0.8802083432674408, + "completion_length": 330.7916717529297, + "epoch": 0.368, + "grad_norm": 13.454778716975406, + "kl": 1.0029296875, + "learning_rate": 8.170384989716657e-07, + "loss": 0.3605, + "reward": 2.8519721031188965, + "reward_std": 0.31362488865852356, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.03691680170595646, + "rewards/tag_count_reward": 0.9791666865348816, "step": 736 }, { "clip_ratio": 0.0, - "completion_length": 563.3541870117188, - "epoch": 0.737, - "grad_norm": 37.75513345672942, - "kl": 3.3359375, - "learning_rate": 2.766808721245211e-07, - "loss": 1.1112, - "reward": 2.4378621578216553, - "reward_std": 0.59513920545578, - "rewards/accuracy_reward": 0.6041666716337204, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.016999011393636465, - "rewards/tag_count_reward": 0.8645833432674408, + "completion_length": 395.7708435058594, + "epoch": 0.3685, + "grad_norm": 13.197505616793235, + "kl": 2.11328125, + "learning_rate": 8.164059304596488e-07, + "loss": 0.25, + "reward": 2.556104898452759, + "reward_std": 0.37108321487903595, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9722222089767456, + "rewards/repetition_penalty_reward": -0.04632566310465336, + "rewards/tag_count_reward": 0.9427083432674408, "step": 737 }, { "clip_ratio": 0.0, - "completion_length": 531.875, - "epoch": 0.738, - "grad_norm": 18.56617973802945, - "kl": 4.125, - "learning_rate": 2.7543467624442956e-07, - "loss": 0.8568, - "reward": 2.246809482574463, - "reward_std": 0.6611096858978271, - "rewards/accuracy_reward": 0.4166666716337204, - "rewards/reasoning_steps_reward": 0.972222238779068, - "rewards/repetition_penalty_reward": -0.017079456709325314, - "rewards/tag_count_reward": 0.875, + "completion_length": 365.2708435058594, + "epoch": 0.369, + "grad_norm": 9.749024221023307, + "kl": 2.08203125, + "learning_rate": 8.157725504289664e-07, + "loss": 0.3044, + "reward": 2.4535757303237915, + "reward_std": 0.558014452457428, + "rewards/accuracy_reward": 0.6041666716337204, + "rewards/reasoning_steps_reward": 0.951388955116272, + "rewards/repetition_penalty_reward": -0.018646665383130312, + "rewards/tag_count_reward": 0.9166666865348816, "step": 738 }, { "clip_ratio": 0.0, - "completion_length": 520.2708435058594, - "epoch": 0.739, - "grad_norm": 23.972621042263274, - "kl": 3.859375, - "learning_rate": 2.741918258561607e-07, - "loss": 1.0541, - "reward": 2.4821611642837524, - "reward_std": 0.4908117651939392, - "rewards/accuracy_reward": 0.6250000298023224, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.03520001098513603, - "rewards/tag_count_reward": 0.9062500298023224, + "completion_length": 479.2083435058594, + "epoch": 0.3695, + "grad_norm": 11.39083083892124, + "kl": 2.03125, + "learning_rate": 8.151383608090039e-07, + "loss": 0.635, + "reward": 2.229800820350647, + "reward_std": 0.5332637131214142, + "rewards/accuracy_reward": 0.4375000223517418, + "rewards/reasoning_steps_reward": 0.9305556118488312, + "rewards/repetition_penalty_reward": -0.03929641842842102, + "rewards/tag_count_reward": 0.9010416865348816, "step": 739 }, { "clip_ratio": 0.0, - "completion_length": 527.1458587646484, - "epoch": 0.74, - "grad_norm": 40.196471457608176, - "kl": 6.765625, - "learning_rate": 2.729523361034538e-07, - "loss": 1.2053, - "reward": 2.3264917135238647, - "reward_std": 0.6097930297255516, - "rewards/accuracy_reward": 0.5833333432674408, - "rewards/reasoning_steps_reward": 0.9375000596046448, - "rewards/repetition_penalty_reward": -0.022466725669801235, - "rewards/tag_count_reward": 0.8281250298023224, + "completion_length": 464.5625305175781, + "epoch": 0.37, + "grad_norm": 16.878206896706008, + "kl": 2.859375, + "learning_rate": 8.145033635316128e-07, + "loss": 0.6706, + "reward": 2.367822289466858, + "reward_std": 0.7059852480888367, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.031483279541134834, + "rewards/tag_count_reward": 0.9062500298023224, "step": 740 }, { "clip_ratio": 0.0, - "completion_length": 493.04168701171875, - "epoch": 0.741, - "grad_norm": 29.927610585853564, - "kl": 4.03515625, - "learning_rate": 2.717162220891007e-07, - "loss": 0.9692, - "reward": 2.016554594039917, - "reward_std": 0.4730721116065979, - "rewards/accuracy_reward": 0.16666667722165585, - "rewards/reasoning_steps_reward": 0.9513889253139496, - "rewards/repetition_penalty_reward": -0.023376007564365864, - "rewards/tag_count_reward": 0.921875, + "completion_length": 343.375, + "epoch": 0.3705, + "grad_norm": 4.401882005610615, + "kl": 0.7861328125, + "learning_rate": 8.138675605311051e-07, + "loss": 0.1554, + "reward": 2.771618604660034, + "reward_std": 0.2334722802042961, + "rewards/accuracy_reward": 0.8333333730697632, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.0460897758603096, + "rewards/tag_count_reward": 0.984375, "step": 741 }, { "clip_ratio": 0.0, - "completion_length": 512.9583435058594, - "epoch": 0.742, - "grad_norm": 25.41926623083724, - "kl": 4.2734375, - "learning_rate": 2.7048349887476037e-07, - "loss": 1.0409, - "reward": 2.418197512626648, - "reward_std": 0.6901432275772095, - "rewards/accuracy_reward": 0.6250000149011612, - "rewards/reasoning_steps_reward": 0.9583333730697632, - "rewards/repetition_penalty_reward": -0.03492756187915802, - "rewards/tag_count_reward": 0.8697916865348816, + "completion_length": 359.4166717529297, + "epoch": 0.371, + "grad_norm": 4.239738905969326, + "kl": 1.05078125, + "learning_rate": 8.13230953744247e-07, + "loss": 0.1471, + "reward": 2.5801262855529785, + "reward_std": 0.2986167371273041, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.03619310073554516, + "rewards/tag_count_reward": 0.9635416865348816, "step": 742 }, { "clip_ratio": 0.0, - "completion_length": 648.3958435058594, - "epoch": 0.743, - "grad_norm": 28.016549982017455, - "kl": 6.734375, - "learning_rate": 2.692541814807763e-07, - "loss": 1.1588, - "reward": 2.3356810808181763, - "reward_std": 0.7242786288261414, - "rewards/accuracy_reward": 0.5625, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.021957868244498968, - "rewards/tag_count_reward": 0.8229166865348816, + "completion_length": 389.3333435058594, + "epoch": 0.3715, + "grad_norm": 7.057211069031311, + "kl": 0.87109375, + "learning_rate": 8.125935451102528e-07, + "loss": 0.4242, + "reward": 2.3998607397079468, + "reward_std": 0.4749663770198822, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.03763941116631031, + "rewards/tag_count_reward": 0.9583333432674408, "step": 743 }, { "clip_ratio": 0.0, - "completion_length": 528.0000152587891, - "epoch": 0.744, - "grad_norm": 93.21543758888738, - "kl": 9.375, - "learning_rate": 2.6802828488599294e-07, - "loss": 1.1425, - "reward": 2.218702793121338, - "reward_std": 0.743184506893158, - "rewards/accuracy_reward": 0.458333358168602, - "rewards/reasoning_steps_reward": 0.9375000298023224, - "rewards/repetition_penalty_reward": -0.02608906291425228, - "rewards/tag_count_reward": 0.8489583432674408, + "completion_length": 317.6458435058594, + "epoch": 0.372, + "grad_norm": 5.779641128334165, + "kl": 0.638671875, + "learning_rate": 8.119553365707802e-07, + "loss": 0.2267, + "reward": 2.8257747888565063, + "reward_std": 0.4051903337240219, + "rewards/accuracy_reward": 0.8958333730697632, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.026655779220163822, + "rewards/tag_count_reward": 0.9635416865348816, "step": 744 }, { "clip_ratio": 0.0, - "completion_length": 522.5833587646484, - "epoch": 0.745, - "grad_norm": 29.592772740929842, - "kl": 5.4375, - "learning_rate": 2.6680582402757324e-07, - "loss": 0.8376, - "reward": 2.1133170127868652, - "reward_std": 0.4543229639530182, - "rewards/accuracy_reward": 0.27083333395421505, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.020363647490739822, - "rewards/tag_count_reward": 0.890625, + "completion_length": 410.9166717529297, + "epoch": 0.3725, + "grad_norm": 3.66973899046251, + "kl": 0.517578125, + "learning_rate": 8.113163300699228e-07, + "loss": 0.1316, + "reward": 2.464886784553528, + "reward_std": 0.3369094356894493, + "rewards/accuracy_reward": 0.5625000223517418, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.038585495203733444, + "rewards/tag_count_reward": 0.9479166865348816, "step": 745 }, { "clip_ratio": 0.0, - "completion_length": 537.7083435058594, - "epoch": 0.746, - "grad_norm": 52.43685846686619, - "kl": 7.9375, - "learning_rate": 2.655868138008171e-07, - "loss": 1.3022, - "reward": 2.560097575187683, - "reward_std": 0.6881845593452454, + "completion_length": 314.81251525878906, + "epoch": 0.373, + "grad_norm": 4.867508354023929, + "kl": 0.4296875, + "learning_rate": 8.106765275542053e-07, + "loss": 0.1155, + "reward": 2.6931064128875732, + "reward_std": 0.40478771924972534, "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 0.9375, - "rewards/repetition_penalty_reward": -0.01802760176360607, - "rewards/tag_count_reward": 0.8697916865348816, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.0412687249481678, + "rewards/tag_count_reward": 0.984375, "step": 746 }, { "clip_ratio": 0.0, - "completion_length": 491.4583435058594, - "epoch": 0.747, - "grad_norm": 42.17070883402169, - "kl": 4.8125, - "learning_rate": 2.6437126905897967e-07, - "loss": 0.9389, - "reward": 2.4418286085128784, - "reward_std": 0.479451060295105, - "rewards/accuracy_reward": 0.6041666865348816, - "rewards/reasoning_steps_reward": 0.9444445371627808, - "rewards/repetition_penalty_reward": -0.023449244908988476, - "rewards/tag_count_reward": 0.9166666865348816, + "completion_length": 318.7291717529297, + "epoch": 0.3735, + "grad_norm": 4.225560833503795, + "kl": 0.349609375, + "learning_rate": 8.100359309725774e-07, + "loss": 0.0563, + "reward": 2.715116500854492, + "reward_std": 0.3557048738002777, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.03835564851760864, + "rewards/tag_count_reward": 0.9895833432674408, "step": 747 }, { "clip_ratio": 0.0, - "completion_length": 495.89585876464844, - "epoch": 0.748, - "grad_norm": 42.23947257797948, - "kl": 3.734375, - "learning_rate": 2.631592046130896e-07, - "loss": 0.8408, - "reward": 2.2848896980285645, - "reward_std": 0.6534326076507568, - "rewards/accuracy_reward": 0.4791666865348816, - "rewards/reasoning_steps_reward": 0.965277761220932, - "rewards/repetition_penalty_reward": -0.029346639290452003, - "rewards/tag_count_reward": 0.8697916865348816, + "completion_length": 364.68751525878906, + "epoch": 0.374, + "grad_norm": 5.832499305507599, + "kl": 0.7421875, + "learning_rate": 8.093945422764069e-07, + "loss": 0.2438, + "reward": 2.5478862524032593, + "reward_std": 0.48231200873851776, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.03891943581402302, + "rewards/tag_count_reward": 0.9479166865348816, "step": 748 }, { "clip_ratio": 0.0, - "completion_length": 520.9166717529297, - "epoch": 0.749, - "grad_norm": 28.13462040337534, - "kl": 4.9921875, - "learning_rate": 2.6195063523177e-07, - "loss": 0.6576, - "reward": 2.5075308084487915, - "reward_std": 0.4372696727514267, - "rewards/accuracy_reward": 0.6875000298023224, - "rewards/reasoning_steps_reward": 0.9444445073604584, - "rewards/repetition_penalty_reward": -0.020247011445462704, - "rewards/tag_count_reward": 0.8958333730697632, + "completion_length": 513.2916717529297, + "epoch": 0.3745, + "grad_norm": 18.59065925947558, + "kl": 1.67578125, + "learning_rate": 8.087523634194754e-07, + "loss": 0.7216, + "reward": 2.548434257507324, + "reward_std": 0.6094189584255219, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.04010744206607342, + "rewards/tag_count_reward": 0.9010416865348816, "step": 749 }, { "clip_ratio": 0.0, - "completion_length": 550.9791870117188, - "epoch": 0.75, - "grad_norm": 51.58149030204615, - "kl": 4.734375, - "learning_rate": 2.6074557564105724e-07, - "loss": 1.2626, - "reward": 2.304089069366455, - "reward_std": 0.65184286236763, - "rewards/accuracy_reward": 0.5208333432674408, - "rewards/reasoning_steps_reward": 0.944444477558136, - "rewards/repetition_penalty_reward": -0.010147055611014366, - "rewards/tag_count_reward": 0.8489583730697632, + "completion_length": 322.60418701171875, + "epoch": 0.375, + "grad_norm": 7.99311552783378, + "kl": 1.29296875, + "learning_rate": 8.081093963579707e-07, + "loss": 0.2852, + "reward": 2.577797532081604, + "reward_std": 0.4589303731918335, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.05241105332970619, + "rewards/tag_count_reward": 0.9635416865348816, "step": 750 }, { "clip_ratio": 0.0, - "completion_length": 414.5208435058594, - "epoch": 0.751, - "grad_norm": 25.50420977958888, - "kl": 2.85546875, - "learning_rate": 2.595440405242222e-07, - "loss": 0.4946, - "reward": 2.3329780101776123, - "reward_std": 0.4813043922185898, - "rewards/accuracy_reward": 0.4375000149011612, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.01424429938197136, - "rewards/tag_count_reward": 0.9166666865348816, + "completion_length": 367.8333435058594, + "epoch": 0.3755, + "grad_norm": 7.024161119025017, + "kl": 0.73779296875, + "learning_rate": 8.074656430504823e-07, + "loss": 0.1466, + "reward": 2.575978994369507, + "reward_std": 0.32069287449121475, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.03339605592191219, + "rewards/tag_count_reward": 0.984375, "step": 751 }, { "clip_ratio": 0.0, - "completion_length": 549.7708587646484, - "epoch": 0.752, - "grad_norm": 30.257524874531967, - "kl": 4.296875, - "learning_rate": 2.583460445215911e-07, - "loss": 1.0795, - "reward": 2.672580599784851, - "reward_std": 0.6382189244031906, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 0.9513889253139496, - "rewards/repetition_penalty_reward": -0.013183359056711197, - "rewards/tag_count_reward": 0.8802083432674408, + "completion_length": 360.93751525878906, + "epoch": 0.376, + "grad_norm": 13.063738129746366, + "kl": 0.5400390625, + "learning_rate": 8.068211054579943e-07, + "loss": 0.322, + "reward": 2.6451334953308105, + "reward_std": 0.4438292682170868, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9375000596046448, + "rewards/repetition_penalty_reward": -0.026741638779640198, + "rewards/tag_count_reward": 0.9635416865348816, "step": 752 }, { "clip_ratio": 0.0, - "completion_length": 409.50001525878906, - "epoch": 0.753, - "grad_norm": 42.99616520034882, - "kl": 3.15625, - "learning_rate": 2.571516022303671e-07, - "loss": 0.8373, - "reward": 2.759161591529846, - "reward_std": 0.7119235694408417, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/reasoning_steps_reward": 0.9652777910232544, - "rewards/repetition_penalty_reward": -0.02382473973557353, - "rewards/tag_count_reward": 0.921875, + "completion_length": 420.2083435058594, + "epoch": 0.3765, + "grad_norm": 23.053449901319926, + "kl": 1.609375, + "learning_rate": 8.061757855438799e-07, + "loss": 0.4878, + "reward": 2.5872808694839478, + "reward_std": 0.6218395531177521, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.04292759019881487, + "rewards/tag_count_reward": 0.9427083432674408, "step": 753 }, { "clip_ratio": 0.0, - "completion_length": 536.5625305175781, - "epoch": 0.754, - "grad_norm": 31.379188639299347, - "kl": 5.7109375, - "learning_rate": 2.5596072820445254e-07, - "loss": 1.1123, - "reward": 2.451164484024048, - "reward_std": 0.6144693195819855, - "rewards/accuracy_reward": 0.6875000298023224, - "rewards/reasoning_steps_reward": 0.909722238779068, - "rewards/repetition_penalty_reward": -0.01584968389943242, - "rewards/tag_count_reward": 0.8697916865348816, + "completion_length": 393.9583435058594, + "epoch": 0.377, + "grad_norm": 18.602852015242384, + "kl": 1.9296875, + "learning_rate": 8.055296852738956e-07, + "loss": 0.4041, + "reward": 2.7404640913009644, + "reward_std": 0.3876974582672119, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.05120253935456276, + "rewards/tag_count_reward": 0.9791666865348816, "step": 754 }, { "clip_ratio": 0.0, - "completion_length": 489.0416717529297, - "epoch": 0.755, - "grad_norm": 45.223755625235505, - "kl": 4.796875, - "learning_rate": 2.547734369542718e-07, - "loss": 1.2234, - "reward": 2.4944331645965576, - "reward_std": 0.610102117061615, - "rewards/accuracy_reward": 0.6250000298023224, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.012511319015175104, - "rewards/tag_count_reward": 0.8958333432674408, + "completion_length": 279.62501525878906, + "epoch": 0.3775, + "grad_norm": 3.7238210341731244, + "kl": 0.24462890625, + "learning_rate": 8.048828066161747e-07, + "loss": 0.0714, + "reward": 2.776410698890686, + "reward_std": 0.1331208571791649, + "rewards/accuracy_reward": 0.8125, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.02914472296833992, + "rewards/tag_count_reward": 1.0, "step": 755 }, { "clip_ratio": 0.0, - "completion_length": 443.12501525878906, - "epoch": 0.756, - "grad_norm": 43.56858310727776, - "kl": 4.9921875, - "learning_rate": 2.5358974294659373e-07, - "loss": 1.0932, - "reward": 2.5396214723587036, - "reward_std": 0.7032516896724701, - "rewards/accuracy_reward": 0.7083333730697632, - "rewards/reasoning_steps_reward": 0.9652778208255768, - "rewards/repetition_penalty_reward": -0.014198230113834143, - "rewards/tag_count_reward": 0.8802083432674408, + "completion_length": 321.1666717529297, + "epoch": 0.378, + "grad_norm": 67.36108059538589, + "kl": 3.19140625, + "learning_rate": 8.04235151541222e-07, + "loss": 0.2793, + "reward": 2.7950555086135864, + "reward_std": 0.21008452773094177, + "rewards/accuracy_reward": 0.8333333730697632, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.03827812150120735, + "rewards/tag_count_reward": 1.0, "step": 756 }, { "clip_ratio": 0.0, - "completion_length": 380.25, - "epoch": 0.757, - "grad_norm": 42.833747394340975, - "kl": 3.90625, - "learning_rate": 2.5240966060435674e-07, - "loss": 0.4364, - "reward": 2.670251488685608, - "reward_std": 0.3203464448451996, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 0.9652778506278992, - "rewards/repetition_penalty_reward": -0.013776272535324097, - "rewards/tag_count_reward": 0.9270833432674408, + "completion_length": 373.7291717529297, + "epoch": 0.3785, + "grad_norm": 84.54444780952632, + "kl": 2.7587890625, + "learning_rate": 8.035867220219071e-07, + "loss": 0.3899, + "reward": 2.434798002243042, + "reward_std": 0.4415217489004135, + "rewards/accuracy_reward": 0.520833358168602, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.04263266548514366, + "rewards/tag_count_reward": 0.9635416865348816, "step": 757 }, { "clip_ratio": 0.0, - "completion_length": 520.7291870117188, - "epoch": 0.758, - "grad_norm": 103.32021322129948, - "kl": 6.125, - "learning_rate": 2.512332043064913e-07, - "loss": 1.2798, - "reward": 2.463586449623108, - "reward_std": 0.6297429800033569, - "rewards/accuracy_reward": 0.6041666865348816, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.017316261306405067, - "rewards/tag_count_reward": 0.890625, + "completion_length": 342.3541717529297, + "epoch": 0.379, + "grad_norm": 119.06865323156653, + "kl": 5.875, + "learning_rate": 8.029375200334587e-07, + "loss": 0.4683, + "reward": 2.722063660621643, + "reward_std": 0.38934317231178284, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.04356146976351738, + "rewards/tag_count_reward": 0.9739583432674408, "step": 758 }, { "clip_ratio": 0.0, - "completion_length": 603.5625305175781, - "epoch": 0.759, - "grad_norm": 43.65057021190212, - "kl": 6.9375, - "learning_rate": 2.5006038838774647e-07, - "loss": 0.9189, - "reward": 2.158547341823578, - "reward_std": 0.6411450803279877, - "rewards/accuracy_reward": 0.375, - "rewards/reasoning_steps_reward": 0.9444445073604584, - "rewards/repetition_penalty_reward": -0.009855579119175673, - "rewards/tag_count_reward": 0.8489583730697632, + "completion_length": 342.2916717529297, + "epoch": 0.3795, + "grad_norm": 27.51386663996604, + "kl": 2.546875, + "learning_rate": 8.022875475534588e-07, + "loss": 0.4781, + "reward": 2.81855309009552, + "reward_std": 0.3882335126399994, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.028669222258031368, + "rewards/tag_count_reward": 0.9583333432674408, "step": 759 }, { "clip_ratio": 0.0, - "completion_length": 617.6041870117188, - "epoch": 0.76, - "grad_norm": 19.541751124761845, - "kl": 6.046875, - "learning_rate": 2.488912271385139e-07, - "loss": 1.0373, - "reward": 2.354681611061096, - "reward_std": 0.6235771775245667, - "rewards/accuracy_reward": 0.6041666716337204, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.025526808109134436, - "rewards/tag_count_reward": 0.796875, + "completion_length": 388.0208435058594, + "epoch": 0.38, + "grad_norm": 75.44970102943357, + "kl": 3.5390625, + "learning_rate": 8.01636806561836e-07, + "loss": 0.5887, + "reward": 2.4815808534622192, + "reward_std": 0.4953022599220276, + "rewards/accuracy_reward": 0.5625, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.0427247304469347, + "rewards/tag_count_reward": 0.9687500298023224, "step": 760 }, { "clip_ratio": 0.0, - "completion_length": 486.4166717529297, - "epoch": 0.761, - "grad_norm": 39.557769432826085, - "kl": 2.8046875, - "learning_rate": 2.4772573480465445e-07, - "loss": 0.8268, - "reward": 2.4871546030044556, - "reward_std": 0.6703044772148132, - "rewards/accuracy_reward": 0.625, - "rewards/reasoning_steps_reward": 0.9652778208255768, - "rewards/repetition_penalty_reward": -0.024998143315315247, - "rewards/tag_count_reward": 0.921875, + "completion_length": 292.3958435058594, + "epoch": 0.3805, + "grad_norm": 5.1567729258708015, + "kl": 0.36328125, + "learning_rate": 8.009852990408606e-07, + "loss": -0.0068, + "reward": 2.7873027324676514, + "reward_std": 0.34609653055667877, + "rewards/accuracy_reward": 0.8333333730697632, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.02346120961010456, + "rewards/tag_count_reward": 0.984375, "step": 761 }, { "clip_ratio": 0.0, - "completion_length": 558.5208435058594, - "epoch": 0.762, - "grad_norm": 37.29197408055863, - "kl": 4.4140625, - "learning_rate": 2.465639255873246e-07, - "loss": 0.9776, - "reward": 2.1103891134262085, - "reward_std": 0.7344604879617691, - "rewards/accuracy_reward": 0.3958333358168602, - "rewards/reasoning_steps_reward": 0.9375000298023224, - "rewards/repetition_penalty_reward": -0.02502751164138317, - "rewards/tag_count_reward": 0.8020833432674408, + "completion_length": 298.2083435058594, + "epoch": 0.381, + "grad_norm": 5.782753089670167, + "kl": 0.2744140625, + "learning_rate": 8.003330269751372e-07, + "loss": 0.0454, + "reward": 2.6413458585739136, + "reward_std": 0.27150559425354004, + "rewards/accuracy_reward": 0.7708333730697632, + "rewards/reasoning_steps_reward": 0.9305556118488312, + "rewards/repetition_penalty_reward": -0.05483482964336872, + "rewards/tag_count_reward": 0.9947916865348816, "step": 762 }, { "clip_ratio": 0.0, - "completion_length": 556.3333435058594, - "epoch": 0.763, - "grad_norm": 17.379113373562127, - "kl": 4.255859375, - "learning_rate": 2.454058136428027e-07, - "loss": 0.5297, - "reward": 2.3058613538742065, - "reward_std": 0.5592197477817535, - "rewards/accuracy_reward": 0.5416666716337204, - "rewards/reasoning_steps_reward": 0.9583334028720856, - "rewards/repetition_penalty_reward": -0.017055293079465628, - "rewards/tag_count_reward": 0.8229166865348816, + "completion_length": 323.43751525878906, + "epoch": 0.3815, + "grad_norm": 8.790559038103488, + "kl": 0.712890625, + "learning_rate": 7.996799923515997e-07, + "loss": 0.1095, + "reward": 2.427235722541809, + "reward_std": 0.40729573369026184, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.9791666269302368, + "rewards/repetition_penalty_reward": -0.036305982619524, + "rewards/tag_count_reward": 0.9427083730697632, "step": 763 }, { "clip_ratio": 0.0, - "completion_length": 404.56251525878906, - "epoch": 0.764, - "grad_norm": 18.161864885556103, - "kl": 1.474609375, - "learning_rate": 2.4425141308231765e-07, - "loss": 0.2815, - "reward": 2.35969877243042, - "reward_std": 0.4401056468486786, - "rewards/accuracy_reward": 0.4375, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.023981940932571888, - "rewards/tag_count_reward": 0.953125, + "completion_length": 339.37501525878906, + "epoch": 0.382, + "grad_norm": 12.201161237432581, + "kl": 0.9375, + "learning_rate": 7.990261971595048e-07, + "loss": 0.2632, + "reward": 2.6326658725738525, + "reward_std": 0.4427875429391861, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.02879253402352333, + "rewards/tag_count_reward": 0.9322916865348816, "step": 764 }, { "clip_ratio": 0.0, - "completion_length": 388.0, - "epoch": 0.765, - "grad_norm": 35.807830777637626, - "kl": 2.65625, - "learning_rate": 2.4310073797187573e-07, - "loss": 0.7565, - "reward": 2.547232747077942, - "reward_std": 0.6375119686126709, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/reasoning_steps_reward": 0.9305556416511536, - "rewards/repetition_penalty_reward": -0.013531276490539312, - "rewards/tag_count_reward": 0.921875, + "completion_length": 353.75, + "epoch": 0.3825, + "grad_norm": 22.3451597944635, + "kl": 1.24609375, + "learning_rate": 7.983716433904262e-07, + "loss": 0.3512, + "reward": 2.682904601097107, + "reward_std": 0.46456609666347504, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.03758169710636139, + "rewards/tag_count_reward": 0.9427083432674408, "step": 765 }, { "clip_ratio": 0.0, - "completion_length": 414.5416717529297, - "epoch": 0.766, - "grad_norm": 31.125919596419223, - "kl": 1.38671875, - "learning_rate": 2.4195380233209006e-07, - "loss": 0.481, - "reward": 2.5880796909332275, - "reward_std": 0.547327809035778, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/reasoning_steps_reward": 0.9583333730697632, - "rewards/repetition_penalty_reward": -0.021295433398336172, - "rewards/tag_count_reward": 0.9427083432674408, + "completion_length": 302.4583435058594, + "epoch": 0.383, + "grad_norm": 20.403277618055263, + "kl": 2.078125, + "learning_rate": 7.977163330382479e-07, + "loss": 0.2761, + "reward": 2.799572229385376, + "reward_std": 0.3507230281829834, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.026816830039024353, + "rewards/tag_count_reward": 0.9791666865348816, "step": 766 }, { "clip_ratio": 0.0, - "completion_length": 451.41668701171875, - "epoch": 0.767, - "grad_norm": 36.219759589516954, - "kl": 4.25, - "learning_rate": 2.408106201380097e-07, - "loss": 0.8591, - "reward": 2.333495259284973, - "reward_std": 0.6193402707576752, - "rewards/accuracy_reward": 0.583333358168602, - "rewards/reasoning_steps_reward": 0.9166666865348816, - "rewards/repetition_penalty_reward": -0.015463168267160654, - "rewards/tag_count_reward": 0.8489583432674408, + "completion_length": 379.2291717529297, + "epoch": 0.3835, + "grad_norm": 83.31159486387506, + "kl": 6.796875, + "learning_rate": 7.970602680991592e-07, + "loss": 0.7361, + "reward": 2.5988014936447144, + "reward_std": 0.6159058213233948, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.026198641397058964, + "rewards/tag_count_reward": 0.8958333730697632, "step": 767 }, { "clip_ratio": 0.0, - "completion_length": 520.5833435058594, - "epoch": 0.768, - "grad_norm": 23.45639672156471, - "kl": 4.4765625, - "learning_rate": 2.3967120531894857e-07, - "loss": 0.9129, - "reward": 2.30954110622406, - "reward_std": 0.6106710433959961, - "rewards/accuracy_reward": 0.4791666865348816, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.016847790218889713, - "rewards/tag_count_reward": 0.8750000298023224, + "completion_length": 366.5625, + "epoch": 0.384, + "grad_norm": 363.6216773317555, + "kl": 13.65625, + "learning_rate": 7.964034505716476e-07, + "loss": 1.19, + "reward": 2.539207100868225, + "reward_std": 0.5661123096942902, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.03891796059906483, + "rewards/tag_count_reward": 0.9531250298023224, "step": 768 }, { "clip_ratio": 0.0, - "completion_length": 551.25, - "epoch": 0.769, - "grad_norm": 129.78473682307904, - "kl": 6.140625, - "learning_rate": 2.38535571758317e-07, - "loss": 1.3243, - "reward": 2.448180079460144, - "reward_std": 0.7538703083992004, - "rewards/accuracy_reward": 0.6875000149011612, - "rewards/reasoning_steps_reward": 0.9513888955116272, - "rewards/repetition_penalty_reward": -0.02404231671243906, - "rewards/tag_count_reward": 0.8333333432674408, + "completion_length": 472.4791717529297, + "epoch": 0.3845, + "grad_norm": 187.4365397141768, + "kl": 16.546875, + "learning_rate": 7.957458824564931e-07, + "loss": 1.2768, + "reward": 2.338832139968872, + "reward_std": 0.5791323632001877, + "rewards/accuracy_reward": 0.5625000298023224, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.011862346436828375, + "rewards/tag_count_reward": 0.84375, "step": 769 }, { "clip_ratio": 0.0, - "completion_length": 569.8750305175781, - "epoch": 0.77, - "grad_norm": 35.971720291998714, - "kl": 7.40625, - "learning_rate": 2.374037332934512e-07, - "loss": 1.0446, - "reward": 2.1608505249023438, - "reward_std": 0.7265328466892242, - "rewards/accuracy_reward": 0.4166666716337204, - "rewards/reasoning_steps_reward": 0.9444445371627808, - "rewards/repetition_penalty_reward": -0.028385651297867298, - "rewards/tag_count_reward": 0.828125, + "completion_length": 546.7916717529297, + "epoch": 0.385, + "grad_norm": 35.1440590310128, + "kl": 5.8671875, + "learning_rate": 7.950875657567621e-07, + "loss": 0.861, + "reward": 2.5058937072753906, + "reward_std": 0.5305248498916626, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.0201479597017169, + "rewards/tag_count_reward": 0.859375, "step": 770 }, { "clip_ratio": 0.0, - "completion_length": 474.7916717529297, - "epoch": 0.771, - "grad_norm": 78.42716292437864, - "kl": 4.375, - "learning_rate": 2.36275703715446e-07, - "loss": 0.6855, - "reward": 2.4102360010147095, - "reward_std": 0.627646267414093, - "rewards/accuracy_reward": 0.5208333432674408, - "rewards/reasoning_steps_reward": 0.979166716337204, - "rewards/repetition_penalty_reward": -0.016847367398440838, - "rewards/tag_count_reward": 0.9270833730697632, + "completion_length": 511.39585876464844, + "epoch": 0.3855, + "grad_norm": 78.06611028598661, + "kl": 6.6328125, + "learning_rate": 7.944285024778017e-07, + "loss": 0.8001, + "reward": 2.510785937309265, + "reward_std": 0.6201603412628174, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.027408569119870663, + "rewards/tag_count_reward": 0.9062500298023224, "step": 771 }, { "clip_ratio": 0.0, - "completion_length": 517.9166870117188, - "epoch": 0.772, - "grad_norm": 51.33839598106081, - "kl": 7.234375, - "learning_rate": 2.3515149676898552e-07, - "loss": 0.9322, - "reward": 2.0547146797180176, - "reward_std": 0.7165104746818542, - "rewards/accuracy_reward": 0.3333333432674408, - "rewards/reasoning_steps_reward": 0.8819445371627808, - "rewards/repetition_penalty_reward": -0.03035500179976225, - "rewards/tag_count_reward": 0.8697916865348816, + "completion_length": 411.1458435058594, + "epoch": 0.386, + "grad_norm": 18.27424052638579, + "kl": 2.765625, + "learning_rate": 7.93768694627233e-07, + "loss": 0.5689, + "reward": 2.3896729946136475, + "reward_std": 0.5375154912471771, + "rewards/accuracy_reward": 0.5416666716337204, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.028729761950671673, + "rewards/tag_count_reward": 0.8906250298023224, "step": 772 }, { "clip_ratio": 0.0, - "completion_length": 662.8541870117188, - "epoch": 0.773, - "grad_norm": 99.24123358538509, - "kl": 10.1875, - "learning_rate": 2.3403112615217693e-07, - "loss": 1.2164, - "reward": 2.322015166282654, - "reward_std": 0.7576204538345337, - "rewards/accuracy_reward": 0.6041666865348816, - "rewards/reasoning_steps_reward": 0.9513888955116272, - "rewards/repetition_penalty_reward": -0.014790416695177555, - "rewards/tag_count_reward": 0.7812500298023224, + "completion_length": 495.625, + "epoch": 0.3865, + "grad_norm": 14.881333576700378, + "kl": 2.375, + "learning_rate": 7.931081442149448e-07, + "loss": 0.4987, + "reward": 2.577857255935669, + "reward_std": 0.5150826573371887, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.02457323018461466, + "rewards/tag_count_reward": 0.9010416865348816, "step": 773 }, { "clip_ratio": 0.0, - "completion_length": 555.0208587646484, - "epoch": 0.774, - "grad_norm": 55.815576719277246, - "kl": 6.765625, - "learning_rate": 2.3291460551638237e-07, - "loss": 1.1251, - "reward": 2.1742671728134155, - "reward_std": 0.6895886063575745, - "rewards/accuracy_reward": 0.3958333432674408, - "rewards/reasoning_steps_reward": 0.9652778506278992, - "rewards/repetition_penalty_reward": -0.020177372731268406, - "rewards/tag_count_reward": 0.8333333432674408, + "completion_length": 427.9375, + "epoch": 0.387, + "grad_norm": 11.469486615908298, + "kl": 1.912109375, + "learning_rate": 7.924468532530883e-07, + "loss": 0.2236, + "reward": 2.6311352252960205, + "reward_std": 0.37547628581523895, + "rewards/accuracy_reward": 0.7708333730697632, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.0615730844438076, + "rewards/tag_count_reward": 0.9427083730697632, "step": 774 }, { "clip_ratio": 0.0, - "completion_length": 501.3333435058594, - "epoch": 0.775, - "grad_norm": 21.026258662130076, - "kl": 5.171875, - "learning_rate": 2.3180194846605364e-07, - "loss": 0.8011, - "reward": 2.2563339471817017, - "reward_std": 0.6224390119314194, - "rewards/accuracy_reward": 0.4375000149011612, - "rewards/reasoning_steps_reward": 0.9652778506278992, - "rewards/repetition_penalty_reward": -0.011027187574654818, - "rewards/tag_count_reward": 0.8645833730697632, + "completion_length": 642.3125305175781, + "epoch": 0.3875, + "grad_norm": 47.419569435384595, + "kl": 5.8515625, + "learning_rate": 7.917848237560708e-07, + "loss": 0.7953, + "reward": 2.1859389543533325, + "reward_std": 0.7739139497280121, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 0.9236111342906952, + "rewards/repetition_penalty_reward": -0.02933894842863083, + "rewards/tag_count_reward": 0.7708333432674408, "step": 775 }, { "clip_ratio": 0.0, - "completion_length": 488.3958435058594, - "epoch": 0.776, - "grad_norm": 40.72839840915295, - "kl": 2.578125, - "learning_rate": 2.306931685585657e-07, - "loss": 0.791, - "reward": 2.5573089122772217, - "reward_std": 0.6792595684528351, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9583333730697632, - "rewards/repetition_penalty_reward": -0.04164944589138031, - "rewards/tag_count_reward": 0.9114583432674408, + "completion_length": 433.8958435058594, + "epoch": 0.388, + "grad_norm": 16.151319546185952, + "kl": 2.9453125, + "learning_rate": 7.911220577405484e-07, + "loss": 0.5875, + "reward": 2.4124279022216797, + "reward_std": 0.5835599303245544, + "rewards/accuracy_reward": 0.5625000298023224, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.023336103186011314, + "rewards/tag_count_reward": 0.9010416865348816, "step": 776 }, { "clip_ratio": 0.0, - "completion_length": 478.1875, - "epoch": 0.777, - "grad_norm": 65.73064893150305, - "kl": 3.2578125, - "learning_rate": 2.2958827930405162e-07, - "loss": 1.0533, - "reward": 2.6781049966812134, - "reward_std": 0.6686598658561707, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 0.9652778506278992, - "rewards/repetition_penalty_reward": -0.01633965945802629, - "rewards/tag_count_reward": 0.8750000298023224, + "completion_length": 493.87501525878906, + "epoch": 0.3885, + "grad_norm": 19.784079907225436, + "kl": 3.0546875, + "learning_rate": 7.904585572254218e-07, + "loss": 0.5988, + "reward": 2.564281463623047, + "reward_std": 0.6512129902839661, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.02773252595216036, + "rewards/tag_count_reward": 0.8697916865348816, "step": 777 }, { "clip_ratio": 0.0, - "completion_length": 374.79168701171875, - "epoch": 0.778, - "grad_norm": 12.77197238349156, - "kl": 2.05078125, - "learning_rate": 2.2848729416523859e-07, - "loss": 0.2598, - "reward": 2.4885555505752563, - "reward_std": 0.43929314613342285, - "rewards/accuracy_reward": 0.5625000298023224, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.014916833024471998, - "rewards/tag_count_reward": 0.9687500298023224, + "completion_length": 527.8750152587891, + "epoch": 0.389, + "grad_norm": 14.102553925882571, + "kl": 3.291015625, + "learning_rate": 7.897943242318285e-07, + "loss": 0.6637, + "reward": 2.508991241455078, + "reward_std": 0.49256084859371185, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9444445371627808, + "rewards/repetition_penalty_reward": -0.03962016478180885, + "rewards/tag_count_reward": 0.875, "step": 778 }, { "clip_ratio": 0.0, - "completion_length": 520.2291870117188, - "epoch": 0.779, - "grad_norm": 27.96030488624158, - "kl": 6.6875, - "learning_rate": 2.2739022655728277e-07, - "loss": 0.9628, - "reward": 2.069424092769623, - "reward_std": 0.6286317706108093, - "rewards/accuracy_reward": 0.3125000111758709, - "rewards/reasoning_steps_reward": 0.9305556118488312, - "rewards/repetition_penalty_reward": -0.012173243798315525, - "rewards/tag_count_reward": 0.8385416865348816, + "completion_length": 553.7708587646484, + "epoch": 0.3895, + "grad_norm": 21.449440079929175, + "kl": 4.6875, + "learning_rate": 7.891293607831373e-07, + "loss": 0.6951, + "reward": 2.2457956075668335, + "reward_std": 0.3153197094798088, + "rewards/accuracy_reward": 0.4375, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.03719065245240927, + "rewards/tag_count_reward": 0.8593750298023224, "step": 779 }, { "clip_ratio": 0.0, - "completion_length": 430.22918701171875, - "epoch": 0.78, - "grad_norm": 29.26274656267414, - "kl": 3.59375, - "learning_rate": 2.2629708984760706e-07, - "loss": 0.9216, - "reward": 2.4278910160064697, - "reward_std": 0.5140875577926636, - "rewards/accuracy_reward": 0.5416666716337204, - "rewards/reasoning_steps_reward": 0.972222238779068, - "rewards/repetition_penalty_reward": -0.018289764411747456, - "rewards/tag_count_reward": 0.9322916865348816, + "completion_length": 681.0833435058594, + "epoch": 0.39, + "grad_norm": 50.20111893584829, + "kl": 5.640625, + "learning_rate": 7.884636689049422e-07, + "loss": 0.9104, + "reward": 2.192242741584778, + "reward_std": 0.5565801113843918, + "rewards/accuracy_reward": 0.4791666865348816, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.04213242046535015, + "rewards/tag_count_reward": 0.7968750298023224, "step": 780 }, { "clip_ratio": 0.0, - "completion_length": 520.8750152587891, - "epoch": 0.781, - "grad_norm": 27.923966049196657, - "kl": 4.5703125, - "learning_rate": 2.2520789735573704e-07, - "loss": 1.121, - "reward": 2.480145573616028, - "reward_std": 0.6819994747638702, - "rewards/accuracy_reward": 0.6875000298023224, - "rewards/reasoning_steps_reward": 0.9652777910232544, - "rewards/repetition_penalty_reward": -0.02679906040430069, - "rewards/tag_count_reward": 0.8541666865348816, + "completion_length": 493.3125305175781, + "epoch": 0.3905, + "grad_norm": 31.66716632745188, + "kl": 2.26953125, + "learning_rate": 7.877972506250562e-07, + "loss": 0.727, + "reward": 2.6193435192108154, + "reward_std": 0.655490517616272, + "rewards/accuracy_reward": 0.7708333730697632, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.03517040517181158, + "rewards/tag_count_reward": 0.8906250298023224, "step": 781 }, { "clip_ratio": 0.0, - "completion_length": 503.1875305175781, - "epoch": 0.782, - "grad_norm": 33.004171238913635, - "kl": 2.75, - "learning_rate": 2.2412266235313973e-07, - "loss": 0.7836, - "reward": 2.555114269256592, - "reward_std": 0.6732204258441925, - "rewards/accuracy_reward": 0.7083333730697632, - "rewards/reasoning_steps_reward": 0.972222238779068, - "rewards/repetition_penalty_reward": -0.04210797697305679, - "rewards/tag_count_reward": 0.9166666865348816, + "completion_length": 546.0833435058594, + "epoch": 0.391, + "grad_norm": 20.081813742060923, + "kl": 2.97265625, + "learning_rate": 7.871301079735049e-07, + "loss": 0.6893, + "reward": 2.3784666061401367, + "reward_std": 0.4472994953393936, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.02604720927774906, + "rewards/tag_count_reward": 0.8697916865348816, "step": 782 }, { "clip_ratio": 0.0, - "completion_length": 543.2291870117188, - "epoch": 0.783, - "grad_norm": 26.670810792645383, - "kl": 3.9765625, - "learning_rate": 2.230413980630609e-07, - "loss": 0.8157, - "reward": 2.3876798152923584, - "reward_std": 0.6481176614761353, - "rewards/accuracy_reward": 0.625, - "rewards/reasoning_steps_reward": 0.9305555522441864, - "rewards/repetition_penalty_reward": -0.02725088130682707, - "rewards/tag_count_reward": 0.8593750298023224, + "completion_length": 530.6458435058594, + "epoch": 0.3915, + "grad_norm": 18.755713225081045, + "kl": 3.94140625, + "learning_rate": 7.864622429825204e-07, + "loss": 0.5134, + "reward": 2.5227210521698, + "reward_std": 0.6135649532079697, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.04672360047698021, + "rewards/tag_count_reward": 0.8958333432674408, "step": 783 }, { "clip_ratio": 0.0, - "completion_length": 454.14585876464844, - "epoch": 0.784, - "grad_norm": 20.273867549895126, - "kl": 2.8828125, - "learning_rate": 2.2196411766036487e-07, - "loss": 0.3284, - "reward": 2.4754650592803955, - "reward_std": 0.5723298937082291, - "rewards/accuracy_reward": 0.6250000149011612, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.03495161887258291, - "rewards/tag_count_reward": 0.90625, + "completion_length": 393.2083435058594, + "epoch": 0.392, + "grad_norm": 44.37092698629651, + "kl": 3.7265625, + "learning_rate": 7.857936576865356e-07, + "loss": 0.5662, + "reward": 2.404683828353882, + "reward_std": 0.31213444471359253, + "rewards/accuracy_reward": 0.5, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.02413582894951105, + "rewards/tag_count_reward": 0.9427083432674408, "step": 784 }, { "clip_ratio": 0.0, - "completion_length": 726.125, - "epoch": 0.785, - "grad_norm": 29.95305237028232, - "kl": 8.2578125, - "learning_rate": 2.2089083427137329e-07, - "loss": 1.0536, - "reward": 2.2507245540618896, - "reward_std": 0.7002788186073303, - "rewards/accuracy_reward": 0.5625, - "rewards/reasoning_steps_reward": 0.9583333730697632, - "rewards/repetition_penalty_reward": -0.025317177176475525, - "rewards/tag_count_reward": 0.7552083432674408, + "completion_length": 403.0208435058594, + "epoch": 0.3925, + "grad_norm": 26.04848472624046, + "kl": 1.279296875, + "learning_rate": 7.851243541221769e-07, + "loss": 0.4965, + "reward": 2.568751573562622, + "reward_std": 0.3721753219142556, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.023262279108166695, + "rewards/tag_count_reward": 0.9322916865348816, "step": 785 }, { "clip_ratio": 0.0, - "completion_length": 456.6875, - "epoch": 0.786, - "grad_norm": 23.330969926542757, - "kl": 3.6953125, - "learning_rate": 2.1982156097370557e-07, - "loss": 0.851, - "reward": 2.6652588844299316, - "reward_std": 0.6001283526420593, - "rewards/accuracy_reward": 0.8125000298023224, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.015296765603125095, - "rewards/tag_count_reward": 0.8958333730697632, + "completion_length": 430.6458435058594, + "epoch": 0.393, + "grad_norm": 13.25665865367494, + "kl": 2.01953125, + "learning_rate": 7.844543343282595e-07, + "loss": 0.4602, + "reward": 2.5878489017486572, + "reward_std": 0.3847273141145706, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9722221791744232, + "rewards/repetition_penalty_reward": -0.03541503753513098, + "rewards/tag_count_reward": 0.921875, "step": 786 }, { "clip_ratio": 0.0, - "completion_length": 439.56251525878906, - "epoch": 0.787, - "grad_norm": 41.78445904580422, - "kl": 3.1875, - "learning_rate": 2.1875631079611956e-07, - "loss": 0.9046, - "reward": 2.517007827758789, - "reward_std": 0.5808334052562714, - "rewards/accuracy_reward": 0.6875, - "rewards/reasoning_steps_reward": 0.958333432674408, - "rewards/repetition_penalty_reward": -0.019450515508651733, - "rewards/tag_count_reward": 0.8906250298023224, + "completion_length": 444.87501525878906, + "epoch": 0.3935, + "grad_norm": 22.772832737303286, + "kl": 4.8125, + "learning_rate": 7.837836003457793e-07, + "loss": 0.6349, + "reward": 2.510228753089905, + "reward_std": 0.5596920847892761, + "rewards/accuracy_reward": 0.6875000149011612, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.04532698355615139, + "rewards/tag_count_reward": 0.8958333432674408, "step": 787 }, { "clip_ratio": 0.0, - "completion_length": 500.0208435058594, - "epoch": 0.788, - "grad_norm": 21.94070893015738, - "kl": 3.8828125, - "learning_rate": 2.1769509671835223e-07, - "loss": 0.8589, - "reward": 2.501682996749878, - "reward_std": 0.7997144758701324, - "rewards/accuracy_reward": 0.7083333730697632, - "rewards/reasoning_steps_reward": 0.9305555820465088, - "rewards/repetition_penalty_reward": -0.022622703574597836, - "rewards/tag_count_reward": 0.8854166865348816, + "completion_length": 472.9583435058594, + "epoch": 0.394, + "grad_norm": 15.953872553666358, + "kl": 2.59375, + "learning_rate": 7.831121542179086e-07, + "loss": 0.7152, + "reward": 2.431522011756897, + "reward_std": 0.46617111563682556, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.04590862803161144, + "rewards/tag_count_reward": 0.9010416865348816, "step": 788 }, { "clip_ratio": 0.0, - "completion_length": 356.10418701171875, - "epoch": 0.789, - "grad_norm": 18.917515789374068, - "kl": 4.484375, - "learning_rate": 2.166379316709625e-07, - "loss": 0.4355, - "reward": 2.2824082374572754, - "reward_std": 0.5984006226062775, - "rewards/accuracy_reward": 0.5000000149011612, - "rewards/reasoning_steps_reward": 0.916666716337204, - "rewards/repetition_penalty_reward": -0.019675294868648052, - "rewards/tag_count_reward": 0.8854166865348816, + "completion_length": 631.1041870117188, + "epoch": 0.3945, + "grad_norm": 21.209792192046844, + "kl": 3.51953125, + "learning_rate": 7.824399979899889e-07, + "loss": 0.4762, + "reward": 2.2550671100616455, + "reward_std": 0.3108735680580139, + "rewards/accuracy_reward": 0.4583333432674408, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.06437743455171585, + "rewards/tag_count_reward": 0.875, "step": 789 }, { "clip_ratio": 0.0, - "completion_length": 632.2083740234375, - "epoch": 0.79, - "grad_norm": 16.91140019655358, - "kl": 6.6875, - "learning_rate": 2.1558482853517253e-07, - "loss": 1.2443, - "reward": 2.2729358673095703, - "reward_std": 0.853611171245575, - "rewards/accuracy_reward": 0.5833333432674408, - "rewards/reasoning_steps_reward": 0.8819445073604584, - "rewards/repetition_penalty_reward": -0.025675359182059765, - "rewards/tag_count_reward": 0.8333333432674408, + "completion_length": 340.8333435058594, + "epoch": 0.395, + "grad_norm": 33.37412557405853, + "kl": 1.37890625, + "learning_rate": 7.817671337095244e-07, + "loss": 0.4575, + "reward": 2.5797940492630005, + "reward_std": 0.6914741396903992, + "rewards/accuracy_reward": 0.7708333730697632, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.041733767837285995, + "rewards/tag_count_reward": 0.9062500298023224, "step": 790 }, { "clip_ratio": 0.0, - "completion_length": 376.31251525878906, - "epoch": 0.791, - "grad_norm": 20.048000622022776, - "kl": 1.310546875, - "learning_rate": 2.1453580014271203e-07, - "loss": 0.3778, - "reward": 2.2707479000091553, - "reward_std": 0.3341696485877037, - "rewards/accuracy_reward": 0.3125000074505806, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.0191827230155468, - "rewards/tag_count_reward": 0.984375, + "completion_length": 437.12501525878906, + "epoch": 0.3955, + "grad_norm": 16.65766077241616, + "kl": 1.3662109375, + "learning_rate": 7.810935634261764e-07, + "loss": 0.3751, + "reward": 2.6942687034606934, + "reward_std": 0.47277119755744934, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.04705093614757061, + "rewards/tag_count_reward": 0.9218750298023224, "step": 791 }, { "clip_ratio": 0.0, - "completion_length": 371.6458435058594, - "epoch": 0.792, - "grad_norm": 19.6502337749718, - "kl": 1.71484375, - "learning_rate": 2.134908592756607e-07, - "loss": 0.4382, - "reward": 2.6625298261642456, - "reward_std": 0.537158876657486, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.02670634165406227, - "rewards/tag_count_reward": 0.9739583730697632, + "completion_length": 486.60418701171875, + "epoch": 0.396, + "grad_norm": 9.722828730186746, + "kl": 2.5546875, + "learning_rate": 7.804192891917571e-07, + "loss": 0.4686, + "reward": 2.0610522031784058, + "reward_std": 0.35085567831993103, + "rewards/accuracy_reward": 0.2916666865348816, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.04658674821257591, + "rewards/tag_count_reward": 0.8645833432674408, "step": 792 }, { "clip_ratio": 0.0, - "completion_length": 515.7291870117188, - "epoch": 0.793, - "grad_norm": 28.434148981976, - "kl": 6.421875, - "learning_rate": 2.124500186662932e-07, - "loss": 0.8134, - "reward": 2.500126600265503, - "reward_std": 0.7610350847244263, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/reasoning_steps_reward": 0.9513888955116272, - "rewards/repetition_penalty_reward": -0.013762431219220161, - "rewards/tag_count_reward": 0.8541666865348816, + "completion_length": 768.0, + "epoch": 0.3965, + "grad_norm": 32.12608283927452, + "kl": 5.296875, + "learning_rate": 7.797443130602226e-07, + "loss": 0.9858, + "reward": 1.9784066081047058, + "reward_std": 0.8380246162414551, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.10145452991127968, + "rewards/tag_count_reward": 0.7812500298023224, "step": 793 }, { "clip_ratio": 0.0, - "completion_length": 621.7500305175781, - "epoch": 0.794, - "grad_norm": 34.773569674299296, - "kl": 7.453125, - "learning_rate": 2.1141329099692406e-07, - "loss": 1.1273, - "reward": 2.032840847969055, - "reward_std": 0.881601870059967, - "rewards/accuracy_reward": 0.3541666716337204, - "rewards/reasoning_steps_reward": 0.9166666865348816, - "rewards/repetition_penalty_reward": -0.034867677837610245, - "rewards/tag_count_reward": 0.7968750298023224, + "completion_length": 600.875, + "epoch": 0.397, + "grad_norm": 18.410765045440638, + "kl": 2.9765625, + "learning_rate": 7.79068637087667e-07, + "loss": 0.643, + "reward": 2.441210389137268, + "reward_std": 0.3568989485502243, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9791667461395264, + "rewards/repetition_penalty_reward": -0.08483139425516129, + "rewards/tag_count_reward": 0.8802083432674408, "step": 794 }, { "clip_ratio": 0.0, - "completion_length": 517.0000152587891, - "epoch": 0.795, - "grad_norm": 30.664837556455563, - "kl": 5.78125, - "learning_rate": 2.1038068889975259e-07, - "loss": 1.0339, - "reward": 2.5728070735931396, - "reward_std": 0.7294317185878754, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.024415286257863045, - "rewards/tag_count_reward": 0.8541666865348816, + "completion_length": 436.79168701171875, + "epoch": 0.3975, + "grad_norm": 10.317344532371504, + "kl": 1.23046875, + "learning_rate": 7.783922633323169e-07, + "loss": 0.2964, + "reward": 2.6988085508346558, + "reward_std": 0.543820321559906, + "rewards/accuracy_reward": 0.8750000298023224, + "rewards/reasoning_steps_reward": 0.9444445371627808, + "rewards/repetition_penalty_reward": -0.07896940223872662, + "rewards/tag_count_reward": 0.9583333730697632, "step": 795 }, { "clip_ratio": 0.0, - "completion_length": 454.56251525878906, - "epoch": 0.796, - "grad_norm": 22.153680697625163, - "kl": 5.1015625, - "learning_rate": 2.0935222495670968e-07, - "loss": 1.0399, - "reward": 2.4211684465408325, - "reward_std": 0.7883006036281586, + "completion_length": 542.0416870117188, + "epoch": 0.398, + "grad_norm": 20.086888963335625, + "kl": 3.09375, + "learning_rate": 7.777151938545235e-07, + "loss": 0.6306, + "reward": 2.3561642169952393, + "reward_std": 0.7507622838020325, "rewards/accuracy_reward": 0.6041666865348816, - "rewards/reasoning_steps_reward": 0.9444445371627808, - "rewards/repetition_penalty_reward": -0.01285938685759902, - "rewards/tag_count_reward": 0.8854166865348816, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.10737751796841621, + "rewards/tag_count_reward": 0.9010416865348816, "step": 796 }, { "clip_ratio": 0.0, - "completion_length": 558.0000152587891, - "epoch": 0.797, - "grad_norm": 44.01821977708549, - "kl": 5.890625, - "learning_rate": 2.0832791169930363e-07, - "loss": 1.2008, - "reward": 2.6672359704971313, - "reward_std": 0.5852415859699249, + "completion_length": 419.9375, + "epoch": 0.3985, + "grad_norm": 13.37138079802378, + "kl": 0.71484375, + "learning_rate": 7.770374307167585e-07, + "loss": 0.378, + "reward": 2.7522013187408447, + "reward_std": 0.4279877096414566, "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 0.9722222983837128, - "rewards/repetition_penalty_reward": -0.02894441783428192, - "rewards/tag_count_reward": 0.8697916865348816, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.05335431918501854, + "rewards/tag_count_reward": 0.9583333432674408, "step": 797 }, { "clip_ratio": 0.0, - "completion_length": 579.6458435058594, - "epoch": 0.798, - "grad_norm": 23.974305974531344, - "kl": 5.125, - "learning_rate": 2.0730776160846853e-07, - "loss": 0.9124, - "reward": 2.1921111345291138, - "reward_std": 0.7356277704238892, - "rewards/accuracy_reward": 0.458333358168602, - "rewards/reasoning_steps_reward": 0.9305555820465088, - "rewards/repetition_penalty_reward": -0.024902895092964172, - "rewards/tag_count_reward": 0.8281250298023224, + "completion_length": 587.2500305175781, + "epoch": 0.399, + "grad_norm": 27.70225468914978, + "kl": 2.796875, + "learning_rate": 7.763589759836058e-07, + "loss": 0.6997, + "reward": 2.363113760948181, + "reward_std": 0.67525914311409, + "rewards/accuracy_reward": 0.6875, + "rewards/reasoning_steps_reward": 0.9444445371627808, + "rewards/repetition_penalty_reward": -0.13341398537158966, + "rewards/tag_count_reward": 0.8645833432674408, "step": 798 }, { "clip_ratio": 0.0, - "completion_length": 496.81251525878906, - "epoch": 0.799, - "grad_norm": 20.99627755388401, - "kl": 4.53125, - "learning_rate": 2.0629178711441115e-07, - "loss": 0.7604, - "reward": 2.4410648345947266, - "reward_std": 0.5636108368635178, - "rewards/accuracy_reward": 0.6458333432674408, - "rewards/reasoning_steps_reward": 0.9513888955116272, - "rewards/repetition_penalty_reward": -0.010324170347303152, - "rewards/tag_count_reward": 0.8541666865348816, + "completion_length": 870.6666870117188, + "epoch": 0.3995, + "grad_norm": 15.870305020089033, + "kl": 3.0859375, + "learning_rate": 7.756798317217558e-07, + "loss": 0.8678, + "reward": 1.9343020915985107, + "reward_std": 0.7617403268814087, + "rewards/accuracy_reward": 0.3750000149011612, + "rewards/reasoning_steps_reward": 0.951388955116272, + "rewards/repetition_penalty_reward": -0.20458681881427765, + "rewards/tag_count_reward": 0.8125, "step": 799 }, { "clip_ratio": 0.0, - "completion_length": 608.5208740234375, - "epoch": 0.8, - "grad_norm": 26.72579720606878, - "kl": 5.25, - "learning_rate": 2.0528000059645995e-07, - "loss": 0.7956, - "reward": 2.119356393814087, - "reward_std": 0.5921457409858704, - "rewards/accuracy_reward": 0.3541666716337204, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.03689361736178398, - "rewards/tag_count_reward": 0.8229166865348816, + "completion_length": 647.75, + "epoch": 0.4, + "grad_norm": 12.665956910806365, + "kl": 1.33203125, + "learning_rate": 7.75e-07, + "loss": 0.6692, + "reward": 2.246774673461914, + "reward_std": 0.6763182580471039, + "rewards/accuracy_reward": 0.5208333358168602, + "rewards/reasoning_steps_reward": 0.9652778506278992, + "rewards/repetition_penalty_reward": -0.15079493820667267, + "rewards/tag_count_reward": 0.9114583730697632, "step": 800 }, { "clip_ratio": 0.0, - "completion_length": 312.0, - "epoch": 0.801, - "grad_norm": 13.404486874760053, - "kl": 1.23046875, - "learning_rate": 2.042724143829146e-07, - "loss": 0.077, - "reward": 2.3504269123077393, - "reward_std": 0.36202409863471985, - "rewards/accuracy_reward": 0.4375000149011612, - "rewards/reasoning_steps_reward": 0.9583333730697632, - "rewards/repetition_penalty_reward": -0.014156644232571125, - "rewards/tag_count_reward": 0.9687500298023224, + "completion_length": 515.3750305175781, + "epoch": 0.4005, + "grad_norm": 11.953588265183154, + "kl": 0.8115234375, + "learning_rate": 7.743194828892235e-07, + "loss": 0.4331, + "reward": 2.6169031858444214, + "reward_std": 0.5906797051429749, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.10011079534888268, + "rewards/tag_count_reward": 0.9322916865348816, "step": 801 }, { "clip_ratio": 0.0, - "completion_length": 434.4583435058594, - "epoch": 0.802, - "grad_norm": 19.113136797766565, - "kl": 2.7421875, - "learning_rate": 2.032690407508949e-07, - "loss": 0.5925, - "reward": 2.4155834913253784, - "reward_std": 0.6261934638023376, - "rewards/accuracy_reward": 0.5625, - "rewards/reasoning_steps_reward": 0.951388955116272, - "rewards/repetition_penalty_reward": -0.020180439576506615, - "rewards/tag_count_reward": 0.921875, + "completion_length": 601.0625152587891, + "epoch": 0.401, + "grad_norm": 11.261392190054263, + "kl": 1.77734375, + "learning_rate": 7.736382824623999e-07, + "loss": 0.8205, + "reward": 2.2957258224487305, + "reward_std": 0.8579961359500885, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.909722238779068, + "rewards/repetition_penalty_reward": -0.15045488253235817, + "rewards/tag_count_reward": 0.8697916865348816, "step": 802 }, { "clip_ratio": 0.0, - "completion_length": 501.4583435058594, - "epoch": 0.803, - "grad_norm": 25.89659290548137, - "kl": 3.203125, - "learning_rate": 2.0226989192619204e-07, - "loss": 0.4875, - "reward": 2.301639437675476, - "reward_std": 0.555485874414444, - "rewards/accuracy_reward": 0.4791666865348816, + "completion_length": 513.75, + "epoch": 0.4015, + "grad_norm": 7.271737188658035, + "kl": 1.1640625, + "learning_rate": 7.729564007945834e-07, + "loss": 0.5417, + "reward": 2.590050458908081, + "reward_std": 0.43862822093069553, + "rewards/accuracy_reward": 0.8125, "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.03690226562321186, - "rewards/tag_count_reward": 0.8802083730697632, + "rewards/repetition_penalty_reward": -0.13390799798071384, + "rewards/tag_count_reward": 0.9322916865348816, "step": 803 }, { "clip_ratio": 0.0, - "completion_length": 438.1250305175781, - "epoch": 0.804, - "grad_norm": 21.314874667338287, - "kl": 2.4453125, - "learning_rate": 2.0127498008311922e-07, - "loss": 0.3104, - "reward": 2.334763288497925, - "reward_std": 0.3796348571777344, - "rewards/accuracy_reward": 0.4791666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.02287570107728243, - "rewards/tag_count_reward": 0.8854166865348816, + "completion_length": 619.6666870117188, + "epoch": 0.402, + "grad_norm": 12.068916347239274, + "kl": 1.515625, + "learning_rate": 7.72273839962904e-07, + "loss": 0.6556, + "reward": 2.3289283514022827, + "reward_std": 0.6953560709953308, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.14503009244799614, + "rewards/tag_count_reward": 0.8697916865348816, "step": 804 }, { "clip_ratio": 0.0, - "completion_length": 421.6666717529297, - "epoch": 0.805, - "grad_norm": 14.366039627616365, - "kl": 1.8720703125, - "learning_rate": 2.0028431734436308e-07, - "loss": 0.3854, - "reward": 2.546318769454956, - "reward_std": 0.3817850574851036, - "rewards/accuracy_reward": 0.645833358168602, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.026598164811730385, - "rewards/tag_count_reward": 0.9479166865348816, + "completion_length": 473.06251525878906, + "epoch": 0.4025, + "grad_norm": 7.004105215955508, + "kl": 1.224609375, + "learning_rate": 7.715906020465602e-07, + "loss": 0.51, + "reward": 2.6324926614761353, + "reward_std": 0.42844393849372864, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.10535471886396408, + "rewards/tag_count_reward": 0.9531250298023224, "step": 805 }, { "clip_ratio": 0.0, - "completion_length": 418.625, - "epoch": 0.806, - "grad_norm": 21.19577118539385, - "kl": 3.046875, - "learning_rate": 1.9929791578083655e-07, - "loss": 0.5684, - "reward": 2.6353834867477417, - "reward_std": 0.5922529548406601, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 0.944444477558136, - "rewards/repetition_penalty_reward": -0.02260266337543726, - "rewards/tag_count_reward": 0.9218750298023224, + "completion_length": 571.7916717529297, + "epoch": 0.403, + "grad_norm": 15.381900062308238, + "kl": 1.984375, + "learning_rate": 7.709066891268133e-07, + "loss": 0.5004, + "reward": 2.460014581680298, + "reward_std": 0.6100521683692932, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.14241605252027512, + "rewards/tag_count_reward": 0.921875, "step": 806 }, { "clip_ratio": 0.0, - "completion_length": 527.6666870117188, - "epoch": 0.807, - "grad_norm": 26.76069518759677, - "kl": 5.15625, - "learning_rate": 1.9831578741153155e-07, - "loss": 0.7895, - "reward": 2.3032405376434326, - "reward_std": 0.808287501335144, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/reasoning_steps_reward": 0.9305556416511536, - "rewards/repetition_penalty_reward": -0.028356720693409443, - "rewards/tag_count_reward": 0.8593750298023224, + "completion_length": 360.31251525878906, + "epoch": 0.4035, + "grad_norm": 5.503594150237046, + "kl": 0.3388671875, + "learning_rate": 7.702221032869808e-07, + "loss": 0.1863, + "reward": 2.5507017374038696, + "reward_std": 0.32198452949523926, + "rewards/accuracy_reward": 0.645833358168602, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.06388161890208721, + "rewards/tag_count_reward": 0.9895833432674408, "step": 807 }, { "clip_ratio": 0.0, - "completion_length": 381.8125, - "epoch": 0.808, - "grad_norm": 20.376838690762124, - "kl": 1.927734375, - "learning_rate": 1.9733794420337213e-07, - "loss": 0.268, - "reward": 2.428946018218994, - "reward_std": 0.5701454132795334, - "rewards/accuracy_reward": 0.5208333432674408, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.027651351876556873, - "rewards/tag_count_reward": 0.9635416865348816, + "completion_length": 360.25, + "epoch": 0.404, + "grad_norm": 11.719601907986142, + "kl": 0.39794921875, + "learning_rate": 7.695368466124296e-07, + "loss": 0.0649, + "reward": 2.8640111684799194, + "reward_std": 0.24921771883964539, + "rewards/accuracy_reward": 0.9375000298023224, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.052655573934316635, + "rewards/tag_count_reward": 1.0, "step": 808 }, { "clip_ratio": 0.0, - "completion_length": 445.62501525878906, - "epoch": 0.809, - "grad_norm": 20.816082501770886, - "kl": 5.140625, - "learning_rate": 1.9636439807106912e-07, - "loss": 0.8017, - "reward": 2.5268882513046265, - "reward_std": 0.7573049068450928, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9722222089767456, - "rewards/repetition_penalty_reward": -0.023458978161215782, - "rewards/tag_count_reward": 0.8489583730697632, + "completion_length": 489.6666717529297, + "epoch": 0.4045, + "grad_norm": 16.98625822145741, + "kl": 0.5205078125, + "learning_rate": 7.688509211905707e-07, + "loss": 0.6203, + "reward": 2.476475715637207, + "reward_std": 0.7080457210540771, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/reasoning_steps_reward": 0.9305555522441864, + "rewards/repetition_penalty_reward": -0.08949651196599007, + "rewards/tag_count_reward": 0.9687500298023224, "step": 809 }, { "clip_ratio": 0.0, - "completion_length": 391.81251525878906, - "epoch": 0.81, - "grad_norm": 22.694875364982547, - "kl": 2.3359375, - "learning_rate": 1.9539516087697517e-07, - "loss": 0.5417, - "reward": 2.658096432685852, - "reward_std": 0.6354215741157532, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/reasoning_steps_reward": 0.972222238779068, - "rewards/repetition_penalty_reward": -0.0172510021366179, - "rewards/tag_count_reward": 0.953125, + "completion_length": 439.5208435058594, + "epoch": 0.405, + "grad_norm": 12.748082551046092, + "kl": 0.71875, + "learning_rate": 7.681643291108517e-07, + "loss": 0.3463, + "reward": 2.565284848213196, + "reward_std": 0.4373869299888611, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.07186824828386307, + "rewards/tag_count_reward": 0.9427083432674408, "step": 810 }, { "clip_ratio": 0.0, - "completion_length": 503.35418701171875, - "epoch": 0.811, - "grad_norm": 58.019855154883885, - "kl": 6.5625, - "learning_rate": 1.944302444309393e-07, - "loss": 1.3334, - "reward": 2.501904606819153, - "reward_std": 0.7767859101295471, - "rewards/accuracy_reward": 0.7083333730697632, - "rewards/reasoning_steps_reward": 0.9513888955116272, - "rewards/repetition_penalty_reward": -0.022400940768420696, - "rewards/tag_count_reward": 0.8645833730697632, + "completion_length": 732.1875305175781, + "epoch": 0.4055, + "grad_norm": 48.03197567653632, + "kl": 1.46875, + "learning_rate": 7.67477072464751e-07, + "loss": 0.6842, + "reward": 1.9572343230247498, + "reward_std": 0.734717458486557, + "rewards/accuracy_reward": 0.4375000223517418, + "rewards/reasoning_steps_reward": 0.881944477558136, + "rewards/repetition_penalty_reward": -0.20596013963222504, + "rewards/tag_count_reward": 0.8437500298023224, "step": 811 }, { "clip_ratio": 0.0, - "completion_length": 516.8125, - "epoch": 0.812, - "grad_norm": 54.33176698076773, - "kl": 7.484375, - "learning_rate": 1.934696604901642e-07, - "loss": 0.8888, - "reward": 2.2534443140029907, - "reward_std": 0.7112808525562286, - "rewards/accuracy_reward": 0.4583333432674408, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.017389055341482162, - "rewards/tag_count_reward": 0.8333333730697632, + "completion_length": 474.5, + "epoch": 0.406, + "grad_norm": 36.55159806207502, + "kl": 2.560546875, + "learning_rate": 7.667891533457718e-07, + "loss": 0.3356, + "reward": 2.531466484069824, + "reward_std": 0.40393590182065964, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.09179743006825447, + "rewards/tag_count_reward": 0.9635416865348816, "step": 812 }, { "clip_ratio": 0.0, - "completion_length": 467.00001525878906, - "epoch": 0.813, - "grad_norm": 35.200107482672536, - "kl": 4.8515625, - "learning_rate": 1.9251342075906179e-07, - "loss": 0.9493, - "reward": 2.1571664810180664, - "reward_std": 0.6603484153747559, - "rewards/accuracy_reward": 0.3750000149011612, - "rewards/reasoning_steps_reward": 0.9236112236976624, - "rewards/repetition_penalty_reward": -0.021653023548424244, - "rewards/tag_count_reward": 0.8802083432674408, + "completion_length": 486.1875, + "epoch": 0.4065, + "grad_norm": 501.2023762121776, + "kl": 10.4375, + "learning_rate": 7.661005738494349e-07, + "loss": 1.8999, + "reward": 2.5981922149658203, + "reward_std": 0.7066002190113068, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.09451625496149063, + "rewards/tag_count_reward": 0.9427083432674408, "step": 813 }, { "clip_ratio": 0.0, - "completion_length": 469.1041717529297, - "epoch": 0.814, - "grad_norm": 54.98157225337433, - "kl": 5.5, - "learning_rate": 1.915615368891117e-07, - "loss": 0.9186, - "reward": 2.1737260818481445, - "reward_std": 0.6402971148490906, - "rewards/accuracy_reward": 0.3750000149011612, - "rewards/reasoning_steps_reward": 0.9166667461395264, - "rewards/repetition_penalty_reward": -0.013774096965789795, - "rewards/tag_count_reward": 0.8958333432674408, + "completion_length": 753.5000305175781, + "epoch": 0.407, + "grad_norm": 356.97977721564547, + "kl": 15.0, + "learning_rate": 7.654113360732732e-07, + "loss": 1.628, + "reward": 2.165995717048645, + "reward_std": 0.8618307709693909, + "rewards/accuracy_reward": 0.5625000149011612, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.19858761131763458, + "rewards/tag_count_reward": 0.8437500298023224, "step": 814 }, { "clip_ratio": 0.0, - "completion_length": 531.7291717529297, - "epoch": 0.815, - "grad_norm": 44.84634369429172, - "kl": 6.203125, - "learning_rate": 1.9061402047871833e-07, - "loss": 1.2846, - "reward": 2.147634506225586, - "reward_std": 0.592269778251648, - "rewards/accuracy_reward": 0.3541666716337204, - "rewards/reasoning_steps_reward": 0.9375, - "rewards/repetition_penalty_reward": -0.019032152369618416, - "rewards/tag_count_reward": 0.875, + "completion_length": 499.125, + "epoch": 0.4075, + "grad_norm": 123.33805411679445, + "kl": 4.54296875, + "learning_rate": 7.647214421168238e-07, + "loss": 1.0472, + "reward": 2.6111273765563965, + "reward_std": 0.6222415566444397, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.11283096671104431, + "rewards/tag_count_reward": 0.953125, "step": 815 }, { "clip_ratio": 0.0, - "completion_length": 393.7291717529297, - "epoch": 0.816, - "grad_norm": 23.81197579521365, - "kl": 4.3671875, - "learning_rate": 1.8967088307307e-07, - "loss": 0.4182, - "reward": 2.38164883852005, - "reward_std": 0.6038723587989807, - "rewards/accuracy_reward": 0.5625, - "rewards/reasoning_steps_reward": 0.9375001192092896, - "rewards/repetition_penalty_reward": -0.019393039867281914, - "rewards/tag_count_reward": 0.9010416865348816, + "completion_length": 623.5625305175781, + "epoch": 0.408, + "grad_norm": 111.86522019096887, + "kl": 1.94921875, + "learning_rate": 7.640308940816239e-07, + "loss": 1.0335, + "reward": 2.26309335231781, + "reward_std": 0.8162707686424255, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.9652778506278992, + "rewards/repetition_penalty_reward": -0.15010132268071175, + "rewards/tag_count_reward": 0.9062500298023224, "step": 816 }, { "clip_ratio": 0.0, - "completion_length": 435.91668701171875, - "epoch": 0.817, - "grad_norm": 25.95495924946456, - "kl": 3.8046875, - "learning_rate": 1.887321361639985e-07, - "loss": 1.0102, - "reward": 2.579134702682495, - "reward_std": 0.5107276141643524, - "rewards/accuracy_reward": 0.7083333730697632, - "rewards/reasoning_steps_reward": 0.9652777910232544, - "rewards/repetition_penalty_reward": -0.026768138632178307, - "rewards/tag_count_reward": 0.9322916865348816, + "completion_length": 818.4166870117188, + "epoch": 0.4085, + "grad_norm": 53.005862638537856, + "kl": 2.6171875, + "learning_rate": 7.633396940712023e-07, + "loss": 0.7982, + "reward": 2.1612285375595093, + "reward_std": 1.055372416973114, + "rewards/accuracy_reward": 0.6250000149011612, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.21550768613815308, + "rewards/tag_count_reward": 0.8072916865348816, "step": 817 }, { "clip_ratio": 0.0, - "completion_length": 375.68751525878906, - "epoch": 0.818, - "grad_norm": 12.048598436192929, - "kl": 1.546875, - "learning_rate": 1.8779779118983867e-07, - "loss": 0.0993, - "reward": 2.702491521835327, - "reward_std": 0.4488513916730881, - "rewards/accuracy_reward": 0.7708333730697632, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.028411319479346275, - "rewards/tag_count_reward": 0.9739583432674408, + "completion_length": 524.6041717529297, + "epoch": 0.409, + "grad_norm": 35.60976729675571, + "kl": 2.546875, + "learning_rate": 7.626478441910744e-07, + "loss": 0.826, + "reward": 2.595088481903076, + "reward_std": 0.6593966484069824, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9374999701976776, + "rewards/repetition_penalty_reward": -0.1132449209690094, + "rewards/tag_count_reward": 0.9166666865348816, "step": 818 }, { "clip_ratio": 0.0, - "completion_length": 601.5416870117188, - "epoch": 0.819, - "grad_norm": 36.93153228705177, - "kl": 4.96875, - "learning_rate": 1.8686785953528922e-07, - "loss": 1.1533, - "reward": 2.5069422721862793, - "reward_std": 0.7914581596851349, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/reasoning_steps_reward": 0.9722222089767456, - "rewards/repetition_penalty_reward": -0.02257181704044342, - "rewards/tag_count_reward": 0.8489583432674408, + "completion_length": 432.0833435058594, + "epoch": 0.4095, + "grad_norm": 15.736710234210651, + "kl": 1.71875, + "learning_rate": 7.619553465487344e-07, + "loss": 0.5061, + "reward": 2.743853211402893, + "reward_std": 0.557222306728363, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.08774404600262642, + "rewards/tag_count_reward": 0.9635416865348816, "step": 819 }, { "clip_ratio": 0.0, - "completion_length": 414.00001525878906, - "epoch": 0.82, - "grad_norm": 36.676997752826466, - "kl": 2.37890625, - "learning_rate": 1.8594235253127372e-07, - "loss": 0.8483, - "reward": 2.6289749145507812, - "reward_std": 0.5092899203300476, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.020330775994807482, - "rewards/tag_count_reward": 0.9479166865348816, + "completion_length": 654.4583435058594, + "epoch": 0.41, + "grad_norm": 55.05386297781122, + "kl": 2.859375, + "learning_rate": 7.612622032536507e-07, + "loss": 0.8643, + "reward": 2.1351473927497864, + "reward_std": 0.6679968535900116, + "rewards/accuracy_reward": 0.5000000298023224, + "rewards/reasoning_steps_reward": 0.9375000298023224, + "rewards/repetition_penalty_reward": -0.161727674305439, + "rewards/tag_count_reward": 0.859375, "step": 820 }, { "clip_ratio": 0.0, - "completion_length": 424.6458435058594, - "epoch": 0.821, - "grad_norm": 25.786201228130313, - "kl": 1.57421875, - "learning_rate": 1.850212814548031e-07, - "loss": 0.4719, - "reward": 2.536083459854126, - "reward_std": 0.5136556923389435, - "rewards/accuracy_reward": 0.6250000149011612, - "rewards/reasoning_steps_reward": 0.9861111044883728, - "rewards/repetition_penalty_reward": -0.02815271820873022, - "rewards/tag_count_reward": 0.953125, + "completion_length": 502.00001525878906, + "epoch": 0.4105, + "grad_norm": 13.566998114200823, + "kl": 0.576171875, + "learning_rate": 7.60568416417258e-07, + "loss": 0.2932, + "reward": 2.584618330001831, + "reward_std": 0.4659956693649292, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/reasoning_steps_reward": 0.9652778506278992, + "rewards/repetition_penalty_reward": -0.13586794957518578, + "rewards/tag_count_reward": 0.9218750298023224, "step": 821 }, { "clip_ratio": 0.0, - "completion_length": 327.3333435058594, - "epoch": 0.822, - "grad_norm": 9.339930080359409, - "kl": 0.5400390625, - "learning_rate": 1.8410465752883758e-07, - "loss": 0.0109, - "reward": 2.5481141805648804, - "reward_std": 0.4344000518321991, - "rewards/accuracy_reward": 0.6250000149011612, - "rewards/reasoning_steps_reward": 0.9652777910232544, - "rewards/repetition_penalty_reward": -0.026538672856986523, - "rewards/tag_count_reward": 0.984375, + "completion_length": 629.7500305175781, + "epoch": 0.411, + "grad_norm": 30.139911446994017, + "kl": 1.54296875, + "learning_rate": 7.59873988152951e-07, + "loss": 0.6395, + "reward": 2.022117495536804, + "reward_std": 0.9050773084163666, + "rewards/accuracy_reward": 0.4583333432674408, + "rewards/reasoning_steps_reward": 0.8750000894069672, + "rewards/repetition_penalty_reward": -0.1445491872727871, + "rewards/tag_count_reward": 0.8333333432674408, "step": 822 }, { "clip_ratio": 0.0, - "completion_length": 425.04168701171875, - "epoch": 0.823, - "grad_norm": 17.806285658594135, - "kl": 2.37109375, - "learning_rate": 1.8319249192215055e-07, - "loss": 0.6621, - "reward": 2.3081016540527344, - "reward_std": 0.5683871209621429, - "rewards/accuracy_reward": 0.4375000298023224, - "rewards/reasoning_steps_reward": 0.9444444477558136, - "rewards/repetition_penalty_reward": -0.02175955567508936, - "rewards/tag_count_reward": 0.9479166865348816, + "completion_length": 747.7500305175781, + "epoch": 0.4115, + "grad_norm": 33.95615169187785, + "kl": 1.16796875, + "learning_rate": 7.591789205760789e-07, + "loss": 0.7839, + "reward": 1.9424059987068176, + "reward_std": 0.8640447556972504, + "rewards/accuracy_reward": 0.33333333395421505, + "rewards/reasoning_steps_reward": 0.9444445371627808, + "rewards/repetition_penalty_reward": -0.19474690407514572, + "rewards/tag_count_reward": 0.8593750298023224, "step": 823 }, { "clip_ratio": 0.0, - "completion_length": 488.16668701171875, - "epoch": 0.824, - "grad_norm": 30.757926718689315, - "kl": 4.0625, - "learning_rate": 1.822847957491922e-07, - "loss": 1.0787, - "reward": 2.3749157190322876, - "reward_std": 0.5852408409118652, - "rewards/accuracy_reward": 0.5416666716337204, - "rewards/reasoning_steps_reward": 0.9513889253139496, - "rewards/repetition_penalty_reward": -0.01918159332126379, - "rewards/tag_count_reward": 0.9010416865348816, + "completion_length": 561.3541717529297, + "epoch": 0.412, + "grad_norm": 8.156901607192008, + "kl": 0.73828125, + "learning_rate": 7.584832158039378e-07, + "loss": 0.1677, + "reward": 2.5275344848632812, + "reward_std": 0.3939831107854843, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.12871567159891129, + "rewards/tag_count_reward": 0.90625, "step": 824 }, { "clip_ratio": 0.0, - "completion_length": 474.3125305175781, - "epoch": 0.825, - "grad_norm": 22.249676223818117, - "kl": 3.10546875, - "learning_rate": 1.8138158006995363e-07, - "loss": 0.568, - "reward": 2.3771095275878906, - "reward_std": 0.5015990436077118, - "rewards/accuracy_reward": 0.479166679084301, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.015251755714416504, - "rewards/tag_count_reward": 0.9270833730697632, + "completion_length": 607.4166870117188, + "epoch": 0.4125, + "grad_norm": 17.24978480660208, + "kl": 1.37109375, + "learning_rate": 7.577868759557653e-07, + "loss": 0.6142, + "reward": 2.3871285915374756, + "reward_std": 0.7968339771032333, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9027777910232544, + "rewards/repetition_penalty_reward": -0.17189929634332657, + "rewards/tag_count_reward": 0.8645833730697632, "step": 825 }, { "clip_ratio": 0.0, - "completion_length": 347.37501525878906, - "epoch": 0.826, - "grad_norm": 14.195193659523204, - "kl": 2.14453125, - "learning_rate": 1.804828558898332e-07, - "loss": 0.1007, - "reward": 2.5919097661972046, - "reward_std": 0.468966469168663, - "rewards/accuracy_reward": 0.6666666716337204, - "rewards/reasoning_steps_reward": 0.972222238779068, - "rewards/repetition_penalty_reward": -0.010520970448851585, - "rewards/tag_count_reward": 0.9635416865348816, + "completion_length": 839.1250305175781, + "epoch": 0.413, + "grad_norm": 20.131860885223457, + "kl": 1.71875, + "learning_rate": 7.570899031527332e-07, + "loss": 0.5612, + "reward": 2.0470434427261353, + "reward_std": 0.7597078531980515, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.24635940790176392, + "rewards/tag_count_reward": 0.7864583432674408, "step": 826 }, { "clip_ratio": 0.0, - "completion_length": 450.2083435058594, - "epoch": 0.827, - "grad_norm": 32.764312557604384, - "kl": 4.703125, - "learning_rate": 1.7958863415950112e-07, - "loss": 0.8324, - "reward": 2.222867786884308, - "reward_std": 0.6226075887680054, - "rewards/accuracy_reward": 0.3958333358168602, - "rewards/reasoning_steps_reward": 0.9375000596046448, - "rewards/repetition_penalty_reward": -0.02192395133897662, - "rewards/tag_count_reward": 0.9114583730697632, + "completion_length": 628.4375305175781, + "epoch": 0.4135, + "grad_norm": 21.15993434326557, + "kl": 0.99609375, + "learning_rate": 7.563922995179418e-07, + "loss": 0.5685, + "reward": 2.4131293296813965, + "reward_std": 0.7387427687644958, + "rewards/accuracy_reward": 0.7083333730697632, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.14416248351335526, + "rewards/tag_count_reward": 0.8697916865348816, "step": 827 }, { "clip_ratio": 0.0, - "completion_length": 537.3958587646484, - "epoch": 0.828, - "grad_norm": 102.68181493594267, - "kl": 7.6669921875, - "learning_rate": 1.7869892577476722e-07, - "loss": 0.8764, - "reward": 2.513993501663208, - "reward_std": 0.5042714290320873, - "rewards/accuracy_reward": 0.7291666716337204, - "rewards/reasoning_steps_reward": 0.930555522441864, - "rewards/repetition_penalty_reward": -0.025937245693057775, - "rewards/tag_count_reward": 0.8802083432674408, + "completion_length": 370.8333435058594, + "epoch": 0.414, + "grad_norm": 12.527599564815661, + "kl": 0.3623046875, + "learning_rate": 7.556940671764124e-07, + "loss": 0.1614, + "reward": 2.8723835945129395, + "reward_std": 0.15300642838701606, + "rewards/accuracy_reward": 0.9583333432674408, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.061644500121474266, + "rewards/tag_count_reward": 0.9895833432674408, "step": 828 }, { "clip_ratio": 0.0, - "completion_length": 428.5833435058594, - "epoch": 0.829, - "grad_norm": 25.253738055443158, - "kl": 4.703125, - "learning_rate": 1.7781374157644713e-07, - "loss": 0.8549, - "reward": 2.5480579137802124, - "reward_std": 0.7514591813087463, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9375, - "rewards/repetition_penalty_reward": -0.01444217236712575, - "rewards/tag_count_reward": 0.8958333432674408, + "completion_length": 981.0625305175781, + "epoch": 0.4145, + "grad_norm": 29.125329174569146, + "kl": 1.6484375, + "learning_rate": 7.54995208255082e-07, + "loss": 0.6239, + "reward": 1.5005772113800049, + "reward_std": 0.6931151449680328, + "rewards/accuracy_reward": 0.10416666977107525, + "rewards/reasoning_steps_reward": 0.8888888955116272, + "rewards/repetition_penalty_reward": -0.11227016523480415, + "rewards/tag_count_reward": 0.6197916865348816, "step": 829 }, { "clip_ratio": 0.0, - "completion_length": 319.7708435058594, - "epoch": 0.83, - "grad_norm": 43.578316518329956, - "kl": 3.408203125, - "learning_rate": 1.7693309235023127e-07, - "loss": 0.2552, - "reward": 2.751266121864319, - "reward_std": 0.45656658709049225, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 0.9375, - "rewards/repetition_penalty_reward": -0.014358973130583763, - "rewards/tag_count_reward": 0.9739583432674408, + "completion_length": 652.4375, + "epoch": 0.415, + "grad_norm": 29.203264211648943, + "kl": 0.794921875, + "learning_rate": 7.54295724882796e-07, + "loss": 0.6498, + "reward": 2.3145864009857178, + "reward_std": 0.7329890131950378, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.8819445073604584, + "rewards/repetition_penalty_reward": -0.04652494750916958, + "rewards/tag_count_reward": 0.8333333432674408, "step": 830 }, { "clip_ratio": 0.0, - "completion_length": 485.10418701171875, - "epoch": 0.831, - "grad_norm": 27.193058696951205, - "kl": 5.515625, - "learning_rate": 1.7605698882655233e-07, - "loss": 0.8536, - "reward": 2.412537455558777, - "reward_std": 0.8152902722358704, - "rewards/accuracy_reward": 0.6250000298023224, - "rewards/reasoning_steps_reward": 0.9375, - "rewards/repetition_penalty_reward": -0.04058762267231941, - "rewards/tag_count_reward": 0.8906250298023224, + "completion_length": 405.5625, + "epoch": 0.4155, + "grad_norm": 9.688376442723252, + "kl": 0.556640625, + "learning_rate": 7.535956191903021e-07, + "loss": 0.1869, + "reward": 2.5234345197677612, + "reward_std": 0.46155112981796265, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9652778506278992, + "rewards/repetition_penalty_reward": -0.04080154746770859, + "rewards/tag_count_reward": 0.9114583432674408, "step": 831 }, { "clip_ratio": 0.0, - "completion_length": 331.0208435058594, - "epoch": 0.832, - "grad_norm": 19.688526945972043, - "kl": 2.2265625, - "learning_rate": 1.7518544168045524e-07, - "loss": 0.3667, - "reward": 2.815763831138611, - "reward_std": 0.3860451430082321, - "rewards/accuracy_reward": 0.875, - "rewards/reasoning_steps_reward": 0.9791667461395264, - "rewards/repetition_penalty_reward": -0.012361295986920595, - "rewards/tag_count_reward": 0.9739583432674408, + "completion_length": 313.81251525878906, + "epoch": 0.416, + "grad_norm": 15.677158884669252, + "kl": 0.33837890625, + "learning_rate": 7.528948933102438e-07, + "loss": 0.2505, + "reward": 2.81933856010437, + "reward_std": 0.3189728558063507, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.958333432674408, + "rewards/repetition_penalty_reward": -0.02441159961745143, + "rewards/tag_count_reward": 0.96875, "step": 832 }, { "clip_ratio": 0.0, - "completion_length": 346.56251525878906, - "epoch": 0.833, - "grad_norm": 22.298341306372343, - "kl": 2.765625, - "learning_rate": 1.743184615314671e-07, - "loss": 0.3607, - "reward": 2.7588515281677246, - "reward_std": 0.545660674571991, - "rewards/accuracy_reward": 0.8750000298023224, - "rewards/reasoning_steps_reward": 0.9513888955116272, - "rewards/repetition_penalty_reward": -0.02066253498196602, - "rewards/tag_count_reward": 0.953125, + "completion_length": 471.9791717529297, + "epoch": 0.4165, + "grad_norm": 24.249907996018568, + "kl": 0.767578125, + "learning_rate": 7.521935493771534e-07, + "loss": 0.418, + "reward": 2.4950411319732666, + "reward_std": 0.7277962416410446, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9375000596046448, + "rewards/repetition_penalty_reward": -0.03620896954089403, + "rewards/tag_count_reward": 0.8645833730697632, "step": 833 }, { "clip_ratio": 0.0, - "completion_length": 359.31251525878906, - "epoch": 0.834, - "grad_norm": 28.797542999117116, - "kl": 3.25390625, - "learning_rate": 1.7345605894346726e-07, - "loss": 0.5221, - "reward": 2.7526845932006836, - "reward_std": 0.674776554107666, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/reasoning_steps_reward": 0.9513888955116272, - "rewards/repetition_penalty_reward": -0.01641272520646453, - "rewards/tag_count_reward": 0.9218750298023224, + "completion_length": 396.56251525878906, + "epoch": 0.417, + "grad_norm": 9.707239836993917, + "kl": 0.5439453125, + "learning_rate": 7.514915895274463e-07, + "loss": 0.2468, + "reward": 2.515194535255432, + "reward_std": 0.30929915606975555, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.03862500563263893, + "rewards/tag_count_reward": 0.9635416865348816, "step": 834 }, { "clip_ratio": 0.0, - "completion_length": 430.62501525878906, - "epoch": 0.835, - "grad_norm": 43.496982955042476, - "kl": 3.0859375, - "learning_rate": 1.7259824442455923e-07, - "loss": 0.6982, - "reward": 2.299278497695923, - "reward_std": 0.6011901348829269, - "rewards/accuracy_reward": 0.5000000149011612, - "rewards/reasoning_steps_reward": 0.9027777910232544, - "rewards/repetition_penalty_reward": -0.00974938040599227, - "rewards/tag_count_reward": 0.9062500298023224, + "completion_length": 620.3333435058594, + "epoch": 0.4175, + "grad_norm": 17.508594529679065, + "kl": 1.17578125, + "learning_rate": 7.507890158994139e-07, + "loss": 0.5817, + "reward": 2.4108619689941406, + "reward_std": 0.8157097101211548, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.028374179266393185, + "rewards/tag_count_reward": 0.7864583432674408, "step": 835 }, { "clip_ratio": 0.0, - "completion_length": 519.9375152587891, - "epoch": 0.836, - "grad_norm": 23.29360992129975, - "kl": 3.484375, - "learning_rate": 1.7174502842694212e-07, - "loss": 1.0119, - "reward": 2.5549784898757935, - "reward_std": 0.7014127969741821, - "rewards/accuracy_reward": 0.6875, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.021410479210317135, - "rewards/tag_count_reward": 0.8958333730697632, + "completion_length": 676.0416870117188, + "epoch": 0.418, + "grad_norm": 10.440509283265795, + "kl": 1.5859375, + "learning_rate": 7.500858306332172e-07, + "loss": 0.7901, + "reward": 2.2338361740112305, + "reward_std": 0.7627847492694855, + "rewards/accuracy_reward": 0.5625000149011612, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.024844415485858917, + "rewards/tag_count_reward": 0.7447916865348816, "step": 836 }, { "clip_ratio": 0.0, - "completion_length": 468.81251525878906, - "epoch": 0.837, - "grad_norm": 21.45595694285051, - "kl": 4.984375, - "learning_rate": 1.7089642134678364e-07, - "loss": 0.8767, - "reward": 2.2903480529785156, - "reward_std": 0.6241036355495453, - "rewards/accuracy_reward": 0.4375000149011612, - "rewards/reasoning_steps_reward": 0.9652778208255768, - "rewards/repetition_penalty_reward": -0.018679913599044085, - "rewards/tag_count_reward": 0.90625, + "completion_length": 726.2083435058594, + "epoch": 0.4185, + "grad_norm": 15.62225883232442, + "kl": 2.19921875, + "learning_rate": 7.493820358708809e-07, + "loss": 0.7195, + "reward": 2.36660635471344, + "reward_std": 0.7908368110656738, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.018810252659022808, + "rewards/tag_count_reward": 0.7812500298023224, "step": 837 }, { "clip_ratio": 0.0, - "completion_length": 736.0833435058594, - "epoch": 0.838, - "grad_norm": 62.125475017964185, - "kl": 9.53125, - "learning_rate": 1.7005243352409333e-07, - "loss": 1.49, - "reward": 2.3610810041427612, - "reward_std": 0.8324085474014282, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/reasoning_steps_reward": 0.9444445073604584, - "rewards/repetition_penalty_reward": -0.020863616839051247, - "rewards/tag_count_reward": 0.7708333432674408, + "completion_length": 925.7291870117188, + "epoch": 0.419, + "grad_norm": 17.894574750635286, + "kl": 3.9296875, + "learning_rate": 7.486776337562853e-07, + "loss": 0.7237, + "reward": 1.4900822043418884, + "reward_std": 0.6835044920444489, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/reasoning_steps_reward": 0.847222238779068, + "rewards/repetition_penalty_reward": -0.023806744255125523, + "rewards/tag_count_reward": 0.5833333432674408, "step": 838 }, { "clip_ratio": 0.0, - "completion_length": 711.6041870117188, - "epoch": 0.839, - "grad_norm": 51.57175499154389, - "kl": 10.3125, - "learning_rate": 1.6921307524259625e-07, - "loss": 1.4895, - "reward": 2.234101891517639, - "reward_std": 0.9274267852306366, - "rewards/accuracy_reward": 0.5833333432674408, - "rewards/reasoning_steps_reward": 0.9513888955116272, - "rewards/repetition_penalty_reward": -0.02457870915532112, - "rewards/tag_count_reward": 0.7239583730697632, + "completion_length": 497.8125305175781, + "epoch": 0.4195, + "grad_norm": 6.541740244547362, + "kl": 1.78515625, + "learning_rate": 7.479726264351618e-07, + "loss": 0.6408, + "reward": 2.4577986001968384, + "reward_std": 0.5397596657276154, + "rewards/accuracy_reward": 0.6875, + "rewards/reasoning_steps_reward": 0.9513888657093048, + "rewards/repetition_penalty_reward": -0.030048648826777935, + "rewards/tag_count_reward": 0.8489583432674408, "step": 839 }, { "clip_ratio": 0.0, - "completion_length": 431.00001525878906, - "epoch": 0.84, - "grad_norm": 33.432954925810336, - "kl": 2.5546875, - "learning_rate": 1.6837835672960831e-07, - "loss": 0.8481, - "reward": 2.6463335752487183, - "reward_std": 0.5603546500205994, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.015124747063964605, - "rewards/tag_count_reward": 0.9322916865348816, + "completion_length": 691.0416870117188, + "epoch": 0.42, + "grad_norm": 12.527206541745974, + "kl": 2.6484375, + "learning_rate": 7.472670160550848e-07, + "loss": 0.694, + "reward": 1.9736045598983765, + "reward_std": 0.6236002445220947, + "rewards/accuracy_reward": 0.3541666716337204, + "rewards/reasoning_steps_reward": 0.9166666865348816, + "rewards/repetition_penalty_reward": -0.04722887650132179, + "rewards/tag_count_reward": 0.75, "step": 840 }, { "clip_ratio": 0.0, - "completion_length": 512.1666717529297, - "epoch": 0.841, - "grad_norm": 66.56725483975436, - "kl": 5.359375, - "learning_rate": 1.6754828815591131e-07, - "loss": 1.1354, - "reward": 2.3149657249450684, - "reward_std": 0.6210701763629913, - "rewards/accuracy_reward": 0.479166679084301, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.01663155946880579, - "rewards/tag_count_reward": 0.8802083730697632, + "completion_length": 454.9375, + "epoch": 0.4205, + "grad_norm": 6.487598751220338, + "kl": 1.51953125, + "learning_rate": 7.46560804765466e-07, + "loss": 0.4798, + "reward": 2.6140646934509277, + "reward_std": 0.6524476110935211, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9444445073604584, + "rewards/repetition_penalty_reward": -0.03350475896149874, + "rewards/tag_count_reward": 0.9114583432674408, "step": 841 }, { "clip_ratio": 0.0, - "completion_length": 407.85418701171875, - "epoch": 0.842, - "grad_norm": 12.29872685471777, - "kl": 3.1630859375, - "learning_rate": 1.6672287963562852e-07, - "loss": 0.4297, - "reward": 2.510159969329834, - "reward_std": 0.602110430598259, - "rewards/accuracy_reward": 0.6250000298023224, - "rewards/reasoning_steps_reward": 0.9652777910232544, - "rewards/repetition_penalty_reward": -0.01761801028624177, - "rewards/tag_count_reward": 0.9375, + "completion_length": 515.2708587646484, + "epoch": 0.421, + "grad_norm": 10.317644373992936, + "kl": 1.349609375, + "learning_rate": 7.458539947175473e-07, + "loss": 0.6031, + "reward": 2.220026969909668, + "reward_std": 0.5868920385837555, + "rewards/accuracy_reward": 0.4166666716337204, + "rewards/reasoning_steps_reward": 0.916666716337204, + "rewards/repetition_penalty_reward": -0.014348076190799475, + "rewards/tag_count_reward": 0.9010416865348816, "step": 842 }, { "clip_ratio": 0.0, - "completion_length": 541.5, - "epoch": 0.843, - "grad_norm": 27.148373245511962, - "kl": 5.609375, - "learning_rate": 1.659021412261026e-07, - "loss": 0.9403, - "reward": 2.315541386604309, - "reward_std": 0.7348792850971222, - "rewards/accuracy_reward": 0.5, - "rewards/reasoning_steps_reward": 0.965277761220932, - "rewards/repetition_penalty_reward": -0.024736556224524975, - "rewards/tag_count_reward": 0.8750000298023224, + "completion_length": 379.35418701171875, + "epoch": 0.4215, + "grad_norm": 7.886525307085669, + "kl": 1.072265625, + "learning_rate": 7.45146588064395e-07, + "loss": 0.3578, + "reward": 2.5978939533233643, + "reward_std": 0.6561948955059052, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9583333134651184, + "rewards/repetition_penalty_reward": -0.04273105226457119, + "rewards/tag_count_reward": 0.9322916865348816, "step": 843 }, { "clip_ratio": 0.0, - "completion_length": 594.3750152587891, - "epoch": 0.844, - "grad_norm": 37.507439921203925, - "kl": 6.9140625, - "learning_rate": 1.6508608292777203e-07, - "loss": 0.4498, - "reward": 2.3864521980285645, - "reward_std": 0.3542592525482178, - "rewards/accuracy_reward": 0.625, - "rewards/reasoning_steps_reward": 0.9305556118488312, - "rewards/repetition_penalty_reward": -0.01806183159351349, - "rewards/tag_count_reward": 0.8489583432674408, + "completion_length": 324.1666717529297, + "epoch": 0.422, + "grad_norm": 4.37947548836921, + "kl": 0.30078125, + "learning_rate": 7.444385869608921e-07, + "loss": 0.0354, + "reward": 2.6675782203674316, + "reward_std": 0.17353395372629166, + "rewards/accuracy_reward": 0.708333358168602, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.03381069749593735, + "rewards/tag_count_reward": 1.0, "step": 844 }, { "clip_ratio": 0.0, - "completion_length": 574.5416870117188, - "epoch": 0.845, - "grad_norm": 42.479348438013346, - "kl": 4.875, - "learning_rate": 1.6427471468404952e-07, - "loss": 1.2607, - "reward": 2.191675543785095, - "reward_std": 0.5457549095153809, - "rewards/accuracy_reward": 0.3750000149011612, - "rewards/reasoning_steps_reward": 0.9652778506278992, - "rewards/repetition_penalty_reward": -0.018394021317362785, - "rewards/tag_count_reward": 0.8697916865348816, + "completion_length": 372.7083435058594, + "epoch": 0.4225, + "grad_norm": 18.876751039530795, + "kl": 1.12890625, + "learning_rate": 7.437299935637328e-07, + "loss": 0.45, + "reward": 2.5024070739746094, + "reward_std": 0.5223306268453598, + "rewards/accuracy_reward": 0.6250000149011612, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.03231525328010321, + "rewards/tag_count_reward": 0.9375, "step": 845 }, { "clip_ratio": 0.0, - "completion_length": 447.5, - "epoch": 0.846, - "grad_norm": 32.21272755825097, - "kl": 2.6328125, - "learning_rate": 1.6346804638120098e-07, - "loss": 0.6288, - "reward": 2.359144449234009, - "reward_std": 0.7404135763645172, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/reasoning_steps_reward": 0.9236111342906952, - "rewards/repetition_penalty_reward": -0.017591833136975765, - "rewards/tag_count_reward": 0.9114583730697632, + "completion_length": 368.4583435058594, + "epoch": 0.423, + "grad_norm": 7.735517734052216, + "kl": 0.373046875, + "learning_rate": 7.430208100314156e-07, + "loss": 0.2957, + "reward": 2.737704277038574, + "reward_std": 0.41725849360227585, + "rewards/accuracy_reward": 0.8125, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.04701795056462288, + "rewards/tag_count_reward": 0.9791666865348816, "step": 846 }, { "clip_ratio": 0.0, - "completion_length": 384.18751525878906, - "epoch": 0.847, - "grad_norm": 14.416271559769205, - "kl": 2.5927734375, - "learning_rate": 1.6266608784822542e-07, - "loss": 0.393, - "reward": 2.211021304130554, - "reward_std": 0.4275623857975006, + "completion_length": 383.68751525878906, + "epoch": 0.4235, + "grad_norm": 4.3638722065625855, + "kl": 0.3779296875, + "learning_rate": 7.423110385242366e-07, + "loss": 0.0939, + "reward": 2.3006144762039185, + "reward_std": 0.29770078510046005, "rewards/accuracy_reward": 0.3750000149011612, - "rewards/reasoning_steps_reward": 0.944444477558136, - "rewards/repetition_penalty_reward": -0.014673332683742046, - "rewards/tag_count_reward": 0.90625, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.032718876376748085, + "rewards/tag_count_reward": 0.9791666865348816, "step": 847 }, { "clip_ratio": 0.0, - "completion_length": 436.5625, - "epoch": 0.848, - "grad_norm": 14.205493656115495, - "kl": 3.384765625, - "learning_rate": 1.6186884885673413e-07, - "loss": 0.5267, - "reward": 2.2585190534591675, - "reward_std": 0.7064023613929749, - "rewards/accuracy_reward": 0.4375000149011612, - "rewards/reasoning_steps_reward": 0.9375, - "rewards/repetition_penalty_reward": -0.03314755018800497, - "rewards/tag_count_reward": 0.9166666865348816, + "completion_length": 337.2083435058594, + "epoch": 0.424, + "grad_norm": 9.281897298394469, + "kl": 0.65625, + "learning_rate": 7.416006812042827e-07, + "loss": 0.2155, + "reward": 2.708508014678955, + "reward_std": 0.40016093850135803, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.06406151317059994, + "rewards/tag_count_reward": 0.9739583730697632, "step": 848 }, { "clip_ratio": 0.0, - "completion_length": 496.375, - "epoch": 0.849, - "grad_norm": 47.615315210946434, - "kl": 3.21484375, - "learning_rate": 1.610763391208329e-07, - "loss": 0.7123, - "reward": 2.543179988861084, - "reward_std": 0.6331344544887543, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.944444477558136, - "rewards/repetition_penalty_reward": -0.01584793906658888, - "rewards/tag_count_reward": 0.8854166865348816, + "completion_length": 348.37501525878906, + "epoch": 0.4245, + "grad_norm": 9.904867829218396, + "kl": 0.6005859375, + "learning_rate": 7.408897402354255e-07, + "loss": 0.1548, + "reward": 2.447229743003845, + "reward_std": 0.3772226721048355, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.026728657074272633, + "rewards/tag_count_reward": 0.9739583432674408, "step": 849 }, { "clip_ratio": 0.0, - "completion_length": 330.125, - "epoch": 0.85, - "grad_norm": 22.926928355727185, - "kl": 1.275390625, - "learning_rate": 1.6028856829700258e-07, - "loss": 0.2793, - "reward": 2.8741886615753174, - "reward_std": 0.27669021487236023, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.011228146031498909, - "rewards/tag_count_reward": 0.9687500298023224, + "completion_length": 424.04168701171875, + "epoch": 0.425, + "grad_norm": 5.890746892326936, + "kl": 0.86328125, + "learning_rate": 7.401782177833147e-07, + "loss": 0.4768, + "reward": 2.3541191816329956, + "reward_std": 0.5978062450885773, + "rewards/accuracy_reward": 0.520833358168602, + "rewards/reasoning_steps_reward": 0.951388955116272, + "rewards/repetition_penalty_reward": -0.05039471574127674, + "rewards/tag_count_reward": 0.9322916865348816, "step": 850 }, { "clip_ratio": 0.0, - "completion_length": 352.6875, - "epoch": 0.851, - "grad_norm": 23.63650850429641, - "kl": 1.009765625, - "learning_rate": 1.5950554598398228e-07, - "loss": 0.2369, - "reward": 2.6674481630325317, - "reward_std": 0.3812839537858963, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/reasoning_steps_reward": 0.9583333730697632, - "rewards/repetition_penalty_reward": -0.014843634329736233, - "rewards/tag_count_reward": 0.9739583432674408, + "completion_length": 361.3958435058594, + "epoch": 0.4255, + "grad_norm": 19.104666432985564, + "kl": 0.65234375, + "learning_rate": 7.394661160153709e-07, + "loss": 0.4226, + "reward": 2.7522761821746826, + "reward_std": 0.5294878482818604, + "rewards/accuracy_reward": 0.875, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.020293288864195347, + "rewards/tag_count_reward": 0.953125, "step": 851 }, { "clip_ratio": 0.0, - "completion_length": 504.22918701171875, - "epoch": 0.852, - "grad_norm": 26.46847605504948, - "kl": 6.5625, - "learning_rate": 1.5872728172265146e-07, - "loss": 0.9065, - "reward": 2.415515661239624, - "reward_std": 0.7073606848716736, - "rewards/accuracy_reward": 0.6250000298023224, - "rewards/reasoning_steps_reward": 0.9444445073604584, - "rewards/repetition_penalty_reward": -0.01330387475900352, - "rewards/tag_count_reward": 0.8593750298023224, + "completion_length": 398.9583435058594, + "epoch": 0.426, + "grad_norm": 7.305881246017198, + "kl": 0.7138671875, + "learning_rate": 7.387534371007797e-07, + "loss": 0.2967, + "reward": 2.524535655975342, + "reward_std": 0.29497088491916656, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.05706173926591873, + "rewards/tag_count_reward": 0.9635416865348816, "step": 852 }, { "clip_ratio": 0.0, - "completion_length": 561.5416870117188, - "epoch": 0.853, - "grad_norm": 25.999454906190575, - "kl": 4.828125, - "learning_rate": 1.579537849959148e-07, - "loss": 1.113, - "reward": 2.4573510885238647, - "reward_std": 0.7351089715957642, - "rewards/accuracy_reward": 0.6458333730697632, + "completion_length": 294.7291717529297, + "epoch": 0.4265, + "grad_norm": 4.07471559844156, + "kl": 0.3271484375, + "learning_rate": 7.380401832104845e-07, + "loss": 0.0466, + "reward": 2.9143292903900146, + "reward_std": 0.11590113118290901, + "rewards/accuracy_reward": 0.9791666865348816, "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.021815632469952106, - "rewards/tag_count_reward": 0.8541666865348816, + "rewards/repetition_penalty_reward": -0.044004119001328945, + "rewards/tag_count_reward": 1.0, "step": 853 }, { "clip_ratio": 0.0, - "completion_length": 511.2708435058594, - "epoch": 0.854, - "grad_norm": 32.177866129176245, - "kl": 5.859375, - "learning_rate": 1.5718506522858572e-07, - "loss": 0.9686, - "reward": 2.1545926332473755, - "reward_std": 0.6024680733680725, - "rewards/accuracy_reward": 0.3541666865348816, - "rewards/reasoning_steps_reward": 0.9513888955116272, - "rewards/repetition_penalty_reward": -0.031171313486993313, - "rewards/tag_count_reward": 0.8802083730697632, + "completion_length": 460.12501525878906, + "epoch": 0.427, + "grad_norm": 14.708604251307426, + "kl": 1.111328125, + "learning_rate": 7.373263565171805e-07, + "loss": 0.5831, + "reward": 2.3425523042678833, + "reward_std": 0.6779049038887024, + "rewards/accuracy_reward": 0.5, + "rewards/reasoning_steps_reward": 0.951388955116272, + "rewards/repetition_penalty_reward": -0.020295153371989727, + "rewards/tag_count_reward": 0.9114583432674408, "step": 854 }, { "clip_ratio": 0.0, - "completion_length": 464.5416717529297, - "epoch": 0.855, - "grad_norm": 35.98943778638736, - "kl": 2.75, - "learning_rate": 1.5642113178727193e-07, - "loss": 0.8586, - "reward": 2.234099566936493, - "reward_std": 0.4664178192615509, - "rewards/accuracy_reward": 0.3541666679084301, - "rewards/reasoning_steps_reward": 0.9652778208255768, - "rewards/repetition_penalty_reward": -0.028053374961018562, - "rewards/tag_count_reward": 0.9427083730697632, + "completion_length": 337.7916717529297, + "epoch": 0.4275, + "grad_norm": 11.608809720420162, + "kl": 0.7373046875, + "learning_rate": 7.366119591953075e-07, + "loss": 0.1999, + "reward": 2.80188250541687, + "reward_std": 0.3770581018179655, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 0.9722222089767456, + "rewards/repetition_penalty_reward": -0.03492319490760565, + "rewards/tag_count_reward": 0.96875, "step": 855 }, { "clip_ratio": 0.0, - "completion_length": 539.6250305175781, - "epoch": 0.856, - "grad_norm": 25.61587236309964, - "kl": 4.1640625, - "learning_rate": 1.5566199398026147e-07, - "loss": 1.0126, - "reward": 2.0876386165618896, - "reward_std": 0.7813932299613953, - "rewards/accuracy_reward": 0.3333333358168602, - "rewards/reasoning_steps_reward": 0.916666716337204, - "rewards/repetition_penalty_reward": -0.02173637691885233, - "rewards/tag_count_reward": 0.8593750298023224, + "completion_length": 514.4375, + "epoch": 0.428, + "grad_norm": 15.274727479273395, + "kl": 2.109375, + "learning_rate": 7.358969934210438e-07, + "loss": 0.4943, + "reward": 2.3344244956970215, + "reward_std": 0.38865962624549866, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9097222685813904, + "rewards/repetition_penalty_reward": -0.028422830160707235, + "rewards/tag_count_reward": 0.8697916865348816, "step": 856 }, { "clip_ratio": 0.0, - "completion_length": 394.68751525878906, - "epoch": 0.857, - "grad_norm": 21.20771571565506, - "kl": 2.45703125, - "learning_rate": 1.5490766105740876e-07, - "loss": 0.5834, - "reward": 2.4374274015426636, - "reward_std": 0.4593869596719742, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/reasoning_steps_reward": 0.9722222089767456, - "rewards/repetition_penalty_reward": -0.019169961102306843, - "rewards/tag_count_reward": 0.9427083730697632, + "completion_length": 539.9166870117188, + "epoch": 0.4285, + "grad_norm": 21.14206646504488, + "kl": 2.140625, + "learning_rate": 7.35181461372299e-07, + "loss": 1.0918, + "reward": 2.400139808654785, + "reward_std": 0.6591689586639404, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 0.9444443881511688, + "rewards/repetition_penalty_reward": -0.044304635375738144, + "rewards/tag_count_reward": 0.8750000298023224, "step": 857 }, { "clip_ratio": 0.0, - "completion_length": 338.50001525878906, - "epoch": 0.858, - "grad_norm": 11.186901565270288, - "kl": 2.41796875, - "learning_rate": 1.5415814221002265e-07, - "loss": 0.1621, - "reward": 2.769268274307251, - "reward_std": 0.46035242080688477, - "rewards/accuracy_reward": 0.8750000298023224, - "rewards/reasoning_steps_reward": 0.965277761220932, - "rewards/repetition_penalty_reward": -0.024134621024131775, - "rewards/tag_count_reward": 0.953125, + "completion_length": 492.62501525878906, + "epoch": 0.429, + "grad_norm": 17.16519433385956, + "kl": 1.72265625, + "learning_rate": 7.344653652287077e-07, + "loss": 0.7656, + "reward": 2.677896022796631, + "reward_std": 0.6244023740291595, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9652778506278992, + "rewards/repetition_penalty_reward": -0.03217345476150513, + "rewards/tag_count_reward": 0.8906250298023224, "step": 858 }, { "clip_ratio": 0.0, - "completion_length": 433.9166717529297, - "epoch": 0.859, - "grad_norm": 26.347652823757198, - "kl": 2.93359375, - "learning_rate": 1.5341344657075354e-07, - "loss": 0.7346, - "reward": 2.543567419052124, - "reward_std": 0.6669409871101379, + "completion_length": 538.0416717529297, + "epoch": 0.4295, + "grad_norm": 13.397818376841236, + "kl": 2.8046875, + "learning_rate": 7.337487071716232e-07, + "loss": 0.5746, + "reward": 2.45145046710968, + "reward_std": 0.6907146573066711, "rewards/accuracy_reward": 0.6666666865348816, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.02066876133903861, - "rewards/tag_count_reward": 0.9114583730697632, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.034660689532756805, + "rewards/tag_count_reward": 0.8541666865348816, "step": 859 }, { "clip_ratio": 0.0, - "completion_length": 308.7708435058594, - "epoch": 0.86, - "grad_norm": 24.14563880775655, - "kl": 1.58203125, - "learning_rate": 1.5267358321348285e-07, - "loss": 0.3053, - "reward": 2.8249399662017822, - "reward_std": 0.4262206554412842, - "rewards/accuracy_reward": 0.8750000298023224, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.01186564564704895, - "rewards/tag_count_reward": 0.96875, + "completion_length": 389.41668701171875, + "epoch": 0.43, + "grad_norm": 10.960948707755849, + "kl": 1.181640625, + "learning_rate": 7.330314893841101e-07, + "loss": 0.2798, + "reward": 2.515633225440979, + "reward_std": 0.4169411063194275, + "rewards/accuracy_reward": 0.625, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.02603358030319214, + "rewards/tag_count_reward": 0.9583333432674408, "step": 860 }, { "clip_ratio": 0.0, - "completion_length": 364.9583435058594, - "epoch": 0.861, - "grad_norm": 30.59682645079428, - "kl": 2.7890625, - "learning_rate": 1.5193856115321224e-07, - "loss": 0.4332, - "reward": 2.4405341148376465, - "reward_std": 0.4027775228023529, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.01779924053698778, - "rewards/tag_count_reward": 0.9375000298023224, + "completion_length": 288.7708435058594, + "epoch": 0.4305, + "grad_norm": 3.4791756020959634, + "kl": 0.291015625, + "learning_rate": 7.323137140509381e-07, + "loss": 0.0551, + "reward": 2.6766408681869507, + "reward_std": 0.10699502378702164, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.03169257752597332, + "rewards/tag_count_reward": 1.0, "step": 861 }, { "clip_ratio": 0.0, - "completion_length": 432.06251525878906, - "epoch": 0.862, - "grad_norm": 36.09673783797281, - "kl": 6.0625, - "learning_rate": 1.5120838934595337e-07, - "loss": 0.8199, - "reward": 2.335049033164978, - "reward_std": 0.7307652831077576, - "rewards/accuracy_reward": 0.5208333432674408, - "rewards/reasoning_steps_reward": 0.951388955116272, - "rewards/repetition_penalty_reward": -0.01738166157156229, - "rewards/tag_count_reward": 0.8802083432674408, + "completion_length": 492.5416717529297, + "epoch": 0.431, + "grad_norm": 184.11729136527305, + "kl": 8.76171875, + "learning_rate": 7.315953833585755e-07, + "loss": 0.8431, + "reward": 2.497468113899231, + "reward_std": 0.5085512399673462, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9444444179534912, + "rewards/repetition_penalty_reward": -0.025101464241743088, + "rewards/tag_count_reward": 0.8697916865348816, "step": 862 }, { "clip_ratio": 0.0, - "completion_length": 502.0208435058594, - "epoch": 0.863, - "grad_norm": 53.67270636598954, - "kl": 6.25, - "learning_rate": 1.5048307668861947e-07, - "loss": 1.0301, - "reward": 2.3041592836380005, - "reward_std": 0.6167122721672058, - "rewards/accuracy_reward": 0.4375000223517418, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.017021275125443935, - "rewards/tag_count_reward": 0.8906250298023224, + "completion_length": 547.9166870117188, + "epoch": 0.4315, + "grad_norm": 634.497752315967, + "kl": 19.375, + "learning_rate": 7.308764994951821e-07, + "loss": 2.7713, + "reward": 2.3461129665374756, + "reward_std": 0.7580728530883789, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9027777910232544, + "rewards/repetition_penalty_reward": -0.014998285099864006, + "rewards/tag_count_reward": 0.8750000298023224, "step": 863 }, { "clip_ratio": 0.0, - "completion_length": 502.625, - "epoch": 0.864, - "grad_norm": 40.13186559524844, - "kl": 6.4140625, - "learning_rate": 1.4976263201891613e-07, - "loss": 1.1658, - "reward": 2.634553909301758, - "reward_std": 0.6430298089981079, + "completion_length": 405.0208435058594, + "epoch": 0.432, + "grad_norm": 51.28196086116214, + "kl": 3.462890625, + "learning_rate": 7.301570646506027e-07, + "loss": 0.6456, + "reward": 2.7293022871017456, + "reward_std": 0.44555216282606125, "rewards/accuracy_reward": 0.8125000298023224, - "rewards/reasoning_steps_reward": 0.9444445073604584, - "rewards/repetition_penalty_reward": -0.01822400465607643, - "rewards/tag_count_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.013753433711826801, + "rewards/tag_count_reward": 0.9583333730697632, "step": 864 }, { "clip_ratio": 0.0, - "completion_length": 602.0625152587891, - "epoch": 0.865, - "grad_norm": 67.05207348072643, - "kl": 9.6796875, - "learning_rate": 1.4904706411523448e-07, - "loss": 1.2048, - "reward": 2.128021478652954, - "reward_std": 0.7044219970703125, - "rewards/accuracy_reward": 0.416666679084301, - "rewards/reasoning_steps_reward": 0.9375, - "rewards/repetition_penalty_reward": -0.0178119083866477, - "rewards/tag_count_reward": 0.7916666865348816, + "completion_length": 292.3541717529297, + "epoch": 0.4325, + "grad_norm": 24.09046766028715, + "kl": 1.6337890625, + "learning_rate": 7.294370810163607e-07, + "loss": 0.192, + "reward": 2.6840856075286865, + "reward_std": 0.17013289034366608, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.013831105083227158, + "rewards/tag_count_reward": 0.96875, "step": 865 }, { "clip_ratio": 0.0, - "completion_length": 473.0, - "epoch": 0.866, - "grad_norm": 17.39107169503413, - "kl": 5.40625, - "learning_rate": 1.483363816965435e-07, - "loss": 1.0166, - "reward": 2.366263747215271, - "reward_std": 0.7000749707221985, - "rewards/accuracy_reward": 0.5625000149011612, - "rewards/reasoning_steps_reward": 0.9583333730697632, - "rewards/repetition_penalty_reward": -0.024361333809792995, - "rewards/tag_count_reward": 0.8697916865348816, + "completion_length": 589.7500152587891, + "epoch": 0.433, + "grad_norm": 405.26278466771953, + "kl": 12.75, + "learning_rate": 7.287165507856512e-07, + "loss": 1.1717, + "reward": 2.1014411449432373, + "reward_std": 0.6244297027587891, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/reasoning_steps_reward": 0.9305555522441864, + "rewards/repetition_penalty_reward": -0.021822815760970116, + "rewards/tag_count_reward": 0.8593750298023224, "step": 866 }, { "clip_ratio": 0.0, - "completion_length": 589.4166870117188, - "epoch": 0.867, - "grad_norm": 52.2532552532266, - "kl": 8.546875, - "learning_rate": 1.4763059342228434e-07, - "loss": 1.0051, - "reward": 2.1066824197769165, - "reward_std": 0.6221250742673874, - "rewards/accuracy_reward": 0.3750000149011612, - "rewards/reasoning_steps_reward": 0.965277761220932, - "rewards/repetition_penalty_reward": -0.009637102019041777, - "rewards/tag_count_reward": 0.7760416865348816, + "completion_length": 429.8333435058594, + "epoch": 0.4335, + "grad_norm": 64.46281032488784, + "kl": 4.21484375, + "learning_rate": 7.279954761533342e-07, + "loss": 0.5701, + "reward": 2.4746246337890625, + "reward_std": 0.7078258693218231, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.0323198651894927, + "rewards/tag_count_reward": 0.9166666865348816, "step": 867 }, { "clip_ratio": 0.0, - "completion_length": 482.3541717529297, - "epoch": 0.868, - "grad_norm": 34.11769671109303, - "kl": 4.921875, - "learning_rate": 1.469297078922642e-07, - "loss": 0.8941, - "reward": 2.2735098600387573, - "reward_std": 0.3796389400959015, - "rewards/accuracy_reward": 0.5, - "rewards/reasoning_steps_reward": 0.92361119389534, - "rewards/repetition_penalty_reward": -0.01468472508713603, - "rewards/tag_count_reward": 0.8645833432674408, + "completion_length": 502.9583435058594, + "epoch": 0.434, + "grad_norm": 21.766406267926293, + "kl": 2.9140625, + "learning_rate": 7.27273859315928e-07, + "loss": 0.8102, + "reward": 2.275583267211914, + "reward_std": 0.7442755401134491, + "rewards/accuracy_reward": 0.5000000149011612, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.023027977906167507, + "rewards/tag_count_reward": 0.8541666865348816, "step": 868 }, { "clip_ratio": 0.0, - "completion_length": 419.0208435058594, - "epoch": 0.869, - "grad_norm": 28.34872735283239, - "kl": 3.4375, - "learning_rate": 1.4623373364655223e-07, - "loss": 0.9823, - "reward": 2.6034168004989624, - "reward_std": 0.6418764889240265, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.014638913795351982, - "rewards/tag_count_reward": 0.8958333730697632, + "completion_length": 388.0833435058594, + "epoch": 0.4345, + "grad_norm": 35.27655318443123, + "kl": 0.978515625, + "learning_rate": 7.265517024716026e-07, + "loss": 0.5909, + "reward": 2.544116973876953, + "reward_std": 0.6133610606193542, + "rewards/accuracy_reward": 0.7083333730697632, + "rewards/reasoning_steps_reward": 0.9236111640930176, + "rewards/repetition_penalty_reward": -0.03574422746896744, + "rewards/tag_count_reward": 0.9479166865348816, "step": 869 }, { "clip_ratio": 0.0, - "completion_length": 400.56251525878906, - "epoch": 0.87, - "grad_norm": 17.236700769600052, - "kl": 2.61328125, - "learning_rate": 1.4554267916537495e-07, - "loss": 0.5795, - "reward": 2.361931324005127, - "reward_std": 0.5499020516872406, - "rewards/accuracy_reward": 0.520833358168602, - "rewards/reasoning_steps_reward": 0.9375000298023224, - "rewards/repetition_penalty_reward": -0.018277212977409363, - "rewards/tag_count_reward": 0.921875, + "completion_length": 370.5416717529297, + "epoch": 0.435, + "grad_norm": 12.299061579151882, + "kl": 0.912109375, + "learning_rate": 7.258290078201731e-07, + "loss": 0.2876, + "reward": 2.400050640106201, + "reward_std": 0.3678786903619766, + "rewards/accuracy_reward": 0.5, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.025296634063124657, + "rewards/tag_count_reward": 0.9322916865348816, "step": 870 }, { "clip_ratio": 0.0, - "completion_length": 570.1875152587891, - "epoch": 0.871, - "grad_norm": 27.373915984285357, - "kl": 5.46875, - "learning_rate": 1.448565528690129e-07, - "loss": 1.2036, - "reward": 2.156299114227295, - "reward_std": 0.615426778793335, - "rewards/accuracy_reward": 0.4166666865348816, - "rewards/reasoning_steps_reward": 0.92361119389534, - "rewards/repetition_penalty_reward": -0.027728760614991188, - "rewards/tag_count_reward": 0.84375, + "completion_length": 263.6041793823242, + "epoch": 0.4355, + "grad_norm": 3.4077521243834212, + "kl": 0.22705078125, + "learning_rate": 7.251057775630927e-07, + "loss": 0.0145, + "reward": 2.8228542804718018, + "reward_std": 0.17518935352563858, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.017423711717128754, + "rewards/tag_count_reward": 1.0, "step": 871 }, { "clip_ratio": 0.0, - "completion_length": 488.875, - "epoch": 0.872, - "grad_norm": 25.43206804197954, - "kl": 3.91796875, - "learning_rate": 1.4417536311769885e-07, - "loss": 0.8613, - "reward": 2.2447725534439087, - "reward_std": 0.6266459226608276, - "rewards/accuracy_reward": 0.4166666716337204, - "rewards/reasoning_steps_reward": 0.9652777910232544, - "rewards/repetition_penalty_reward": -0.012171986512839794, - "rewards/tag_count_reward": 0.875, + "completion_length": 358.2083435058594, + "epoch": 0.436, + "grad_norm": 30.74690332937046, + "kl": 0.7333984375, + "learning_rate": 7.243820139034464e-07, + "loss": 0.3554, + "reward": 2.778803586959839, + "reward_std": 0.4877840280532837, + "rewards/accuracy_reward": 0.875, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.018071786500513554, + "rewards/tag_count_reward": 0.9427083432674408, "step": 872 }, { "clip_ratio": 0.0, - "completion_length": 385.1458435058594, - "epoch": 0.873, - "grad_norm": 17.40833999493999, - "kl": 2.28125, - "learning_rate": 1.4349911821151462e-07, - "loss": 0.3716, - "reward": 2.3771172761917114, - "reward_std": 0.5491065829992294, - "rewards/accuracy_reward": 0.4791666716337204, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.016980044543743134, - "rewards/tag_count_reward": 0.9427083730697632, + "completion_length": 382.04168701171875, + "epoch": 0.4365, + "grad_norm": 39.70789630630236, + "kl": 0.978515625, + "learning_rate": 7.236577190459433e-07, + "loss": 0.5793, + "reward": 2.512961983680725, + "reward_std": 0.49974559247493744, + "rewards/accuracy_reward": 0.6875000149011612, + "rewards/reasoning_steps_reward": 0.9166666865348816, + "rewards/repetition_penalty_reward": -0.02349638007581234, + "rewards/tag_count_reward": 0.9322916865348816, "step": 873 }, { "clip_ratio": 0.0, - "completion_length": 504.31251525878906, - "epoch": 0.874, - "grad_norm": 33.796267710350925, - "kl": 3.75, - "learning_rate": 1.4282782639029128e-07, - "loss": 0.8301, - "reward": 2.4175636768341064, - "reward_std": 0.5949557721614838, - "rewards/accuracy_reward": 0.5833333432674408, - "rewards/reasoning_steps_reward": 0.9652778208255768, - "rewards/repetition_penalty_reward": -0.02167257433757186, - "rewards/tag_count_reward": 0.890625, + "completion_length": 390.8958435058594, + "epoch": 0.437, + "grad_norm": 38.28149680062182, + "kl": 1.4296875, + "learning_rate": 7.229328951969115e-07, + "loss": 0.6429, + "reward": 2.5388529300689697, + "reward_std": 0.7759108543395996, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9375000298023224, + "rewards/repetition_penalty_reward": -0.03406373132020235, + "rewards/tag_count_reward": 0.8854166865348816, "step": 874 }, { "clip_ratio": 0.0, - "completion_length": 591.6041870117188, - "epoch": 0.875, - "grad_norm": 26.046786075949313, - "kl": 7.28125, - "learning_rate": 1.4216149583350755e-07, - "loss": 1.0826, - "reward": 2.029456853866577, - "reward_std": 0.6440670937299728, - "rewards/accuracy_reward": 0.3125000111758709, - "rewards/reasoning_steps_reward": 0.951388955116272, - "rewards/repetition_penalty_reward": -0.015682112891227007, - "rewards/tag_count_reward": 0.7812500298023224, + "completion_length": 387.2708435058594, + "epoch": 0.4375, + "grad_norm": 29.933419477194974, + "kl": 1.177734375, + "learning_rate": 7.222075445642904e-07, + "loss": 0.4695, + "reward": 2.6218247413635254, + "reward_std": 0.47380533814430237, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.024008896201848984, + "rewards/tag_count_reward": 0.9583333730697632, "step": 875 }, { "clip_ratio": 0.0, - "completion_length": 409.10418701171875, - "epoch": 0.876, - "grad_norm": 27.02250848782874, - "kl": 2.4375, - "learning_rate": 1.4150013466019114e-07, - "loss": 0.5262, - "reward": 2.6342684030532837, - "reward_std": 0.5182087272405624, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 0.9652778506278992, - "rewards/repetition_penalty_reward": -0.023717753123492002, - "rewards/tag_count_reward": 0.9218750298023224, + "completion_length": 323.4791717529297, + "epoch": 0.438, + "grad_norm": 17.340458090206823, + "kl": 1.15283203125, + "learning_rate": 7.214816693576234e-07, + "loss": 0.3436, + "reward": 2.513605237007141, + "reward_std": 0.3129299432039261, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.024589229375123978, + "rewards/tag_count_reward": 0.96875, "step": 876 }, { "clip_ratio": 0.0, - "completion_length": 640.5208740234375, - "epoch": 0.877, - "grad_norm": 37.146004800834625, - "kl": 6.90625, - "learning_rate": 1.4084375092881917e-07, - "loss": 1.1224, - "reward": 2.2963308095932007, - "reward_std": 0.7272785305976868, - "rewards/accuracy_reward": 0.5833333730697632, - "rewards/reasoning_steps_reward": 0.9305556118488312, - "rewards/repetition_penalty_reward": -0.024849796667695045, - "rewards/tag_count_reward": 0.8072916865348816, + "completion_length": 294.0208435058594, + "epoch": 0.4385, + "grad_norm": 3.2301186364234344, + "kl": 0.24853515625, + "learning_rate": 7.207552717880522e-07, + "loss": 0.04, + "reward": 2.6579480171203613, + "reward_std": 0.24716416746377945, + "rewards/accuracy_reward": 0.6875, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.02955202106386423, + "rewards/tag_count_reward": 1.0, "step": 877 }, { "clip_ratio": 0.0, - "completion_length": 498.75001525878906, - "epoch": 0.878, - "grad_norm": 22.18515714015866, - "kl": 4.2578125, - "learning_rate": 1.4019235263722034e-07, - "loss": 0.8769, - "reward": 2.144127130508423, - "reward_std": 0.4630337953567505, - "rewards/accuracy_reward": 0.3125000074505806, - "rewards/reasoning_steps_reward": 0.9513889253139496, - "rewards/repetition_penalty_reward": -0.015595164615660906, - "rewards/tag_count_reward": 0.8958333432674408, + "completion_length": 292.2291717529297, + "epoch": 0.439, + "grad_norm": 20.50363538611729, + "kl": 1.21435546875, + "learning_rate": 7.200283540683102e-07, + "loss": 0.1637, + "reward": 2.731342077255249, + "reward_std": 0.4017653465270996, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.03254689462482929, + "rewards/tag_count_reward": 0.9791666865348816, "step": 878 }, { "clip_ratio": 0.0, - "completion_length": 486.8333435058594, - "epoch": 0.879, - "grad_norm": 33.22887528150052, - "kl": 4.6015625, - "learning_rate": 1.395459477224772e-07, - "loss": 0.8573, - "reward": 2.4029178619384766, - "reward_std": 0.5731684267520905, - "rewards/accuracy_reward": 0.5833333432674408, - "rewards/reasoning_steps_reward": 0.9513889253139496, - "rewards/repetition_penalty_reward": -0.02763769868761301, - "rewards/tag_count_reward": 0.8958333730697632, + "completion_length": 319.0833435058594, + "epoch": 0.4395, + "grad_norm": 123.17102900336207, + "kl": 5.9921875, + "learning_rate": 7.193009184127145e-07, + "loss": 0.8709, + "reward": 2.4152499437332153, + "reward_std": 0.3710479885339737, + "rewards/accuracy_reward": 0.520833358168602, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.0222501028329134, + "rewards/tag_count_reward": 0.9583333432674408, "step": 879 }, { "clip_ratio": 0.0, - "completion_length": 386.7291717529297, - "epoch": 0.88, - "grad_norm": 20.990104821906048, - "kl": 1.6796875, - "learning_rate": 1.3890454406082956e-07, - "loss": 0.5235, - "reward": 2.454037666320801, - "reward_std": 0.4934305101633072, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/reasoning_steps_reward": 0.9652777910232544, - "rewards/repetition_penalty_reward": -0.01644855784252286, - "rewards/tag_count_reward": 0.9635416865348816, + "completion_length": 448.12501525878906, + "epoch": 0.44, + "grad_norm": 363.0002951670322, + "kl": 14.9375, + "learning_rate": 7.185729670371604e-07, + "loss": 2.2049, + "reward": 2.466444969177246, + "reward_std": 0.6788784861564636, + "rewards/accuracy_reward": 0.625, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.01966631133109331, + "rewards/tag_count_reward": 0.8750000298023224, "step": 880 }, { "clip_ratio": 0.0, - "completion_length": 502.29168701171875, - "epoch": 0.881, - "grad_norm": 36.364235182054, - "kl": 5.3828125, - "learning_rate": 1.3826814946757888e-07, - "loss": 1.4712, - "reward": 2.4933007955551147, - "reward_std": 0.7101709246635437, - "rewards/accuracy_reward": 0.6875, - "rewards/reasoning_steps_reward": 0.9513889253139496, - "rewards/repetition_penalty_reward": -0.015379873104393482, - "rewards/tag_count_reward": 0.8697916865348816, + "completion_length": 355.625, + "epoch": 0.4405, + "grad_norm": 377.3905680062747, + "kl": 13.265625, + "learning_rate": 7.17844502159114e-07, + "loss": 1.9299, + "reward": 2.5884649753570557, + "reward_std": 0.302852138876915, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9236111044883728, + "rewards/repetition_penalty_reward": -0.017437931150197983, + "rewards/tag_count_reward": 0.9531250298023224, "step": 881 }, { "clip_ratio": 0.0, - "completion_length": 520.25, - "epoch": 0.882, - "grad_norm": 60.16390554825615, - "kl": 6.0546875, - "learning_rate": 1.3763677169699217e-07, - "loss": 1.0879, - "reward": 2.3464200496673584, - "reward_std": 0.5781585574150085, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/reasoning_steps_reward": 0.944444477558136, - "rewards/repetition_penalty_reward": -0.019899526610970497, - "rewards/tag_count_reward": 0.8802083730697632, + "completion_length": 299.10418701171875, + "epoch": 0.441, + "grad_norm": 81.195858576975, + "kl": 3.01171875, + "learning_rate": 7.171155259976057e-07, + "loss": 0.8336, + "reward": 2.445673108100891, + "reward_std": 0.44037002325057983, + "rewards/accuracy_reward": 0.5625000223517418, + "rewards/reasoning_steps_reward": 0.951388955116272, + "rewards/repetition_penalty_reward": -0.026549202390015125, + "rewards/tag_count_reward": 0.9583333730697632, "step": 882 }, { "clip_ratio": 0.0, - "completion_length": 499.5625, - "epoch": 0.883, - "grad_norm": 34.74857681904962, - "kl": 6.4375, - "learning_rate": 1.370104184422085e-07, - "loss": 0.5873, - "reward": 2.2807507514953613, - "reward_std": 0.5849172174930573, - "rewards/accuracy_reward": 0.5208333432674408, - "rewards/reasoning_steps_reward": 0.9166667759418488, - "rewards/repetition_penalty_reward": -0.01091603422537446, - "rewards/tag_count_reward": 0.8541666865348816, + "completion_length": 420.3541717529297, + "epoch": 0.4415, + "grad_norm": 51.10985682493742, + "kl": 4.3125, + "learning_rate": 7.163860407732231e-07, + "loss": 0.9492, + "reward": 2.43084716796875, + "reward_std": 0.4608851373195648, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.02575009036809206, + "rewards/tag_count_reward": 0.921875, "step": 883 }, { "clip_ratio": 0.0, - "completion_length": 448.31251525878906, - "epoch": 0.884, - "grad_norm": 24.793424614985756, - "kl": 3.4609375, - "learning_rate": 1.3638909733514452e-07, - "loss": 1.0009, - "reward": 2.423251986503601, - "reward_std": 0.6239519119262695, - "rewards/accuracy_reward": 0.5625, - "rewards/reasoning_steps_reward": 0.944444477558136, - "rewards/repetition_penalty_reward": -0.01598423975519836, - "rewards/tag_count_reward": 0.9322916865348816, + "completion_length": 366.79168701171875, + "epoch": 0.442, + "grad_norm": 145.68633226852327, + "kl": 7.15625, + "learning_rate": 7.156560487081051e-07, + "loss": 0.8389, + "reward": 2.534042716026306, + "reward_std": 0.6869227886199951, + "rewards/accuracy_reward": 0.75, + "rewards/reasoning_steps_reward": 0.9236111640930176, + "rewards/repetition_penalty_reward": -0.019776692148298025, + "rewards/tag_count_reward": 0.8802083730697632, "step": 884 }, { "clip_ratio": 0.0, - "completion_length": 541.0, - "epoch": 0.885, - "grad_norm": 40.71564310689436, - "kl": 6.109375, - "learning_rate": 1.3577281594640182e-07, - "loss": 0.9699, - "reward": 2.234649658203125, - "reward_std": 0.5764244198799133, - "rewards/accuracy_reward": 0.4166666865348816, - "rewards/reasoning_steps_reward": 0.965277761220932, - "rewards/repetition_penalty_reward": -0.01708643836900592, - "rewards/tag_count_reward": 0.8697916865348816, + "completion_length": 351.125, + "epoch": 0.4425, + "grad_norm": 31.81128878392762, + "kl": 1.453125, + "learning_rate": 7.149255520259338e-07, + "loss": 0.4362, + "reward": 2.469942808151245, + "reward_std": 0.6845696568489075, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.9305556416511536, + "rewards/repetition_penalty_reward": -0.017904633656144142, + "rewards/tag_count_reward": 0.9114583730697632, "step": 885 }, { "clip_ratio": 0.0, - "completion_length": 400.8541717529297, - "epoch": 0.886, - "grad_norm": 26.123391575885496, - "kl": 3.5625, - "learning_rate": 1.351615817851748e-07, - "loss": 0.8407, - "reward": 2.6832687854766846, - "reward_std": 0.4722274839878082, - "rewards/accuracy_reward": 0.8333333730697632, - "rewards/reasoning_steps_reward": 0.9236111640930176, - "rewards/repetition_penalty_reward": -0.021592404693365097, - "rewards/tag_count_reward": 0.9479166865348816, - "step": 886 - }, - { - "clip_ratio": 0.0, - "completion_length": 527.3333587646484, - "epoch": 0.887, - "grad_norm": 20.992724819526305, - "kl": 5.24609375, - "learning_rate": 1.345554022991586e-07, - "loss": 1.196, - "reward": 2.380396842956543, - "reward_std": 0.6401159465312958, - "rewards/accuracy_reward": 0.6041666716337204, - "rewards/reasoning_steps_reward": 0.944444477558136, - "rewards/repetition_penalty_reward": -0.011964202858507633, - "rewards/tag_count_reward": 0.8437500298023224, + "completion_length": 276.3541717529297, + "epoch": 0.443, + "grad_norm": 4.828439869270836, + "kl": 0.30419921875, + "learning_rate": 7.141945529519288e-07, + "loss": 0.049, + "reward": 2.859205484390259, + "reward_std": 0.2999560683965683, + "rewards/accuracy_reward": 0.8958333730697632, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.026211323216557503, + "rewards/tag_count_reward": 0.9895833432674408, + "step": 886 + }, + { + "clip_ratio": 0.0, + "completion_length": 434.25001525878906, + "epoch": 0.4435, + "grad_norm": 18.69245813013392, + "kl": 1.875, + "learning_rate": 7.134630537128403e-07, + "loss": 0.4583, + "reward": 2.5157185792922974, + "reward_std": 0.4226338863372803, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.958333432674408, + "rewards/repetition_penalty_reward": -0.02073993068188429, + "rewards/tag_count_reward": 0.8697916865348816, "step": 887 }, { "clip_ratio": 0.0, - "completion_length": 429.3541717529297, - "epoch": 0.888, - "grad_norm": 18.876114754116053, - "kl": 2.08984375, - "learning_rate": 1.3395428487445914e-07, - "loss": 0.5497, - "reward": 2.315842390060425, - "reward_std": 0.41791072487831116, - "rewards/accuracy_reward": 0.3958333432674408, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.014018761925399303, - "rewards/tag_count_reward": 0.9479166865348816, + "completion_length": 262.54168701171875, + "epoch": 0.444, + "grad_norm": 8.211191313929003, + "kl": 0.3388671875, + "learning_rate": 7.127310565369415e-07, + "loss": 0.0778, + "reward": 2.9123250246047974, + "reward_std": 0.16250279545783997, + "rewards/accuracy_reward": 0.9791666865348816, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.03559182398021221, + "rewards/tag_count_reward": 0.9895833432674408, "step": 888 }, { "clip_ratio": 0.0, - "completion_length": 433.9166717529297, - "epoch": 0.889, - "grad_norm": 20.1828507834608, - "kl": 4.0234375, - "learning_rate": 1.3335823683550237e-07, - "loss": 0.7386, - "reward": 2.623020648956299, - "reward_std": 0.6058576703071594, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 0.979166716337204, - "rewards/repetition_penalty_reward": -0.017604444175958633, - "rewards/tag_count_reward": 0.8906250298023224, + "completion_length": 381.1666717529297, + "epoch": 0.4445, + "grad_norm": 39.63561659258511, + "kl": 1.671875, + "learning_rate": 7.11998563654023e-07, + "loss": 0.4953, + "reward": 2.48183274269104, + "reward_std": 0.5570933073759079, + "rewards/accuracy_reward": 0.6458333730697632, + "rewards/reasoning_steps_reward": 0.9583334028720856, + "rewards/repetition_penalty_reward": -0.03379237279295921, + "rewards/tag_count_reward": 0.9114583730697632, "step": 889 }, { "clip_ratio": 0.0, - "completion_length": 498.7500305175781, - "epoch": 0.89, - "grad_norm": 21.280761697718606, - "kl": 4.3125, - "learning_rate": 1.3276726544494571e-07, - "loss": 0.8813, - "reward": 2.311787486076355, - "reward_std": 0.6657915413379669, - "rewards/accuracy_reward": 0.5, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.019809790886938572, - "rewards/tag_count_reward": 0.8593750298023224, + "completion_length": 408.1875228881836, + "epoch": 0.445, + "grad_norm": 19.45400997832264, + "kl": 2.63671875, + "learning_rate": 7.11265577295385e-07, + "loss": 0.5174, + "reward": 2.653844475746155, + "reward_std": 0.4112878441810608, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9583333134651184, + "rewards/repetition_penalty_reward": -0.012822255957871675, + "rewards/tag_count_reward": 0.9166666865348816, "step": 890 }, { "clip_ratio": 0.0, - "completion_length": 408.9166717529297, - "epoch": 0.891, - "grad_norm": 15.5213911574011, - "kl": 3.2626953125, - "learning_rate": 1.3218137790358892e-07, - "loss": 0.3428, - "reward": 2.777226448059082, - "reward_std": 0.3445788323879242, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/reasoning_steps_reward": 0.9652778208255768, - "rewards/repetition_penalty_reward": -0.016176452860236168, - "rewards/tag_count_reward": 0.9322916865348816, + "completion_length": 298.5208435058594, + "epoch": 0.4455, + "grad_norm": 40.00263159392063, + "kl": 1.97265625, + "learning_rate": 7.105320996938314e-07, + "loss": 0.5189, + "reward": 2.7283371686935425, + "reward_std": 0.4932422488927841, + "rewards/accuracy_reward": 0.8333333730697632, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.030343500431627035, + "rewards/tag_count_reward": 0.953125, "step": 891 }, { "clip_ratio": 0.0, - "completion_length": 547.1458435058594, - "epoch": 0.892, - "grad_norm": 26.69864519239987, - "kl": 5.2734375, - "learning_rate": 1.316005813502869e-07, - "loss": 0.735, - "reward": 2.280866265296936, - "reward_std": 0.5884718000888824, - "rewards/accuracy_reward": 0.4583333432674408, - "rewards/reasoning_steps_reward": 0.9652777910232544, - "rewards/repetition_penalty_reward": -0.017744951881468296, - "rewards/tag_count_reward": 0.875, + "completion_length": 394.2083435058594, + "epoch": 0.446, + "grad_norm": 41.93561167127058, + "kl": 4.359375, + "learning_rate": 7.097981330836616e-07, + "loss": 1.3479, + "reward": 2.7040258646011353, + "reward_std": 0.5273069739341736, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.040765900164842606, + "rewards/tag_count_reward": 0.9114583432674408, "step": 892 }, { "clip_ratio": 0.0, - "completion_length": 429.68751525878906, - "epoch": 0.893, - "grad_norm": 16.8784751889796, - "kl": 2.828125, - "learning_rate": 1.3102488286186234e-07, - "loss": 0.5288, - "reward": 2.408605933189392, - "reward_std": 0.48363907635211945, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/reasoning_steps_reward": 0.9583333432674408, - "rewards/repetition_penalty_reward": -0.023685818538069725, - "rewards/tag_count_reward": 0.9322916865348816, + "completion_length": 346.25001525878906, + "epoch": 0.4465, + "grad_norm": 48.76214538177417, + "kl": 5.28125, + "learning_rate": 7.090636797006657e-07, + "loss": 0.5066, + "reward": 2.608002543449402, + "reward_std": 0.7648839950561523, + "rewards/accuracy_reward": 0.7708333730697632, + "rewards/reasoning_steps_reward": 0.9513889849185944, + "rewards/repetition_penalty_reward": -0.025678012520074844, + "rewards/tag_count_reward": 0.9114583730697632, "step": 893 }, { "clip_ratio": 0.0, - "completion_length": 437.5416717529297, - "epoch": 0.894, - "grad_norm": 41.80416425701382, - "kl": 3.0625, - "learning_rate": 1.3045428945301953e-07, - "loss": 0.8179, - "reward": 2.665311813354492, - "reward_std": 0.4915819466114044, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 0.9583334028720856, - "rewards/repetition_penalty_reward": -0.01177146751433611, - "rewards/tag_count_reward": 0.9270833730697632, + "completion_length": 369.04168701171875, + "epoch": 0.447, + "grad_norm": 92.64315624811375, + "kl": 5.3515625, + "learning_rate": 7.083287417821157e-07, + "loss": 0.6629, + "reward": 2.784505248069763, + "reward_std": 0.33816368877887726, + "rewards/accuracy_reward": 0.8750000298023224, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.01063354080542922, + "rewards/tag_count_reward": 0.9479166865348816, "step": 894 }, { "clip_ratio": 0.0, - "completion_length": 358.5833435058594, - "epoch": 0.895, - "grad_norm": 47.78557902231892, - "kl": 2.76171875, - "learning_rate": 1.2988880807625927e-07, - "loss": 0.5157, - "reward": 2.5233113765716553, - "reward_std": 0.5640779733657837, - "rewards/accuracy_reward": 0.625, - "rewards/reasoning_steps_reward": 0.9861111044883728, - "rewards/repetition_penalty_reward": -0.025299932807683945, - "rewards/tag_count_reward": 0.9375000298023224, + "completion_length": 643.3958587646484, + "epoch": 0.4475, + "grad_norm": 223.85239628295713, + "kl": 16.421875, + "learning_rate": 7.075933215667604e-07, + "loss": 1.5531, + "reward": 2.1586782932281494, + "reward_std": 0.6240611672401428, + "rewards/accuracy_reward": 0.5000000298023224, + "rewards/reasoning_steps_reward": 0.9027777910232544, + "rewards/repetition_penalty_reward": -0.02014122251421213, + "rewards/tag_count_reward": 0.7760416865348816, "step": 895 }, { "clip_ratio": 0.0, - "completion_length": 535.6666870117188, - "epoch": 0.896, - "grad_norm": 27.290533216671655, - "kl": 4.2578125, - "learning_rate": 1.2932844562179352e-07, - "loss": 0.6981, - "reward": 2.0957794189453125, - "reward_std": 0.5598824918270111, - "rewards/accuracy_reward": 0.25, - "rewards/reasoning_steps_reward": 0.9652778208255768, - "rewards/repetition_penalty_reward": -0.015331756323575974, - "rewards/tag_count_reward": 0.8958333432674408, + "completion_length": 381.2916717529297, + "epoch": 0.448, + "grad_norm": 32.38556348235585, + "kl": 2.5, + "learning_rate": 7.068574212948169e-07, + "loss": 0.954, + "reward": 2.5282145738601685, + "reward_std": 0.39006973803043365, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.023868918418884277, + "rewards/tag_count_reward": 0.9479166865348816, "step": 896 }, { "clip_ratio": 0.0, - "completion_length": 384.68751525878906, - "epoch": 0.897, - "grad_norm": 8.938743613588342, - "kl": 1.4921875, - "learning_rate": 1.2877320891746201e-07, - "loss": 0.2616, - "reward": 2.667781949043274, - "reward_std": 0.4907715171575546, - "rewards/accuracy_reward": 0.75, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.026662585325539112, - "rewards/tag_count_reward": 0.9583333432674408, + "completion_length": 445.62501525878906, + "epoch": 0.4485, + "grad_norm": 29.350505167918783, + "kl": 5.484375, + "learning_rate": 7.06121043207965e-07, + "loss": 1.1386, + "reward": 2.497186064720154, + "reward_std": 0.7929587364196777, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.951388955116272, + "rewards/repetition_penalty_reward": -0.027119554579257965, + "rewards/tag_count_reward": 0.8645833432674408, "step": 897 }, { "clip_ratio": 0.0, - "completion_length": 413.6041717529297, - "epoch": 0.898, - "grad_norm": 44.44288512911277, - "kl": 1.8125, - "learning_rate": 1.2822310472864885e-07, - "loss": 0.7351, - "reward": 2.583755135536194, - "reward_std": 0.557906985282898, - "rewards/accuracy_reward": 0.6875000298023224, - "rewards/reasoning_steps_reward": 0.972222238779068, - "rewards/repetition_penalty_reward": -0.01867562346160412, - "rewards/tag_count_reward": 0.9427083730697632, + "completion_length": 615.4166870117188, + "epoch": 0.449, + "grad_norm": 36.911352628939454, + "kl": 6.421875, + "learning_rate": 7.053841895493406e-07, + "loss": 1.61, + "reward": 2.3755075931549072, + "reward_std": 0.9200084507465363, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9236111342906952, + "rewards/repetition_penalty_reward": -0.016853561159223318, + "rewards/tag_count_reward": 0.8020833730697632, "step": 898 }, { "clip_ratio": 0.0, - "completion_length": 339.2708435058594, - "epoch": 0.899, - "grad_norm": 12.549931479255974, - "kl": 2.23046875, - "learning_rate": 1.2767813975819983e-07, - "loss": 0.3723, - "reward": 2.7536635398864746, - "reward_std": 0.3352846037596464, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.010225379839539528, - "rewards/tag_count_reward": 0.9375, + "completion_length": 442.1250305175781, + "epoch": 0.4495, + "grad_norm": 22.680192929837112, + "kl": 4.47265625, + "learning_rate": 7.046468625635274e-07, + "loss": 0.7625, + "reward": 2.4903249740600586, + "reward_std": 0.8195319175720215, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9027778208255768, + "rewards/repetition_penalty_reward": -0.01661947136744857, + "rewards/tag_count_reward": 0.875, "step": 899 }, { "clip_ratio": 0.0, - "completion_length": 467.1250305175781, - "epoch": 0.9, - "grad_norm": 22.420504928111246, - "kl": 3.375, - "learning_rate": 1.2713832064634125e-07, - "loss": 0.8531, - "reward": 2.0020930767059326, - "reward_std": 0.47625498473644257, - "rewards/accuracy_reward": 0.1875, - "rewards/reasoning_steps_reward": 0.944444477558136, - "rewards/repetition_penalty_reward": -0.020476454868912697, - "rewards/tag_count_reward": 0.8906250298023224, + "completion_length": 463.6041717529297, + "epoch": 0.45, + "grad_norm": 27.272842353579144, + "kl": 4.59375, + "learning_rate": 7.039090644965509e-07, + "loss": 0.6818, + "reward": 2.319231152534485, + "reward_std": 0.888009250164032, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 0.9097222685813904, + "rewards/repetition_penalty_reward": -0.0227828249335289, + "rewards/tag_count_reward": 0.8072916865348816, "step": 900 }, { "clip_ratio": 0.0, - "completion_length": 603.0833587646484, - "epoch": 0.901, - "grad_norm": 32.67875227205643, - "kl": 5.296875, - "learning_rate": 1.2660365397059856e-07, - "loss": 1.2488, - "reward": 2.2659354209899902, - "reward_std": 0.6388919800519943, - "rewards/accuracy_reward": 0.4583333432674408, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.02573126833885908, - "rewards/tag_count_reward": 0.8541666865348816, + "completion_length": 449.0833435058594, + "epoch": 0.4505, + "grad_norm": 34.13999464127485, + "kl": 2.9140625, + "learning_rate": 7.031707975958726e-07, + "loss": 0.7008, + "reward": 2.466166138648987, + "reward_std": 0.6233952641487122, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9444444477558136, + "rewards/repetition_penalty_reward": -0.01994516607373953, + "rewards/tag_count_reward": 0.8750000298023224, "step": 901 }, { "clip_ratio": 0.0, - "completion_length": 458.7916717529297, - "epoch": 0.902, - "grad_norm": 31.91653489837797, - "kl": 2.703125, - "learning_rate": 1.260741462457165e-07, - "loss": 1.0144, - "reward": 2.4180402755737305, - "reward_std": 0.6183659136295319, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.017723857890814543, - "rewards/tag_count_reward": 0.9218750298023224, + "completion_length": 477.9166717529297, + "epoch": 0.451, + "grad_norm": 44.299788634003804, + "kl": 3.7109375, + "learning_rate": 7.024320641103811e-07, + "loss": 0.9245, + "reward": 2.344895601272583, + "reward_std": 0.6696476340293884, + "rewards/accuracy_reward": 0.5625000149011612, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.01621554931625724, + "rewards/tag_count_reward": 0.8541666865348816, "step": 902 }, { "clip_ratio": 0.0, - "completion_length": 526.8750305175781, - "epoch": 0.903, - "grad_norm": 29.282295286202803, - "kl": 6.421875, - "learning_rate": 1.2554980392357956e-07, - "loss": 1.0086, - "reward": 2.2422985434532166, - "reward_std": 0.8072032630443573, - "rewards/accuracy_reward": 0.5208333432674408, - "rewards/reasoning_steps_reward": 0.8888889253139496, - "rewards/repetition_penalty_reward": -0.0163820618763566, - "rewards/tag_count_reward": 0.8489583432674408, + "completion_length": 702.1458435058594, + "epoch": 0.4515, + "grad_norm": 140.07908899869074, + "kl": 14.5, + "learning_rate": 7.01692866290387e-07, + "loss": 1.4671, + "reward": 1.8353345394134521, + "reward_std": 0.8967846035957336, + "rewards/accuracy_reward": 0.3541666716337204, + "rewards/reasoning_steps_reward": 0.8750000298023224, + "rewards/repetition_penalty_reward": -0.013623815728351474, + "rewards/tag_count_reward": 0.6197916865348816, "step": 903 }, { "clip_ratio": 0.0, - "completion_length": 478.37501525878906, - "epoch": 0.904, - "grad_norm": 33.228944520213055, - "kl": 6.609375, - "learning_rate": 1.2503063339313356e-07, - "loss": 1.1511, - "reward": 2.098447561264038, - "reward_std": 0.6330851316452026, - "rewards/accuracy_reward": 0.3541666716337204, - "rewards/reasoning_steps_reward": 0.902777761220932, - "rewards/repetition_penalty_reward": -0.012663647066801786, - "rewards/tag_count_reward": 0.8541666865348816, + "completion_length": 422.50001525878906, + "epoch": 0.452, + "grad_norm": 52.20368268795704, + "kl": 5.78125, + "learning_rate": 7.009532063876148e-07, + "loss": 0.6616, + "reward": 2.408238172531128, + "reward_std": 0.5504840314388275, + "rewards/accuracy_reward": 0.6041666716337204, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.024053473956882954, + "rewards/tag_count_reward": 0.8697916865348816, "step": 904 }, { "clip_ratio": 0.0, - "completion_length": 458.87501525878906, - "epoch": 0.905, - "grad_norm": 36.169535665899474, - "kl": 4.171875, - "learning_rate": 1.2451664098030743e-07, - "loss": 0.7981, - "reward": 2.4604378938674927, - "reward_std": 0.7671001553535461, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/reasoning_steps_reward": 0.9375000596046448, - "rewards/repetition_penalty_reward": -0.0291455565020442, - "rewards/tag_count_reward": 0.8854166865348816, + "completion_length": 445.0416717529297, + "epoch": 0.4525, + "grad_norm": 33.39372260946179, + "kl": 4.5234375, + "learning_rate": 7.002130866551968e-07, + "loss": 0.5914, + "reward": 2.167671024799347, + "reward_std": 0.30702342092990875, + "rewards/accuracy_reward": 0.3333333544433117, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.03198174387216568, + "rewards/tag_count_reward": 0.8802083432674408, "step": 905 }, { "clip_ratio": 0.0, - "completion_length": 404.79168701171875, - "epoch": 0.906, - "grad_norm": 17.708470010127638, - "kl": 3.55078125, - "learning_rate": 1.2400783294793668e-07, - "loss": 0.9617, - "reward": 2.5592939853668213, - "reward_std": 0.7362368702888489, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.944444477558136, - "rewards/repetition_penalty_reward": -0.015358910895884037, - "rewards/tag_count_reward": 0.9010416865348816, + "completion_length": 484.9791717529297, + "epoch": 0.453, + "grad_norm": 25.372165474058104, + "kl": 3.1796875, + "learning_rate": 6.994725093476664e-07, + "loss": 0.5834, + "reward": 2.468542218208313, + "reward_std": 0.6370213180780411, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9305555820465088, + "rewards/repetition_penalty_reward": -0.01930511137470603, + "rewards/tag_count_reward": 0.8697916865348816, "step": 906 }, { "clip_ratio": 0.0, - "completion_length": 607.6250305175781, - "epoch": 0.907, - "grad_norm": 52.06808756849091, - "kl": 9.578125, - "learning_rate": 1.235042154956865e-07, - "loss": 1.4479, - "reward": 2.2677348852157593, - "reward_std": 0.8661213219165802, - "rewards/accuracy_reward": 0.5833333432674408, - "rewards/reasoning_steps_reward": 0.9097222685813904, - "rewards/repetition_penalty_reward": -0.03261231165379286, - "rewards/tag_count_reward": 0.8072916865348816, + "completion_length": 408.97918701171875, + "epoch": 0.4535, + "grad_norm": 9.77141277310684, + "kl": 2.5703125, + "learning_rate": 6.987314767209503e-07, + "loss": 0.3112, + "reward": 2.3773725032806396, + "reward_std": 0.49295616149902344, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9305555820465088, + "rewards/repetition_penalty_reward": -0.021933134645223618, + "rewards/tag_count_reward": 0.8645833432674408, "step": 907 }, { "clip_ratio": 0.0, - "completion_length": 379.8958435058594, - "epoch": 0.908, - "grad_norm": 14.881992614761662, - "kl": 2.5, - "learning_rate": 1.2300579475997657e-07, - "loss": 0.3101, - "reward": 2.3961899280548096, - "reward_std": 0.6012940406799316, - "rewards/accuracy_reward": 0.5208333432674408, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.023949017748236656, - "rewards/tag_count_reward": 0.9270833432674408, + "completion_length": 462.1041793823242, + "epoch": 0.454, + "grad_norm": 24.983838013132367, + "kl": 2.546875, + "learning_rate": 6.979899910323624e-07, + "loss": 0.4343, + "reward": 2.223532557487488, + "reward_std": 0.6077858209609985, + "rewards/accuracy_reward": 0.45833333395421505, + "rewards/reasoning_steps_reward": 0.9305555820465088, + "rewards/repetition_penalty_reward": -0.019523008493706584, + "rewards/tag_count_reward": 0.8541666865348816, "step": 908 }, { "clip_ratio": 0.0, - "completion_length": 573.8541870117188, - "epoch": 0.909, - "grad_norm": 96.3349004340528, - "kl": 8.25, - "learning_rate": 1.2251257681390645e-07, - "loss": 1.2975, - "reward": 2.3617441654205322, - "reward_std": 0.7564932107925415, - "rewards/accuracy_reward": 0.6041666716337204, - "rewards/reasoning_steps_reward": 0.951388955116272, - "rewards/repetition_penalty_reward": -0.01151987537741661, - "rewards/tag_count_reward": 0.8177083432674408, + "completion_length": 304.0416717529297, + "epoch": 0.4545, + "grad_norm": 28.981123137793116, + "kl": 1.62109375, + "learning_rate": 6.972480545405968e-07, + "loss": 0.3188, + "reward": 2.5748839378356934, + "reward_std": 0.47933661937713623, + "rewards/accuracy_reward": 0.708333358168602, + "rewards/reasoning_steps_reward": 0.9375000596046448, + "rewards/repetition_penalty_reward": -0.013657951261848211, + "rewards/tag_count_reward": 0.9427083432674408, "step": 909 }, { "clip_ratio": 0.0, - "completion_length": 523.9791717529297, - "epoch": 0.91, - "grad_norm": 36.959308180317855, - "kl": 6.234375, - "learning_rate": 1.220245676671809e-07, - "loss": 1.2606, - "reward": 2.20367431640625, - "reward_std": 0.9045920073986053, - "rewards/accuracy_reward": 0.5208333432674408, - "rewards/reasoning_steps_reward": 0.9097222685813904, - "rewards/repetition_penalty_reward": -0.039381190203130245, - "rewards/tag_count_reward": 0.8125000298023224, + "completion_length": 440.97918701171875, + "epoch": 0.455, + "grad_norm": 25.272268744236346, + "kl": 5.78125, + "learning_rate": 6.965056695057204e-07, + "loss": 1.0701, + "reward": 2.4917463064193726, + "reward_std": 0.769326239824295, + "rewards/accuracy_reward": 0.7083333730697632, + "rewards/reasoning_steps_reward": 0.9513888657093048, + "rewards/repetition_penalty_reward": -0.04297593608498573, + "rewards/tag_count_reward": 0.875, "step": 910 }, { "clip_ratio": 0.0, - "completion_length": 556.875, - "epoch": 0.911, - "grad_norm": 16.456009421461072, - "kl": 5.59375, - "learning_rate": 1.2154177326603763e-07, - "loss": 1.3297, - "reward": 2.312328815460205, - "reward_std": 0.7721385657787323, - "rewards/accuracy_reward": 0.5625000149011612, - "rewards/reasoning_steps_reward": 0.9236111044883728, - "rewards/repetition_penalty_reward": -0.022740643471479416, - "rewards/tag_count_reward": 0.8489583432674408, + "completion_length": 287.12501525878906, + "epoch": 0.4555, + "grad_norm": 30.011200731000926, + "kl": 2.720703125, + "learning_rate": 6.957628381891673e-07, + "loss": 0.3289, + "reward": 2.6884909868240356, + "reward_std": 0.36660417914390564, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.014634124469012022, + "rewards/tag_count_reward": 0.9739583432674408, "step": 911 }, { "clip_ratio": 0.0, - "completion_length": 409.29168701171875, - "epoch": 0.912, - "grad_norm": 18.335739930280326, - "kl": 2.6767578125, - "learning_rate": 1.2106419949317388e-07, - "loss": 0.6476, - "reward": 2.470385789871216, - "reward_std": 0.5638497471809387, - "rewards/accuracy_reward": 0.5625000298023224, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.01746157230809331, - "rewards/tag_count_reward": 0.953125, + "completion_length": 328.75, + "epoch": 0.456, + "grad_norm": 26.395423136562222, + "kl": 3.484375, + "learning_rate": 6.950195628537299e-07, + "loss": 0.7263, + "reward": 2.7875736951828003, + "reward_std": 0.4841386526823044, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.02319031674414873, + "rewards/tag_count_reward": 0.9427083432674408, "step": 912 }, { "clip_ratio": 0.0, - "completion_length": 632.8958435058594, - "epoch": 0.913, - "grad_norm": 23.440878703155796, - "kl": 6.265625, - "learning_rate": 1.2059185216767543e-07, - "loss": 1.0253, - "reward": 2.0628278851509094, - "reward_std": 0.9162045121192932, - "rewards/accuracy_reward": 0.4166666716337204, - "rewards/reasoning_steps_reward": 0.9027778506278992, - "rewards/repetition_penalty_reward": -0.04307497665286064, - "rewards/tag_count_reward": 0.7864583432674408, + "completion_length": 379.5625, + "epoch": 0.4565, + "grad_norm": 81.85144356177, + "kl": 6.125, + "learning_rate": 6.942758457635543e-07, + "loss": 0.9552, + "reward": 2.5703320503234863, + "reward_std": 0.3624073415994644, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9722222983837128, + "rewards/repetition_penalty_reward": -0.03209859039634466, + "rewards/tag_count_reward": 0.9218750298023224, "step": 913 }, { "clip_ratio": 0.0, - "completion_length": 622.4583435058594, - "epoch": 0.914, - "grad_norm": 42.16437419261308, - "kl": 6.59375, - "learning_rate": 1.2012473704494537e-07, - "loss": 1.123, - "reward": 1.9104456305503845, - "reward_std": 0.520677238702774, - "rewards/accuracy_reward": 0.18750000558793545, - "rewards/reasoning_steps_reward": 0.9305555820465088, - "rewards/repetition_penalty_reward": -0.020110088400542736, - "rewards/tag_count_reward": 0.8125000298023224, + "completion_length": 364.25001525878906, + "epoch": 0.457, + "grad_norm": 45.15015133012363, + "kl": 5.046875, + "learning_rate": 6.935316891841315e-07, + "loss": 0.4927, + "reward": 2.6616228818893433, + "reward_std": 0.5104624330997467, + "rewards/accuracy_reward": 0.8333333730697632, + "rewards/reasoning_steps_reward": 0.9097222685813904, + "rewards/repetition_penalty_reward": -0.018932746723294258, + "rewards/tag_count_reward": 0.9375, "step": 914 }, { "clip_ratio": 0.0, - "completion_length": 379.04168701171875, - "epoch": 0.915, - "grad_norm": 108.02105225090547, - "kl": 3.39453125, - "learning_rate": 1.1966285981663407e-07, - "loss": 0.3631, - "reward": 2.5404409170150757, - "reward_std": 0.3071303367614746, - "rewards/accuracy_reward": 0.6458333432674408, - "rewards/reasoning_steps_reward": 0.9513888955116272, - "rewards/repetition_penalty_reward": -0.020322874188423157, - "rewards/tag_count_reward": 0.9635416865348816, + "completion_length": 392.0416717529297, + "epoch": 0.4575, + "grad_norm": 37.202986072549116, + "kl": 2.38671875, + "learning_rate": 6.927870953822915e-07, + "loss": 0.5714, + "reward": 2.446205258369446, + "reward_std": 0.593352735042572, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.027753093279898167, + "rewards/tag_count_reward": 0.9114583432674408, "step": 915 }, { "clip_ratio": 0.0, - "completion_length": 573.6041870117188, - "epoch": 0.916, - "grad_norm": 30.21063587734105, - "kl": 5.234375, - "learning_rate": 1.1920622611056974e-07, - "loss": 0.9573, - "reward": 1.9745216369628906, - "reward_std": 0.8713274002075195, - "rewards/accuracy_reward": 0.291666679084301, - "rewards/reasoning_steps_reward": 0.881944477558136, - "rewards/repetition_penalty_reward": -0.042839540168643, - "rewards/tag_count_reward": 0.8437500298023224, + "completion_length": 322.35418701171875, + "epoch": 0.458, + "grad_norm": 24.96453853888965, + "kl": 0.90234375, + "learning_rate": 6.920420666261961e-07, + "loss": 0.2755, + "reward": 2.790469765663147, + "reward_std": 0.28246453404426575, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.027238698676228523, + "rewards/tag_count_reward": 0.984375, "step": 916 }, { "clip_ratio": 0.0, - "completion_length": 568.0833587646484, - "epoch": 0.917, - "grad_norm": 43.80723570465828, - "kl": 4.2109375, - "learning_rate": 1.1875484149069004e-07, - "loss": 1.1291, - "reward": 2.45370614528656, - "reward_std": 0.8172959387302399, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/reasoning_steps_reward": 0.9166666865348816, - "rewards/repetition_penalty_reward": -0.020252287853509188, - "rewards/tag_count_reward": 0.8906250298023224, + "completion_length": 345.3958435058594, + "epoch": 0.4585, + "grad_norm": 49.466728379593775, + "kl": 0.9541015625, + "learning_rate": 6.912966051853322e-07, + "loss": 0.5167, + "reward": 2.606629729270935, + "reward_std": 0.5284582823514938, + "rewards/accuracy_reward": 0.6875000149011612, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.025314772501587868, + "rewards/tag_count_reward": 0.9791666865348816, "step": 917 }, { "clip_ratio": 0.0, - "completion_length": 400.7083435058594, - "epoch": 0.918, - "grad_norm": 20.756190324375613, - "kl": 1.0966796875, - "learning_rate": 1.1830871145697412e-07, - "loss": 0.2955, - "reward": 2.608530044555664, - "reward_std": 0.4455796778202057, - "rewards/accuracy_reward": 0.6875000298023224, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.028622763231396675, - "rewards/tag_count_reward": 0.9635416865348816, + "completion_length": 385.18751525878906, + "epoch": 0.459, + "grad_norm": 8.401580563295221, + "kl": 2.28662109375, + "learning_rate": 6.905507133305047e-07, + "loss": 0.3083, + "reward": 2.5714030265808105, + "reward_std": 0.39095499366521835, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.017138768918812275, + "rewards/tag_count_reward": 0.921875, "step": 918 }, { "clip_ratio": 0.0, - "completion_length": 432.9791717529297, - "epoch": 0.919, - "grad_norm": 11.62915536443853, - "kl": 2.140625, - "learning_rate": 1.1786784144537563e-07, - "loss": 0.4186, - "reward": 2.5469974279403687, - "reward_std": 0.4763101041316986, - "rewards/accuracy_reward": 0.6250000149011612, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.015502599067986012, - "rewards/tag_count_reward": 0.9375000298023224, + "completion_length": 364.4791717529297, + "epoch": 0.4595, + "grad_norm": 30.373353543333543, + "kl": 1.8203125, + "learning_rate": 6.898043933338293e-07, + "loss": 0.648, + "reward": 2.423296332359314, + "reward_std": 0.4785070866346359, + "rewards/accuracy_reward": 0.5416666716337204, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.04892597161233425, + "rewards/tag_count_reward": 0.9583333432674408, "step": 919 }, { "clip_ratio": 0.0, - "completion_length": 507.95835876464844, - "epoch": 0.92, - "grad_norm": 15.992892533041532, - "kl": 3.59375, - "learning_rate": 1.1743223682775649e-07, - "loss": 0.7643, - "reward": 2.407273054122925, - "reward_std": 0.6685203611850739, - "rewards/accuracy_reward": 0.5833333432674408, - "rewards/reasoning_steps_reward": 0.9583333730697632, - "rewards/repetition_penalty_reward": -0.019810385070741177, - "rewards/tag_count_reward": 0.8854166865348816, + "completion_length": 337.2708435058594, + "epoch": 0.46, + "grad_norm": 16.49521792000666, + "kl": 1.76171875, + "learning_rate": 6.890576474687263e-07, + "loss": 0.2062, + "reward": 2.577837347984314, + "reward_std": 0.36855159886181355, + "rewards/accuracy_reward": 0.6875, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.03153769485652447, + "rewards/tag_count_reward": 0.9635416865348816, "step": 920 }, { "clip_ratio": 0.0, - "completion_length": 349.81251525878906, - "epoch": 0.921, - "grad_norm": 35.49834021420722, - "kl": 2.08203125, - "learning_rate": 1.1700190291182158e-07, - "loss": 0.499, - "reward": 2.5659878253936768, - "reward_std": 0.5623100101947784, - "rewards/accuracy_reward": 0.6666666716337204, - "rewards/reasoning_steps_reward": 0.9652778208255768, - "rewards/repetition_penalty_reward": -0.00866501871496439, - "rewards/tag_count_reward": 0.9427083432674408, + "completion_length": 345.62501525878906, + "epoch": 0.4605, + "grad_norm": 15.297924858007171, + "kl": 1.8203125, + "learning_rate": 6.883104780099133e-07, + "loss": 0.4431, + "reward": 2.5769479274749756, + "reward_std": 0.4878064692020416, + "rewards/accuracy_reward": 0.6875000149011612, + "rewards/reasoning_steps_reward": 0.951388955116272, + "rewards/repetition_penalty_reward": -0.046316033229231834, + "rewards/tag_count_reward": 0.984375, "step": 921 }, { "clip_ratio": 0.0, - "completion_length": 441.7083435058594, - "epoch": 0.922, - "grad_norm": 14.606033932622575, - "kl": 2.08203125, - "learning_rate": 1.1657684494105386e-07, - "loss": 0.4224, - "reward": 2.6236190795898438, - "reward_std": 0.5365782380104065, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.01526999520137906, - "rewards/tag_count_reward": 0.9375, + "completion_length": 318.3958435058594, + "epoch": 0.461, + "grad_norm": 17.663934976152945, + "kl": 1.71484375, + "learning_rate": 6.875628872333975e-07, + "loss": 0.4003, + "reward": 2.721220016479492, + "reward_std": 0.3829363286495209, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.02878008782863617, + "rewards/tag_count_reward": 0.9791666865348816, "step": 922 }, { "clip_ratio": 0.0, - "completion_length": 544.25, - "epoch": 0.923, - "grad_norm": 32.63063888698561, - "kl": 5.8515625, - "learning_rate": 1.1615706809465051e-07, - "loss": 0.8629, - "reward": 2.486325263977051, - "reward_std": 0.6139254868030548, - "rewards/accuracy_reward": 0.6458333432674408, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.02756360173225403, - "rewards/tag_count_reward": 0.8958333730697632, + "completion_length": 388.12501525878906, + "epoch": 0.4615, + "grad_norm": 24.847963074766394, + "kl": 2.52880859375, + "learning_rate": 6.868148774164706e-07, + "loss": 0.8003, + "reward": 2.7672914266586304, + "reward_std": 0.40449826419353485, + "rewards/accuracy_reward": 0.8333333730697632, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.02958354353904724, + "rewards/tag_count_reward": 0.9635416865348816, "step": 923 }, { "clip_ratio": 0.0, - "completion_length": 347.3125, - "epoch": 0.924, - "grad_norm": 12.704180107443388, - "kl": 1.7890625, - "learning_rate": 1.1574257748745986e-07, - "loss": 0.2841, - "reward": 2.5728049278259277, - "reward_std": 0.43000659346580505, - "rewards/accuracy_reward": 0.6458333432674408, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.024417342618107796, - "rewards/tag_count_reward": 0.9583333432674408, + "completion_length": 332.5208435058594, + "epoch": 0.462, + "grad_norm": 50.169714777079136, + "kl": 2.1552734375, + "learning_rate": 6.860664508377001e-07, + "loss": 0.368, + "reward": 2.5989573001861572, + "reward_std": 0.36564599722623825, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.026042713783681393, + "rewards/tag_count_reward": 0.9791666865348816, "step": 924 }, { "clip_ratio": 0.0, - "completion_length": 413.12501525878906, - "epoch": 0.925, - "grad_norm": 18.841011447480344, - "kl": 3.2578125, - "learning_rate": 1.1533337816991931e-07, - "loss": 0.7064, - "reward": 2.5212587118148804, - "reward_std": 0.42140892148017883, - "rewards/accuracy_reward": 0.6666666716337204, - "rewards/reasoning_steps_reward": 0.92361119389534, - "rewards/repetition_penalty_reward": -0.011727516539394855, - "rewards/tag_count_reward": 0.9427083730697632, + "completion_length": 390.3333435058594, + "epoch": 0.4625, + "grad_norm": 128.99931158793126, + "kl": 6.46875, + "learning_rate": 6.853176097769228e-07, + "loss": 1.1684, + "reward": 2.682253360748291, + "reward_std": 0.5880621820688248, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9236111044883728, + "rewards/repetition_penalty_reward": -0.03302454110234976, + "rewards/tag_count_reward": 0.9375000298023224, "step": 925 }, { "clip_ratio": 0.0, - "completion_length": 442.25, - "epoch": 0.926, - "grad_norm": 16.753619228197998, - "kl": 3.5546875, - "learning_rate": 1.1492947512799328e-07, - "loss": 0.6144, - "reward": 2.417970061302185, - "reward_std": 0.5506798624992371, - "rewards/accuracy_reward": 0.5833333432674408, - "rewards/reasoning_steps_reward": 0.9305556118488312, - "rewards/repetition_penalty_reward": -0.02300213649868965, - "rewards/tag_count_reward": 0.9270833432674408, + "completion_length": 421.3958435058594, + "epoch": 0.463, + "grad_norm": 43.308678626367616, + "kl": 3.63671875, + "learning_rate": 6.84568356515239e-07, + "loss": 1.3087, + "reward": 2.6976706981658936, + "reward_std": 0.5417228192090988, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9305556416511536, + "rewards/repetition_penalty_reward": -0.02455159369856119, + "rewards/tag_count_reward": 0.9375, "step": 926 }, { "clip_ratio": 0.0, - "completion_length": 532.6250305175781, - "epoch": 0.927, - "grad_norm": 47.35016742807312, - "kl": 7.84375, - "learning_rate": 1.1453087328311299e-07, - "loss": 1.1422, - "reward": 2.0104929208755493, - "reward_std": 0.6963983774185181, - "rewards/accuracy_reward": 0.29166667722165585, - "rewards/reasoning_steps_reward": 0.9305555820465088, - "rewards/repetition_penalty_reward": -0.02422943152487278, - "rewards/tag_count_reward": 0.8125000298023224, + "completion_length": 364.6875, + "epoch": 0.4635, + "grad_norm": 19.355624028175274, + "kl": 1.4140625, + "learning_rate": 6.838186933350036e-07, + "loss": 0.5342, + "reward": 2.708867311477661, + "reward_std": 0.4020465463399887, + "rewards/accuracy_reward": 0.8333333730697632, + "rewards/reasoning_steps_reward": 0.9444444477558136, + "rewards/repetition_penalty_reward": -0.03245212323963642, + "rewards/tag_count_reward": 0.9635416865348816, "step": 927 }, { "clip_ratio": 0.0, - "completion_length": 385.4791717529297, - "epoch": 0.928, - "grad_norm": 15.095495398152355, - "kl": 2.072265625, - "learning_rate": 1.1413757749211602e-07, - "loss": 0.4393, - "reward": 2.454028844833374, - "reward_std": 0.44756019115448, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.01645728573203087, - "rewards/tag_count_reward": 0.9427083432674408, + "completion_length": 328.81251525878906, + "epoch": 0.464, + "grad_norm": 26.038208175574418, + "kl": 1.3828125, + "learning_rate": 6.83068622519821e-07, + "loss": 0.4745, + "reward": 2.8109928369522095, + "reward_std": 0.32376736029982567, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.017132141161710024, + "rewards/tag_count_reward": 0.9739583432674408, "step": 928 }, { "clip_ratio": 0.0, - "completion_length": 614.6041870117188, - "epoch": 0.929, - "grad_norm": 31.056679659798007, - "kl": 7.28125, - "learning_rate": 1.137495925471875e-07, - "loss": 1.1384, - "reward": 2.323223352432251, - "reward_std": 0.7905822396278381, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/reasoning_steps_reward": 0.958333432674408, - "rewards/repetition_penalty_reward": -0.015318467281758785, - "rewards/tag_count_reward": 0.8385416865348816, + "completion_length": 433.81251525878906, + "epoch": 0.4645, + "grad_norm": 31.19447558981725, + "kl": 4.1796875, + "learning_rate": 6.823181463545366e-07, + "loss": 0.8102, + "reward": 2.39223575592041, + "reward_std": 0.6007313132286072, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9374999701976776, + "rewards/repetition_penalty_reward": -0.04526432417333126, + "rewards/tag_count_reward": 0.8958333730697632, "step": 929 }, { "clip_ratio": 0.0, - "completion_length": 451.43751525878906, - "epoch": 0.93, - "grad_norm": 24.05252138370831, - "kl": 3.65625, - "learning_rate": 1.1336692317580158e-07, - "loss": 0.8455, - "reward": 2.514081120491028, - "reward_std": 0.6175527423620224, - "rewards/accuracy_reward": 0.6250000298023224, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.0275856489315629, - "rewards/tag_count_reward": 0.9375000298023224, + "completion_length": 354.54168701171875, + "epoch": 0.465, + "grad_norm": 16.112655951237382, + "kl": 1.796875, + "learning_rate": 6.815672671252315e-07, + "loss": 0.5261, + "reward": 2.3884493112564087, + "reward_std": 0.416194885969162, + "rewards/accuracy_reward": 0.4791666716337204, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.026481250301003456, + "rewards/tag_count_reward": 0.9635416865348816, "step": 930 }, { "clip_ratio": 0.0, - "completion_length": 414.43751525878906, - "epoch": 0.931, - "grad_norm": 27.33191623105398, - "kl": 2.1875, - "learning_rate": 1.1298957404066381e-07, - "loss": 0.3232, - "reward": 2.5895215272903442, - "reward_std": 0.4988251328468323, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.02506194543093443, - "rewards/tag_count_reward": 0.9479166865348816, + "completion_length": 308.68751525878906, + "epoch": 0.4655, + "grad_norm": 39.327147074285115, + "kl": 1.4365234375, + "learning_rate": 6.808159871192136e-07, + "loss": 0.3897, + "reward": 2.633025646209717, + "reward_std": 0.1877675480209291, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.04579382762312889, + "rewards/tag_count_reward": 0.984375, "step": 931 }, { "clip_ratio": 0.0, - "completion_length": 502.9583435058594, - "epoch": 0.932, - "grad_norm": 30.75771927110975, - "kl": 4.39453125, - "learning_rate": 1.1261754973965422e-07, - "loss": 0.7145, - "reward": 2.1934012174606323, - "reward_std": 0.5984681844711304, - "rewards/accuracy_reward": 0.3750000149011612, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.02534892037510872, - "rewards/tag_count_reward": 0.8645833432674408, + "completion_length": 435.9583435058594, + "epoch": 0.466, + "grad_norm": 22.43891344147886, + "kl": 3.75, + "learning_rate": 6.800643086250121e-07, + "loss": 1.0874, + "reward": 2.6055747270584106, + "reward_std": 0.6811087727546692, + "rewards/accuracy_reward": 0.75, + "rewards/reasoning_steps_reward": 0.9652778506278992, + "rewards/repetition_penalty_reward": -0.026369919069111347, + "rewards/tag_count_reward": 0.9166666865348816, "step": 932 }, { "clip_ratio": 0.0, - "completion_length": 458.54168701171875, - "epoch": 0.933, - "grad_norm": 26.112800067792758, - "kl": 4.203125, - "learning_rate": 1.1225085480577158e-07, - "loss": 0.8496, - "reward": 2.2411762475967407, - "reward_std": 0.7032366394996643, - "rewards/accuracy_reward": 0.4166666865348816, - "rewards/reasoning_steps_reward": 0.9444445371627808, - "rewards/repetition_penalty_reward": -0.01576836034655571, - "rewards/tag_count_reward": 0.8958333730697632, + "completion_length": 489.97918701171875, + "epoch": 0.4665, + "grad_norm": 105.78114916915985, + "kl": 7.40625, + "learning_rate": 6.793122339323705e-07, + "loss": 0.9496, + "reward": 2.352501630783081, + "reward_std": 0.6076760590076447, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9097222685813904, + "rewards/repetition_penalty_reward": -0.020762240514159203, + "rewards/tag_count_reward": 0.8802083730697632, "step": 933 }, { "clip_ratio": 0.0, - "completion_length": 424.37501525878906, - "epoch": 0.934, - "grad_norm": 44.09196385541136, - "kl": 2.703125, - "learning_rate": 1.1188949370707787e-07, - "loss": 0.9393, - "reward": 2.4043623208999634, - "reward_std": 0.672376275062561, - "rewards/accuracy_reward": 0.5833333432674408, - "rewards/reasoning_steps_reward": 0.9305556118488312, - "rewards/repetition_penalty_reward": -0.010568303987383842, - "rewards/tag_count_reward": 0.9010416865348816, + "completion_length": 440.7708435058594, + "epoch": 0.467, + "grad_norm": 28.045265825606887, + "kl": 2.93017578125, + "learning_rate": 6.78559765332238e-07, + "loss": 0.4259, + "reward": 2.5912506580352783, + "reward_std": 0.45991945266723633, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.02159656397998333, + "rewards/tag_count_reward": 0.9114583432674408, "step": 934 }, { "clip_ratio": 0.0, - "completion_length": 421.3958435058594, - "epoch": 0.935, - "grad_norm": 19.863825505537683, - "kl": 2.9765625, - "learning_rate": 1.1153347084664419e-07, - "loss": 0.5384, - "reward": 2.456182837486267, - "reward_std": 0.5422267615795135, - "rewards/accuracy_reward": 0.5833333432674408, - "rewards/reasoning_steps_reward": 0.9513888955116272, - "rewards/repetition_penalty_reward": -0.021247888915240765, - "rewards/tag_count_reward": 0.9427083432674408, + "completion_length": 435.00001525878906, + "epoch": 0.4675, + "grad_norm": 41.65994882506967, + "kl": 1.521484375, + "learning_rate": 6.778069051167653e-07, + "loss": 0.62, + "reward": 2.5147244930267334, + "reward_std": 0.556397408246994, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9444445073604584, + "rewards/repetition_penalty_reward": -0.0338868722319603, + "rewards/tag_count_reward": 0.9375, "step": 935 }, { "clip_ratio": 0.0, - "completion_length": 487.3541717529297, - "epoch": 0.936, - "grad_norm": 49.22541738740071, - "kl": 3.1875, - "learning_rate": 1.1118279056249653e-07, - "loss": 1.1499, - "reward": 2.2859641313552856, - "reward_std": 0.4807792901992798, - "rewards/accuracy_reward": 0.4583333432674408, - "rewards/reasoning_steps_reward": 0.944444477558136, - "rewards/repetition_penalty_reward": -0.01785538112744689, - "rewards/tag_count_reward": 0.9010416865348816, - "step": 936 - }, - { - "clip_ratio": 0.0, - "completion_length": 476.3958435058594, - "epoch": 0.937, - "grad_norm": 50.529982193331755, - "kl": 3.078125, - "learning_rate": 1.1083745712756364e-07, - "loss": 0.9369, - "reward": 2.6942487955093384, - "reward_std": 0.6124294996261597, - "rewards/accuracy_reward": 0.8125000298023224, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.019293093122541904, - "rewards/tag_count_reward": 0.9218750298023224, - "step": 937 + "completion_length": 691.4166870117188, + "epoch": 0.468, + "grad_norm": 37.144816051148496, + "kl": 4.2109375, + "learning_rate": 6.770536555792944e-07, + "loss": 0.8037, + "reward": 2.3306804895401, + "reward_std": 0.7895675301551819, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 0.9027778208255768, + "rewards/repetition_penalty_reward": -0.02001405507326126, + "rewards/tag_count_reward": 0.8229166865348816, + "step": 936 }, { "clip_ratio": 0.0, - "completion_length": 583.0625, - "epoch": 0.938, - "grad_norm": 23.005166702704713, - "kl": 5.2421875, - "learning_rate": 1.1049747474962444e-07, - "loss": 0.8491, - "reward": 2.30727219581604, - "reward_std": 0.6647425889968872, - "rewards/accuracy_reward": 0.4791666865348816, - "rewards/reasoning_steps_reward": 0.9652778506278992, - "rewards/repetition_penalty_reward": -0.022589108906686306, + "completion_length": 526.0625305175781, + "epoch": 0.4685, + "grad_norm": 48.4930120879613, + "kl": 2.5390625, + "learning_rate": 6.763000190143545e-07, + "loss": 0.7913, + "reward": 2.2242666482925415, + "reward_std": 0.5582642555236816, + "rewards/accuracy_reward": 0.4583333432674408, + "rewards/reasoning_steps_reward": 0.916666716337204, + "rewards/repetition_penalty_reward": -0.036150138825178146, "rewards/tag_count_reward": 0.8854166865348816, - "step": 938 + "step": 937 }, { "clip_ratio": 0.0, - "completion_length": 426.4583435058594, - "epoch": 0.939, - "grad_norm": 27.108706168312672, - "kl": 3.7734375, - "learning_rate": 1.1016284757125685e-07, - "loss": 0.644, - "reward": 2.582595705986023, - "reward_std": 0.6667671203613281, + "completion_length": 472.9583435058594, + "epoch": 0.469, + "grad_norm": 28.377979881418177, + "kl": 2.875, + "learning_rate": 6.755459977176532e-07, + "loss": 0.7778, + "reward": 2.534548759460449, + "reward_std": 0.6151235550642014, "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9652778208255768, - "rewards/repetition_penalty_reward": -0.023307234048843384, - "rewards/tag_count_reward": 0.9114583432674408, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.02100683329626918, + "rewards/tag_count_reward": 0.8750000298023224, + "step": 938 + }, + { + "clip_ratio": 0.0, + "completion_length": 506.08335876464844, + "epoch": 0.4695, + "grad_norm": 51.22661172315456, + "kl": 5.640625, + "learning_rate": 6.747915939860701e-07, + "loss": 1.0882, + "reward": 2.568353295326233, + "reward_std": 0.7697390615940094, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9305555820465088, + "rewards/repetition_penalty_reward": -0.028869030997157097, + "rewards/tag_count_reward": 0.8750000298023224, "step": 939 }, { "clip_ratio": 0.0, - "completion_length": 606.9791870117188, - "epoch": 0.94, - "grad_norm": 35.88000566575318, - "kl": 7.203125, - "learning_rate": 1.0983357966978745e-07, - "loss": 1.2855, - "reward": 2.064675807952881, - "reward_std": 0.6436410248279572, - "rewards/accuracy_reward": 0.3125000149011612, - "rewards/reasoning_steps_reward": 0.944444477558136, - "rewards/repetition_penalty_reward": -0.02039373479783535, - "rewards/tag_count_reward": 0.828125, + "completion_length": 490.87501525878906, + "epoch": 0.47, + "grad_norm": 90.34107970778037, + "kl": 5.359375, + "learning_rate": 6.740368101176495e-07, + "loss": 1.3803, + "reward": 2.270451307296753, + "reward_std": 0.7119334787130356, + "rewards/accuracy_reward": 0.5000000149011612, + "rewards/reasoning_steps_reward": 0.9097222685813904, + "rewards/repetition_penalty_reward": -0.03510438837110996, + "rewards/tag_count_reward": 0.8958333432674408, "step": 940 }, { "clip_ratio": 0.0, - "completion_length": 449.1041717529297, - "epoch": 0.941, - "grad_norm": 26.56932442747977, - "kl": 2.328125, - "learning_rate": 1.0950967505724175e-07, - "loss": 0.7233, - "reward": 2.446874976158142, - "reward_std": 0.4883129894733429, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.03229183237999678, - "rewards/tag_count_reward": 0.9583333730697632, + "completion_length": 668.5416870117188, + "epoch": 0.4705, + "grad_norm": 149.87469687137508, + "kl": 9.921875, + "learning_rate": 6.732816484115946e-07, + "loss": 1.7406, + "reward": 2.2741907835006714, + "reward_std": 0.8710194230079651, + "rewards/accuracy_reward": 0.6041666716337204, + "rewards/reasoning_steps_reward": 0.8888888955116272, + "rewards/repetition_penalty_reward": -0.03136484418064356, + "rewards/tag_count_reward": 0.8125, "step": 941 }, { "clip_ratio": 0.0, - "completion_length": 334.3541717529297, - "epoch": 0.942, - "grad_norm": 10.161335311358066, - "kl": 1.244140625, - "learning_rate": 1.0919113768029517e-07, - "loss": 0.0637, - "reward": 2.7936389446258545, - "reward_std": 0.438838854432106, - "rewards/accuracy_reward": 0.875, - "rewards/reasoning_steps_reward": 0.965277761220932, - "rewards/repetition_penalty_reward": -0.02580568566918373, - "rewards/tag_count_reward": 0.9791666865348816, + "completion_length": 346.2291717529297, + "epoch": 0.471, + "grad_norm": 28.86324528704515, + "kl": 3.076171875, + "learning_rate": 6.725261111682584e-07, + "loss": 0.739, + "reward": 2.4970829486846924, + "reward_std": 0.5250428169965744, + "rewards/accuracy_reward": 0.6250000149011612, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.01506982184946537, + "rewards/tag_count_reward": 0.9218750298023224, "step": 942 }, { "clip_ratio": 0.0, - "completion_length": 391.22918701171875, - "epoch": 0.943, - "grad_norm": 16.415006349713707, - "kl": 2.619140625, - "learning_rate": 1.0887797142022521e-07, - "loss": 0.2111, - "reward": 2.3882477283477783, - "reward_std": 0.3951050788164139, - "rewards/accuracy_reward": 0.458333358168602, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.018002384342253208, + "completion_length": 487.75001525878906, + "epoch": 0.4715, + "grad_norm": 56.50232838741636, + "kl": 3.095703125, + "learning_rate": 6.717702006891386e-07, + "loss": 0.723, + "reward": 2.7474167346954346, + "reward_std": 0.4104699492454529, + "rewards/accuracy_reward": 0.8333333730697632, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.02688878308981657, "rewards/tag_count_reward": 0.9479166865348816, "step": 943 }, { "clip_ratio": 0.0, - "completion_length": 540.5208435058594, - "epoch": 0.944, - "grad_norm": 30.14237606691049, - "kl": 5.453125, - "learning_rate": 1.0857018009286381e-07, - "loss": 1.0364, - "reward": 2.2670759558677673, - "reward_std": 0.7407488822937012, - "rewards/accuracy_reward": 0.4791666716337204, - "rewards/reasoning_steps_reward": 0.972222238779068, - "rewards/repetition_penalty_reward": -0.022854735143482685, - "rewards/tag_count_reward": 0.8385416865348816, + "completion_length": 494.35418701171875, + "epoch": 0.472, + "grad_norm": 29.170486989553982, + "kl": 2.484375, + "learning_rate": 6.710139192768694e-07, + "loss": 0.8221, + "reward": 2.548454523086548, + "reward_std": 0.6629728078842163, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.875, + "rewards/repetition_penalty_reward": -0.02446222584694624, + "rewards/tag_count_reward": 0.8854166865348816, "step": 944 }, { "clip_ratio": 0.0, - "completion_length": 409.10418701171875, - "epoch": 0.945, - "grad_norm": 28.58154188768852, - "kl": 3.96484375, - "learning_rate": 1.0826776744855121e-07, - "loss": 0.5507, - "reward": 2.29867160320282, - "reward_std": 0.5912670195102692, - "rewards/accuracy_reward": 0.4166666865348816, - "rewards/reasoning_steps_reward": 0.9652778208255768, - "rewards/repetition_penalty_reward": -0.01556460838764906, + "completion_length": 438.97918701171875, + "epoch": 0.4725, + "grad_norm": 32.67810625226338, + "kl": 1.666015625, + "learning_rate": 6.702572692352155e-07, + "loss": 0.43, + "reward": 2.627410054206848, + "reward_std": 0.5064197182655334, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.027104116044938564, "rewards/tag_count_reward": 0.9322916865348816, "step": 945 }, { "clip_ratio": 0.0, - "completion_length": 407.93751525878906, - "epoch": 0.946, - "grad_norm": 19.249242221629466, - "kl": 2.23828125, - "learning_rate": 1.0797073717209013e-07, - "loss": 0.555, - "reward": 2.5011874437332153, - "reward_std": 0.40425705909729004, - "rewards/accuracy_reward": 0.6250000298023224, - "rewards/reasoning_steps_reward": 0.9444444179534912, - "rewards/repetition_penalty_reward": -0.02659040503203869, - "rewards/tag_count_reward": 0.9583333432674408, + "completion_length": 395.1458435058594, + "epoch": 0.473, + "grad_norm": 17.27648804391711, + "kl": 1.1796875, + "learning_rate": 6.695002528690639e-07, + "loss": 0.4697, + "reward": 2.5221011638641357, + "reward_std": 0.5778738856315613, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9583333134651184, + "rewards/repetition_penalty_reward": -0.024773948825895786, + "rewards/tag_count_reward": 0.9218750298023224, "step": 946 }, { "clip_ratio": 0.0, - "completion_length": 505.66668701171875, - "epoch": 0.947, - "grad_norm": 73.25982352915128, - "kl": 6.1875, - "learning_rate": 1.0767909288270063e-07, - "loss": 1.2229, - "reward": 2.4074935913085938, - "reward_std": 0.7739560008049011, - "rewards/accuracy_reward": 0.6041666716337204, - "rewards/reasoning_steps_reward": 0.9375, - "rewards/repetition_penalty_reward": -0.024798161815851927, - "rewards/tag_count_reward": 0.8906250298023224, + "completion_length": 393.2708435058594, + "epoch": 0.4735, + "grad_norm": 29.4628975430113, + "kl": 1.16015625, + "learning_rate": 6.687428724844179e-07, + "loss": 0.7055, + "reward": 2.7500462532043457, + "reward_std": 0.556076854467392, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9583333134651184, + "rewards/repetition_penalty_reward": -0.02078736387193203, + "rewards/tag_count_reward": 0.9583333432674408, "step": 947 }, { "clip_ratio": 0.0, - "completion_length": 349.8333435058594, - "epoch": 0.948, - "grad_norm": 24.365433217819515, - "kl": 2.26953125, - "learning_rate": 1.0739283813397639e-07, - "loss": 0.1764, - "reward": 2.4521753787994385, - "reward_std": 0.42572128772735596, - "rewards/accuracy_reward": 0.5208333432674408, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.025255252607166767, - "rewards/tag_count_reward": 0.984375, + "completion_length": 440.75, + "epoch": 0.474, + "grad_norm": 20.698807383719775, + "kl": 2.576171875, + "learning_rate": 6.679851303883891e-07, + "loss": 0.4381, + "reward": 2.62761914730072, + "reward_std": 0.5112544745206833, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9305555522441864, + "rewards/repetition_penalty_reward": -0.05293664522469044, + "rewards/tag_count_reward": 0.8958333730697632, "step": 948 }, { "clip_ratio": 0.0, - "completion_length": 552.0625, - "epoch": 0.949, - "grad_norm": 46.55689685946224, - "kl": 7.375, - "learning_rate": 1.0711197641384115e-07, - "loss": 1.0546, - "reward": 2.4559671878814697, - "reward_std": 0.6184609234333038, - "rewards/accuracy_reward": 0.6875000298023224, - "rewards/reasoning_steps_reward": 0.9375000298023224, - "rewards/repetition_penalty_reward": -0.012783022597432137, - "rewards/tag_count_reward": 0.8437500298023224, + "completion_length": 693.7500305175781, + "epoch": 0.4745, + "grad_norm": 96.72626140777622, + "kl": 5.59375, + "learning_rate": 6.672270288891918e-07, + "loss": 1.1839, + "reward": 2.393503427505493, + "reward_std": 0.6808468699455261, + "rewards/accuracy_reward": 0.7083333730697632, + "rewards/reasoning_steps_reward": 0.847222238779068, + "rewards/repetition_penalty_reward": -0.01621889090165496, + "rewards/tag_count_reward": 0.8541666865348816, "step": 949 }, { "clip_ratio": 0.0, - "completion_length": 614.4166870117188, - "epoch": 0.95, - "grad_norm": 35.5385607782762, - "kl": 9.46875, - "learning_rate": 1.068365111445064e-07, - "loss": 1.4734, - "reward": 2.2201212644577026, - "reward_std": 0.7953130602836609, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/reasoning_steps_reward": 0.9375000298023224, - "rewards/repetition_penalty_reward": -0.03508709650486708, - "rewards/tag_count_reward": 0.7760416865348816, + "completion_length": 373.5208435058594, + "epoch": 0.475, + "grad_norm": 19.558125629971823, + "kl": 1.59765625, + "learning_rate": 6.664685702961344e-07, + "loss": 0.5762, + "reward": 2.7208783626556396, + "reward_std": 0.4175289124250412, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.03780231811106205, + "rewards/tag_count_reward": 0.9739583432674408, "step": 950 }, { "clip_ratio": 0.0, - "completion_length": 607.0416870117188, - "epoch": 0.951, - "grad_norm": 44.814962262317266, - "kl": 10.71875, - "learning_rate": 1.0656644568242946e-07, - "loss": 1.4213, - "reward": 2.336472511291504, - "reward_std": 0.7723419368267059, - "rewards/accuracy_reward": 0.6250000298023224, - "rewards/reasoning_steps_reward": 0.9513888955116272, - "rewards/repetition_penalty_reward": -0.01595810428261757, - "rewards/tag_count_reward": 0.7760416865348816, + "completion_length": 386.3541717529297, + "epoch": 0.4755, + "grad_norm": 10.796574951729502, + "kl": 1.2109375, + "learning_rate": 6.657097569196133e-07, + "loss": 0.6176, + "reward": 2.8126251697540283, + "reward_std": 0.4413589537143707, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.03633330576121807, + "rewards/tag_count_reward": 0.953125, "step": 951 }, { "clip_ratio": 0.0, - "completion_length": 468.31251525878906, - "epoch": 0.952, - "grad_norm": 25.864907881139306, - "kl": 3.4375, - "learning_rate": 1.063017833182728e-07, - "loss": 1.2212, - "reward": 2.3151695728302, - "reward_std": 0.5797133445739746, - "rewards/accuracy_reward": 0.4583333432674408, - "rewards/reasoning_steps_reward": 0.9652777910232544, - "rewards/repetition_penalty_reward": -0.014691710472106934, - "rewards/tag_count_reward": 0.9062500298023224, + "completion_length": 511.18751525878906, + "epoch": 0.476, + "grad_norm": 19.892775367759974, + "kl": 1.998046875, + "learning_rate": 6.649505910711058e-07, + "loss": 0.8213, + "reward": 2.4712870121002197, + "reward_std": 0.6184780597686768, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 0.9722221791744232, + "rewards/repetition_penalty_reward": -0.04781022481620312, + "rewards/tag_count_reward": 0.9218750298023224, "step": 952 }, { "clip_ratio": 0.0, - "completion_length": 622.8125305175781, - "epoch": 0.953, - "grad_norm": 38.335448681356254, - "kl": 9.015625, - "learning_rate": 1.0604252727686379e-07, - "loss": 1.011, - "reward": 2.2985631823539734, - "reward_std": 0.8530721068382263, - "rewards/accuracy_reward": 0.6041666865348816, - "rewards/reasoning_steps_reward": 0.9166666865348816, - "rewards/repetition_penalty_reward": -0.024353576824069023, - "rewards/tag_count_reward": 0.8020833432674408, + "completion_length": 392.4791717529297, + "epoch": 0.4765, + "grad_norm": 4.848138327898023, + "kl": 0.654296875, + "learning_rate": 6.641910750631626e-07, + "loss": 0.4091, + "reward": 2.516338586807251, + "reward_std": 0.4349832981824875, + "rewards/accuracy_reward": 0.5833333730697632, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.02011991571635008, + "rewards/tag_count_reward": 0.9739583432674408, "step": 953 }, { "clip_ratio": 0.0, - "completion_length": 426.62501525878906, - "epoch": 0.954, - "grad_norm": 44.054481520111416, - "kl": 3.671875, - "learning_rate": 1.0578868071715544e-07, - "loss": 1.0606, - "reward": 2.355110287666321, - "reward_std": 0.5248502939939499, - "rewards/accuracy_reward": 0.4791666865348816, - "rewards/reasoning_steps_reward": 0.9583334028720856, - "rewards/repetition_penalty_reward": -0.019889703020453453, - "rewards/tag_count_reward": 0.9375, + "completion_length": 345.2291717529297, + "epoch": 0.477, + "grad_norm": 7.336625031482144, + "kl": 0.3828125, + "learning_rate": 6.634312112094013e-07, + "loss": 0.2705, + "reward": 2.9284573793411255, + "reward_std": 0.1592683894559741, + "rewards/accuracy_reward": 0.9791666865348816, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.0281402375549078, + "rewards/tag_count_reward": 0.984375, "step": 954 }, { "clip_ratio": 0.0, - "completion_length": 394.50001525878906, - "epoch": 0.955, - "grad_norm": 17.701509278594894, - "kl": 2.38671875, - "learning_rate": 1.0554024673218806e-07, - "loss": 0.4845, - "reward": 2.6396692991256714, - "reward_std": 0.647905558347702, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.020053054206073284, - "rewards/tag_count_reward": 0.9166666865348816, + "completion_length": 437.06251525878906, + "epoch": 0.4775, + "grad_norm": 7.131878067757179, + "kl": 0.966796875, + "learning_rate": 6.626710018244987e-07, + "loss": 0.475, + "reward": 2.6469991207122803, + "reward_std": 0.4648020267486572, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9513888657093048, + "rewards/repetition_penalty_reward": -0.03876485675573349, + "rewards/tag_count_reward": 0.9427083432674408, "step": 955 }, { "clip_ratio": 0.0, - "completion_length": 387.5208435058594, - "epoch": 0.956, - "grad_norm": 22.09702269576327, - "kl": 1.2958984375, - "learning_rate": 1.0529722834905125e-07, - "loss": 0.3797, - "reward": 2.641054391860962, - "reward_std": 0.46035097539424896, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/reasoning_steps_reward": 0.9861111044883728, - "rewards/repetition_penalty_reward": -0.016931693069636822, - "rewards/tag_count_reward": 0.9635416865348816, + "completion_length": 548.125, + "epoch": 0.478, + "grad_norm": 6.425684042725763, + "kl": 1.1748046875, + "learning_rate": 6.619104492241847e-07, + "loss": 0.6925, + "reward": 2.6355875730514526, + "reward_std": 0.6411095261573792, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.92361119389534, + "rewards/repetition_penalty_reward": -0.02239851839840412, + "rewards/tag_count_reward": 0.921875, "step": 956 }, { "clip_ratio": 0.0, - "completion_length": 429.41668701171875, - "epoch": 0.957, - "grad_norm": 34.01762179013479, - "kl": 1.96875, - "learning_rate": 1.0505962852884739e-07, - "loss": 0.6316, - "reward": 2.5664961338043213, - "reward_std": 0.6158457398414612, - "rewards/accuracy_reward": 0.6875, - "rewards/reasoning_steps_reward": 0.965277761220932, - "rewards/repetition_penalty_reward": -0.02378181181848049, - "rewards/tag_count_reward": 0.9375000298023224, + "completion_length": 626.2916870117188, + "epoch": 0.4785, + "grad_norm": 16.967190075717454, + "kl": 1.76171875, + "learning_rate": 6.611495557252344e-07, + "loss": 0.5574, + "reward": 2.226294755935669, + "reward_std": 0.6637153029441833, + "rewards/accuracy_reward": 0.4375000149011612, + "rewards/reasoning_steps_reward": 0.9236111640930176, + "rewards/repetition_penalty_reward": -0.03585825115442276, + "rewards/tag_count_reward": 0.9010416865348816, "step": 957 }, { "clip_ratio": 0.0, - "completion_length": 401.2916717529297, - "epoch": 0.958, - "grad_norm": 27.1346809126327, - "kl": 2.7109375, - "learning_rate": 1.0482745016665526e-07, - "loss": 0.7956, - "reward": 2.382103443145752, - "reward_std": 0.40817123651504517, - "rewards/accuracy_reward": 0.5000000298023224, - "rewards/reasoning_steps_reward": 0.958333432674408, - "rewards/repetition_penalty_reward": -0.013729824218899012, - "rewards/tag_count_reward": 0.9375, + "completion_length": 404.2291717529297, + "epoch": 0.479, + "grad_norm": 10.223600950988965, + "kl": 0.904296875, + "learning_rate": 6.603883236454612e-07, + "loss": 0.3123, + "reward": 2.5562199354171753, + "reward_std": 0.36893555521965027, + "rewards/accuracy_reward": 0.645833358168602, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.02884970884770155, + "rewards/tag_count_reward": 0.9531250298023224, "step": 958 }, { "clip_ratio": 0.0, - "completion_length": 464.29168701171875, - "epoch": 0.959, - "grad_norm": 27.609867845288534, - "kl": 4.34375, - "learning_rate": 1.0460069609149496e-07, - "loss": 1.1173, - "reward": 2.4054399728775024, - "reward_std": 0.4486614912748337, - "rewards/accuracy_reward": 0.5625, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.011226738337427378, - "rewards/tag_count_reward": 0.8750000298023224, + "completion_length": 564.0000305175781, + "epoch": 0.4795, + "grad_norm": 13.545049398841398, + "kl": 1.447265625, + "learning_rate": 6.596267553037102e-07, + "loss": 0.6647, + "reward": 2.6013318300247192, + "reward_std": 0.5714467167854309, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.021932163275778294, + "rewards/tag_count_reward": 0.9010416865348816, "step": 959 }, { "clip_ratio": 0.0, - "completion_length": 582.2500305175781, - "epoch": 0.96, - "grad_norm": 34.491132802450544, - "kl": 5.03515625, - "learning_rate": 1.0437936906629334e-07, - "loss": 0.922, - "reward": 2.4019153118133545, - "reward_std": 0.6779120862483978, - "rewards/accuracy_reward": 0.5833333432674408, - "rewards/reasoning_steps_reward": 0.9652777910232544, - "rewards/repetition_penalty_reward": -0.021695845760405064, - "rewards/tag_count_reward": 0.8750000298023224, + "completion_length": 281.9375, + "epoch": 0.48, + "grad_norm": 3.6393715149294783, + "kl": 0.318359375, + "learning_rate": 6.588648530198504e-07, + "loss": 0.0146, + "reward": 2.820993185043335, + "reward_std": 0.29440733790397644, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.033173730596899986, + "rewards/tag_count_reward": 1.0, "step": 960 }, { "clip_ratio": 0.0, - "completion_length": 404.8125, - "epoch": 0.961, - "grad_norm": 23.15463113511402, - "kl": 4.71875, - "learning_rate": 1.0416347178785039e-07, - "loss": 0.7217, - "reward": 2.643435001373291, - "reward_std": 0.5447419583797455, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.021495724096894264, - "rewards/tag_count_reward": 0.921875, + "completion_length": 403.41668701171875, + "epoch": 0.4805, + "grad_norm": 12.12449463950798, + "kl": 0.63671875, + "learning_rate": 6.581026191147687e-07, + "loss": 0.2886, + "reward": 2.8694658279418945, + "reward_std": 0.1928266827017069, + "rewards/accuracy_reward": 0.9375, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.03157583996653557, + "rewards/tag_count_reward": 0.9635416865348816, "step": 961 }, { "clip_ratio": 0.0, - "completion_length": 386.6458435058594, - "epoch": 0.962, - "grad_norm": 14.191588671125146, - "kl": 2.1640625, - "learning_rate": 1.0395300688680625e-07, - "loss": 0.459, - "reward": 2.233402729034424, - "reward_std": 0.43939197063446045, - "rewards/accuracy_reward": 0.3333333544433117, - "rewards/reasoning_steps_reward": 0.9583333730697632, - "rewards/repetition_penalty_reward": -0.0165973836556077, - "rewards/tag_count_reward": 0.9583333432674408, + "completion_length": 473.35418701171875, + "epoch": 0.481, + "grad_norm": 11.78122309788569, + "kl": 0.736328125, + "learning_rate": 6.573400559103613e-07, + "loss": 0.5582, + "reward": 2.6566274166107178, + "reward_std": 0.5360835194587708, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.023928127251565456, + "rewards/tag_count_reward": 0.9375000298023224, "step": 962 }, { "clip_ratio": 0.0, - "completion_length": 684.1458435058594, - "epoch": 0.963, - "grad_norm": 66.27238140783334, - "kl": 8.921875, - "learning_rate": 1.0374797692760933e-07, - "loss": 1.2039, - "reward": 2.404576539993286, - "reward_std": 0.6691045165061951, - "rewards/accuracy_reward": 0.6250000298023224, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.025979137048125267, - "rewards/tag_count_reward": 0.8125000298023224, + "completion_length": 446.41668701171875, + "epoch": 0.4815, + "grad_norm": 5.271282050317202, + "kl": 0.7451171875, + "learning_rate": 6.565771657295285e-07, + "loss": 0.5522, + "reward": 2.6375869512557983, + "reward_std": 0.5502025783061981, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.030815905891358852, + "rewards/tag_count_reward": 0.9322916865348816, "step": 963 }, { "clip_ratio": 0.0, - "completion_length": 517.4375, + "completion_length": 652.5416717529297, + "epoch": 0.482, + "grad_norm": 15.413225325982117, + "kl": 2.1796875, + "learning_rate": 6.558139508961654e-07, + "loss": 0.3435, + "reward": 2.3981354236602783, + "reward_std": 0.5044302940368652, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9027778506278992, + "rewards/repetition_penalty_reward": -0.030684133991599083, + "rewards/tag_count_reward": 0.8593750298023224, + "step": 964 + }, + { + "clip_ratio": 0.0, + "completion_length": 512.5625152587891, + "epoch": 0.4825, + "grad_norm": 12.446301073299496, + "kl": 2.546875, + "learning_rate": 6.550504137351575e-07, + "loss": 0.6114, + "reward": 2.6402413845062256, + "reward_std": 0.6303743720054626, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.035105928778648376, + "rewards/tag_count_reward": 0.9322916865348816, + "step": 965 + }, + { + "clip_ratio": 0.0, + "completion_length": 326.625, + "epoch": 0.483, + "grad_norm": 18.969040400669726, + "kl": 1.03125, + "learning_rate": 6.542865565723707e-07, + "loss": 0.2782, + "reward": 2.5593295097351074, + "reward_std": 0.25267046224325895, + "rewards/accuracy_reward": 0.6041666716337204, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.0135872233659029, + "rewards/tag_count_reward": 0.9895833432674408, + "step": 966 + }, + { + "clip_ratio": 0.0, + "completion_length": 586.7291870117188, + "epoch": 0.4835, + "grad_norm": 16.445189525113065, + "kl": 1.46875, + "learning_rate": 6.53522381734647e-07, + "loss": 0.8025, + "reward": 2.4304505586624146, + "reward_std": 0.6917058825492859, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.029619052074849606, + "rewards/tag_count_reward": 0.890625, + "step": 967 + }, + { + "clip_ratio": 0.0, + "completion_length": 352.0416717529297, + "epoch": 0.484, + "grad_norm": 12.008822684584926, + "kl": 0.662109375, + "learning_rate": 6.527578915497951e-07, + "loss": 0.3397, + "reward": 2.639287829399109, + "reward_std": 0.19891707226634026, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.025642716325819492, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 968 + }, + { + "clip_ratio": 0.0, + "completion_length": 434.8333435058594, + "epoch": 0.4845, + "grad_norm": 20.326619395037802, + "kl": 0.962890625, + "learning_rate": 6.519930883465847e-07, + "loss": 0.7163, + "reward": 2.6135048866271973, + "reward_std": 0.5543282628059387, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.02538419794291258, + "rewards/tag_count_reward": 0.9375000298023224, + "step": 969 + }, + { + "clip_ratio": 0.0, + "completion_length": 655.375, + "epoch": 0.485, + "grad_norm": 19.44522032453373, + "kl": 1.9609375, + "learning_rate": 6.512279744547392e-07, + "loss": 0.3818, + "reward": 2.0434839129447937, + "reward_std": 0.49595144391059875, + "rewards/accuracy_reward": 0.2708333432674408, + "rewards/reasoning_steps_reward": 0.9305556118488312, + "rewards/repetition_penalty_reward": -0.022488368675112724, + "rewards/tag_count_reward": 0.8645833432674408, + "step": 970 + }, + { + "clip_ratio": 0.0, + "completion_length": 469.16668701171875, + "epoch": 0.4855, + "grad_norm": 24.245196285909877, + "kl": 1.53515625, + "learning_rate": 6.50462552204928e-07, + "loss": 0.2542, + "reward": 2.400037169456482, + "reward_std": 0.32993632555007935, + "rewards/accuracy_reward": 0.5000000149011612, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.03225455339998007, + "rewards/tag_count_reward": 0.953125, + "step": 971 + }, + { + "clip_ratio": 0.0, + "completion_length": 513.3333435058594, + "epoch": 0.486, + "grad_norm": 8.729059180440508, + "kl": 1.6484375, + "learning_rate": 6.496968239287603e-07, + "loss": 0.6719, + "reward": 2.4571033716201782, + "reward_std": 0.5960609018802643, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.01859105657786131, + "rewards/tag_count_reward": 0.9270833432674408, + "step": 972 + }, + { + "clip_ratio": 0.0, + "completion_length": 365.9583435058594, + "epoch": 0.4865, + "grad_norm": 11.187904638989847, + "kl": 1.177734375, + "learning_rate": 6.489307919587769e-07, + "loss": 0.3269, + "reward": 2.578757405281067, + "reward_std": 0.5115742385387421, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9583334028720856, + "rewards/repetition_penalty_reward": -0.0358259379863739, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 973 + }, + { + "clip_ratio": 0.0, + "completion_length": 299.37501525878906, + "epoch": 0.487, + "grad_norm": 5.4073047177912175, + "kl": 0.423828125, + "learning_rate": 6.481644586284442e-07, + "loss": 0.2367, + "reward": 2.902934432029724, + "reward_std": 0.275345578789711, + "rewards/accuracy_reward": 0.9583333730697632, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.018940678797662258, + "rewards/tag_count_reward": 0.984375, + "step": 974 + }, + { + "clip_ratio": 0.0, + "completion_length": 521.4791870117188, + "epoch": 0.4875, + "grad_norm": 6.345217828877253, + "kl": 1.3828125, + "learning_rate": 6.473978262721463e-07, + "loss": 0.5127, + "reward": 2.6163452863693237, + "reward_std": 0.4908182621002197, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.039904940873384476, + "rewards/tag_count_reward": 0.90625, + "step": 975 + }, + { + "clip_ratio": 0.0, + "completion_length": 328.47918701171875, + "epoch": 0.488, + "grad_norm": 20.969372614612894, + "kl": 0.859375, + "learning_rate": 6.466308972251785e-07, + "loss": 0.3447, + "reward": 2.4979482889175415, + "reward_std": 0.43351081013679504, + "rewards/accuracy_reward": 0.5625000149011612, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.01767673483118415, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 976 + }, + { + "clip_ratio": 0.0, + "completion_length": 553.3541870117188, + "epoch": 0.4885, + "grad_norm": 22.992172626012245, + "kl": 2.705078125, + "learning_rate": 6.458636738237395e-07, + "loss": 0.2706, + "reward": 1.9824464321136475, + "reward_std": 0.34522050246596336, + "rewards/accuracy_reward": 0.1875, + "rewards/reasoning_steps_reward": 0.9236111342906952, + "rewards/repetition_penalty_reward": -0.01928970357403159, + "rewards/tag_count_reward": 0.890625, + "step": 977 + }, + { + "clip_ratio": 0.0, + "completion_length": 351.9791717529297, + "epoch": 0.489, + "grad_norm": 13.361424348905297, + "kl": 1.169921875, + "learning_rate": 6.45096158404925e-07, + "loss": 0.2465, + "reward": 2.648337244987488, + "reward_std": 0.3259388506412506, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9236111342906952, + "rewards/repetition_penalty_reward": -0.0461073312908411, + "rewards/tag_count_reward": 0.9791666865348816, + "step": 978 + }, + { + "clip_ratio": 0.0, + "completion_length": 295.31251525878906, + "epoch": 0.4895, + "grad_norm": 7.444601060796791, + "kl": 0.79296875, + "learning_rate": 6.443283533067198e-07, + "loss": 0.1929, + "reward": 2.6383087635040283, + "reward_std": 0.48684653639793396, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9722222089767456, + "rewards/repetition_penalty_reward": -0.03703854978084564, + "rewards/tag_count_reward": 0.9947916865348816, + "step": 979 + }, + { + "clip_ratio": 0.0, + "completion_length": 394.81251525878906, + "epoch": 0.49, + "grad_norm": 16.978947124748732, + "kl": 2.134765625, + "learning_rate": 6.435602608679916e-07, + "loss": 0.54, + "reward": 2.573986530303955, + "reward_std": 0.4741198271512985, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9375000596046448, + "rewards/repetition_penalty_reward": -0.014555106405168772, + "rewards/tag_count_reward": 0.9427083432674408, + "step": 980 + }, + { + "clip_ratio": 0.0, + "completion_length": 397.5, + "epoch": 0.4905, + "grad_norm": 7.762331640955408, + "kl": 1.609375, + "learning_rate": 6.427918834284834e-07, + "loss": 0.3917, + "reward": 2.5270793437957764, + "reward_std": 0.3988271802663803, + "rewards/accuracy_reward": 0.6250000149011612, + "rewards/reasoning_steps_reward": 0.9513889849185944, + "rewards/repetition_penalty_reward": -0.02326801000162959, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 981 + }, + { + "clip_ratio": 0.0, + "completion_length": 389.5625, + "epoch": 0.491, + "grad_norm": 15.572448453302112, + "kl": 1.1484375, + "learning_rate": 6.420232233288055e-07, + "loss": 0.2343, + "reward": 2.2758413553237915, + "reward_std": 0.42710670828819275, + "rewards/accuracy_reward": 0.3541666716337204, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.036658719182014465, + "rewards/tag_count_reward": 0.9791666865348816, + "step": 982 + }, + { + "clip_ratio": 0.0, + "completion_length": 328.1458435058594, + "epoch": 0.4915, + "grad_norm": 8.40361032839979, + "kl": 0.9599609375, + "learning_rate": 6.412542829104306e-07, + "loss": 0.0975, + "reward": 2.634483575820923, + "reward_std": 0.15141713619232178, + "rewards/accuracy_reward": 0.75, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.02350257895886898, + "rewards/tag_count_reward": 0.9427083432674408, + "step": 983 + }, + { + "clip_ratio": 0.0, + "completion_length": 373.06251525878906, + "epoch": 0.492, + "grad_norm": 27.667281989688025, + "kl": 0.796875, + "learning_rate": 6.404850645156841e-07, + "loss": 0.3823, + "reward": 2.5004520416259766, + "reward_std": 0.42026159167289734, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.025589827448129654, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 984 + }, + { + "clip_ratio": 0.0, + "completion_length": 271.50001525878906, + "epoch": 0.4925, + "grad_norm": 5.158618170324173, + "kl": 0.271484375, + "learning_rate": 6.397155704877388e-07, + "loss": -0.0319, + "reward": 2.6765854358673096, + "reward_std": 0.21046356856822968, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.024803485721349716, + "rewards/tag_count_reward": 1.0, + "step": 985 + }, + { + "clip_ratio": 0.0, + "completion_length": 374.375, + "epoch": 0.493, + "grad_norm": 21.884347593037273, + "kl": 0.80859375, + "learning_rate": 6.389458031706068e-07, + "loss": 0.431, + "reward": 2.621227979660034, + "reward_std": 0.43661460280418396, + "rewards/accuracy_reward": 0.75, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.03154988866299391, + "rewards/tag_count_reward": 0.9583333432674408, + "step": 986 + }, + { + "clip_ratio": 0.0, + "completion_length": 330.87501525878906, + "epoch": 0.4935, + "grad_norm": 4.514766065731832, + "kl": 0.408203125, + "learning_rate": 6.381757649091329e-07, + "loss": 0.1863, + "reward": 2.54175066947937, + "reward_std": 0.15631027286872268, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.025957662612199783, + "rewards/tag_count_reward": 0.984375, + "step": 987 + }, + { + "clip_ratio": 0.0, + "completion_length": 426.47918701171875, + "epoch": 0.494, + "grad_norm": 8.57007282438186, + "kl": 1.37109375, + "learning_rate": 6.374054580489873e-07, + "loss": 0.5563, + "reward": 2.7405987977981567, + "reward_std": 0.40447968570515513, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.016345561482012272, + "rewards/tag_count_reward": 0.9375, + "step": 988 + }, + { + "clip_ratio": 0.0, + "completion_length": 459.77085876464844, + "epoch": 0.4945, + "grad_norm": 7.660866604622285, + "kl": 1.337890625, + "learning_rate": 6.366348849366583e-07, + "loss": 0.1801, + "reward": 2.6990854740142822, + "reward_std": 0.36674173176288605, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.04570627398788929, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 989 + }, + { + "clip_ratio": 0.0, + "completion_length": 329.97918701171875, + "epoch": 0.495, + "grad_norm": 21.838061572005632, + "kl": 1.4375, + "learning_rate": 6.358640479194451e-07, + "loss": 0.4576, + "reward": 2.5701619386672974, + "reward_std": 0.5304689109325409, + "rewards/accuracy_reward": 0.625, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.01664359588176012, + "rewards/tag_count_reward": 0.96875, + "step": 990 + }, + { + "clip_ratio": 0.0, + "completion_length": 327.43751525878906, + "epoch": 0.4955, + "grad_norm": 24.17528666080015, + "kl": 0.6298828125, + "learning_rate": 6.35092949345451e-07, + "loss": 0.3975, + "reward": 2.8759747743606567, + "reward_std": 0.2522295266389847, + "rewards/accuracy_reward": 0.9375000298023224, + "rewards/reasoning_steps_reward": 0.9861111640930176, + "rewards/repetition_penalty_reward": -0.03201141953468323, + "rewards/tag_count_reward": 0.984375, + "step": 991 + }, + { + "clip_ratio": 0.0, + "completion_length": 399.31251525878906, + "epoch": 0.496, + "grad_norm": 32.1142545684353, + "kl": 3.203125, + "learning_rate": 6.343215915635761e-07, + "loss": 0.7948, + "reward": 2.630898594856262, + "reward_std": 0.6931562125682831, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.9236111342906952, + "rewards/repetition_penalty_reward": -0.027087541297078133, + "rewards/tag_count_reward": 0.921875, + "step": 992 + }, + { + "clip_ratio": 0.0, + "completion_length": 471.10418701171875, + "epoch": 0.4965, + "grad_norm": 48.892823948801954, + "kl": 3.1953125, + "learning_rate": 6.335499769235098e-07, + "loss": 0.6139, + "reward": 2.738042116165161, + "reward_std": 0.37622474133968353, + "rewards/accuracy_reward": 0.875, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.05015224777162075, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 993 + }, + { + "clip_ratio": 0.0, + "completion_length": 330.87500762939453, + "epoch": 0.497, + "grad_norm": 28.390795641858606, + "kl": 1.5107421875, + "learning_rate": 6.327781077757241e-07, + "loss": 0.2853, + "reward": 2.7957078218460083, + "reward_std": 0.19093896262347698, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.025472811423242092, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 994 + }, + { + "clip_ratio": 0.0, + "completion_length": 329.6666717529297, + "epoch": 0.4975, + "grad_norm": 10.891904018933554, + "kl": 1.03515625, + "learning_rate": 6.320059864714664e-07, + "loss": 0.1864, + "reward": 2.720821738243103, + "reward_std": 0.32492922246456146, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.02570601273328066, + "rewards/tag_count_reward": 0.9895833432674408, + "step": 995 + }, + { + "clip_ratio": 0.0, + "completion_length": 435.1041717529297, + "epoch": 0.498, + "grad_norm": 27.389591437302162, + "kl": 1.552734375, + "learning_rate": 6.31233615362752e-07, + "loss": 0.7395, + "reward": 2.655308961868286, + "reward_std": 0.42288788408041, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.03392717707902193, + "rewards/tag_count_reward": 0.9531250298023224, + "step": 996 + }, + { + "clip_ratio": 0.0, + "completion_length": 394.4583435058594, + "epoch": 0.4985, + "grad_norm": 37.45628290835572, + "kl": 1.1953125, + "learning_rate": 6.304609968023572e-07, + "loss": 0.5472, + "reward": 2.78287935256958, + "reward_std": 0.4373747333884239, + "rewards/accuracy_reward": 0.8750000298023224, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.031356871128082275, + "rewards/tag_count_reward": 0.9531250298023224, + "step": 997 + }, + { + "clip_ratio": 0.0, + "completion_length": 440.3333435058594, + "epoch": 0.499, + "grad_norm": 8.83893360388916, + "kl": 1.12890625, + "learning_rate": 6.296881331438126e-07, + "loss": 0.3209, + "reward": 2.574609398841858, + "reward_std": 0.4172719120979309, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.019140728749334812, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 998 + }, + { + "clip_ratio": 0.0, + "completion_length": 448.4166717529297, + "epoch": 0.4995, + "grad_norm": 25.468071616698786, + "kl": 1.109375, + "learning_rate": 6.289150267413942e-07, + "loss": 0.5873, + "reward": 2.646699547767639, + "reward_std": 0.38845836371183395, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.023439443670213223, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 999 + }, + { + "clip_ratio": 0.0, + "completion_length": 591.1041717529297, + "epoch": 0.5, + "grad_norm": 9.084576194579185, + "kl": 2.25, + "learning_rate": 6.281416799501187e-07, + "loss": 0.5736, + "reward": 2.3330485820770264, + "reward_std": 0.5498259365558624, + "rewards/accuracy_reward": 0.5000000149011612, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.019382060505449772, + "rewards/tag_count_reward": 0.8802083730697632, + "step": 1000 + }, + { + "clip_ratio": 0.0, + "completion_length": 461.8333435058594, + "epoch": 0.5005, + "grad_norm": 9.95609254967102, + "kl": 1.60546875, + "learning_rate": 6.273680951257342e-07, + "loss": 0.5362, + "reward": 2.549267292022705, + "reward_std": 0.6142518520355225, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9652778506278992, + "rewards/repetition_penalty_reward": -0.03580236993730068, + "rewards/tag_count_reward": 0.9322916865348816, + "step": 1001 + }, + { + "clip_ratio": 0.0, + "completion_length": 455.0833435058594, + "epoch": 0.501, + "grad_norm": 14.049667830028957, + "kl": 2.2265625, + "learning_rate": 6.265942746247146e-07, + "loss": 0.7722, + "reward": 2.731945276260376, + "reward_std": 0.5192966759204865, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.021526905708014965, + "rewards/tag_count_reward": 0.9270833730697632, + "step": 1002 + }, + { + "clip_ratio": 0.0, + "completion_length": 623.5625305175781, + "epoch": 0.5015, + "grad_norm": 31.177249787933718, + "kl": 2.55859375, + "learning_rate": 6.258202208042511e-07, + "loss": 1.1778, + "reward": 2.524030923843384, + "reward_std": 0.7691468000411987, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9305556118488312, + "rewards/repetition_penalty_reward": -0.02110826689749956, + "rewards/tag_count_reward": 0.8854166865348816, + "step": 1003 + }, + { + "clip_ratio": 0.0, + "completion_length": 584.2083435058594, + "epoch": 0.502, + "grad_norm": 39.78085883662432, + "kl": 3.6015625, + "learning_rate": 6.25045936022246e-07, + "loss": 1.1624, + "reward": 2.3681070804595947, + "reward_std": 0.57652947306633, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 0.951388955116272, + "rewards/repetition_penalty_reward": -0.015573485288769007, + "rewards/tag_count_reward": 0.9114583730697632, + "step": 1004 + }, + { + "clip_ratio": 0.0, + "completion_length": 541.2083587646484, + "epoch": 0.5025, + "grad_norm": 15.400304918011507, + "kl": 2.7578125, + "learning_rate": 6.242714226373049e-07, + "loss": 0.8093, + "reward": 2.1841408014297485, + "reward_std": 0.5138780176639557, + "rewards/accuracy_reward": 0.3750000149011612, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.015512102749198675, + "rewards/tag_count_reward": 0.8802083432674408, + "step": 1005 + }, + { + "clip_ratio": 0.0, + "completion_length": 517.3541870117188, + "epoch": 0.503, + "grad_norm": 6.931951953481591, + "kl": 2.15234375, + "learning_rate": 6.2349668300873e-07, + "loss": 0.6114, + "reward": 2.5530234575271606, + "reward_std": 0.6553203761577606, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9375000298023224, + "rewards/repetition_penalty_reward": -0.03030997794121504, + "rewards/tag_count_reward": 0.8958333432674408, + "step": 1006 + }, + { + "clip_ratio": 0.0, + "completion_length": 549.6666870117188, + "epoch": 0.5035, + "grad_norm": 22.648929704749897, + "kl": 1.70703125, + "learning_rate": 6.227217194965125e-07, + "loss": 0.8695, + "reward": 2.318639397621155, + "reward_std": 0.6079018115997314, + "rewards/accuracy_reward": 0.4791666865348816, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.014693964272737503, + "rewards/tag_count_reward": 0.8750000298023224, + "step": 1007 + }, + { + "clip_ratio": 0.0, + "completion_length": 386.8333435058594, + "epoch": 0.504, + "grad_norm": 28.81093113427859, + "kl": 0.859375, + "learning_rate": 6.219465344613258e-07, + "loss": 0.4981, + "reward": 2.6072503328323364, + "reward_std": 0.45103733241558075, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.026430224999785423, + "rewards/tag_count_reward": 0.9531250298023224, + "step": 1008 + }, + { + "clip_ratio": 0.0, + "completion_length": 579.2500305175781, + "epoch": 0.5045, + "grad_norm": 18.90641611996879, + "kl": 1.79296875, + "learning_rate": 6.211711302645177e-07, + "loss": 0.7654, + "reward": 2.513100743293762, + "reward_std": 0.6586687564849854, + "rewards/accuracy_reward": 0.7083333730697632, + "rewards/reasoning_steps_reward": 0.9444445371627808, + "rewards/repetition_penalty_reward": -0.025093771517276764, + "rewards/tag_count_reward": 0.8854166865348816, + "step": 1009 + }, + { + "clip_ratio": 0.0, + "completion_length": 316.3333435058594, + "epoch": 0.505, + "grad_norm": 10.521243740499507, + "kl": 0.7080078125, + "learning_rate": 6.203955092681039e-07, + "loss": 0.2849, + "reward": 2.616108775138855, + "reward_std": 0.26797990035265684, + "rewards/accuracy_reward": 0.6875, + "rewards/reasoning_steps_reward": 0.9861111640930176, + "rewards/repetition_penalty_reward": -0.031460804864764214, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 1010 + }, + { + "clip_ratio": 0.0, + "completion_length": 762.6875457763672, + "epoch": 0.5055, + "grad_norm": 17.712711157663584, + "kl": 2.359375, + "learning_rate": 6.196196738347607e-07, + "loss": 0.5658, + "reward": 2.252875328063965, + "reward_std": 0.44946494698524475, + "rewards/accuracy_reward": 0.4375000149011612, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.0370553620159626, + "rewards/tag_count_reward": 0.8593750298023224, + "step": 1011 + }, + { + "clip_ratio": 0.0, + "completion_length": 358.9791717529297, + "epoch": 0.506, + "grad_norm": 6.724680601267686, + "kl": 0.7841796875, + "learning_rate": 6.188436263278172e-07, + "loss": 0.371, + "reward": 2.8078267574310303, + "reward_std": 0.3918469473719597, + "rewards/accuracy_reward": 0.8750000298023224, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.018562145065516233, + "rewards/tag_count_reward": 0.9583333730697632, + "step": 1012 + }, + { + "clip_ratio": 0.0, + "completion_length": 381.6666717529297, + "epoch": 0.5065, + "grad_norm": 7.753482099144785, + "kl": 1.005859375, + "learning_rate": 6.180673691112486e-07, + "loss": 0.7111, + "reward": 2.758282780647278, + "reward_std": 0.42837944626808167, + "rewards/accuracy_reward": 0.8750000298023224, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.02643944276496768, + "rewards/tag_count_reward": 0.9583333432674408, + "step": 1013 + }, + { + "clip_ratio": 0.0, + "completion_length": 618.2500305175781, + "epoch": 0.507, + "grad_norm": 14.72862436999275, + "kl": 1.84765625, + "learning_rate": 6.172909045496694e-07, + "loss": 1.0001, + "reward": 2.36434543132782, + "reward_std": 0.6381807327270508, + "rewards/accuracy_reward": 0.5625000149011612, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.0245436392724514, + "rewards/tag_count_reward": 0.8750000298023224, + "step": 1014 + }, + { + "clip_ratio": 0.0, + "completion_length": 824.8750457763672, + "epoch": 0.5075, + "grad_norm": 16.05316707574026, + "kl": 1.919921875, + "learning_rate": 6.165142350083249e-07, + "loss": 0.9197, + "reward": 2.327476441860199, + "reward_std": 0.8264204859733582, + "rewards/accuracy_reward": 0.6875, + "rewards/reasoning_steps_reward": 0.8750000298023224, + "rewards/repetition_penalty_reward": -0.016273547895252705, + "rewards/tag_count_reward": 0.78125, + "step": 1015 + }, + { + "clip_ratio": 0.0, + "completion_length": 662.8333435058594, + "epoch": 0.508, + "grad_norm": 14.986219495301736, + "kl": 1.734375, + "learning_rate": 6.157373628530852e-07, + "loss": 0.6306, + "reward": 2.3806835412979126, + "reward_std": 0.4617319107055664, + "rewards/accuracy_reward": 0.5625000149011612, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.018622069619596004, + "rewards/tag_count_reward": 0.8645833432674408, + "step": 1016 + }, + { + "clip_ratio": 0.0, + "completion_length": 493.97918701171875, + "epoch": 0.5085, + "grad_norm": 10.603280718054956, + "kl": 1.19921875, + "learning_rate": 6.149602904504378e-07, + "loss": 0.6167, + "reward": 2.702357053756714, + "reward_std": 0.46163134276866913, + "rewards/accuracy_reward": 0.8750000298023224, + "rewards/reasoning_steps_reward": 0.9375000298023224, + "rewards/repetition_penalty_reward": -0.026809771545231342, + "rewards/tag_count_reward": 0.9166666865348816, + "step": 1017 + }, + { + "clip_ratio": 0.0, + "completion_length": 505.5, + "epoch": 0.509, + "grad_norm": 12.432373079567979, + "kl": 1.107421875, + "learning_rate": 6.141830201674802e-07, + "loss": 0.7902, + "reward": 2.693175435066223, + "reward_std": 0.5161421597003937, + "rewards/accuracy_reward": 0.8333333730697632, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.03425509575754404, + "rewards/tag_count_reward": 0.9218750298023224, + "step": 1018 + }, + { + "clip_ratio": 0.0, + "completion_length": 632.8750305175781, + "epoch": 0.5095, + "grad_norm": 7.175100274761201, + "kl": 1.376953125, + "learning_rate": 6.134055543719121e-07, + "loss": 0.4146, + "reward": 2.4220484495162964, + "reward_std": 0.5273237824440002, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9027778208255768, + "rewards/repetition_penalty_reward": -0.027604437433183193, + "rewards/tag_count_reward": 0.8593750298023224, + "step": 1019 + }, + { + "clip_ratio": 0.0, + "completion_length": 456.6250305175781, + "epoch": 0.51, + "grad_norm": 16.588343745029032, + "kl": 0.994140625, + "learning_rate": 6.126278954320294e-07, + "loss": 0.7095, + "reward": 2.6970959901809692, + "reward_std": 0.5613195151090622, + "rewards/accuracy_reward": 0.8125, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.023390086367726326, + "rewards/tag_count_reward": 0.9218750298023224, + "step": 1020 + }, + { + "clip_ratio": 0.0, + "completion_length": 734.9375, + "epoch": 0.5105, + "grad_norm": 9.757877385128149, + "kl": 1.38671875, + "learning_rate": 6.118500457167159e-07, + "loss": 0.8497, + "reward": 2.0375831723213196, + "reward_std": 0.6933330148458481, + "rewards/accuracy_reward": 0.3750000149011612, + "rewards/reasoning_steps_reward": 0.8402778208255768, + "rewards/repetition_penalty_reward": -0.021444641053676605, + "rewards/tag_count_reward": 0.8437500298023224, + "step": 1021 + }, + { + "clip_ratio": 0.0, + "completion_length": 400.6250305175781, + "epoch": 0.511, + "grad_norm": 3.581679020231462, + "kl": 0.5986328125, + "learning_rate": 6.11072007595437e-07, + "loss": 0.2243, + "reward": 2.635998249053955, + "reward_std": 0.3600511699914932, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.020251845009624958, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 1022 + }, + { + "clip_ratio": 0.0, + "completion_length": 532.3125305175781, + "epoch": 0.5115, + "grad_norm": 17.57086871577716, + "kl": 0.810546875, + "learning_rate": 6.102937834382315e-07, + "loss": 0.7239, + "reward": 2.707748532295227, + "reward_std": 0.5111279785633087, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.021418226417154074, + "rewards/tag_count_reward": 0.9166666865348816, + "step": 1023 + }, + { + "clip_ratio": 0.0, + "completion_length": 446.0000305175781, + "epoch": 0.512, + "grad_norm": 5.091174343110264, + "kl": 0.666015625, + "learning_rate": 6.095153756157051e-07, + "loss": 0.4922, + "reward": 2.5856151580810547, + "reward_std": 0.5574827194213867, + "rewards/accuracy_reward": 0.7083333730697632, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.027232157066464424, + "rewards/tag_count_reward": 0.9531250298023224, + "step": 1024 + }, + { + "clip_ratio": 0.0, + "completion_length": 425.56251525878906, + "epoch": 0.5125, + "grad_norm": 7.051286670184644, + "kl": 0.54296875, + "learning_rate": 6.087367864990232e-07, + "loss": 0.4512, + "reward": 2.401527166366577, + "reward_std": 0.45996415615081787, + "rewards/accuracy_reward": 0.5000000149011612, + "rewards/reasoning_steps_reward": 0.965277761220932, + "rewards/repetition_penalty_reward": -0.027292468585073948, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 1025 + }, + { + "clip_ratio": 0.0, + "completion_length": 481.41668701171875, + "epoch": 0.513, + "grad_norm": 6.804881212406539, + "kl": 0.7265625, + "learning_rate": 6.079580184599032e-07, + "loss": 0.752, + "reward": 2.53912889957428, + "reward_std": 0.4916386604309082, + "rewards/accuracy_reward": 0.6875000149011612, + "rewards/reasoning_steps_reward": 0.9513888657093048, + "rewards/repetition_penalty_reward": -0.03205180913209915, + "rewards/tag_count_reward": 0.9322916865348816, + "step": 1026 + }, + { + "clip_ratio": 0.0, + "completion_length": 428.25001525878906, + "epoch": 0.5135, + "grad_norm": 4.0838002502110955, + "kl": 0.689453125, + "learning_rate": 6.071790738706078e-07, + "loss": 0.7677, + "reward": 2.633172869682312, + "reward_std": 0.5927000939846039, + "rewards/accuracy_reward": 0.7708333730697632, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.030021720565855503, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 1027 + }, + { + "clip_ratio": 0.0, + "completion_length": 445.8750305175781, + "epoch": 0.514, + "grad_norm": 6.82558184595021, + "kl": 0.6650390625, + "learning_rate": 6.06399955103937e-07, + "loss": 0.3746, + "reward": 2.3708232641220093, + "reward_std": 0.47719016671180725, + "rewards/accuracy_reward": 0.5000000223517418, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.021537834778428078, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 1028 + }, + { + "clip_ratio": 0.0, + "completion_length": 612.4375305175781, + "epoch": 0.5145, + "grad_norm": 29.593011906377633, + "kl": 1.89453125, + "learning_rate": 6.056206645332217e-07, + "loss": 0.6587, + "reward": 2.507744789123535, + "reward_std": 0.5372722446918488, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9097222685813904, + "rewards/repetition_penalty_reward": -0.03218586929142475, + "rewards/tag_count_reward": 0.8802083432674408, + "step": 1029 + }, + { + "clip_ratio": 0.0, + "completion_length": 447.0208435058594, + "epoch": 0.515, + "grad_norm": 14.40828703378696, + "kl": 1.123046875, + "learning_rate": 6.048412045323164e-07, + "loss": 0.3732, + "reward": 2.6217926740646362, + "reward_std": 0.3849295526742935, + "rewards/accuracy_reward": 0.75, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.03966595232486725, + "rewards/tag_count_reward": 0.9531250298023224, + "step": 1030 + }, + { + "clip_ratio": 0.0, + "completion_length": 443.4583435058594, + "epoch": 0.5155, + "grad_norm": 14.1549537273764, + "kl": 1.20703125, + "learning_rate": 6.040615774755911e-07, + "loss": 0.8186, + "reward": 2.3135095834732056, + "reward_std": 0.4170246571302414, + "rewards/accuracy_reward": 0.45833333395421505, + "rewards/reasoning_steps_reward": 0.9583334028720856, + "rewards/repetition_penalty_reward": -0.01461556926369667, + "rewards/tag_count_reward": 0.9114583432674408, + "step": 1031 + }, + { + "clip_ratio": 0.0, + "completion_length": 668.5625, + "epoch": 0.516, + "grad_norm": 16.64560517730956, + "kl": 1.71875, + "learning_rate": 6.032817857379256e-07, + "loss": 0.5198, + "reward": 2.264461040496826, + "reward_std": 0.5811220407485962, + "rewards/accuracy_reward": 0.5416666716337204, + "rewards/reasoning_steps_reward": 0.9166666567325592, + "rewards/repetition_penalty_reward": -0.02720571681857109, + "rewards/tag_count_reward": 0.8333333432674408, + "step": 1032 + }, + { + "clip_ratio": 0.0, + "completion_length": 331.93751525878906, + "epoch": 0.5165, + "grad_norm": 4.122201207331607, + "kl": 0.443359375, + "learning_rate": 6.025018316946999e-07, + "loss": 0.2138, + "reward": 2.7509995698928833, + "reward_std": 0.42070019245147705, + "rewards/accuracy_reward": 0.8125, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.026778437197208405, + "rewards/tag_count_reward": 0.9791666865348816, + "step": 1033 + }, + { + "clip_ratio": 0.0, + "completion_length": 630.0833740234375, + "epoch": 0.517, + "grad_norm": 5.176497247797343, + "kl": 1.1875, + "learning_rate": 6.017217177217899e-07, + "loss": 0.7832, + "reward": 2.3645520210266113, + "reward_std": 0.8221311867237091, + "rewards/accuracy_reward": 0.645833358168602, + "rewards/reasoning_steps_reward": 0.8888889253139496, + "rewards/repetition_penalty_reward": -0.02954527549445629, + "rewards/tag_count_reward": 0.859375, + "step": 1034 + }, + { + "clip_ratio": 0.0, + "completion_length": 402.08335876464844, + "epoch": 0.5175, + "grad_norm": 4.289877365631067, + "kl": 0.6337890625, + "learning_rate": 6.009414461955581e-07, + "loss": 0.3456, + "reward": 2.8156780004501343, + "reward_std": 0.25007878383621573, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.012447083368897438, + "rewards/tag_count_reward": 0.953125, + "step": 1035 + }, + { + "clip_ratio": 0.0, + "completion_length": 511.6666717529297, + "epoch": 0.518, + "grad_norm": 4.534984555827674, + "kl": 0.7265625, + "learning_rate": 6.001610194928464e-07, + "loss": 0.6627, + "reward": 2.6687086820602417, + "reward_std": 0.5741814076900482, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.02052734326571226, + "rewards/tag_count_reward": 0.9322916865348816, + "step": 1036 + }, + { + "clip_ratio": 0.0, + "completion_length": 293.4166717529297, + "epoch": 0.5185, + "grad_norm": 9.036303143255743, + "kl": 0.3173828125, + "learning_rate": 5.993804399909703e-07, + "loss": 0.2845, + "reward": 2.868683695793152, + "reward_std": 0.25010958313941956, + "rewards/accuracy_reward": 0.9583333730697632, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.023677408695220947, + "rewards/tag_count_reward": 0.9895833432674408, + "step": 1037 + }, + { + "clip_ratio": 0.0, + "completion_length": 581.3125305175781, + "epoch": 0.519, + "grad_norm": 4.473097097190504, + "kl": 1.04296875, + "learning_rate": 5.985997100677103e-07, + "loss": 0.3827, + "reward": 2.478750705718994, + "reward_std": 0.30521145928651094, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.01083279075101018, + "rewards/tag_count_reward": 0.8645833432674408, + "step": 1038 + }, + { + "clip_ratio": 0.0, + "completion_length": 448.91668701171875, + "epoch": 0.5195, + "grad_norm": 4.330270457188464, + "kl": 0.677734375, + "learning_rate": 5.97818832101305e-07, + "loss": 0.4467, + "reward": 2.632758140563965, + "reward_std": 0.37518632411956787, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.025228080339729786, + "rewards/tag_count_reward": 0.9218750298023224, + "step": 1039 + }, + { + "clip_ratio": 0.0, + "completion_length": 432.4166717529297, + "epoch": 0.52, + "grad_norm": 5.413314396002232, + "kl": 0.7490234375, + "learning_rate": 5.97037808470444e-07, + "loss": 0.4434, + "reward": 2.658997416496277, + "reward_std": 0.548340767621994, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9305555522441864, + "rewards/repetition_penalty_reward": -0.021558281034231186, + "rewards/tag_count_reward": 0.9583333432674408, + "step": 1040 + }, + { + "clip_ratio": 0.0, + "completion_length": 462.7083435058594, + "epoch": 0.5205, + "grad_norm": 13.173521990092542, + "kl": 0.990234375, + "learning_rate": 5.96256641554261e-07, + "loss": 0.9078, + "reward": 2.477555751800537, + "reward_std": 0.6211674511432648, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 0.9513889849185944, + "rewards/repetition_penalty_reward": -0.025916431099176407, + "rewards/tag_count_reward": 0.9270833432674408, + "step": 1041 + }, + { + "clip_ratio": 0.0, + "completion_length": 747.7500305175781, + "epoch": 0.521, + "grad_norm": 5.994356817443346, + "kl": 1.3681640625, + "learning_rate": 5.954753337323259e-07, + "loss": 0.3688, + "reward": 2.3351893424987793, + "reward_std": 0.5483206920325756, + "rewards/accuracy_reward": 0.6875, + "rewards/reasoning_steps_reward": 0.875, + "rewards/repetition_penalty_reward": -0.024185655638575554, + "rewards/tag_count_reward": 0.796875, + "step": 1042 + }, + { + "clip_ratio": 0.0, + "completion_length": 641.5833587646484, + "epoch": 0.5215, + "grad_norm": 6.65678571854715, + "kl": 1.65234375, + "learning_rate": 5.946938873846375e-07, + "loss": 0.6598, + "reward": 2.1638104915618896, + "reward_std": 0.6675990223884583, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 0.8541667461395264, + "rewards/repetition_penalty_reward": -0.02889794297516346, + "rewards/tag_count_reward": 0.8177083432674408, + "step": 1043 + }, + { + "clip_ratio": 0.0, + "completion_length": 581.5625, + "epoch": 0.522, + "grad_norm": 7.592842509620014, + "kl": 1.189453125, + "learning_rate": 5.939123048916173e-07, + "loss": 0.836, + "reward": 2.385079026222229, + "reward_std": 0.90406933426857, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.8680556118488312, + "rewards/repetition_penalty_reward": -0.019434815272688866, + "rewards/tag_count_reward": 0.8697916865348816, + "step": 1044 + }, + { + "clip_ratio": 0.0, + "completion_length": 786.4583740234375, + "epoch": 0.5225, + "grad_norm": 4.691007195698727, + "kl": 1.4140625, + "learning_rate": 5.931305886341008e-07, + "loss": 0.7366, + "reward": 2.2028392553329468, + "reward_std": 0.9715997576713562, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.8541666865348816, + "rewards/repetition_penalty_reward": -0.015910686925053596, + "rewards/tag_count_reward": 0.7604166865348816, + "step": 1045 + }, + { + "clip_ratio": 0.0, + "completion_length": 484.85418701171875, + "epoch": 0.523, + "grad_norm": 11.332963488425568, + "kl": 0.888671875, + "learning_rate": 5.923487409933315e-07, + "loss": 0.4905, + "reward": 2.6106256246566772, + "reward_std": 0.4605254530906677, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.016110544558614492, + "rewards/tag_count_reward": 0.9114583730697632, + "step": 1046 + }, + { + "clip_ratio": 0.0, + "completion_length": 620.1250152587891, + "epoch": 0.5235, + "grad_norm": 7.355316334467517, + "kl": 1.28125, + "learning_rate": 5.915667643509528e-07, + "loss": 0.5121, + "reward": 2.435990571975708, + "reward_std": 0.720505028963089, + "rewards/accuracy_reward": 0.6875, + "rewards/reasoning_steps_reward": 0.9444444477558136, + "rewards/repetition_penalty_reward": -0.03449564054608345, + "rewards/tag_count_reward": 0.8385416865348816, + "step": 1047 + }, + { + "clip_ratio": 0.0, + "completion_length": 493.79168701171875, + "epoch": 0.524, + "grad_norm": 10.744727961689891, + "kl": 0.791015625, + "learning_rate": 5.907846610890011e-07, + "loss": 0.5435, + "reward": 2.488598942756653, + "reward_std": 0.5222236812114716, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.028762370347976685, + "rewards/tag_count_reward": 0.9270833730697632, + "step": 1048 + }, + { + "clip_ratio": 0.0, + "completion_length": 643.3958435058594, + "epoch": 0.5245, + "grad_norm": 10.473792543164542, + "kl": 1.25, + "learning_rate": 5.900024335898987e-07, + "loss": 0.7268, + "reward": 2.29390549659729, + "reward_std": 0.8730533719062805, + "rewards/accuracy_reward": 0.583333358168602, + "rewards/reasoning_steps_reward": 0.9097222685813904, + "rewards/repetition_penalty_reward": -0.03769184276461601, + "rewards/tag_count_reward": 0.8385416865348816, + "step": 1049 + }, + { + "clip_ratio": 0.0, + "completion_length": 499.35418701171875, + "epoch": 0.525, + "grad_norm": 12.588187330698355, + "kl": 1.1171875, + "learning_rate": 5.892200842364462e-07, + "loss": 0.5724, + "reward": 2.6492995023727417, + "reward_std": 0.7308410704135895, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9375000596046448, + "rewards/repetition_penalty_reward": -0.03299206867814064, + "rewards/tag_count_reward": 0.8906250298023224, + "step": 1050 + }, + { + "clip_ratio": 0.0, + "completion_length": 393.375, + "epoch": 0.5255, + "grad_norm": 10.44138233428014, + "kl": 0.4931640625, + "learning_rate": 5.884376154118154e-07, + "loss": 0.2827, + "reward": 2.626540184020996, + "reward_std": 0.44100892543792725, + "rewards/accuracy_reward": 0.6875000149011612, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.017557154409587383, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 1051 + }, + { + "clip_ratio": 0.0, + "completion_length": 692.6458435058594, + "epoch": 0.526, + "grad_norm": 18.556893989812647, + "kl": 2.2890625, + "learning_rate": 5.87655029499542e-07, + "loss": 0.7235, + "reward": 2.1944429874420166, + "reward_std": 0.7475170493125916, + "rewards/accuracy_reward": 0.4375, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.02604318968951702, + "rewards/tag_count_reward": 0.8385416865348816, + "step": 1052 + }, + { + "clip_ratio": 0.0, + "completion_length": 561.8125, + "epoch": 0.5265, + "grad_norm": 23.386680578226347, + "kl": 2.37890625, + "learning_rate": 5.868723288835184e-07, + "loss": 0.6783, + "reward": 2.347724437713623, + "reward_std": 0.5882080346345901, + "rewards/accuracy_reward": 0.5625, + "rewards/reasoning_steps_reward": 0.9236111342906952, + "rewards/repetition_penalty_reward": -0.013386863050982356, + "rewards/tag_count_reward": 0.875, + "step": 1053 + }, + { + "clip_ratio": 0.0, + "completion_length": 507.18751525878906, + "epoch": 0.527, + "grad_norm": 25.81517984790388, + "kl": 1.8828125, + "learning_rate": 5.860895159479864e-07, + "loss": 0.7704, + "reward": 2.459168314933777, + "reward_std": 0.5893406569957733, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.9375000298023224, + "rewards/repetition_penalty_reward": -0.030415082350373268, + "rewards/tag_count_reward": 0.90625, + "step": 1054 + }, + { + "clip_ratio": 0.0, + "completion_length": 358.00001525878906, + "epoch": 0.5275, + "grad_norm": 5.939357848487736, + "kl": 0.48828125, + "learning_rate": 5.853065930775303e-07, + "loss": 0.2008, + "reward": 2.8095940351486206, + "reward_std": 0.29858143627643585, + "rewards/accuracy_reward": 0.8750000298023224, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.023739230819046497, + "rewards/tag_count_reward": 0.9791666865348816, + "step": 1055 + }, + { + "clip_ratio": 0.0, + "completion_length": 587.3125, + "epoch": 0.528, + "grad_norm": 29.053870250832556, + "kl": 2.244140625, + "learning_rate": 5.845235626570683e-07, + "loss": 0.4827, + "reward": 2.430117607116699, + "reward_std": 0.5931105017662048, + "rewards/accuracy_reward": 0.6875, + "rewards/reasoning_steps_reward": 0.9097222685813904, + "rewards/repetition_penalty_reward": -0.03689653240144253, + "rewards/tag_count_reward": 0.8697916865348816, + "step": 1056 + }, + { + "clip_ratio": 0.0, + "completion_length": 464.2916717529297, + "epoch": 0.5285, + "grad_norm": 19.31179372615611, + "kl": 1.26953125, + "learning_rate": 5.837404270718475e-07, + "loss": 0.7061, + "reward": 2.5217255353927612, + "reward_std": 0.4243352711200714, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.04424683563411236, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 1057 + }, + { + "clip_ratio": 0.0, + "completion_length": 326.8541717529297, + "epoch": 0.529, + "grad_norm": 6.273677914137758, + "kl": 0.4892578125, + "learning_rate": 5.829571887074343e-07, + "loss": 0.1964, + "reward": 2.713360071182251, + "reward_std": 0.12399672530591488, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.03490386623889208, + "rewards/tag_count_reward": 0.984375, + "step": 1058 + }, + { + "clip_ratio": 0.0, + "completion_length": 351.7291717529297, + "epoch": 0.5295, + "grad_norm": 4.222827898707743, + "kl": 0.4443359375, + "learning_rate": 5.821738499497086e-07, + "loss": 0.1775, + "reward": 2.596918225288391, + "reward_std": 0.3194064572453499, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.03329028841108084, + "rewards/tag_count_reward": 0.984375, + "step": 1059 + }, + { + "clip_ratio": 0.0, + "completion_length": 304.06251525878906, + "epoch": 0.53, + "grad_norm": 3.4491270319460914, + "kl": 0.31640625, + "learning_rate": 5.813904131848564e-07, + "loss": 0.0394, + "reward": 2.959268808364868, + "reward_std": 0.0850947042927146, + "rewards/accuracy_reward": 0.9791666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.019898047670722008, + "rewards/tag_count_reward": 1.0, + "step": 1060 + }, + { + "clip_ratio": 0.0, + "completion_length": 392.60418701171875, + "epoch": 0.5305, + "grad_norm": 7.953501592426025, + "kl": 0.4990234375, + "learning_rate": 5.806068807993617e-07, + "loss": 0.4339, + "reward": 2.659141421318054, + "reward_std": 0.45532485842704773, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.024886406026780605, + "rewards/tag_count_reward": 0.9687500298023224, + "step": 1061 + }, + { + "clip_ratio": 0.0, + "completion_length": 335.18751525878906, + "epoch": 0.531, + "grad_norm": 3.524239803768179, + "kl": 0.40625, + "learning_rate": 5.798232551800002e-07, + "loss": 0.0785, + "reward": 2.477108120918274, + "reward_std": 0.38651110231876373, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.03330867923796177, + "rewards/tag_count_reward": 0.9895833432674408, + "step": 1062 + }, + { + "clip_ratio": 0.0, + "completion_length": 310.3541717529297, + "epoch": 0.5315, + "grad_norm": 3.4813021990603783, + "kl": 0.353515625, + "learning_rate": 5.790395387138311e-07, + "loss": 0.0409, + "reward": 2.969285249710083, + "reward_std": 0.014413285069167614, + "rewards/accuracy_reward": 1.0, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.030714819207787514, + "rewards/tag_count_reward": 1.0, + "step": 1063 + }, + { + "clip_ratio": 0.0, + "completion_length": 387.9791717529297, + "epoch": 0.532, + "grad_norm": 7.496016496118182, + "kl": 0.8828125, + "learning_rate": 5.78255733788191e-07, + "loss": 0.6521, + "reward": 2.826792359352112, + "reward_std": 0.36261652410030365, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.029110469855368137, + "rewards/tag_count_reward": 0.9531250298023224, + "step": 1064 + }, + { + "clip_ratio": 0.0, + "completion_length": 370.5833435058594, + "epoch": 0.5325, + "grad_norm": 5.973629081248443, + "kl": 0.701171875, + "learning_rate": 5.774718427906856e-07, + "loss": 0.3884, + "reward": 2.569111704826355, + "reward_std": 0.5015820562839508, + "rewards/accuracy_reward": 0.6875, + "rewards/reasoning_steps_reward": 0.9513888657093048, + "rewards/repetition_penalty_reward": -0.01769387163221836, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 1065 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.5416717529297, + "epoch": 0.533, + "grad_norm": 3.6653468809368026, + "kl": 0.345703125, + "learning_rate": 5.766878681091828e-07, + "loss": 0.0043, + "reward": 2.7937185764312744, + "reward_std": 0.12711532320827246, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.039614940993487835, + "rewards/tag_count_reward": 1.0, + "step": 1066 + }, + { + "clip_ratio": 0.0, + "completion_length": 330.7916717529297, + "epoch": 0.5335, + "grad_norm": 5.676230222355891, + "kl": 0.3671875, + "learning_rate": 5.759038121318052e-07, + "loss": 0.1888, + "reward": 2.7420976161956787, + "reward_std": 0.3609135150909424, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.026999651454389095, + "rewards/tag_count_reward": 0.984375, + "step": 1067 + }, + { + "clip_ratio": 0.0, + "completion_length": 309.00001525878906, + "epoch": 0.534, + "grad_norm": 2.9464749451257464, + "kl": 0.33203125, + "learning_rate": 5.751196772469237e-07, + "loss": 0.035, + "reward": 2.555404543876648, + "reward_std": 0.19347204267978668, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.027928968891501427, + "rewards/tag_count_reward": 1.0, + "step": 1068 + }, + { + "clip_ratio": 0.0, + "completion_length": 346.6041717529297, + "epoch": 0.5345, + "grad_norm": 4.718857747074456, + "kl": 0.4462890625, + "learning_rate": 5.743354658431489e-07, + "loss": 0.2027, + "reward": 2.7974963188171387, + "reward_std": 0.351472407579422, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.015003710053861141, + "rewards/tag_count_reward": 0.9791666865348816, + "step": 1069 + }, + { + "clip_ratio": 0.0, + "completion_length": 368.68751525878906, + "epoch": 0.535, + "grad_norm": 7.916520582997023, + "kl": 0.783203125, + "learning_rate": 5.735511803093248e-07, + "loss": 0.7907, + "reward": 2.7259095907211304, + "reward_std": 0.5795675814151764, + "rewards/accuracy_reward": 0.8125, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.018882008269429207, + "rewards/tag_count_reward": 0.953125, + "step": 1070 + }, + { + "clip_ratio": 0.0, + "completion_length": 383.5416717529297, + "epoch": 0.5355, + "grad_norm": 89.56603759912137, + "kl": 1.296875, + "learning_rate": 5.727668230345209e-07, + "loss": 0.6934, + "reward": 2.24530291557312, + "reward_std": 0.495767742395401, + "rewards/accuracy_reward": 0.3541666716337204, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.023794405162334442, + "rewards/tag_count_reward": 0.9427083730697632, + "step": 1071 + }, + { + "clip_ratio": 0.0, + "completion_length": 515.4791717529297, + "epoch": 0.536, + "grad_norm": 9.256848299391667, + "kl": 1.056640625, + "learning_rate": 5.71982396408026e-07, + "loss": 0.3638, + "reward": 2.5147180557250977, + "reward_std": 0.4126999229192734, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.016532148234546185, + "rewards/tag_count_reward": 0.9270833432674408, + "step": 1072 + }, + { + "clip_ratio": 0.0, + "completion_length": 496.6250305175781, + "epoch": 0.5365, + "grad_norm": 9.704892056750113, + "kl": 0.923828125, + "learning_rate": 5.711979028193391e-07, + "loss": 0.7874, + "reward": 2.534700036048889, + "reward_std": 0.4814092218875885, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.022591806016862392, + "rewards/tag_count_reward": 0.9114583432674408, + "step": 1073 + }, + { + "clip_ratio": 0.0, + "completion_length": 442.4583435058594, + "epoch": 0.537, + "grad_norm": 4.434868047648692, + "kl": 0.734375, + "learning_rate": 5.704133446581642e-07, + "loss": 0.594, + "reward": 2.7428154945373535, + "reward_std": 0.5054127871990204, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.014129179995507002, + "rewards/tag_count_reward": 0.9166666865348816, + "step": 1074 + }, + { + "clip_ratio": 0.0, + "completion_length": 739.2708740234375, + "epoch": 0.5375, + "grad_norm": 10.643100205082053, + "kl": 1.72265625, + "learning_rate": 5.696287243144012e-07, + "loss": 0.8503, + "reward": 2.1527023315429688, + "reward_std": 0.8699755072593689, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 0.8680556118488312, + "rewards/repetition_penalty_reward": -0.01222835062071681, + "rewards/tag_count_reward": 0.7760416865348816, + "step": 1075 + }, + { + "clip_ratio": 0.0, + "completion_length": 386.875, + "epoch": 0.538, + "grad_norm": 5.50232302994543, + "kl": 0.69921875, + "learning_rate": 5.688440441781398e-07, + "loss": 0.3794, + "reward": 2.6137609481811523, + "reward_std": 0.37844765186309814, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.026864062063395977, + "rewards/tag_count_reward": 0.953125, + "step": 1076 + }, + { + "clip_ratio": 0.0, + "completion_length": 348.2083435058594, + "epoch": 0.5385, + "grad_norm": 4.831237438916453, + "kl": 0.5751953125, + "learning_rate": 5.680593066396518e-07, + "loss": 0.4982, + "reward": 2.895869016647339, + "reward_std": 0.267215795814991, + "rewards/accuracy_reward": 0.9583333730697632, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.027742099948227406, + "rewards/tag_count_reward": 0.9791666865348816, + "step": 1077 + }, + { + "clip_ratio": 0.0, + "completion_length": 411.45835876464844, + "epoch": 0.539, + "grad_norm": 4.089673654340983, + "kl": 0.5966796875, + "learning_rate": 5.672745140893839e-07, + "loss": 0.449, + "reward": 2.5935864448547363, + "reward_std": 0.31501081585884094, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.022733049467206, + "rewards/tag_count_reward": 0.9427083432674408, + "step": 1078 + }, + { + "clip_ratio": 0.0, + "completion_length": 500.3541717529297, + "epoch": 0.5395, + "grad_norm": 3.580216142280184, + "kl": 0.658203125, + "learning_rate": 5.664896689179504e-07, + "loss": 0.6341, + "reward": 2.669227361679077, + "reward_std": 0.7126676142215729, + "rewards/accuracy_reward": 0.8750000298023224, + "rewards/reasoning_steps_reward": 0.9166666865348816, + "rewards/repetition_penalty_reward": -0.028689204715192318, + "rewards/tag_count_reward": 0.9062500298023224, + "step": 1079 + }, + { + "clip_ratio": 0.0, + "completion_length": 781.6667175292969, + "epoch": 0.54, + "grad_norm": 4.955650799366805, + "kl": 1.166015625, + "learning_rate": 5.657047735161255e-07, + "loss": 0.8425, + "reward": 2.199775218963623, + "reward_std": 0.9051741063594818, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.8888889253139496, + "rewards/repetition_penalty_reward": -0.017238642554730177, + "rewards/tag_count_reward": 0.7864583730697632, + "step": 1080 + }, + { + "clip_ratio": 0.0, + "completion_length": 399.72918701171875, + "epoch": 0.5405, + "grad_norm": 4.90026140945702, + "kl": 0.5576171875, + "learning_rate": 5.649198302748368e-07, + "loss": 0.4486, + "reward": 2.7975977659225464, + "reward_std": 0.45839452743530273, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.021846801973879337, + "rewards/tag_count_reward": 0.9583333730697632, + "step": 1081 + }, + { + "clip_ratio": 0.0, + "completion_length": 757.9791870117188, + "epoch": 0.541, + "grad_norm": 8.049018307327215, + "kl": 1.220703125, + "learning_rate": 5.641348415851577e-07, + "loss": 0.7273, + "reward": 2.0966323614120483, + "reward_std": 0.7242147624492645, + "rewards/accuracy_reward": 0.3958333432674408, + "rewards/reasoning_steps_reward": 0.9027778506278992, + "rewards/repetition_penalty_reward": -0.01968714315444231, + "rewards/tag_count_reward": 0.8177083730697632, + "step": 1082 + }, + { + "clip_ratio": 0.0, + "completion_length": 374.2916717529297, + "epoch": 0.5415, + "grad_norm": 8.835700341793663, + "kl": 0.77734375, + "learning_rate": 5.633498098382998e-07, + "loss": 0.4341, + "reward": 2.6963536739349365, + "reward_std": 0.3628556430339813, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.022396229207515717, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 1083 + }, + { + "clip_ratio": 0.0, + "completion_length": 802.6875, + "epoch": 0.542, + "grad_norm": 9.101773741644875, + "kl": 1.55078125, + "learning_rate": 5.625647374256061e-07, + "loss": 0.741, + "reward": 2.3281177282333374, + "reward_std": 0.6684702336788177, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/reasoning_steps_reward": 0.909722238779068, + "rewards/repetition_penalty_reward": -0.013896321877837181, + "rewards/tag_count_reward": 0.7656250298023224, + "step": 1084 + }, + { + "clip_ratio": 0.0, + "completion_length": 559.9166870117188, + "epoch": 0.5425, + "grad_norm": 3.17667785466319, + "kl": 0.849609375, + "learning_rate": 5.617796267385429e-07, + "loss": 0.5913, + "reward": 2.5073158740997314, + "reward_std": 0.6131802946329117, + "rewards/accuracy_reward": 0.6875, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.03261485882103443, + "rewards/tag_count_reward": 0.8802083730697632, + "step": 1085 + }, + { + "clip_ratio": 0.0, + "completion_length": 395.54168701171875, + "epoch": 0.543, + "grad_norm": 5.388043302367293, + "kl": 0.642578125, + "learning_rate": 5.60994480168694e-07, + "loss": 0.369, + "reward": 2.5772793292999268, + "reward_std": 0.5051662921905518, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.01820683665573597, + "rewards/tag_count_reward": 0.9427083432674408, + "step": 1086 + }, + { + "clip_ratio": 0.0, + "completion_length": 585.6250305175781, + "epoch": 0.5435, + "grad_norm": 9.467820919919676, + "kl": 0.974609375, + "learning_rate": 5.602093001077517e-07, + "loss": 0.667, + "reward": 2.562462568283081, + "reward_std": 0.6790414154529572, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9166666865348816, + "rewards/repetition_penalty_reward": -0.010454241652041674, + "rewards/tag_count_reward": 0.8645833432674408, + "step": 1087 + }, + { + "clip_ratio": 0.0, + "completion_length": 357.81251525878906, + "epoch": 0.544, + "grad_norm": 10.151144754971412, + "kl": 0.7001953125, + "learning_rate": 5.594240889475106e-07, + "loss": 0.615, + "reward": 2.8339508771896362, + "reward_std": 0.4007309675216675, + "rewards/accuracy_reward": 0.9375000298023224, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.023687981069087982, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 1088 + }, + { + "clip_ratio": 0.0, + "completion_length": 653.4583435058594, + "epoch": 0.5445, + "grad_norm": 5.455861003165772, + "kl": 0.9765625, + "learning_rate": 5.586388490798604e-07, + "loss": 0.6132, + "reward": 2.212650716304779, + "reward_std": 0.6732420325279236, + "rewards/accuracy_reward": 0.4375000223517418, + "rewards/reasoning_steps_reward": 0.92361119389534, + "rewards/repetition_penalty_reward": -0.018252158537507057, + "rewards/tag_count_reward": 0.8697916865348816, + "step": 1089 + }, + { + "clip_ratio": 0.0, + "completion_length": 648.2291870117188, + "epoch": 0.545, + "grad_norm": 7.571198091437875, + "kl": 1.234375, + "learning_rate": 5.578535828967777e-07, + "loss": 0.8813, + "reward": 2.416181802749634, + "reward_std": 0.9853273630142212, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.875, + "rewards/repetition_penalty_reward": -0.01090173190459609, + "rewards/tag_count_reward": 0.8229166865348816, + "step": 1090 + }, + { + "clip_ratio": 0.0, + "completion_length": 752.1458435058594, + "epoch": 0.5455, + "grad_norm": 6.561919666240053, + "kl": 1.169921875, + "learning_rate": 5.570682927903193e-07, + "loss": 0.4967, + "reward": 2.3344783782958984, + "reward_std": 0.5779477655887604, + "rewards/accuracy_reward": 0.625, + "rewards/reasoning_steps_reward": 0.9027777910232544, + "rewards/repetition_penalty_reward": -0.01621603313833475, + "rewards/tag_count_reward": 0.8229166865348816, + "step": 1091 + }, + { + "clip_ratio": 0.0, + "completion_length": 620.2708435058594, + "epoch": 0.546, + "grad_norm": 8.795113051800255, + "kl": 1.126953125, + "learning_rate": 5.562829811526154e-07, + "loss": 0.6867, + "reward": 2.5719066858291626, + "reward_std": 0.6096862554550171, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9375000596046448, + "rewards/repetition_penalty_reward": -0.02184334397315979, + "rewards/tag_count_reward": 0.8645833432674408, + "step": 1092 + }, + { + "clip_ratio": 0.0, + "completion_length": 448.1458435058594, + "epoch": 0.5465, + "grad_norm": 5.044503857903457, + "kl": 0.6328125, + "learning_rate": 5.554976503758612e-07, + "loss": 0.7076, + "reward": 2.7934606075286865, + "reward_std": 0.3946908265352249, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.017303182743489742, + "rewards/tag_count_reward": 0.9218750298023224, + "step": 1093 + }, + { + "clip_ratio": 0.0, + "completion_length": 521.8541870117188, + "epoch": 0.547, + "grad_norm": 11.91339929157061, + "kl": 0.794921875, + "learning_rate": 5.547123028523106e-07, + "loss": 0.4694, + "reward": 2.475094199180603, + "reward_std": 0.4313882440328598, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.028378095477819443, + "rewards/tag_count_reward": 0.8854166865348816, + "step": 1094 + }, + { + "clip_ratio": 0.0, + "completion_length": 608.2291717529297, + "epoch": 0.5475, + "grad_norm": 21.561567034341035, + "kl": 0.744140625, + "learning_rate": 5.539269409742683e-07, + "loss": 0.8278, + "reward": 2.4341979026794434, + "reward_std": 0.6945553719997406, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9166666865348816, + "rewards/repetition_penalty_reward": -0.024135553278028965, + "rewards/tag_count_reward": 0.8750000298023224, + "step": 1095 + }, + { + "clip_ratio": 0.0, + "completion_length": 534.8750305175781, + "epoch": 0.548, + "grad_norm": 25.643096112271426, + "kl": 1.119140625, + "learning_rate": 5.531415671340826e-07, + "loss": 0.5556, + "reward": 2.2709327936172485, + "reward_std": 0.5148254334926605, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 0.916666716337204, + "rewards/repetition_penalty_reward": -0.02594222454354167, + "rewards/tag_count_reward": 0.8593750298023224, + "step": 1096 + }, + { + "clip_ratio": 0.0, + "completion_length": 337.12501525878906, + "epoch": 0.5485, + "grad_norm": 15.402452878812943, + "kl": 0.4892578125, + "learning_rate": 5.523561837241387e-07, + "loss": 0.2395, + "reward": 2.4139556884765625, + "reward_std": 0.25268905609846115, + "rewards/accuracy_reward": 0.4583333432674408, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.018336026929318905, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 1097 + }, + { + "clip_ratio": 0.0, + "completion_length": 396.93751525878906, + "epoch": 0.549, + "grad_norm": 20.68700015444551, + "kl": 0.609375, + "learning_rate": 5.515707931368507e-07, + "loss": 0.3136, + "reward": 2.6157283782958984, + "reward_std": 0.503062829375267, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.01968833664432168, + "rewards/tag_count_reward": 0.9270833432674408, + "step": 1098 + }, + { + "clip_ratio": 0.0, + "completion_length": 549.8333435058594, + "epoch": 0.5495, + "grad_norm": 41.64035805095098, + "kl": 0.728515625, + "learning_rate": 5.507853977646543e-07, + "loss": 0.6883, + "reward": 2.3382461071014404, + "reward_std": 0.7803103625774384, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.017656567506492138, + "rewards/tag_count_reward": 0.8489583432674408, + "step": 1099 + }, + { + "clip_ratio": 0.0, + "completion_length": 434.16668701171875, + "epoch": 0.55, + "grad_norm": 42.17393716054865, + "kl": 0.666015625, + "learning_rate": 5.5e-07, + "loss": 0.5129, + "reward": 2.6097629070281982, + "reward_std": 0.6973889470100403, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.023917713202536106, + "rewards/tag_count_reward": 0.8906250298023224, + "step": 1100 + }, + { + "clip_ratio": 0.0, + "completion_length": 546.1875152587891, + "epoch": 0.5505, + "grad_norm": 19.4698112016211, + "kl": 0.84375, + "learning_rate": 5.492146022353459e-07, + "loss": 0.3103, + "reward": 2.297290325164795, + "reward_std": 0.6591008305549622, + "rewards/accuracy_reward": 0.5, + "rewards/reasoning_steps_reward": 0.9375000298023224, + "rewards/repetition_penalty_reward": -0.02041808795183897, + "rewards/tag_count_reward": 0.8802083432674408, + "step": 1101 + }, + { + "clip_ratio": 0.0, + "completion_length": 500.8958435058594, + "epoch": 0.551, + "grad_norm": 27.46742900229019, + "kl": 0.751953125, + "learning_rate": 5.484292068631494e-07, + "loss": 0.9024, + "reward": 2.3236552476882935, + "reward_std": 0.5587631464004517, + "rewards/accuracy_reward": 0.4583333358168602, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.01662263460457325, + "rewards/tag_count_reward": 0.9166666865348816, + "step": 1102 + }, + { + "clip_ratio": 0.0, + "completion_length": 422.1041717529297, + "epoch": 0.5515, + "grad_norm": 6.678322908726758, + "kl": 0.634765625, + "learning_rate": 5.476438162758611e-07, + "loss": 0.3346, + "reward": 2.5052073001861572, + "reward_std": 0.44907619804143906, + "rewards/accuracy_reward": 0.6250000149011612, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.01909833773970604, + "rewards/tag_count_reward": 0.90625, + "step": 1103 + }, + { + "clip_ratio": 0.0, + "completion_length": 395.1458435058594, + "epoch": 0.552, + "grad_norm": 7.97609070470728, + "kl": 0.7421875, + "learning_rate": 5.468584328659172e-07, + "loss": 0.3472, + "reward": 2.4966301918029785, + "reward_std": 0.3519609868526459, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.018995044752955437, + "rewards/tag_count_reward": 0.9114583432674408, + "step": 1104 + }, + { + "clip_ratio": 0.0, + "completion_length": 498.5000305175781, + "epoch": 0.5525, + "grad_norm": 8.65361833333178, + "kl": 0.7255859375, + "learning_rate": 5.460730590257317e-07, + "loss": 0.4795, + "reward": 2.138282537460327, + "reward_std": 0.5641656816005707, + "rewards/accuracy_reward": 0.2916666716337204, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.017967475578188896, + "rewards/tag_count_reward": 0.90625, + "step": 1105 + }, + { + "clip_ratio": 0.0, + "completion_length": 457.9583435058594, + "epoch": 0.553, + "grad_norm": 12.3661318307847, + "kl": 0.677734375, + "learning_rate": 5.452876971476896e-07, + "loss": 0.7296, + "reward": 2.612514019012451, + "reward_std": 0.6336883902549744, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.021166563034057617, + "rewards/tag_count_reward": 0.9322916865348816, + "step": 1106 + }, + { + "clip_ratio": 0.0, + "completion_length": 515.6250305175781, + "epoch": 0.5535, + "grad_norm": 10.944643674304801, + "kl": 0.74609375, + "learning_rate": 5.445023496241388e-07, + "loss": 0.8027, + "reward": 2.3295055627822876, + "reward_std": 0.5938288271427155, + "rewards/accuracy_reward": 0.5000000111758709, + "rewards/reasoning_steps_reward": 0.9652778506278992, + "rewards/repetition_penalty_reward": -0.01598059432581067, + "rewards/tag_count_reward": 0.8802083432674408, + "step": 1107 + }, + { + "clip_ratio": 0.0, + "completion_length": 544.2708435058594, + "epoch": 0.554, + "grad_norm": 6.272584778246144, + "kl": 1.17578125, + "learning_rate": 5.437170188473847e-07, + "loss": 0.5575, + "reward": 2.0059564113616943, + "reward_std": 0.6833735406398773, + "rewards/accuracy_reward": 0.2708333432674408, + "rewards/reasoning_steps_reward": 0.9027778506278992, + "rewards/repetition_penalty_reward": -0.01661309227347374, + "rewards/tag_count_reward": 0.8489583432674408, + "step": 1108 + }, + { + "clip_ratio": 0.0, + "completion_length": 388.2916717529297, + "epoch": 0.5545, + "grad_norm": 6.425896412413649, + "kl": 0.81640625, + "learning_rate": 5.429317072096807e-07, + "loss": 0.3538, + "reward": 2.409471869468689, + "reward_std": 0.5708480477333069, + "rewards/accuracy_reward": 0.5625, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.017611466348171234, + "rewards/tag_count_reward": 0.90625, + "step": 1109 + }, + { + "clip_ratio": 0.0, + "completion_length": 555.4375, + "epoch": 0.555, + "grad_norm": 8.904743072445507, + "kl": 1.025390625, + "learning_rate": 5.421464171032224e-07, + "loss": 0.8839, + "reward": 2.551916480064392, + "reward_std": 0.8339214324951172, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9236111342906952, + "rewards/repetition_penalty_reward": -0.012319804634898901, + "rewards/tag_count_reward": 0.8697916865348816, + "step": 1110 + }, + { + "clip_ratio": 0.0, + "completion_length": 421.37501525878906, + "epoch": 0.5555, + "grad_norm": 9.390143696400399, + "kl": 0.7890625, + "learning_rate": 5.413611509201396e-07, + "loss": 0.6891, + "reward": 2.6331262588500977, + "reward_std": 0.6708503365516663, + "rewards/accuracy_reward": 0.8125, + "rewards/reasoning_steps_reward": 0.9375000298023224, + "rewards/repetition_penalty_reward": -0.04395711608231068, + "rewards/tag_count_reward": 0.9270833432674408, + "step": 1111 + }, + { + "clip_ratio": 0.0, + "completion_length": 566.3333740234375, + "epoch": 0.556, + "grad_norm": 7.055072585098017, + "kl": 1.05078125, + "learning_rate": 5.405759110524894e-07, + "loss": 0.688, + "reward": 2.20969557762146, + "reward_std": 0.6133010685443878, + "rewards/accuracy_reward": 0.3750000223517418, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.029887686483561993, + "rewards/tag_count_reward": 0.8854166865348816, + "step": 1112 + }, + { + "clip_ratio": 0.0, + "completion_length": 460.81251525878906, + "epoch": 0.5565, + "grad_norm": 6.681017604333264, + "kl": 0.806640625, + "learning_rate": 5.397906998922483e-07, + "loss": 0.4867, + "reward": 2.5056605339050293, + "reward_std": 0.42460909485816956, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.022117381915450096, + "rewards/tag_count_reward": 0.9375, + "step": 1113 + }, + { + "clip_ratio": 0.0, + "completion_length": 502.1666717529297, + "epoch": 0.557, + "grad_norm": 7.9919915253075136, + "kl": 0.951171875, + "learning_rate": 5.390055198313061e-07, + "loss": 0.8675, + "reward": 2.357274293899536, + "reward_std": 0.6162551641464233, + "rewards/accuracy_reward": 0.5000000298023224, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.015989708248525858, + "rewards/tag_count_reward": 0.921875, + "step": 1114 + }, + { + "clip_ratio": 0.0, + "completion_length": 457.6250305175781, + "epoch": 0.5575, + "grad_norm": 6.025174035853018, + "kl": 0.744140625, + "learning_rate": 5.382203732614571e-07, + "loss": 0.5821, + "reward": 2.6463170051574707, + "reward_std": 0.5895664393901825, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9375000298023224, + "rewards/repetition_penalty_reward": -0.015141477808356285, + "rewards/tag_count_reward": 0.9322916865348816, + "step": 1115 + }, + { + "clip_ratio": 0.0, + "completion_length": 648.9791870117188, + "epoch": 0.558, + "grad_norm": 4.919326916823641, + "kl": 1.15234375, + "learning_rate": 5.37435262574394e-07, + "loss": 0.6887, + "reward": 2.228627920150757, + "reward_std": 0.6992431879043579, + "rewards/accuracy_reward": 0.4375000298023224, + "rewards/reasoning_steps_reward": 0.9652778506278992, + "rewards/repetition_penalty_reward": -0.017899959348142147, + "rewards/tag_count_reward": 0.8437500298023224, + "step": 1116 + }, + { + "clip_ratio": 0.0, + "completion_length": 543.4375, + "epoch": 0.5585, + "grad_norm": 6.777349325411678, + "kl": 0.990234375, + "learning_rate": 5.366501901617001e-07, + "loss": 0.7701, + "reward": 2.5158661603927612, + "reward_std": 0.6982110738754272, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.902777761220932, + "rewards/repetition_penalty_reward": -0.011911679524928331, + "rewards/tag_count_reward": 0.8958333730697632, + "step": 1117 + }, + { + "clip_ratio": 0.0, + "completion_length": 367.2708435058594, + "epoch": 0.559, + "grad_norm": 4.625628969961506, + "kl": 0.611328125, + "learning_rate": 5.358651584148423e-07, + "loss": 0.4419, + "reward": 2.7811131477355957, + "reward_std": 0.3881167322397232, + "rewards/accuracy_reward": 0.8958333730697632, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.04006761498749256, + "rewards/tag_count_reward": 0.953125, + "step": 1118 + }, + { + "clip_ratio": 0.0, + "completion_length": 481.58335876464844, + "epoch": 0.5595, + "grad_norm": 5.8630513324216755, + "kl": 0.779296875, + "learning_rate": 5.350801697251633e-07, + "loss": 0.5938, + "reward": 2.6719554662704468, + "reward_std": 0.5751761198043823, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.03116954304277897, + "rewards/tag_count_reward": 0.9114583432674408, + "step": 1119 + }, + { + "clip_ratio": 0.0, + "completion_length": 393.0833435058594, + "epoch": 0.56, + "grad_norm": 5.093420961702017, + "kl": 0.5625, + "learning_rate": 5.342952264838747e-07, + "loss": 0.4989, + "reward": 2.689083218574524, + "reward_std": 0.388533353805542, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.01230586925521493, + "rewards/tag_count_reward": 0.9375, + "step": 1120 + }, + { + "clip_ratio": 0.0, + "completion_length": 365.7083435058594, + "epoch": 0.5605, + "grad_norm": 5.605297281726827, + "kl": 0.5947265625, + "learning_rate": 5.335103310820496e-07, + "loss": 0.5772, + "reward": 2.8223936557769775, + "reward_std": 0.38263121247291565, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.965277761220932, + "rewards/repetition_penalty_reward": -0.017884175293147564, + "rewards/tag_count_reward": 0.9583333432674408, + "step": 1121 + }, + { + "clip_ratio": 0.0, + "completion_length": 601.6666870117188, + "epoch": 0.561, + "grad_norm": 6.832349813903946, + "kl": 0.763671875, + "learning_rate": 5.32725485910616e-07, + "loss": 0.8745, + "reward": 2.4064905643463135, + "reward_std": 0.7045398056507111, + "rewards/accuracy_reward": 0.6041666716337204, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.01885672379285097, + "rewards/tag_count_reward": 0.8697916865348816, + "step": 1122 + }, + { + "clip_ratio": 0.0, + "completion_length": 450.0833435058594, + "epoch": 0.5615, + "grad_norm": 4.514039507778649, + "kl": 0.666015625, + "learning_rate": 5.319406933603482e-07, + "loss": 0.5151, + "reward": 2.757925033569336, + "reward_std": 0.3729214509949088, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.0250612860545516, + "rewards/tag_count_reward": 0.9427083432674408, + "step": 1123 + }, + { + "clip_ratio": 0.0, + "completion_length": 649.3541870117188, + "epoch": 0.562, + "grad_norm": 7.659959107724474, + "kl": 0.943359375, + "learning_rate": 5.311559558218603e-07, + "loss": 0.6994, + "reward": 2.274022102355957, + "reward_std": 0.7414794862270355, + "rewards/accuracy_reward": 0.5000000149011612, + "rewards/reasoning_steps_reward": 0.9513888657093048, + "rewards/repetition_penalty_reward": -0.03153356537222862, + "rewards/tag_count_reward": 0.8541666865348816, + "step": 1124 + }, + { + "clip_ratio": 0.0, + "completion_length": 671.4166870117188, + "epoch": 0.5625, + "grad_norm": 6.923021149158682, + "kl": 0.923828125, + "learning_rate": 5.303712756855988e-07, + "loss": 0.8844, + "reward": 2.3472338914871216, + "reward_std": 0.7413772344589233, + "rewards/accuracy_reward": 0.6041666716337204, + "rewards/reasoning_steps_reward": 0.9305556416511536, + "rewards/repetition_penalty_reward": -0.015613417141139507, + "rewards/tag_count_reward": 0.828125, + "step": 1125 + }, + { + "clip_ratio": 0.0, + "completion_length": 338.54168701171875, + "epoch": 0.563, + "grad_norm": 6.103986634370641, + "kl": 0.55078125, + "learning_rate": 5.295866553418358e-07, + "loss": 0.5005, + "reward": 2.7973004579544067, + "reward_std": 0.39559805393218994, + "rewards/accuracy_reward": 0.8958333730697632, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.03256077412515879, + "rewards/tag_count_reward": 0.96875, + "step": 1126 + }, + { + "clip_ratio": 0.0, + "completion_length": 624.5000152587891, + "epoch": 0.5635, + "grad_norm": 6.343512742022832, + "kl": 0.99609375, + "learning_rate": 5.288020971806608e-07, + "loss": 0.4152, + "reward": 2.39252108335495, + "reward_std": 0.40336638130247593, + "rewards/accuracy_reward": 0.625, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.015465098433196545, + "rewards/tag_count_reward": 0.8385416865348816, + "step": 1127 + }, + { + "clip_ratio": 0.0, + "completion_length": 343.5833435058594, + "epoch": 0.564, + "grad_norm": 24.603696739447937, + "kl": 0.9208984375, + "learning_rate": 5.28017603591974e-07, + "loss": 0.3151, + "reward": 2.6542476415634155, + "reward_std": 0.28428603522479534, + "rewards/accuracy_reward": 0.6875, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.012418974190950394, + "rewards/tag_count_reward": 0.9791666865348816, + "step": 1128 + }, + { + "clip_ratio": 0.0, + "completion_length": 271.2291717529297, + "epoch": 0.5645, + "grad_norm": 5.473040679290024, + "kl": 0.490234375, + "learning_rate": 5.27233176965479e-07, + "loss": 0.0469, + "reward": 2.9089274406433105, + "reward_std": 0.12796843331307173, + "rewards/accuracy_reward": 0.9375, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.028572553768754005, + "rewards/tag_count_reward": 1.0, + "step": 1129 + }, + { + "clip_ratio": 0.0, + "completion_length": 626.6458435058594, + "epoch": 0.565, + "grad_norm": 13.844398470198444, + "kl": 0.90625, + "learning_rate": 5.264488196906752e-07, + "loss": 0.7776, + "reward": 2.4067180156707764, + "reward_std": 0.6049718260765076, + "rewards/accuracy_reward": 0.5625000298023224, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.020365470554679632, + "rewards/tag_count_reward": 0.8645833432674408, + "step": 1130 + }, + { + "clip_ratio": 0.0, + "completion_length": 575.9583435058594, + "epoch": 0.5655, + "grad_norm": 47.74850593269307, + "kl": 1.37890625, + "learning_rate": 5.256645341568511e-07, + "loss": 0.3055, + "reward": 2.4448657035827637, + "reward_std": 0.2919907867908478, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.015203722519800067, + "rewards/tag_count_reward": 0.8697916865348816, + "step": 1131 + }, + { + "clip_ratio": 0.0, + "completion_length": 457.9375305175781, + "epoch": 0.566, + "grad_norm": 16.058299463435173, + "kl": 0.744140625, + "learning_rate": 5.248803227530763e-07, + "loss": 0.7605, + "reward": 2.6384165287017822, + "reward_std": 0.5586017668247223, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.0195696409791708, + "rewards/tag_count_reward": 0.921875, + "step": 1132 + }, + { + "clip_ratio": 0.0, + "completion_length": 282.8333435058594, + "epoch": 0.5665, + "grad_norm": 4.518324512537823, + "kl": 0.4765625, + "learning_rate": 5.240961878681947e-07, + "loss": 0.0132, + "reward": 2.93279492855072, + "reward_std": 0.15127281844615936, + "rewards/accuracy_reward": 0.9583333730697632, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.025538288988173008, + "rewards/tag_count_reward": 1.0, + "step": 1133 + }, + { + "clip_ratio": 0.0, + "completion_length": 480.04168701171875, + "epoch": 0.567, + "grad_norm": 11.667992967616525, + "kl": 0.814453125, + "learning_rate": 5.233121318908173e-07, + "loss": 0.4719, + "reward": 2.2194743156433105, + "reward_std": 0.4004889130592346, + "rewards/accuracy_reward": 0.3750000223517418, + "rewards/reasoning_steps_reward": 0.9722222089767456, + "rewards/repetition_penalty_reward": -0.018372977152466774, + "rewards/tag_count_reward": 0.890625, + "step": 1134 + }, + { + "clip_ratio": 0.0, + "completion_length": 400.4583435058594, + "epoch": 0.5675, + "grad_norm": 6.73610126947889, + "kl": 0.7626953125, + "learning_rate": 5.225281572093143e-07, + "loss": 0.2514, + "reward": 2.7708280086517334, + "reward_std": 0.23756166687235236, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.03299156203866005, + "rewards/tag_count_reward": 0.921875, + "step": 1135 + }, + { + "clip_ratio": 0.0, + "completion_length": 791.6875305175781, + "epoch": 0.568, + "grad_norm": 25.279730322882415, + "kl": 1.59765625, + "learning_rate": 5.21744266211809e-07, + "loss": 0.8005, + "reward": 2.3063154816627502, + "reward_std": 0.6179798245429993, + "rewards/accuracy_reward": 0.5625, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.016601404175162315, + "rewards/tag_count_reward": 0.78125, + "step": 1136 + }, + { + "clip_ratio": 0.0, + "completion_length": 466.16668701171875, + "epoch": 0.5685, + "grad_norm": 18.939649860781724, + "kl": 0.869140625, + "learning_rate": 5.20960461286169e-07, + "loss": 0.6348, + "reward": 2.7132285833358765, + "reward_std": 0.41872841119766235, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.02635475154966116, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 1137 + }, + { + "clip_ratio": 0.0, + "completion_length": 704.2500305175781, + "epoch": 0.569, + "grad_norm": 23.295755447219996, + "kl": 1.234375, + "learning_rate": 5.2017674482e-07, + "loss": 0.7717, + "reward": 2.3411978483200073, + "reward_std": 0.7033064663410187, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.019913168624043465, + "rewards/tag_count_reward": 0.8333333432674408, + "step": 1138 + }, + { + "clip_ratio": 0.0, + "completion_length": 512.9166717529297, + "epoch": 0.5695, + "grad_norm": 14.21734574883039, + "kl": 1.03515625, + "learning_rate": 5.193931192006385e-07, + "loss": 0.5358, + "reward": 2.592092990875244, + "reward_std": 0.6145272552967072, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.02249028254300356, + "rewards/tag_count_reward": 0.8854166865348816, + "step": 1139 + }, + { + "clip_ratio": 0.0, + "completion_length": 394.45835876464844, + "epoch": 0.57, + "grad_norm": 6.405748026127363, + "kl": 0.642578125, + "learning_rate": 5.186095868151436e-07, + "loss": 0.3287, + "reward": 2.8148388862609863, + "reward_std": 0.3306480962783098, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.03238342609256506, + "rewards/tag_count_reward": 0.9583333432674408, + "step": 1140 + }, + { + "clip_ratio": 0.0, + "completion_length": 643.0208435058594, + "epoch": 0.5705, + "grad_norm": 11.8102138105351, + "kl": 1.228515625, + "learning_rate": 5.178261500502912e-07, + "loss": 0.5072, + "reward": 2.575770854949951, + "reward_std": 0.3936486691236496, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.014507037587463856, + "rewards/tag_count_reward": 0.8333333432674408, + "step": 1141 + }, + { + "clip_ratio": 0.0, + "completion_length": 484.1875, + "epoch": 0.571, + "grad_norm": 19.62442109664474, + "kl": 1.203125, + "learning_rate": 5.170428112925659e-07, + "loss": 0.6059, + "reward": 2.3984756469726562, + "reward_std": 0.5576977431774139, + "rewards/accuracy_reward": 0.583333358168602, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.02860766276717186, + "rewards/tag_count_reward": 0.8854166865348816, + "step": 1142 + }, + { + "clip_ratio": 0.0, + "completion_length": 339.9583435058594, + "epoch": 0.5715, + "grad_norm": 15.224539776880427, + "kl": 0.6611328125, + "learning_rate": 5.162595729281526e-07, + "loss": 0.3318, + "reward": 2.6164965629577637, + "reward_std": 0.2758345529437065, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.018920221365988255, + "rewards/tag_count_reward": 0.96875, + "step": 1143 + }, + { + "clip_ratio": 0.0, + "completion_length": 400.4583435058594, + "epoch": 0.572, + "grad_norm": 16.037796811858144, + "kl": 1.94921875, + "learning_rate": 5.154764373429315e-07, + "loss": 0.5664, + "reward": 2.7207865715026855, + "reward_std": 0.4415482133626938, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.024005182087421417, + "rewards/tag_count_reward": 0.9322916865348816, + "step": 1144 + }, + { + "clip_ratio": 0.0, + "completion_length": 488.3750305175781, + "epoch": 0.5725, + "grad_norm": 89.73106922514725, + "kl": 7.046875, + "learning_rate": 5.146934069224698e-07, + "loss": 0.8408, + "reward": 2.387607216835022, + "reward_std": 0.6802188009023666, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.016906835604459047, + "rewards/tag_count_reward": 0.7864583432674408, + "step": 1145 + }, + { + "clip_ratio": 0.0, + "completion_length": 334.6041717529297, + "epoch": 0.573, + "grad_norm": 32.93197336588902, + "kl": 2.5234375, + "learning_rate": 5.139104840520135e-07, + "loss": 0.4021, + "reward": 2.706830859184265, + "reward_std": 0.45073381066322327, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.017127559520304203, + "rewards/tag_count_reward": 0.9114583432674408, + "step": 1146 + }, + { + "clip_ratio": 0.0, + "completion_length": 331.18751525878906, + "epoch": 0.5735, + "grad_norm": 16.862260635413147, + "kl": 1.87109375, + "learning_rate": 5.131276711164815e-07, + "loss": 0.2396, + "reward": 2.566492795944214, + "reward_std": 0.4739261567592621, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.028993389569222927, + "rewards/tag_count_reward": 0.9218750298023224, + "step": 1147 + }, + { + "clip_ratio": 0.0, + "completion_length": 315.3333435058594, + "epoch": 0.574, + "grad_norm": 34.79221151751134, + "kl": 3.4921875, + "learning_rate": 5.123449705004581e-07, + "loss": 0.3237, + "reward": 2.6664516925811768, + "reward_std": 0.6325987875461578, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.02625663299113512, + "rewards/tag_count_reward": 0.921875, + "step": 1148 + }, + { + "clip_ratio": 0.0, + "completion_length": 253.3541717529297, + "epoch": 0.5745, + "grad_norm": 12.048802514250214, + "kl": 0.947265625, + "learning_rate": 5.115623845881847e-07, + "loss": 0.0768, + "reward": 2.8428802490234375, + "reward_std": 0.40082596242427826, + "rewards/accuracy_reward": 0.9375000298023224, + "rewards/reasoning_steps_reward": 0.9722222089767456, + "rewards/repetition_penalty_reward": -0.019967169500887394, + "rewards/tag_count_reward": 0.953125, + "step": 1149 + }, + { + "clip_ratio": 0.0, + "completion_length": 302.7708435058594, + "epoch": 0.575, + "grad_norm": 10.82498445900699, + "kl": 0.498046875, + "learning_rate": 5.107799157635538e-07, + "loss": 0.0567, + "reward": 2.6700661182403564, + "reward_std": 0.20545261912047863, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.02264238614588976, + "rewards/tag_count_reward": 0.984375, + "step": 1150 + }, + { + "clip_ratio": 0.0, + "completion_length": 256.1041717529297, + "epoch": 0.5755, + "grad_norm": 47.5496817156261, + "kl": 4.015625, + "learning_rate": 5.099975664101014e-07, + "loss": 0.1526, + "reward": 1.90774405002594, + "reward_std": 0.7336077690124512, + "rewards/accuracy_reward": 0.2708333432674408, + "rewards/reasoning_steps_reward": 0.9375000894069672, + "rewards/repetition_penalty_reward": -0.014130960684269667, + "rewards/tag_count_reward": 0.7135416865348816, + "step": 1151 + }, + { + "clip_ratio": 0.0, + "completion_length": 276.0833435058594, + "epoch": 0.576, + "grad_norm": 11.1836621188091, + "kl": 1.27734375, + "learning_rate": 5.09215338910999e-07, + "loss": 0.062, + "reward": 2.778138041496277, + "reward_std": 0.30686675012111664, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.02915370836853981, + "rewards/tag_count_reward": 0.953125, + "step": 1152 + }, + { + "clip_ratio": 0.0, + "completion_length": 278.47918701171875, + "epoch": 0.5765, + "grad_norm": 8.241389663503151, + "kl": 0.83984375, + "learning_rate": 5.084332356490472e-07, + "loss": -0.0037, + "reward": 2.4805880784988403, + "reward_std": 0.40117160230875015, + "rewards/accuracy_reward": 0.625, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.043717604130506516, + "rewards/tag_count_reward": 0.90625, + "step": 1153 + }, + { + "clip_ratio": 0.0, + "completion_length": 253.1666717529297, + "epoch": 0.577, + "grad_norm": 65.55942523736746, + "kl": 6.75, + "learning_rate": 5.076512590066685e-07, + "loss": 0.0028, + "reward": 2.0602548122406006, + "reward_std": 0.6848001033067703, + "rewards/accuracy_reward": 0.4791666716337204, + "rewards/reasoning_steps_reward": 0.9027778506278992, + "rewards/repetition_penalty_reward": -0.01960639003664255, + "rewards/tag_count_reward": 0.6979166865348816, + "step": 1154 + }, + { + "clip_ratio": 0.0, + "completion_length": 270.9583435058594, + "epoch": 0.5775, + "grad_norm": 57.13616985785417, + "kl": 4.9140625, + "learning_rate": 5.068694113658992e-07, + "loss": -0.0814, + "reward": 2.022095203399658, + "reward_std": 0.7005333751440048, + "rewards/accuracy_reward": 0.4791666865348816, + "rewards/reasoning_steps_reward": 0.8680555820465088, + "rewards/repetition_penalty_reward": -0.023043738678097725, + "rewards/tag_count_reward": 0.6979166865348816, + "step": 1155 + }, + { + "clip_ratio": 0.0, + "completion_length": 255.75000762939453, + "epoch": 0.578, + "grad_norm": 5.38110971277495, + "kl": 1.228515625, + "learning_rate": 5.060876951083828e-07, + "loss": -0.0316, + "reward": 2.7976086139678955, + "reward_std": 0.3995959609746933, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.01662743976339698, + "rewards/tag_count_reward": 0.953125, + "step": 1156 + }, + { + "clip_ratio": 0.0, + "completion_length": 295.50001525878906, + "epoch": 0.5785, + "grad_norm": 50.70936605802299, + "kl": 3.1552734375, + "learning_rate": 5.053061126153624e-07, + "loss": 0.0544, + "reward": 2.074872672557831, + "reward_std": 0.20399940758943558, + "rewards/accuracy_reward": 0.25, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.017141404328867793, + "rewards/tag_count_reward": 0.8697916865348816, + "step": 1157 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.4791717529297, + "epoch": 0.579, + "grad_norm": 64.52110252352004, + "kl": 2.74609375, + "learning_rate": 5.045246662676741e-07, + "loss": 0.0535, + "reward": 2.5658187866210938, + "reward_std": 0.34509243071079254, + "rewards/accuracy_reward": 0.7083333730697632, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.036611984483897686, + "rewards/tag_count_reward": 0.9010416865348816, + "step": 1158 + }, + { + "clip_ratio": 0.0, + "completion_length": 258.00000762939453, + "epoch": 0.5795, + "grad_norm": 15.605596683642418, + "kl": 1.49609375, + "learning_rate": 5.037433584457389e-07, + "loss": -0.0383, + "reward": 2.314841389656067, + "reward_std": 0.7232391238212585, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.015019847080111504, + "rewards/tag_count_reward": 0.78125, + "step": 1159 + }, + { + "clip_ratio": 0.0, + "completion_length": 256.02083587646484, + "epoch": 0.58, + "grad_norm": 12.22918579768311, + "kl": 0.7314453125, + "learning_rate": 5.02962191529556e-07, + "loss": -0.0099, + "reward": 2.601512908935547, + "reward_std": 0.5186595022678375, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.013070614542812109, + "rewards/tag_count_reward": 0.90625, + "step": 1160 + }, + { + "clip_ratio": 0.0, + "completion_length": 265.2708435058594, + "epoch": 0.5805, + "grad_norm": 21.869574669814327, + "kl": 2.5107421875, + "learning_rate": 5.021811678986951e-07, + "loss": 0.0476, + "reward": 2.564257025718689, + "reward_std": 0.4466948760673404, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9444444179534912, + "rewards/repetition_penalty_reward": -0.015604046639055014, + "rewards/tag_count_reward": 0.8645833432674408, + "step": 1161 + }, + { + "clip_ratio": 0.0, + "completion_length": 264.12501525878906, + "epoch": 0.581, + "grad_norm": 8.780241833303409, + "kl": 1.43359375, + "learning_rate": 5.014002899322896e-07, + "loss": -0.0048, + "reward": 2.296339511871338, + "reward_std": 0.45754382014274597, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.026577199809253216, + "rewards/tag_count_reward": 0.8020833432674408, + "step": 1162 + }, + { + "clip_ratio": 0.0, + "completion_length": 300.7708435058594, + "epoch": 0.5815, + "grad_norm": 21.9309323286652, + "kl": 2.3203125, + "learning_rate": 5.006195600090296e-07, + "loss": -0.0216, + "reward": 2.414031744003296, + "reward_std": 0.7338520288467407, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.02520434372127056, + "rewards/tag_count_reward": 0.828125, + "step": 1163 + }, + { + "clip_ratio": 0.0, + "completion_length": 228.08333587646484, + "epoch": 0.582, + "grad_norm": 28.587099800302813, + "kl": 3.21875, + "learning_rate": 4.998389805071536e-07, + "loss": 0.0103, + "reward": 2.427722692489624, + "reward_std": 0.7513796091079712, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.006305098533630371, + "rewards/tag_count_reward": 0.7812500298023224, + "step": 1164 + }, + { + "clip_ratio": 0.0, + "completion_length": 266.7708435058594, + "epoch": 0.5825, + "grad_norm": 19.590917470188238, + "kl": 2.234375, + "learning_rate": 4.990585538044419e-07, + "loss": 0.0747, + "reward": 2.6480218172073364, + "reward_std": 0.6686672568321228, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.013436626642942429, + "rewards/tag_count_reward": 0.8697916865348816, + "step": 1165 + }, + { + "clip_ratio": 0.0, + "completion_length": 253.2916717529297, + "epoch": 0.583, + "grad_norm": 20.553956833130847, + "kl": 2.9140625, + "learning_rate": 4.982782822782101e-07, + "loss": 0.0401, + "reward": 1.9601789712905884, + "reward_std": 0.4777047038078308, + "rewards/accuracy_reward": 0.375, + "rewards/reasoning_steps_reward": 0.9375, + "rewards/repetition_penalty_reward": -0.013779422268271446, + "rewards/tag_count_reward": 0.6614583432674408, + "step": 1166 + }, + { + "clip_ratio": 0.0, + "completion_length": 270.50001525878906, + "epoch": 0.5835, + "grad_norm": 89.06303811158742, + "kl": 3.669921875, + "learning_rate": 4.974981683053001e-07, + "loss": 0.1299, + "reward": 2.885384678840637, + "reward_std": 0.34253598749637604, + "rewards/accuracy_reward": 0.9375000298023224, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.015657078940421343, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 1167 + }, + { + "clip_ratio": 0.0, + "completion_length": 289.3958435058594, + "epoch": 0.584, + "grad_norm": 13.515172514998415, + "kl": 0.8046875, + "learning_rate": 4.967182142620745e-07, + "loss": 0.0289, + "reward": 2.4830256700515747, + "reward_std": 0.39654435217380524, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.022182647138834, + "rewards/tag_count_reward": 0.859375, + "step": 1168 + }, + { + "clip_ratio": 0.0, + "completion_length": 267.4583435058594, + "epoch": 0.5845, + "grad_norm": 16.061739704037873, + "kl": 1.3359375, + "learning_rate": 4.959384225244087e-07, + "loss": 0.0793, + "reward": 1.9616607427597046, + "reward_std": 0.44488397240638733, + "rewards/accuracy_reward": 0.3541666716337204, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.017505998257547617, + "rewards/tag_count_reward": 0.6458333432674408, + "step": 1169 + }, + { + "clip_ratio": 0.0, + "completion_length": 269.1666717529297, + "epoch": 0.585, + "grad_norm": 7.397678949657141, + "kl": 0.7978515625, + "learning_rate": 4.951587954676837e-07, + "loss": 0.0111, + "reward": 2.502622127532959, + "reward_std": 0.2668640099000186, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.01126689650118351, + "rewards/tag_count_reward": 0.8333333432674408, + "step": 1170 + }, + { + "clip_ratio": 0.0, + "completion_length": 226.93750762939453, + "epoch": 0.5855, + "grad_norm": 16.086138152776787, + "kl": 1.734375, + "learning_rate": 4.943793354667783e-07, + "loss": 0.0168, + "reward": 2.2989531755447388, + "reward_std": 0.43994690477848053, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9236112236976624, + "rewards/repetition_penalty_reward": -0.015283002983778715, + "rewards/tag_count_reward": 0.7864583432674408, + "step": 1171 + }, + { + "clip_ratio": 0.0, + "completion_length": 282.8333435058594, + "epoch": 0.586, + "grad_norm": 11.885572960330673, + "kl": 1.953125, + "learning_rate": 4.93600044896063e-07, + "loss": 0.0119, + "reward": 2.477262020111084, + "reward_std": 0.5187919661402702, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.022738128900527954, + "rewards/tag_count_reward": 0.8541666865348816, + "step": 1172 + }, + { + "clip_ratio": 0.0, + "completion_length": 259.91668701171875, + "epoch": 0.5865, + "grad_norm": 188.62778027268504, + "kl": 9.859375, + "learning_rate": 4.928209261293923e-07, + "loss": 0.3053, + "reward": 2.367971897125244, + "reward_std": 0.6133864223957062, + "rewards/accuracy_reward": 0.6250000149011612, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.013972645625472069, + "rewards/tag_count_reward": 0.7916666865348816, + "step": 1173 + }, + { + "clip_ratio": 0.0, + "completion_length": 248.85417938232422, + "epoch": 0.587, + "grad_norm": 541.9716559112582, + "kl": 18.40625, + "learning_rate": 4.920419815400968e-07, + "loss": 0.6135, + "reward": 2.119057536125183, + "reward_std": 0.6997096538543701, + "rewards/accuracy_reward": 0.4375000298023224, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.01983139617368579, + "rewards/tag_count_reward": 0.7291666865348816, + "step": 1174 + }, + { + "clip_ratio": 0.0, + "completion_length": 232.4166717529297, + "epoch": 0.5875, + "grad_norm": 47.221216815875486, + "kl": 4.671875, + "learning_rate": 4.912632135009769e-07, + "loss": 0.0897, + "reward": 1.7002249956130981, + "reward_std": 0.7154510319232941, + "rewards/accuracy_reward": 0.2708333432674408, + "rewards/reasoning_steps_reward": 0.8958334028720856, + "rewards/repetition_penalty_reward": -0.008108395617455244, + "rewards/tag_count_reward": 0.5416666865348816, + "step": 1175 + }, + { + "clip_ratio": 0.0, + "completion_length": 248.64583587646484, + "epoch": 0.588, + "grad_norm": 620.752818719277, + "kl": 5.140625, + "learning_rate": 4.904846243842949e-07, + "loss": 0.109, + "reward": 1.9387726187705994, + "reward_std": 0.8643776774406433, + "rewards/accuracy_reward": 0.4375, + "rewards/reasoning_steps_reward": 0.9097222685813904, + "rewards/repetition_penalty_reward": -0.017824689392000437, + "rewards/tag_count_reward": 0.6093750149011612, + "step": 1176 + }, + { + "clip_ratio": 0.0, + "completion_length": 259.43750762939453, + "epoch": 0.5885, + "grad_norm": 12.885096090616047, + "kl": 1.40234375, + "learning_rate": 4.897062165617686e-07, + "loss": -0.0543, + "reward": 1.8845123052597046, + "reward_std": 0.6029966920614243, + "rewards/accuracy_reward": 0.33333333395421505, + "rewards/reasoning_steps_reward": 0.9583333134651184, + "rewards/repetition_penalty_reward": -0.016529476270079613, + "rewards/tag_count_reward": 0.609375, + "step": 1177 + }, + { + "clip_ratio": 0.0, + "completion_length": 239.1666717529297, + "epoch": 0.589, + "grad_norm": 10.235870138091864, + "kl": 1.046875, + "learning_rate": 4.88927992404563e-07, + "loss": -0.055, + "reward": 2.0582520961761475, + "reward_std": 0.688510000705719, + "rewards/accuracy_reward": 0.4791666716337204, + "rewards/reasoning_steps_reward": 0.9027777910232544, + "rewards/repetition_penalty_reward": -0.011192373465746641, + "rewards/tag_count_reward": 0.6875000298023224, + "step": 1178 + }, + { + "clip_ratio": 0.0, + "completion_length": 277.9166717529297, + "epoch": 0.5895, + "grad_norm": 15.676254056984863, + "kl": 1.47265625, + "learning_rate": 4.881499542832841e-07, + "loss": -0.0441, + "reward": 1.829641878604889, + "reward_std": 0.8753339052200317, + "rewards/accuracy_reward": 0.3541666865348816, + "rewards/reasoning_steps_reward": 0.9097222685813904, + "rewards/repetition_penalty_reward": -0.01237204298377037, + "rewards/tag_count_reward": 0.5781250298023224, + "step": 1179 + }, + { + "clip_ratio": 0.0, + "completion_length": 254.35417938232422, + "epoch": 0.59, + "grad_norm": 15.872841179430333, + "kl": 1.21484375, + "learning_rate": 4.873721045679706e-07, + "loss": -0.0526, + "reward": 1.8742225170135498, + "reward_std": 0.628957211971283, + "rewards/accuracy_reward": 0.2500000111758709, + "rewards/reasoning_steps_reward": 0.8888889253139496, + "rewards/repetition_penalty_reward": -0.009458072949200869, + "rewards/tag_count_reward": 0.7447916865348816, + "step": 1180 + }, + { + "clip_ratio": 0.0, + "completion_length": 272.5208435058594, + "epoch": 0.5905, + "grad_norm": 17.36437993847142, + "kl": 2.4609375, + "learning_rate": 4.865944456280878e-07, + "loss": 0.0072, + "reward": 1.6619837880134583, + "reward_std": 0.573122188448906, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/reasoning_steps_reward": 0.9375000596046448, + "rewards/repetition_penalty_reward": -0.00989127904176712, + "rewards/tag_count_reward": 0.6510416865348816, + "step": 1181 + }, + { + "clip_ratio": 0.0, + "completion_length": 236.0625, + "epoch": 0.591, + "grad_norm": 18.143037794685533, + "kl": 3.7890625, + "learning_rate": 4.858169798325198e-07, + "loss": 0.0575, + "reward": 1.7578097581863403, + "reward_std": 0.9271320402622223, + "rewards/accuracy_reward": 0.3958333432674408, + "rewards/reasoning_steps_reward": 0.75, + "rewards/repetition_penalty_reward": -0.013023747596889734, + "rewards/tag_count_reward": 0.6250000149011612, + "step": 1182 + }, + { + "clip_ratio": 0.0, + "completion_length": 278.2916717529297, + "epoch": 0.5915, + "grad_norm": 12.26718270823858, + "kl": 1.2421875, + "learning_rate": 4.850397095495621e-07, + "loss": -0.0167, + "reward": 2.673969864845276, + "reward_std": 0.5497699528932571, + "rewards/accuracy_reward": 0.8125, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.025683030486106873, + "rewards/tag_count_reward": 0.9010416865348816, + "step": 1183 + }, + { + "clip_ratio": 0.0, + "completion_length": 278.2291717529297, + "epoch": 0.592, + "grad_norm": 10.94115924929318, + "kl": 2.15234375, + "learning_rate": 4.842626371469149e-07, + "loss": -0.0423, + "reward": 2.5248043537139893, + "reward_std": 0.6554135978221893, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.02033464703708887, + "rewards/tag_count_reward": 0.8645833730697632, + "step": 1184 + }, + { + "clip_ratio": 0.0, + "completion_length": 305.81251525878906, + "epoch": 0.5925, + "grad_norm": 19.120132180785248, + "kl": 2.685546875, + "learning_rate": 4.834857649916752e-07, + "loss": 0.0412, + "reward": 2.3415642976760864, + "reward_std": 0.4712478220462799, + "rewards/accuracy_reward": 0.5000000149011612, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.033435771241784096, + "rewards/tag_count_reward": 0.875, + "step": 1185 + }, + { + "clip_ratio": 0.0, + "completion_length": 291.60418701171875, + "epoch": 0.593, + "grad_norm": 29.326687780779196, + "kl": 2.50390625, + "learning_rate": 4.827090954503308e-07, + "loss": -0.0046, + "reward": 2.505213499069214, + "reward_std": 0.5307941734790802, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.02777272555977106, + "rewards/tag_count_reward": 0.8802083432674408, + "step": 1186 + }, + { + "clip_ratio": 0.0, + "completion_length": 257.37500762939453, + "epoch": 0.5935, + "grad_norm": 89.56476050571975, + "kl": 8.875, + "learning_rate": 4.819326308887513e-07, + "loss": 0.2113, + "reward": 1.9479205012321472, + "reward_std": 0.5512201189994812, + "rewards/accuracy_reward": 0.2916666716337204, + "rewards/reasoning_steps_reward": 0.9236111044883728, + "rewards/repetition_penalty_reward": -0.01735728792846203, + "rewards/tag_count_reward": 0.7500000298023224, + "step": 1187 + }, + { + "clip_ratio": 0.0, + "completion_length": 282.0833435058594, + "epoch": 0.594, + "grad_norm": 15.951627791356907, + "kl": 3.03125, + "learning_rate": 4.811563736721829e-07, + "loss": -0.0638, + "reward": 2.4049184322357178, + "reward_std": 0.6997295022010803, + "rewards/accuracy_reward": 0.625, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.018692869693040848, + "rewards/tag_count_reward": 0.8333333432674408, + "step": 1188 + }, + { + "clip_ratio": 0.0, + "completion_length": 266.9583435058594, + "epoch": 0.5945, + "grad_norm": 20.534655927942946, + "kl": 1.666015625, + "learning_rate": 4.803803261652395e-07, + "loss": 0.0088, + "reward": 2.855802297592163, + "reward_std": 0.3537350296974182, + "rewards/accuracy_reward": 0.9583333730697632, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.043503398075699806, + "rewards/tag_count_reward": 0.96875, + "step": 1189 + }, + { + "clip_ratio": 0.0, + "completion_length": 257.6041717529297, + "epoch": 0.595, + "grad_norm": 6.707455759169878, + "kl": 0.607421875, + "learning_rate": 4.79604490731896e-07, + "loss": 0.0076, + "reward": 2.3484212160110474, + "reward_std": 0.3202339052222669, + "rewards/accuracy_reward": 0.4375, + "rewards/reasoning_steps_reward": 0.9861111640930176, + "rewards/repetition_penalty_reward": -0.012690091505646706, + "rewards/tag_count_reward": 0.9375, + "step": 1190 + }, + { + "clip_ratio": 0.0, + "completion_length": 291.3541717529297, + "epoch": 0.5955, + "grad_norm": 9.733669029070196, + "kl": 1.18359375, + "learning_rate": 4.788288697354824e-07, + "loss": -0.0888, + "reward": 2.4767041206359863, + "reward_std": 0.5791968405246735, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.026768106035888195, + "rewards/tag_count_reward": 0.8437500298023224, + "step": 1191 + }, + { + "clip_ratio": 0.0, + "completion_length": 315.3125, + "epoch": 0.596, + "grad_norm": 5.518435395583406, + "kl": 0.6328125, + "learning_rate": 4.780534655386743e-07, + "loss": 0.0373, + "reward": 2.48011314868927, + "reward_std": 0.3792032450437546, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.030303513631224632, + "rewards/tag_count_reward": 0.9270833432674408, + "step": 1192 + }, + { + "clip_ratio": 0.0, + "completion_length": 275.3333435058594, + "epoch": 0.5965, + "grad_norm": 10.222004650555824, + "kl": 0.916015625, + "learning_rate": 4.772782805034876e-07, + "loss": -0.0505, + "reward": 2.466397762298584, + "reward_std": 0.637960284948349, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.03533835709095001, + "rewards/tag_count_reward": 0.8489583432674408, + "step": 1193 + }, + { + "clip_ratio": 0.0, + "completion_length": 246.20834350585938, + "epoch": 0.597, + "grad_norm": 83.67864904271147, + "kl": 4.06640625, + "learning_rate": 4.7650331699127013e-07, + "loss": 0.021, + "reward": 2.561871290206909, + "reward_std": 0.4868089556694031, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.951388955116272, + "rewards/repetition_penalty_reward": -0.0197259820997715, + "rewards/tag_count_reward": 0.8385416865348816, + "step": 1194 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.2708435058594, + "epoch": 0.5975, + "grad_norm": 1778.754909364379, + "kl": 56.125, + "learning_rate": 4.75728577362695e-07, + "loss": 1.2038, + "reward": 2.206901729106903, + "reward_std": 0.6084506213665009, + "rewards/accuracy_reward": 0.5208333358168602, + "rewards/reasoning_steps_reward": 0.9791667461395264, + "rewards/repetition_penalty_reward": -0.02747342176735401, + "rewards/tag_count_reward": 0.734375, + "step": 1195 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.1458435058594, + "epoch": 0.598, + "grad_norm": 43.303276292552525, + "kl": 3.14453125, + "learning_rate": 4.749540639777539e-07, + "loss": 0.2523, + "reward": 2.6388306617736816, + "reward_std": 0.47564953565597534, + "rewards/accuracy_reward": 0.7708333730697632, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.013947171624749899, + "rewards/tag_count_reward": 0.8958333730697632, + "step": 1196 + }, + { + "clip_ratio": 0.0, + "completion_length": 297.4375, + "epoch": 0.5985, + "grad_norm": 6.55423415450962, + "kl": 0.69921875, + "learning_rate": 4.741797791957489e-07, + "loss": -0.0043, + "reward": 2.5631258487701416, + "reward_std": 0.4409189820289612, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.0254158116877079, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 1197 + }, + { + "clip_ratio": 0.0, + "completion_length": 286.5, + "epoch": 0.599, + "grad_norm": 79.15687028509065, + "kl": 5.171875, + "learning_rate": 4.7340572537528547e-07, + "loss": 0.1315, + "reward": 2.3954086303710938, + "reward_std": 0.5703263282775879, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.022994363214820623, + "rewards/tag_count_reward": 0.890625, + "step": 1198 + }, + { + "clip_ratio": 0.0, + "completion_length": 274.0208435058594, + "epoch": 0.5995, + "grad_norm": 24.952831112368063, + "kl": 3.373046875, + "learning_rate": 4.7263190487426563e-07, + "loss": -0.0475, + "reward": 2.256582021713257, + "reward_std": 0.5435906499624252, + "rewards/accuracy_reward": 0.5000000223517418, + "rewards/reasoning_steps_reward": 0.9305555820465088, + "rewards/repetition_penalty_reward": -0.028140274807810783, + "rewards/tag_count_reward": 0.8541666865348816, + "step": 1199 + }, + { + "clip_ratio": 0.0, + "completion_length": 316.5208435058594, + "epoch": 0.6, + "grad_norm": 5.061254957850967, + "kl": 0.705078125, + "learning_rate": 4.7185832004988133e-07, + "loss": -0.0041, + "reward": 2.658898711204529, + "reward_std": 0.46046267449855804, + "rewards/accuracy_reward": 0.75, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.03207360953092575, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 1200 + }, + { + "clip_ratio": 0.0, + "completion_length": 261.5416717529297, + "epoch": 0.6005, + "grad_norm": 35.65864208267965, + "kl": 4.21484375, + "learning_rate": 4.710849732586059e-07, + "loss": 0.0959, + "reward": 2.372064232826233, + "reward_std": 0.5162666738033295, + "rewards/accuracy_reward": 0.6250000149011612, + "rewards/reasoning_steps_reward": 0.9444444179534912, + "rewards/repetition_penalty_reward": -0.015088722575455904, + "rewards/tag_count_reward": 0.8177083432674408, + "step": 1201 + }, + { + "clip_ratio": 0.0, + "completion_length": 300.2708435058594, + "epoch": 0.601, + "grad_norm": 14.890784763265065, + "kl": 1.40234375, + "learning_rate": 4.703118668561875e-07, + "loss": 0.0295, + "reward": 2.602800130844116, + "reward_std": 0.5039637088775635, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.03088043723255396, + "rewards/tag_count_reward": 0.9322916865348816, + "step": 1202 + }, + { + "clip_ratio": 0.0, + "completion_length": 275.93751525878906, + "epoch": 0.6015, + "grad_norm": 24.01630825109177, + "kl": 2.609375, + "learning_rate": 4.6953900319764274e-07, + "loss": 0.0056, + "reward": 2.8198323249816895, + "reward_std": 0.4068307876586914, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.025654025375843048, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 1203 + }, + { + "clip_ratio": 0.0, + "completion_length": 311.0416717529297, + "epoch": 0.602, + "grad_norm": 48.98626731878835, + "kl": 4.4609375, + "learning_rate": 4.68766384637248e-07, + "loss": 0.0732, + "reward": 1.9985000491142273, + "reward_std": 0.7038697004318237, + "rewards/accuracy_reward": 0.354166679084301, + "rewards/reasoning_steps_reward": 0.916666716337204, + "rewards/repetition_penalty_reward": -0.022333373315632343, + "rewards/tag_count_reward": 0.75, + "step": 1204 + }, + { + "clip_ratio": 0.0, + "completion_length": 308.41668701171875, + "epoch": 0.6025, + "grad_norm": 3.501918254202678, + "kl": 0.4033203125, + "learning_rate": 4.679940135285336e-07, + "loss": 0.0275, + "reward": 2.9462958574295044, + "reward_std": 0.0838532904163003, + "rewards/accuracy_reward": 0.9791666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.03287102282047272, + "rewards/tag_count_reward": 1.0, + "step": 1205 + }, + { + "clip_ratio": 0.0, + "completion_length": 304.06251525878906, + "epoch": 0.603, + "grad_norm": 11.808559352326427, + "kl": 0.861328125, + "learning_rate": 4.672218922242759e-07, + "loss": 0.0187, + "reward": 2.636156678199768, + "reward_std": 0.512239083647728, + "rewards/accuracy_reward": 0.7708333730697632, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.011412853142246604, + "rewards/tag_count_reward": 0.8906250298023224, + "step": 1206 + }, + { + "clip_ratio": 0.0, + "completion_length": 324.18751525878906, + "epoch": 0.6035, + "grad_norm": 9.023520837002595, + "kl": 0.73046875, + "learning_rate": 4.664500230764903e-07, + "loss": -0.1363, + "reward": 2.22309547662735, + "reward_std": 0.5313438028097153, + "rewards/accuracy_reward": 0.4166666679084301, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.026904682628810406, + "rewards/tag_count_reward": 0.8750000298023224, + "step": 1207 + }, + { + "clip_ratio": 0.0, + "completion_length": 307.2916717529297, + "epoch": 0.604, + "grad_norm": 10.414640951029634, + "kl": 0.888671875, + "learning_rate": 4.656784084364238e-07, + "loss": -0.0566, + "reward": 2.740461230278015, + "reward_std": 0.5690730065107346, + "rewards/accuracy_reward": 0.8750000298023224, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.02516377530992031, + "rewards/tag_count_reward": 0.9322916865348816, + "step": 1208 + }, + { + "clip_ratio": 0.0, + "completion_length": 336.6458435058594, + "epoch": 0.6045, + "grad_norm": 5.915397925387995, + "kl": 1.1640625, + "learning_rate": 4.6490705065454883e-07, + "loss": -0.099, + "reward": 2.537236452102661, + "reward_std": 0.5170433670282364, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.965277761220932, + "rewards/repetition_penalty_reward": -0.02179141901433468, + "rewards/tag_count_reward": 0.90625, + "step": 1209 + }, + { + "clip_ratio": 0.0, + "completion_length": 318.18751525878906, + "epoch": 0.605, + "grad_norm": 18.20904524508889, + "kl": 2.44140625, + "learning_rate": 4.641359520805548e-07, + "loss": 0.0451, + "reward": 2.4042060375213623, + "reward_std": 0.44842204451560974, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.041974639520049095, + "rewards/tag_count_reward": 0.8906250298023224, + "step": 1210 + }, + { + "clip_ratio": 0.0, + "completion_length": 377.29168701171875, + "epoch": 0.6055, + "grad_norm": 43.49583663170755, + "kl": 4.55859375, + "learning_rate": 4.6336511506334177e-07, + "loss": 0.0108, + "reward": 2.3035465478897095, + "reward_std": 0.5203736424446106, + "rewards/accuracy_reward": 0.4583333358168602, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.038467422127723694, + "rewards/tag_count_reward": 0.9114583432674408, + "step": 1211 + }, + { + "clip_ratio": 0.0, + "completion_length": 290.9375, + "epoch": 0.606, + "grad_norm": 42.363711059687056, + "kl": 4.953125, + "learning_rate": 4.6259454195101267e-07, + "loss": 0.1363, + "reward": 2.4941481351852417, + "reward_std": 0.5663126707077026, + "rewards/accuracy_reward": 0.6458333730697632, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.023213034495711327, + "rewards/tag_count_reward": 0.8854166865348816, + "step": 1212 + }, + { + "clip_ratio": 0.0, + "completion_length": 257.8541717529297, + "epoch": 0.6065, + "grad_norm": 53.23769285154482, + "kl": 6.1328125, + "learning_rate": 4.61824235090867e-07, + "loss": 0.2492, + "reward": 2.665413498878479, + "reward_std": 0.4333008825778961, + "rewards/accuracy_reward": 0.8125, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.018614262342453003, + "rewards/tag_count_reward": 0.8854166865348816, + "step": 1213 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.75001525878906, + "epoch": 0.607, + "grad_norm": 23.490826994445097, + "kl": 3.400390625, + "learning_rate": 4.6105419682939316e-07, + "loss": 0.0216, + "reward": 2.4767391681671143, + "reward_std": 0.47361528873443604, + "rewards/accuracy_reward": 0.645833358168602, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.030205383896827698, + "rewards/tag_count_reward": 0.8958333730697632, + "step": 1214 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.75001525878906, + "epoch": 0.6075, + "grad_norm": 7.699965204481391, + "kl": 0.775390625, + "learning_rate": 4.602844295122613e-07, + "loss": 0.0431, + "reward": 2.93735134601593, + "reward_std": 0.11865681782364845, + "rewards/accuracy_reward": 0.9791666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.03139885421842337, + "rewards/tag_count_reward": 0.9895833432674408, + "step": 1215 + }, + { + "clip_ratio": 0.0, + "completion_length": 299.6666717529297, + "epoch": 0.608, + "grad_norm": 8.096084671606365, + "kl": 1.009765625, + "learning_rate": 4.59514935484316e-07, + "loss": 0.049, + "reward": 2.79358172416687, + "reward_std": 0.4377841055393219, + "rewards/accuracy_reward": 0.875, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.024126553907990456, + "rewards/tag_count_reward": 0.9427083432674408, + "step": 1216 + }, + { + "clip_ratio": 0.0, + "completion_length": 329.0416717529297, + "epoch": 0.6085, + "grad_norm": 14.138163667060162, + "kl": 1.8046875, + "learning_rate": 4.5874571708956953e-07, + "loss": -0.0943, + "reward": 2.6621501445770264, + "reward_std": 0.6885767579078674, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.037502871826291084, + "rewards/tag_count_reward": 0.9218750298023224, + "step": 1217 + }, + { + "clip_ratio": 0.0, + "completion_length": 317.6041717529297, + "epoch": 0.609, + "grad_norm": 10.580306326635657, + "kl": 1.7734375, + "learning_rate": 4.579767766711944e-07, + "loss": 0.085, + "reward": 2.5517923831939697, + "reward_std": 0.4928157329559326, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.019388345535844564, + "rewards/tag_count_reward": 0.8906250298023224, + "step": 1218 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.25001525878906, + "epoch": 0.6095, + "grad_norm": 23.76275938679832, + "kl": 4.3046875, + "learning_rate": 4.572081165715167e-07, + "loss": 0.0931, + "reward": 2.224815607070923, + "reward_std": 0.5812746435403824, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.9166666865348816, + "rewards/repetition_penalty_reward": -0.030392706394195557, + "rewards/tag_count_reward": 0.796875, + "step": 1219 + }, + { + "clip_ratio": 0.0, + "completion_length": 304.5416717529297, + "epoch": 0.61, + "grad_norm": 16.029210941264648, + "kl": 3.0, + "learning_rate": 4.5643973913200837e-07, + "loss": 0.0726, + "reward": 2.567992091178894, + "reward_std": 0.6492457389831543, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.03791070729494095, + "rewards/tag_count_reward": 0.8906250298023224, + "step": 1220 + }, + { + "clip_ratio": 0.0, + "completion_length": 286.0, + "epoch": 0.6105, + "grad_norm": 14.40075724881573, + "kl": 2.84375, + "learning_rate": 4.556716466932803e-07, + "loss": 0.0603, + "reward": 2.1511141061782837, + "reward_std": 0.6159583330154419, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.017288665752857924, + "rewards/tag_count_reward": 0.8489583730697632, + "step": 1221 + }, + { + "clip_ratio": 0.0, + "completion_length": 322.87501525878906, + "epoch": 0.611, + "grad_norm": 7.252852057442951, + "kl": 0.806640625, + "learning_rate": 4.549038415950751e-07, + "loss": -0.048, + "reward": 2.8554985523223877, + "reward_std": 0.38053126633167267, + "rewards/accuracy_reward": 0.9375000298023224, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.03339047962799668, + "rewards/tag_count_reward": 0.9583333432674408, + "step": 1222 + }, + { + "clip_ratio": 0.0, + "completion_length": 299.3333435058594, + "epoch": 0.6115, + "grad_norm": 9.575941886870105, + "kl": 2.7734375, + "learning_rate": 4.5413632617626054e-07, + "loss": -0.1175, + "reward": 2.2987377047538757, + "reward_std": 0.7605935335159302, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9305556416511536, + "rewards/repetition_penalty_reward": -0.02765120565891266, + "rewards/tag_count_reward": 0.8125000298023224, + "step": 1223 + }, + { + "clip_ratio": 0.0, + "completion_length": 319.8333435058594, + "epoch": 0.612, + "grad_norm": 7.684972316087842, + "kl": 1.193359375, + "learning_rate": 4.5336910277482155e-07, + "loss": -0.0379, + "reward": 2.6804885864257812, + "reward_std": 0.4239862561225891, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.02437271736562252, + "rewards/tag_count_reward": 0.9270833432674408, + "step": 1224 + }, + { + "clip_ratio": 0.0, + "completion_length": 263.9166717529297, + "epoch": 0.6125, + "grad_norm": 9.7169462842633, + "kl": 2.3359375, + "learning_rate": 4.526021737278537e-07, + "loss": -0.0673, + "reward": 2.2160778045654297, + "reward_std": 0.7273769080638885, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.8819445371627808, + "rewards/repetition_penalty_reward": -0.02524164505302906, + "rewards/tag_count_reward": 0.8177083432674408, + "step": 1225 + }, + { + "clip_ratio": 0.0, + "completion_length": 327.37501525878906, + "epoch": 0.613, + "grad_norm": 23.176995418112103, + "kl": 3.3359375, + "learning_rate": 4.51835541371556e-07, + "loss": 0.0103, + "reward": 2.078250527381897, + "reward_std": 0.5842549949884415, + "rewards/accuracy_reward": 0.31250002048909664, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.020707917399704456, + "rewards/tag_count_reward": 0.8281250298023224, + "step": 1226 + }, + { + "clip_ratio": 0.0, + "completion_length": 236.18750762939453, + "epoch": 0.6135, + "grad_norm": 105.22916350823351, + "kl": 10.765625, + "learning_rate": 4.5106920804122304e-07, + "loss": 0.1645, + "reward": 2.2080377340316772, + "reward_std": 0.8133403062820435, + "rewards/accuracy_reward": 0.5833333730697632, + "rewards/reasoning_steps_reward": 0.8749999701976776, + "rewards/repetition_penalty_reward": -0.015920499339699745, + "rewards/tag_count_reward": 0.7656250298023224, + "step": 1227 + }, + { + "clip_ratio": 0.0, + "completion_length": 343.1875, + "epoch": 0.614, + "grad_norm": 34.43707466221288, + "kl": 2.984375, + "learning_rate": 4.503031760712397e-07, + "loss": 0.0916, + "reward": 2.6236408948898315, + "reward_std": 0.5556021928787231, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.02219252847135067, + "rewards/tag_count_reward": 0.9166666865348816, + "step": 1228 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.6041717529297, + "epoch": 0.6145, + "grad_norm": 47.071050622736806, + "kl": 7.921875, + "learning_rate": 4.4953744779507197e-07, + "loss": 0.0256, + "reward": 2.315521240234375, + "reward_std": 0.8524576425552368, + "rewards/accuracy_reward": 0.625, + "rewards/reasoning_steps_reward": 0.8888889253139496, + "rewards/repetition_penalty_reward": -0.021284347400069237, + "rewards/tag_count_reward": 0.8229166865348816, + "step": 1229 + }, + { + "clip_ratio": 0.0, + "completion_length": 285.0, + "epoch": 0.615, + "grad_norm": 10.5407270563147, + "kl": 2.294921875, + "learning_rate": 4.4877202554526084e-07, + "loss": -0.0323, + "reward": 2.4727187156677246, + "reward_std": 0.6452691853046417, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.03422584570944309, + "rewards/tag_count_reward": 0.875, + "step": 1230 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.8333435058594, + "epoch": 0.6155, + "grad_norm": 8.990594800844864, + "kl": 0.833984375, + "learning_rate": 4.480069116534151e-07, + "loss": -0.0071, + "reward": 2.5792577266693115, + "reward_std": 0.3115484416484833, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.03011729847639799, + "rewards/tag_count_reward": 0.9427083432674408, + "step": 1231 + }, + { + "clip_ratio": 0.0, + "completion_length": 331.8333435058594, + "epoch": 0.616, + "grad_norm": 8.186829347268167, + "kl": 1.681640625, + "learning_rate": 4.4724210845020494e-07, + "loss": -0.0651, + "reward": 2.419158935546875, + "reward_std": 0.5638627856969833, + "rewards/accuracy_reward": 0.6041666716337204, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.033966176211833954, + "rewards/tag_count_reward": 0.8906250298023224, + "step": 1232 + }, + { + "clip_ratio": 0.0, + "completion_length": 283.5416717529297, + "epoch": 0.6165, + "grad_norm": 16.90957738932618, + "kl": 2.40625, + "learning_rate": 4.4647761826535303e-07, + "loss": -0.0256, + "reward": 2.2396740913391113, + "reward_std": 0.492450088262558, + "rewards/accuracy_reward": 0.458333358168602, + "rewards/reasoning_steps_reward": 0.9444445371627808, + "rewards/repetition_penalty_reward": -0.03810371086001396, + "rewards/tag_count_reward": 0.8750000298023224, + "step": 1233 + }, + { + "clip_ratio": 0.0, + "completion_length": 270.3541717529297, + "epoch": 0.617, + "grad_norm": 11.105722560883672, + "kl": 1.4140625, + "learning_rate": 4.457134434276293e-07, + "loss": -0.1259, + "reward": 2.2069406509399414, + "reward_std": 0.5749087035655975, + "rewards/accuracy_reward": 0.3958333432674408, + "rewards/reasoning_steps_reward": 0.9236111342906952, + "rewards/repetition_penalty_reward": -0.03437890112400055, + "rewards/tag_count_reward": 0.9218750298023224, + "step": 1234 + }, + { + "clip_ratio": 0.0, + "completion_length": 292.7083435058594, + "epoch": 0.6175, + "grad_norm": 9.032805027425631, + "kl": 1.0859375, + "learning_rate": 4.449495862648427e-07, + "loss": 0.0574, + "reward": 2.7222955226898193, + "reward_std": 0.23863910883665085, + "rewards/accuracy_reward": 0.7708333730697632, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.022496161051094532, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 1235 + }, + { + "clip_ratio": 0.0, + "completion_length": 264.2083435058594, + "epoch": 0.618, + "grad_norm": 17.78102072531694, + "kl": 2.375, + "learning_rate": 4.441860491038345e-07, + "loss": 0.0578, + "reward": 2.586129903793335, + "reward_std": 0.6346015930175781, + "rewards/accuracy_reward": 0.75, + "rewards/reasoning_steps_reward": 0.9375000596046448, + "rewards/repetition_penalty_reward": -0.02324526757001877, + "rewards/tag_count_reward": 0.921875, + "step": 1236 + }, + { + "clip_ratio": 0.0, + "completion_length": 315.7708435058594, + "epoch": 0.6185, + "grad_norm": 9.03437362621798, + "kl": 1.70703125, + "learning_rate": 4.4342283427047164e-07, + "loss": -0.0525, + "reward": 2.5365703105926514, + "reward_std": 0.5162703096866608, + "rewards/accuracy_reward": 0.625, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.024193717166781425, + "rewards/tag_count_reward": 0.9427083730697632, + "step": 1237 + }, + { + "clip_ratio": 0.0, + "completion_length": 312.4375, + "epoch": 0.619, + "grad_norm": 15.517046311400755, + "kl": 2.40234375, + "learning_rate": 4.4265994408963867e-07, + "loss": 0.063, + "reward": 2.5961966514587402, + "reward_std": 0.6436916887760162, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.02880349662154913, + "rewards/tag_count_reward": 0.8958333432674408, + "step": 1238 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.25001525878906, + "epoch": 0.6195, + "grad_norm": 24.87465794144074, + "kl": 3.734375, + "learning_rate": 4.418973808852313e-07, + "loss": 0.0546, + "reward": 2.469591498374939, + "reward_std": 0.5368989706039429, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.951388955116272, + "rewards/repetition_penalty_reward": -0.018255963921546936, + "rewards/tag_count_reward": 0.8906250298023224, + "step": 1239 + }, + { + "clip_ratio": 0.0, + "completion_length": 342.12501525878906, + "epoch": 0.62, + "grad_norm": 12.84411480237786, + "kl": 2.25390625, + "learning_rate": 4.4113514698014953e-07, + "loss": 0.024, + "reward": 2.184650182723999, + "reward_std": 0.47897736728191376, + "rewards/accuracy_reward": 0.3541666679084301, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.023683225736021996, + "rewards/tag_count_reward": 0.8958333432674408, + "step": 1240 + }, + { + "clip_ratio": 0.0, + "completion_length": 296.5208435058594, + "epoch": 0.6205, + "grad_norm": 8.658225737289808, + "kl": 1.1953125, + "learning_rate": 4.403732446962899e-07, + "loss": -0.0936, + "reward": 2.3716037273406982, + "reward_std": 0.558637946844101, + "rewards/accuracy_reward": 0.5208333358168602, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.01554908649995923, + "rewards/tag_count_reward": 0.9010416865348816, + "step": 1241 + }, + { + "clip_ratio": 0.0, + "completion_length": 312.60418701171875, + "epoch": 0.621, + "grad_norm": 7.919211211849244, + "kl": 1.11328125, + "learning_rate": 4.3961167635453876e-07, + "loss": 0.0181, + "reward": 2.5375410318374634, + "reward_std": 0.37939758598804474, + "rewards/accuracy_reward": 0.6250000149011612, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.02148678805679083, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 1242 + }, + { + "clip_ratio": 0.0, + "completion_length": 292.50001525878906, + "epoch": 0.6215, + "grad_norm": 5.99197945202883, + "kl": 0.673828125, + "learning_rate": 4.388504442747657e-07, + "loss": -0.0048, + "reward": 2.649431347846985, + "reward_std": 0.3595951795578003, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.034596722573041916, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 1243 + }, + { + "clip_ratio": 0.0, + "completion_length": 325.56251525878906, + "epoch": 0.622, + "grad_norm": 7.039763497814729, + "kl": 1.1123046875, + "learning_rate": 4.3808955077581546e-07, + "loss": -0.0806, + "reward": 2.560014247894287, + "reward_std": 0.45148639380931854, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.025055398233234882, + "rewards/tag_count_reward": 0.9531250298023224, + "step": 1244 + }, + { + "clip_ratio": 0.0, + "completion_length": 334.18751525878906, + "epoch": 0.6225, + "grad_norm": 9.138240023491278, + "kl": 0.78515625, + "learning_rate": 4.373289981755013e-07, + "loss": -0.0215, + "reward": 2.5294055938720703, + "reward_std": 0.3349990248680115, + "rewards/accuracy_reward": 0.625, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.050455573946237564, + "rewards/tag_count_reward": 0.96875, + "step": 1245 + }, + { + "clip_ratio": 0.0, + "completion_length": 264.5, + "epoch": 0.623, + "grad_norm": 8.492208011646788, + "kl": 1.853515625, + "learning_rate": 4.365687887905988e-07, + "loss": -0.0616, + "reward": 2.7429254055023193, + "reward_std": 0.5102400928735733, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.01575501961633563, + "rewards/tag_count_reward": 0.9531250298023224, + "step": 1246 + }, + { + "clip_ratio": 0.0, + "completion_length": 297.93751525878906, + "epoch": 0.6235, + "grad_norm": 57.99567662946909, + "kl": 7.73828125, + "learning_rate": 4.358089249368375e-07, + "loss": 0.086, + "reward": 2.5771158933639526, + "reward_std": 0.5258086919784546, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.021842443384230137, + "rewards/tag_count_reward": 0.9322916865348816, + "step": 1247 + }, + { + "clip_ratio": 0.0, + "completion_length": 276.8333435058594, + "epoch": 0.624, + "grad_norm": 341.16700521948934, + "kl": 29.625, + "learning_rate": 4.350494089288943e-07, + "loss": 0.676, + "reward": 2.2864397764205933, + "reward_std": 0.4129794090986252, + "rewards/accuracy_reward": 0.4583333432674408, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.027796387672424316, + "rewards/tag_count_reward": 0.890625, + "step": 1248 + }, + { + "clip_ratio": 0.0, + "completion_length": 320.68751525878906, + "epoch": 0.6245, + "grad_norm": 181.81088614690876, + "kl": 15.53125, + "learning_rate": 4.3429024308038686e-07, + "loss": 0.2868, + "reward": 2.834537148475647, + "reward_std": 0.33811767399311066, + "rewards/accuracy_reward": 0.9375000298023224, + "rewards/reasoning_steps_reward": 0.9652778506278992, + "rewards/repetition_penalty_reward": -0.03178227413445711, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 1249 + }, + { + "clip_ratio": 0.0, + "completion_length": 297.0625, + "epoch": 0.625, + "grad_norm": 11.08135265703862, + "kl": 1.1259765625, + "learning_rate": 4.3353142970386557e-07, + "loss": 0.1305, + "reward": 2.7022109031677246, + "reward_std": 0.10300289653241634, + "rewards/accuracy_reward": 0.7291666716337204, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.01653942931443453, + "rewards/tag_count_reward": 0.9895833432674408, + "step": 1250 + }, + { + "clip_ratio": 0.0, + "completion_length": 292.9583435058594, + "epoch": 0.6255, + "grad_norm": 124.39669515969251, + "kl": 14.46875, + "learning_rate": 4.327729711108082e-07, + "loss": 0.2354, + "reward": 2.619844675064087, + "reward_std": 0.6426686346530914, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.019044365733861923, + "rewards/tag_count_reward": 0.9166666865348816, + "step": 1251 + }, + { + "clip_ratio": 0.0, + "completion_length": 314.0208435058594, + "epoch": 0.626, + "grad_norm": 4.466923459475626, + "kl": 0.685546875, + "learning_rate": 4.3201486961161093e-07, + "loss": -0.0116, + "reward": 2.894577383995056, + "reward_std": 0.2078223153948784, + "rewards/accuracy_reward": 0.9583333432674408, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.029034032486379147, + "rewards/tag_count_reward": 0.9791666865348816, + "step": 1252 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.9791717529297, + "epoch": 0.6265, + "grad_norm": 26.605555594559796, + "kl": 1.48828125, + "learning_rate": 4.312571275155823e-07, + "loss": -0.038, + "reward": 2.5493518114089966, + "reward_std": 0.5423057973384857, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.040926164016127586, + "rewards/tag_count_reward": 0.8958333432674408, + "step": 1253 + }, + { + "clip_ratio": 0.0, + "completion_length": 328.18751525878906, + "epoch": 0.627, + "grad_norm": 8.302364671310045, + "kl": 0.7890625, + "learning_rate": 4.304997471309361e-07, + "loss": -0.0157, + "reward": 2.6195199489593506, + "reward_std": 0.5493273586034775, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.045410582795739174, + "rewards/tag_count_reward": 0.9427083730697632, + "step": 1254 + }, + { + "clip_ratio": 0.0, + "completion_length": 304.4166717529297, + "epoch": 0.6275, + "grad_norm": 13.946595632340095, + "kl": 1.984375, + "learning_rate": 4.297427307647844e-07, + "loss": -0.0215, + "reward": 2.4039559364318848, + "reward_std": 0.4896374046802521, + "rewards/accuracy_reward": 0.520833358168602, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.02312744501978159, + "rewards/tag_count_reward": 0.9270833432674408, + "step": 1255 + }, + { + "clip_ratio": 0.0, + "completion_length": 283.3958435058594, + "epoch": 0.628, + "grad_norm": 11.365303476859737, + "kl": 0.7919921875, + "learning_rate": 4.2898608072313045e-07, + "loss": -0.0273, + "reward": 2.542173385620117, + "reward_std": 0.4263547882437706, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.027271111495792866, + "rewards/tag_count_reward": 0.9375000298023224, + "step": 1256 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.06251525878906, + "epoch": 0.6285, + "grad_norm": 6.488072817521581, + "kl": 1.6943359375, + "learning_rate": 4.2822979931086144e-07, + "loss": 0.064, + "reward": 2.6413581371307373, + "reward_std": 0.32615266740322113, + "rewards/accuracy_reward": 0.75, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.025308695621788502, + "rewards/tag_count_reward": 0.9583333432674408, + "step": 1257 + }, + { + "clip_ratio": 0.0, + "completion_length": 320.4166717529297, + "epoch": 0.629, + "grad_norm": 5.571259374449054, + "kl": 0.7109375, + "learning_rate": 4.2747388883174154e-07, + "loss": -0.0249, + "reward": 2.5305432081222534, + "reward_std": 0.34117555618286133, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.03716518171131611, + "rewards/tag_count_reward": 0.9218750298023224, + "step": 1258 + }, + { + "clip_ratio": 0.0, + "completion_length": 319.0208435058594, + "epoch": 0.6295, + "grad_norm": 10.818868109831325, + "kl": 1.11328125, + "learning_rate": 4.267183515884054e-07, + "loss": -0.0971, + "reward": 2.6202439069747925, + "reward_std": 0.6800469756126404, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.02732554590329528, + "rewards/tag_count_reward": 0.9114583730697632, + "step": 1259 + }, + { + "clip_ratio": 0.0, + "completion_length": 313.8333435058594, + "epoch": 0.63, + "grad_norm": 5.109825873578351, + "kl": 1.12890625, + "learning_rate": 4.2596318988235037e-07, + "loss": -0.0602, + "reward": 2.7829357385635376, + "reward_std": 0.4666026383638382, + "rewards/accuracy_reward": 0.8750000298023224, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.024355984292924404, + "rewards/tag_count_reward": 0.953125, + "step": 1260 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.4166717529297, + "epoch": 0.6305, + "grad_norm": 113.69520754686019, + "kl": 6.984375, + "learning_rate": 4.2520840601392996e-07, + "loss": 0.0285, + "reward": 2.4763646125793457, + "reward_std": 0.5876338928937912, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.02189927827566862, + "rewards/tag_count_reward": 0.8802083432674408, + "step": 1261 + }, + { + "clip_ratio": 0.0, + "completion_length": 277.20833587646484, + "epoch": 0.631, + "grad_norm": 4.878770353368535, + "kl": 0.8046875, + "learning_rate": 4.2445400228234687e-07, + "loss": -0.0089, + "reward": 2.2267041206359863, + "reward_std": 0.21407258417457342, + "rewards/accuracy_reward": 0.3125, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.023295993451029062, + "rewards/tag_count_reward": 0.9375, + "step": 1262 + }, + { + "clip_ratio": 0.0, + "completion_length": 248.00000762939453, + "epoch": 0.6315, + "grad_norm": 91.61805536571175, + "kl": 14.0, + "learning_rate": 4.2369998098564554e-07, + "loss": 0.1059, + "reward": 2.389165997505188, + "reward_std": 0.8639044761657715, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.8819444179534912, + "rewards/repetition_penalty_reward": -0.024028603918850422, + "rewards/tag_count_reward": 0.8854166865348816, + "step": 1263 + }, + { + "clip_ratio": 0.0, + "completion_length": 262.4791793823242, + "epoch": 0.632, + "grad_norm": 61.3227518293924, + "kl": 7.921875, + "learning_rate": 4.2294634442070553e-07, + "loss": 0.0403, + "reward": 2.4555550813674927, + "reward_std": 0.7910381853580475, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9375, + "rewards/repetition_penalty_reward": -0.02361162193119526, + "rewards/tag_count_reward": 0.875, + "step": 1264 + }, + { + "clip_ratio": 0.0, + "completion_length": 298.97918701171875, + "epoch": 0.6325, + "grad_norm": 11.219358419697596, + "kl": 2.75390625, + "learning_rate": 4.2219309488323487e-07, + "loss": 0.0209, + "reward": 2.7085201740264893, + "reward_std": 0.5442517399787903, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.029327161610126495, + "rewards/tag_count_reward": 0.953125, + "step": 1265 + }, + { + "clip_ratio": 0.0, + "completion_length": 275.7708435058594, + "epoch": 0.633, + "grad_norm": 398.00956099056515, + "kl": 23.125, + "learning_rate": 4.214402346677619e-07, + "loss": 0.2082, + "reward": 2.259987235069275, + "reward_std": 0.9177981615066528, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 0.8263889253139496, + "rewards/repetition_penalty_reward": -0.024735040962696075, + "rewards/tag_count_reward": 0.8333333432674408, + "step": 1266 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.9583435058594, + "epoch": 0.6335, + "grad_norm": 24.983686575360647, + "kl": 3.240234375, + "learning_rate": 4.206877660676297e-07, + "loss": 0.0538, + "reward": 2.548527956008911, + "reward_std": 0.6168785095214844, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.01570827839896083, + "rewards/tag_count_reward": 0.890625, + "step": 1267 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.6875, + "epoch": 0.634, + "grad_norm": 29.77346744252865, + "kl": 3.23046875, + "learning_rate": 4.1993569137498776e-07, + "loss": 0.0976, + "reward": 2.5990532636642456, + "reward_std": 0.3927089273929596, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.01726605836302042, + "rewards/tag_count_reward": 0.8802083432674408, + "step": 1268 + }, + { + "clip_ratio": 0.0, + "completion_length": 291.7708435058594, + "epoch": 0.6345, + "grad_norm": 6.809776019842994, + "kl": 1.5078125, + "learning_rate": 4.1918401288078633e-07, + "loss": -0.0669, + "reward": 2.4295482635498047, + "reward_std": 0.6562110781669617, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.014896340668201447, + "rewards/tag_count_reward": 0.8750000298023224, + "step": 1269 + }, + { + "clip_ratio": 0.0, + "completion_length": 242.54167938232422, + "epoch": 0.635, + "grad_norm": 13.637945833280545, + "kl": 2.11328125, + "learning_rate": 4.1843273287476854e-07, + "loss": -0.2244, + "reward": 2.2396379709243774, + "reward_std": 0.7876444458961487, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.8472222685813904, + "rewards/repetition_penalty_reward": -0.024251021444797516, + "rewards/tag_count_reward": 0.8750000298023224, + "step": 1270 + }, + { + "clip_ratio": 0.0, + "completion_length": 224.18750762939453, + "epoch": 0.6355, + "grad_norm": 9.401287077014372, + "kl": 2.37109375, + "learning_rate": 4.1768185364546326e-07, + "loss": -0.2551, + "reward": 1.9714569449424744, + "reward_std": 0.8513112664222717, + "rewards/accuracy_reward": 0.5000000223517418, + "rewards/reasoning_steps_reward": 0.7500000596046448, + "rewards/repetition_penalty_reward": -0.018126386683434248, + "rewards/tag_count_reward": 0.7395833432674408, + "step": 1271 + }, + { + "clip_ratio": 0.0, + "completion_length": 319.7083435058594, + "epoch": 0.636, + "grad_norm": 5.439379439237209, + "kl": 0.857421875, + "learning_rate": 4.1693137748017915e-07, + "loss": -0.0795, + "reward": 2.6746604442596436, + "reward_std": 0.536898672580719, + "rewards/accuracy_reward": 0.8333333730697632, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.02672865055501461, + "rewards/tag_count_reward": 0.9166666865348816, + "step": 1272 + }, + { + "clip_ratio": 0.0, + "completion_length": 255.56250762939453, + "epoch": 0.6365, + "grad_norm": 6.202209412890006, + "kl": 0.8515625, + "learning_rate": 4.161813066649963e-07, + "loss": -0.0982, + "reward": 2.706597089767456, + "reward_std": 0.5873250961303711, + "rewards/accuracy_reward": 0.8333333730697632, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.013889186084270477, + "rewards/tag_count_reward": 0.9427083432674408, + "step": 1273 + }, + { + "clip_ratio": 0.0, + "completion_length": 253.87500762939453, + "epoch": 0.637, + "grad_norm": 8.311119282622155, + "kl": 1.412109375, + "learning_rate": 4.15431643484761e-07, + "loss": -0.0722, + "reward": 2.4473708868026733, + "reward_std": 0.5833633840084076, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9027778208255768, + "rewards/repetition_penalty_reward": -0.02832351950928569, + "rewards/tag_count_reward": 0.8645833432674408, + "step": 1274 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.5416717529297, + "epoch": 0.6375, + "grad_norm": 5.198591187578019, + "kl": 1.04296875, + "learning_rate": 4.146823902230772e-07, + "loss": -0.0484, + "reward": 2.8356668949127197, + "reward_std": 0.44375722110271454, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.020235823933035135, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 1275 + }, + { + "clip_ratio": 0.0, + "completion_length": 230.58334350585938, + "epoch": 0.638, + "grad_norm": 6.926096135008214, + "kl": 1.96484375, + "learning_rate": 4.1393354916230005e-07, + "loss": -0.0609, + "reward": 2.677065134048462, + "reward_std": 0.594237208366394, + "rewards/accuracy_reward": 0.8333333730697632, + "rewards/reasoning_steps_reward": 0.9375000298023224, + "rewards/repetition_penalty_reward": -0.01043500192463398, + "rewards/tag_count_reward": 0.9166666865348816, + "step": 1276 + }, + { + "clip_ratio": 0.0, + "completion_length": 238.68750762939453, + "epoch": 0.6385, + "grad_norm": 48.40360467461647, + "kl": 3.640625, + "learning_rate": 4.1318512258352936e-07, + "loss": 0.0027, + "reward": 2.614649772644043, + "reward_std": 0.4902600198984146, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9305555820465088, + "rewards/repetition_penalty_reward": -0.008614218328148127, + "rewards/tag_count_reward": 0.9010416865348816, + "step": 1277 + }, + { + "clip_ratio": 0.0, + "completion_length": 256.50001525878906, + "epoch": 0.639, + "grad_norm": 17.05514052018502, + "kl": 3.43359375, + "learning_rate": 4.124371127666024e-07, + "loss": -0.045, + "reward": 2.4537200927734375, + "reward_std": 0.6568253189325333, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9305555820465088, + "rewards/repetition_penalty_reward": -0.013293889816850424, + "rewards/tag_count_reward": 0.8697916865348816, + "step": 1278 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.0833435058594, + "epoch": 0.6395, + "grad_norm": 12.772767939709558, + "kl": 3.58984375, + "learning_rate": 4.1168952199008677e-07, + "loss": -0.074, + "reward": 2.3198667764663696, + "reward_std": 0.6525312066078186, + "rewards/accuracy_reward": 0.5000000149011612, + "rewards/reasoning_steps_reward": 0.9444445073604584, + "rewards/repetition_penalty_reward": -0.02561924420297146, + "rewards/tag_count_reward": 0.9010416865348816, + "step": 1279 + }, + { + "clip_ratio": 0.0, + "completion_length": 290.2916717529297, + "epoch": 0.64, + "grad_norm": 13.074456042005389, + "kl": 2.43359375, + "learning_rate": 4.1094235253127374e-07, + "loss": -0.0559, + "reward": 2.621940493583679, + "reward_std": 0.6388831436634064, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.03430960513651371, + "rewards/tag_count_reward": 0.9270833432674408, + "step": 1280 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.3541717529297, + "epoch": 0.6405, + "grad_norm": 20.83070325460604, + "kl": 3.6875, + "learning_rate": 4.101956066661708e-07, + "loss": -0.0974, + "reward": 2.285021424293518, + "reward_std": 0.5357790589332581, + "rewards/accuracy_reward": 0.5, + "rewards/reasoning_steps_reward": 0.9236111342906952, + "rewards/repetition_penalty_reward": -0.018798216711729765, + "rewards/tag_count_reward": 0.8802083730697632, + "step": 1281 + }, + { + "clip_ratio": 0.0, + "completion_length": 274.04168701171875, + "epoch": 0.641, + "grad_norm": 12.971542858607965, + "kl": 2.666015625, + "learning_rate": 4.0944928666949527e-07, + "loss": -0.0943, + "reward": 2.2288068532943726, + "reward_std": 0.6288715898990631, + "rewards/accuracy_reward": 0.416666679084301, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.014248816296458244, + "rewards/tag_count_reward": 0.8541666865348816, + "step": 1282 + }, + { + "clip_ratio": 0.0, + "completion_length": 235.25, + "epoch": 0.6415, + "grad_norm": 11.653407474012347, + "kl": 2.9140625, + "learning_rate": 4.0870339481466774e-07, + "loss": -0.0501, + "reward": 1.9373607635498047, + "reward_std": 0.5385548174381256, + "rewards/accuracy_reward": 0.3125000111758709, + "rewards/reasoning_steps_reward": 0.8472222685813904, + "rewards/repetition_penalty_reward": -0.02444479614496231, + "rewards/tag_count_reward": 0.8020833432674408, + "step": 1283 + }, + { + "clip_ratio": 0.0, + "completion_length": 289.56250762939453, + "epoch": 0.642, + "grad_norm": 8.453230476877637, + "kl": 0.779296875, + "learning_rate": 4.079579333738039e-07, + "loss": 0.0324, + "reward": 2.838531732559204, + "reward_std": 0.28950972855091095, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.036468397825956345, + "rewards/tag_count_reward": 0.9583333730697632, + "step": 1284 + }, + { + "clip_ratio": 0.0, + "completion_length": 274.7916717529297, + "epoch": 0.6425, + "grad_norm": 8.516876725525995, + "kl": 0.984375, + "learning_rate": 4.0721290461770863e-07, + "loss": -0.0514, + "reward": 2.528114676475525, + "reward_std": 0.6328976899385452, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.018760375678539276, + "rewards/tag_count_reward": 0.8802083730697632, + "step": 1285 + }, + { + "clip_ratio": 0.0, + "completion_length": 254.66667938232422, + "epoch": 0.643, + "grad_norm": 11.245200868491317, + "kl": 1.1171875, + "learning_rate": 4.064683108158685e-07, + "loss": -0.1401, + "reward": 2.3839638233184814, + "reward_std": 0.6318697333335876, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.8958333432674408, + "rewards/repetition_penalty_reward": -0.022286144085228443, + "rewards/tag_count_reward": 0.90625, + "step": 1286 + }, + { + "clip_ratio": 0.0, + "completion_length": 277.9583435058594, + "epoch": 0.6435, + "grad_norm": 6.37277111754768, + "kl": 1.4609375, + "learning_rate": 4.057241542364457e-07, + "loss": -0.1055, + "reward": 2.32645583152771, + "reward_std": 0.5588487088680267, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.03465544432401657, + "rewards/tag_count_reward": 0.8541666865348816, + "step": 1287 + }, + { + "clip_ratio": 0.0, + "completion_length": 262.1666717529297, + "epoch": 0.644, + "grad_norm": 16.043998757477272, + "kl": 4.4296875, + "learning_rate": 4.0498043714627006e-07, + "loss": -0.1519, + "reward": 2.1210697889328003, + "reward_std": 0.9449804127216339, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 0.8333333730697632, + "rewards/repetition_penalty_reward": -0.024763552471995354, + "rewards/tag_count_reward": 0.7916666865348816, + "step": 1288 + }, + { + "clip_ratio": 0.0, + "completion_length": 302.4791717529297, + "epoch": 0.6445, + "grad_norm": 8.66946655884018, + "kl": 1.9375, + "learning_rate": 4.042371618108329e-07, + "loss": -0.0738, + "reward": 2.5021121501922607, + "reward_std": 0.7234326303005219, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9583333134651184, + "rewards/repetition_penalty_reward": -0.029137907549738884, + "rewards/tag_count_reward": 0.8854166865348816, + "step": 1289 + }, + { + "clip_ratio": 0.0, + "completion_length": 258.39583587646484, + "epoch": 0.645, + "grad_norm": 6.507177763175018, + "kl": 2.046875, + "learning_rate": 4.034943304942796e-07, + "loss": -0.1071, + "reward": 2.659383535385132, + "reward_std": 0.7204856872558594, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.9305556118488312, + "rewards/repetition_penalty_reward": -0.021172089502215385, + "rewards/tag_count_reward": 0.9375000298023224, + "step": 1290 + }, + { + "clip_ratio": 0.0, + "completion_length": 269.87500762939453, + "epoch": 0.6455, + "grad_norm": 8.84789787566044, + "kl": 1.96484375, + "learning_rate": 4.027519454594033e-07, + "loss": -0.0137, + "reward": 2.724217653274536, + "reward_std": 0.6305935382843018, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.022310104221105576, + "rewards/tag_count_reward": 0.9270833432674408, + "step": 1291 + }, + { + "clip_ratio": 0.0, + "completion_length": 228.14584350585938, + "epoch": 0.646, + "grad_norm": 54.434135625320714, + "kl": 7.625, + "learning_rate": 4.020100089676376e-07, + "loss": 0.0351, + "reward": 2.350663661956787, + "reward_std": 0.5377750098705292, + "rewards/accuracy_reward": 0.625, + "rewards/reasoning_steps_reward": 0.9027778208255768, + "rewards/repetition_penalty_reward": -0.015656011179089546, + "rewards/tag_count_reward": 0.8385416865348816, + "step": 1292 + }, + { + "clip_ratio": 0.0, + "completion_length": 246.37500762939453, + "epoch": 0.6465, + "grad_norm": 36.29934720854378, + "kl": 9.203125, + "learning_rate": 4.012685232790497e-07, + "loss": -0.1163, + "reward": 2.269049644470215, + "reward_std": 0.7809967696666718, + "rewards/accuracy_reward": 0.5000000149011612, + "rewards/reasoning_steps_reward": 0.9166666865348816, + "rewards/repetition_penalty_reward": -0.017408739775419235, + "rewards/tag_count_reward": 0.8697916865348816, + "step": 1293 + }, + { + "clip_ratio": 0.0, + "completion_length": 326.56251525878906, + "epoch": 0.647, + "grad_norm": 13.353341754245465, + "kl": 1.373046875, + "learning_rate": 4.005274906523336e-07, + "loss": -0.0158, + "reward": 2.862304449081421, + "reward_std": 0.2681258460506797, + "rewards/accuracy_reward": 0.9375, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.03700113669037819, + "rewards/tag_count_reward": 0.96875, + "step": 1294 + }, + { + "clip_ratio": 0.0, + "completion_length": 234.875, + "epoch": 0.6475, + "grad_norm": 17.584019029906294, + "kl": 4.6484375, + "learning_rate": 3.9978691334480306e-07, + "loss": -0.026, + "reward": 2.621469259262085, + "reward_std": 0.6262362897396088, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9305556118488312, + "rewards/repetition_penalty_reward": -0.022628027945756912, + "rewards/tag_count_reward": 0.9218750298023224, + "step": 1295 + }, + { + "clip_ratio": 0.0, + "completion_length": 264.25000762939453, + "epoch": 0.648, + "grad_norm": 20.539771262644624, + "kl": 2.46484375, + "learning_rate": 3.9904679361238526e-07, + "loss": -0.0646, + "reward": 2.5812262296676636, + "reward_std": 0.40000835061073303, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.019468236714601517, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 1296 + }, + { + "clip_ratio": 0.0, + "completion_length": 259.3125, + "epoch": 0.6485, + "grad_norm": 12.020081562329604, + "kl": 1.96875, + "learning_rate": 3.9830713370961313e-07, + "loss": 0.0126, + "reward": 2.7614296674728394, + "reward_std": 0.3552953898906708, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.01287601888179779, + "rewards/tag_count_reward": 0.9270833432674408, + "step": 1297 + }, + { + "clip_ratio": 0.0, + "completion_length": 250.25001525878906, + "epoch": 0.649, + "grad_norm": 7.055794372513674, + "kl": 1.11328125, + "learning_rate": 3.975679358896189e-07, + "loss": -0.0177, + "reward": 2.7628384828567505, + "reward_std": 0.35927814757451415, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 0.9305555522441864, + "rewards/repetition_penalty_reward": -0.027092115953564644, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 1298 + }, + { + "clip_ratio": 0.0, + "completion_length": 224.70834350585938, + "epoch": 0.6495, + "grad_norm": 10.528185417792633, + "kl": 1.76953125, + "learning_rate": 3.968292024041275e-07, + "loss": -0.1546, + "reward": 2.3756613731384277, + "reward_std": 0.7093529403209686, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/reasoning_steps_reward": 0.8472222089767456, + "rewards/repetition_penalty_reward": -0.018436014652252197, + "rewards/tag_count_reward": 0.8802083432674408, + "step": 1299 + }, + { + "clip_ratio": 0.0, + "completion_length": 257.25, + "epoch": 0.65, + "grad_norm": 13.557597077977892, + "kl": 2.1328125, + "learning_rate": 3.9609093550344907e-07, + "loss": -0.0656, + "reward": 2.1020314693450928, + "reward_std": 0.7527401447296143, + "rewards/accuracy_reward": 0.3958333432674408, + "rewards/reasoning_steps_reward": 0.8680556118488312, + "rewards/repetition_penalty_reward": -0.02644097339361906, + "rewards/tag_count_reward": 0.8645833432674408, + "step": 1300 + }, + { + "clip_ratio": 0.0, + "completion_length": 292.0208435058594, + "epoch": 0.6505, + "grad_norm": 5.083335577912237, + "kl": 0.4794921875, + "learning_rate": 3.953531374364728e-07, + "loss": 0.0102, + "reward": 2.6702791452407837, + "reward_std": 0.36818480491638184, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.03284592926502228, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 1301 + }, + { + "clip_ratio": 0.0, + "completion_length": 283.6875, + "epoch": 0.651, + "grad_norm": 7.37275238994372, + "kl": 1.07421875, + "learning_rate": 3.946158104506594e-07, + "loss": -0.0573, + "reward": 2.6404601335525513, + "reward_std": 0.5752403140068054, + "rewards/accuracy_reward": 0.75, + "rewards/reasoning_steps_reward": 0.9722222089767456, + "rewards/repetition_penalty_reward": -0.029678759165108204, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 1302 + }, + { + "clip_ratio": 0.0, + "completion_length": 263.7291717529297, + "epoch": 0.6515, + "grad_norm": 6.525040125531069, + "kl": 2.5, + "learning_rate": 3.938789567920349e-07, + "loss": -0.0352, + "reward": 2.368511199951172, + "reward_std": 0.5849394798278809, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9375, + "rewards/repetition_penalty_reward": -0.02211400307714939, + "rewards/tag_count_reward": 0.8697916865348816, + "step": 1303 + }, + { + "clip_ratio": 0.0, + "completion_length": 238.2916717529297, + "epoch": 0.652, + "grad_norm": 13.454959697482677, + "kl": 2.1875, + "learning_rate": 3.931425787051832e-07, + "loss": -0.0314, + "reward": 2.6546378135681152, + "reward_std": 0.6842877864837646, + "rewards/accuracy_reward": 0.8125, + "rewards/reasoning_steps_reward": 0.9375000596046448, + "rewards/repetition_penalty_reward": -0.012028906028717756, + "rewards/tag_count_reward": 0.9166666865348816, + "step": 1304 + }, + { + "clip_ratio": 0.0, + "completion_length": 262.8541793823242, + "epoch": 0.6525, + "grad_norm": 28.026859752832458, + "kl": 5.671875, + "learning_rate": 3.924066784332396e-07, + "loss": -0.1091, + "reward": 2.2767953872680664, + "reward_std": 0.6093401312828064, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.8750000298023224, + "rewards/repetition_penalty_reward": -0.025287946220487356, + "rewards/tag_count_reward": 0.8854166865348816, + "step": 1305 + }, + { + "clip_ratio": 0.0, + "completion_length": 225.56250762939453, + "epoch": 0.653, + "grad_norm": 18.361877129436802, + "kl": 2.220703125, + "learning_rate": 3.9167125821788416e-07, + "loss": -0.0351, + "reward": 2.4435291290283203, + "reward_std": 0.3841914087533951, + "rewards/accuracy_reward": 0.5625, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.011332074645906687, + "rewards/tag_count_reward": 0.9270833432674408, + "step": 1306 + }, + { + "clip_ratio": 0.0, + "completion_length": 274.87501525878906, + "epoch": 0.6535, + "grad_norm": 9.18937969115552, + "kl": 2.7890625, + "learning_rate": 3.909363202993343e-07, + "loss": -0.053, + "reward": 2.573843240737915, + "reward_std": 0.5460654944181442, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9444444477558136, + "rewards/repetition_penalty_reward": -0.02685134019702673, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 1307 + }, + { + "clip_ratio": 0.0, + "completion_length": 239.64584350585938, + "epoch": 0.654, + "grad_norm": 9.701372830225981, + "kl": 2.421875, + "learning_rate": 3.902018669163384e-07, + "loss": -0.0271, + "reward": 2.419216573238373, + "reward_std": 0.44797763228416443, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.007866791682317853, + "rewards/tag_count_reward": 0.9270833432674408, + "step": 1308 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.66668701171875, + "epoch": 0.6545, + "grad_norm": 6.341395491310268, + "kl": 1.55078125, + "learning_rate": 3.894679003061686e-07, + "loss": -0.0434, + "reward": 2.7170095443725586, + "reward_std": 0.5187918990850449, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.027782058343291283, + "rewards/tag_count_reward": 0.9531250298023224, + "step": 1309 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.10418701171875, + "epoch": 0.655, + "grad_norm": 4.622903893254928, + "kl": 0.580078125, + "learning_rate": 3.8873442270461485e-07, + "loss": -0.0032, + "reward": 2.8831721544265747, + "reward_std": 0.2194080576300621, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.01613345229998231, + "rewards/tag_count_reward": 0.9895833432674408, + "step": 1310 + }, + { + "clip_ratio": 0.0, + "completion_length": 234.56250762939453, + "epoch": 0.6555, + "grad_norm": 6.403895926939174, + "kl": 1.96875, + "learning_rate": 3.88001436345977e-07, + "loss": -0.0914, + "reward": 2.64253830909729, + "reward_std": 0.5459905564785004, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.015447880607098341, + "rewards/tag_count_reward": 0.9427083730697632, + "step": 1311 + }, + { + "clip_ratio": 0.0, + "completion_length": 254.02083587646484, + "epoch": 0.656, + "grad_norm": 4.624188058528878, + "kl": 1.1171875, + "learning_rate": 3.872689434630585e-07, + "loss": -0.0328, + "reward": 2.666394829750061, + "reward_std": 0.4922281354665756, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.02110534254461527, + "rewards/tag_count_reward": 0.9583333432674408, + "step": 1312 + }, + { + "clip_ratio": 0.0, + "completion_length": 252.8541717529297, + "epoch": 0.6565, + "grad_norm": 10.421845265889829, + "kl": 0.916015625, + "learning_rate": 3.8653694628715984e-07, + "loss": -0.0879, + "reward": 2.8016003370285034, + "reward_std": 0.5981872081756592, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.010899642948061228, + "rewards/tag_count_reward": 0.9583333730697632, + "step": 1313 + }, + { + "clip_ratio": 0.0, + "completion_length": 229.02084350585938, + "epoch": 0.657, + "grad_norm": 10.523509449540256, + "kl": 2.55859375, + "learning_rate": 3.8580544704807117e-07, + "loss": -0.0991, + "reward": 2.440296769142151, + "reward_std": 0.6269799470901489, + "rewards/accuracy_reward": 0.583333358168602, + "rewards/reasoning_steps_reward": 0.9375000298023224, + "rewards/repetition_penalty_reward": -0.012828361243009567, + "rewards/tag_count_reward": 0.9322916865348816, + "step": 1314 + }, + { + "clip_ratio": 0.0, + "completion_length": 251.89583587646484, + "epoch": 0.6575, + "grad_norm": 21.647956275310747, + "kl": 3.4375, + "learning_rate": 3.850744479740663e-07, + "loss": -0.0654, + "reward": 2.577293634414673, + "reward_std": 0.5918062180280685, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9444444477558136, + "rewards/repetition_penalty_reward": -0.018192541785538197, + "rewards/tag_count_reward": 0.9218750298023224, + "step": 1315 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.00000762939453, + "epoch": 0.658, + "grad_norm": 63.82832688411775, + "kl": 7.328125, + "learning_rate": 3.843439512918949e-07, + "loss": 0.0124, + "reward": 1.5498919486999512, + "reward_std": 0.5158031135797501, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/reasoning_steps_reward": 0.7638889253139496, + "rewards/repetition_penalty_reward": -0.016080408822745085, + "rewards/tag_count_reward": 0.7604166865348816, + "step": 1316 + }, + { + "clip_ratio": 0.0, + "completion_length": 252.0625, + "epoch": 0.6585, + "grad_norm": 6.870792569824665, + "kl": 2.2265625, + "learning_rate": 3.8361395922677687e-07, + "loss": -0.1222, + "reward": 2.7788249254226685, + "reward_std": 0.597758024930954, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.9166666865348816, + "rewards/repetition_penalty_reward": -0.012841662392020226, + "rewards/tag_count_reward": 0.9583333730697632, + "step": 1317 + }, + { + "clip_ratio": 0.0, + "completion_length": 257.00001525878906, + "epoch": 0.659, + "grad_norm": 12.883962513964198, + "kl": 2.453125, + "learning_rate": 3.8288447400239443e-07, + "loss": -0.0232, + "reward": 2.208429753780365, + "reward_std": 0.39593251049518585, + "rewards/accuracy_reward": 0.3541666679084301, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.025945200584828854, + "rewards/tag_count_reward": 0.921875, + "step": 1318 + }, + { + "clip_ratio": 0.0, + "completion_length": 271.18751525878906, + "epoch": 0.6595, + "grad_norm": 11.437028269960171, + "kl": 2.12109375, + "learning_rate": 3.82155497840886e-07, + "loss": -0.0909, + "reward": 2.5074195861816406, + "reward_std": 0.697433352470398, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.015149833634495735, + "rewards/tag_count_reward": 0.9114583432674408, + "step": 1319 + }, + { + "clip_ratio": 0.0, + "completion_length": 244.72917938232422, + "epoch": 0.66, + "grad_norm": 15.92814404052159, + "kl": 3.4296875, + "learning_rate": 3.8142703296283953e-07, + "loss": -0.0835, + "reward": 2.2199000120162964, + "reward_std": 0.7194198369979858, + "rewards/accuracy_reward": 0.4791666865348816, + "rewards/reasoning_steps_reward": 0.8888889253139496, + "rewards/repetition_penalty_reward": -0.02836394216865301, + "rewards/tag_count_reward": 0.8802083432674408, + "step": 1320 + }, + { + "clip_ratio": 0.0, + "completion_length": 234.8541717529297, + "epoch": 0.6605, + "grad_norm": 21.139528290510924, + "kl": 4.59375, + "learning_rate": 3.806990815872855e-07, + "loss": -0.0918, + "reward": 2.3190531730651855, + "reward_std": 0.38901272416114807, + "rewards/accuracy_reward": 0.4583333432674408, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.010808073915541172, + "rewards/tag_count_reward": 0.9270833432674408, + "step": 1321 + }, + { + "clip_ratio": 0.0, + "completion_length": 237.95834350585938, + "epoch": 0.661, + "grad_norm": 8.548692602029842, + "kl": 2.4921875, + "learning_rate": 3.7997164593168983e-07, + "loss": -0.062, + "reward": 2.5779298543930054, + "reward_std": 0.7175216972827911, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.022764576133340597, + "rewards/tag_count_reward": 0.8854166865348816, + "step": 1322 + }, + { + "clip_ratio": 0.0, + "completion_length": 238.06250762939453, + "epoch": 0.6615, + "grad_norm": 8.538300938371009, + "kl": 1.42578125, + "learning_rate": 3.7924472821194765e-07, + "loss": -0.1421, + "reward": 2.6461488008499146, + "reward_std": 0.7617769837379456, + "rewards/accuracy_reward": 0.8333333730697632, + "rewards/reasoning_steps_reward": 0.9097222685813904, + "rewards/repetition_penalty_reward": -0.018781788181513548, + "rewards/tag_count_reward": 0.9218750298023224, + "step": 1323 + }, + { + "clip_ratio": 0.0, + "completion_length": 248.62500762939453, + "epoch": 0.662, + "grad_norm": 10.10993358277435, + "kl": 1.47265625, + "learning_rate": 3.785183306423767e-07, + "loss": 0.0362, + "reward": 2.4081956148147583, + "reward_std": 0.4368290901184082, + "rewards/accuracy_reward": 0.5000000149011612, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.010207333602011204, + "rewards/tag_count_reward": 0.9322916865348816, + "step": 1324 + }, + { + "clip_ratio": 0.0, + "completion_length": 243.58334350585938, + "epoch": 0.6625, + "grad_norm": 6.92812761375714, + "kl": 1.0556640625, + "learning_rate": 3.777924554357096e-07, + "loss": -0.0024, + "reward": 2.8137227296829224, + "reward_std": 0.41092177480459213, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 0.9722222089767456, + "rewards/repetition_penalty_reward": -0.02308299019932747, + "rewards/tag_count_reward": 0.96875, + "step": 1325 + }, + { + "clip_ratio": 0.0, + "completion_length": 257.8333435058594, + "epoch": 0.663, + "grad_norm": 7.315502843369173, + "kl": 2.02734375, + "learning_rate": 3.7706710480308835e-07, + "loss": -0.1371, + "reward": 2.6259608268737793, + "reward_std": 0.6581228971481323, + "rewards/accuracy_reward": 0.7708333730697632, + "rewards/reasoning_steps_reward": 0.9375, + "rewards/repetition_penalty_reward": -0.03549747634679079, + "rewards/tag_count_reward": 0.953125, + "step": 1326 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.2291717529297, + "epoch": 0.6635, + "grad_norm": 6.63436769667506, + "kl": 1.81640625, + "learning_rate": 3.7634228095405673e-07, + "loss": -0.0385, + "reward": 2.8240283727645874, + "reward_std": 0.34141383320093155, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.01972164958715439, + "rewards/tag_count_reward": 0.96875, + "step": 1327 + }, + { + "clip_ratio": 0.0, + "completion_length": 293.5208435058594, + "epoch": 0.664, + "grad_norm": 20.757623767950136, + "kl": 3.8828125, + "learning_rate": 3.7561798609655373e-07, + "loss": -0.0051, + "reward": 2.598018765449524, + "reward_std": 0.724356472492218, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9444444179534912, + "rewards/repetition_penalty_reward": -0.018300842493772507, + "rewards/tag_count_reward": 0.9218750298023224, + "step": 1328 + }, + { + "clip_ratio": 0.0, + "completion_length": 222.7291717529297, + "epoch": 0.6645, + "grad_norm": 21.24530948252979, + "kl": 4.796875, + "learning_rate": 3.748942224369073e-07, + "loss": -0.0478, + "reward": 2.4989744424819946, + "reward_std": 0.6526926159858704, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.916666716337204, + "rewards/repetition_penalty_reward": -0.016650736331939697, + "rewards/tag_count_reward": 0.8906250298023224, + "step": 1329 + }, + { + "clip_ratio": 0.0, + "completion_length": 251.75001525878906, + "epoch": 0.665, + "grad_norm": 35.64687506708033, + "kl": 6.21875, + "learning_rate": 3.7417099217982686e-07, + "loss": -0.0819, + "reward": 2.379560112953186, + "reward_std": 0.7591554820537567, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.881944477558136, + "rewards/repetition_penalty_reward": -0.02321775909513235, + "rewards/tag_count_reward": 0.8541666865348816, + "step": 1330 + }, + { + "clip_ratio": 0.0, + "completion_length": 289.9583435058594, + "epoch": 0.6655, + "grad_norm": 5.687701052224813, + "kl": 2.15234375, + "learning_rate": 3.734482975283975e-07, + "loss": -0.0797, + "reward": 2.6574783325195312, + "reward_std": 0.4438095688819885, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.026549477130174637, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 1331 + }, + { + "clip_ratio": 0.0, + "completion_length": 312.4791717529297, + "epoch": 0.666, + "grad_norm": 5.722398391516816, + "kl": 1.5, + "learning_rate": 3.72726140684072e-07, + "loss": -0.0211, + "reward": 2.485495686531067, + "reward_std": 0.32544514536857605, + "rewards/accuracy_reward": 0.5625000298023224, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.03186549246311188, + "rewards/tag_count_reward": 0.9687500298023224, + "step": 1332 + }, + { + "clip_ratio": 0.0, + "completion_length": 273.3333435058594, + "epoch": 0.6665, + "grad_norm": 4.960361026761474, + "kl": 0.828125, + "learning_rate": 3.720045238466658e-07, + "loss": -0.0164, + "reward": 2.5775978565216064, + "reward_std": 0.36035288870334625, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.021360510494560003, + "rewards/tag_count_reward": 0.953125, + "step": 1333 + }, + { + "clip_ratio": 0.0, + "completion_length": 246.4166717529297, + "epoch": 0.667, + "grad_norm": 6.897070121071065, + "kl": 2.5703125, + "learning_rate": 3.712834492143487e-07, + "loss": -0.088, + "reward": 2.4616823196411133, + "reward_std": 0.5930516719818115, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.9375000298023224, + "rewards/repetition_penalty_reward": -0.022692805156111717, + "rewards/tag_count_reward": 0.9010416865348816, + "step": 1334 + }, + { + "clip_ratio": 0.0, + "completion_length": 248.16667938232422, + "epoch": 0.6675, + "grad_norm": 4.566188182751088, + "kl": 1.150390625, + "learning_rate": 3.7056291898363925e-07, + "loss": -0.0822, + "reward": 2.779423713684082, + "reward_std": 0.4814029037952423, + "rewards/accuracy_reward": 0.8750000298023224, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.013978923205286264, + "rewards/tag_count_reward": 0.9531250298023224, + "step": 1335 + }, + { + "clip_ratio": 0.0, + "completion_length": 260.1458435058594, + "epoch": 0.668, + "grad_norm": 8.6793215256614, + "kl": 1.9140625, + "learning_rate": 3.6984293534939737e-07, + "loss": -0.1695, + "reward": 2.448217272758484, + "reward_std": 0.7062622308731079, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9375000298023224, + "rewards/repetition_penalty_reward": -0.02053278312087059, + "rewards/tag_count_reward": 0.9270833730697632, + "step": 1336 + }, + { + "clip_ratio": 0.0, + "completion_length": 272.2708435058594, + "epoch": 0.6685, + "grad_norm": 6.644806226411769, + "kl": 1.1875, + "learning_rate": 3.69123500504818e-07, + "loss": -0.0284, + "reward": 2.466250419616699, + "reward_std": 0.5085738003253937, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.028541546314954758, + "rewards/tag_count_reward": 0.953125, + "step": 1337 + }, + { + "clip_ratio": 0.0, + "completion_length": 270.5416717529297, + "epoch": 0.669, + "grad_norm": 8.436837507583917, + "kl": 1.466796875, + "learning_rate": 3.6840461664142444e-07, + "loss": -0.0728, + "reward": 2.5604257583618164, + "reward_std": 0.6860225200653076, + "rewards/accuracy_reward": 0.7083333730697632, + "rewards/reasoning_steps_reward": 0.9444445371627808, + "rewards/repetition_penalty_reward": -0.029852177016437054, + "rewards/tag_count_reward": 0.9375000298023224, + "step": 1338 + }, + { + "clip_ratio": 0.0, + "completion_length": 281.06251525878906, + "epoch": 0.6695, + "grad_norm": 10.964332836550879, + "kl": 1.158203125, + "learning_rate": 3.6768628594906193e-07, + "loss": -0.0204, + "reward": 2.8616209030151367, + "reward_std": 0.35454705357551575, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.023795696906745434, + "rewards/tag_count_reward": 0.9895833432674408, + "step": 1339 + }, + { + "clip_ratio": 0.0, + "completion_length": 262.00000762939453, + "epoch": 0.67, + "grad_norm": 6.098760743160745, + "kl": 1.83984375, + "learning_rate": 3.6696851061588994e-07, + "loss": -0.044, + "reward": 2.4193036556243896, + "reward_std": 0.4224669486284256, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.014724194537848234, + "rewards/tag_count_reward": 0.9270833432674408, + "step": 1340 + }, + { + "clip_ratio": 0.0, + "completion_length": 263.6458435058594, + "epoch": 0.6705, + "grad_norm": 14.11730319397243, + "kl": 3.6484375, + "learning_rate": 3.6625129282837685e-07, + "loss": -0.0756, + "reward": 2.5297341346740723, + "reward_std": 0.5198619514703751, + "rewards/accuracy_reward": 0.708333358168602, + "rewards/reasoning_steps_reward": 0.9305555820465088, + "rewards/repetition_penalty_reward": -0.031029692851006985, + "rewards/tag_count_reward": 0.9218750298023224, + "step": 1341 + }, + { + "clip_ratio": 0.0, + "completion_length": 302.2708435058594, + "epoch": 0.671, + "grad_norm": 5.630265152909872, + "kl": 1.7578125, + "learning_rate": 3.655346347712922e-07, + "loss": -0.0712, + "reward": 2.448781371116638, + "reward_std": 0.4181585758924484, + "rewards/accuracy_reward": 0.5625000223517418, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.025177019648253918, + "rewards/tag_count_reward": 0.9322916865348816, + "step": 1342 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.9166717529297, + "epoch": 0.6715, + "grad_norm": 13.132812973478561, + "kl": 4.578125, + "learning_rate": 3.6481853862770107e-07, + "loss": -0.1407, + "reward": 2.560381054878235, + "reward_std": 0.7369968295097351, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9166666865348816, + "rewards/repetition_penalty_reward": -0.028160739690065384, + "rewards/tag_count_reward": 0.9427083432674408, + "step": 1343 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.12501525878906, + "epoch": 0.672, + "grad_norm": 16.153270842061996, + "kl": 3.3125, + "learning_rate": 3.641030065789562e-07, + "loss": -0.173, + "reward": 2.4971349239349365, + "reward_std": 0.7496102154254913, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9305556118488312, + "rewards/repetition_penalty_reward": -0.01675412245094776, + "rewards/tag_count_reward": 0.8958333432674408, + "step": 1344 + }, + { + "clip_ratio": 0.0, + "completion_length": 254.72917938232422, + "epoch": 0.6725, + "grad_norm": 13.478278088046087, + "kl": 2.83203125, + "learning_rate": 3.6338804080469253e-07, + "loss": -0.1632, + "reward": 2.4378998279571533, + "reward_std": 0.7053618729114532, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9097222685813904, + "rewards/repetition_penalty_reward": -0.013489224947988987, + "rewards/tag_count_reward": 0.9375000298023224, + "step": 1345 + }, + { + "clip_ratio": 0.0, + "completion_length": 313.9375, + "epoch": 0.673, + "grad_norm": 11.718823360410598, + "kl": 0.984375, + "learning_rate": 3.6267364348281946e-07, + "loss": -0.0235, + "reward": 2.578608751296997, + "reward_std": 0.4408767372369766, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.02208577375859022, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 1346 + }, + { + "clip_ratio": 0.0, + "completion_length": 260.33333587646484, + "epoch": 0.6735, + "grad_norm": 6.659825601151128, + "kl": 1.31640625, + "learning_rate": 3.6195981678951535e-07, + "loss": -0.0251, + "reward": 2.802467465400696, + "reward_std": 0.32479756511747837, + "rewards/accuracy_reward": 0.875, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.02739358227699995, + "rewards/tag_count_reward": 0.96875, + "step": 1347 + }, + { + "clip_ratio": 0.0, + "completion_length": 265.2916793823242, + "epoch": 0.674, + "grad_norm": 5.94749137911606, + "kl": 1.763671875, + "learning_rate": 3.612465628992203e-07, + "loss": -0.1275, + "reward": 2.4290517568588257, + "reward_std": 0.6968129873275757, + "rewards/accuracy_reward": 0.5833333730697632, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.020601162686944008, + "rewards/tag_count_reward": 0.9218750298023224, + "step": 1348 + }, + { + "clip_ratio": 0.0, + "completion_length": 292.43751525878906, + "epoch": 0.6745, + "grad_norm": 5.403140530841979, + "kl": 1.240234375, + "learning_rate": 3.60533883984629e-07, + "loss": -0.0099, + "reward": 2.505295157432556, + "reward_std": 0.556377038359642, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.020746564492583275, + "rewards/tag_count_reward": 0.921875, + "step": 1349 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.25, + "epoch": 0.675, + "grad_norm": 24.75284339330913, + "kl": 0.8349609375, + "learning_rate": 3.5982178221668533e-07, + "loss": -0.0314, + "reward": 2.6801000833511353, + "reward_std": 0.3536926209926605, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.02476109704002738, + "rewards/tag_count_reward": 0.9895833432674408, + "step": 1350 + }, + { + "clip_ratio": 0.0, + "completion_length": 260.06251525878906, + "epoch": 0.6755, + "grad_norm": 7.877722164133154, + "kl": 0.8818359375, + "learning_rate": 3.591102597645743e-07, + "loss": 0.0142, + "reward": 2.4543099403381348, + "reward_std": 0.3884861320257187, + "rewards/accuracy_reward": 0.5625, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.014439986553043127, + "rewards/tag_count_reward": 0.90625, + "step": 1351 + }, + { + "clip_ratio": 0.0, + "completion_length": 305.60418701171875, + "epoch": 0.676, + "grad_norm": 9.392382547183344, + "kl": 0.9609375, + "learning_rate": 3.5839931879571725e-07, + "loss": 0.0275, + "reward": 2.4401715993881226, + "reward_std": 0.32545991241931915, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.0337868882343173, + "rewards/tag_count_reward": 0.9322916865348816, + "step": 1352 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.87501525878906, + "epoch": 0.6765, + "grad_norm": 7.560942470949891, + "kl": 1.6328125, + "learning_rate": 3.5768896147576344e-07, + "loss": -0.0042, + "reward": 2.3383556604385376, + "reward_std": 0.6181593537330627, + "rewards/accuracy_reward": 0.4791666716337204, + "rewards/reasoning_steps_reward": 0.9652778506278992, + "rewards/repetition_penalty_reward": -0.02796376869082451, + "rewards/tag_count_reward": 0.9218750298023224, + "step": 1353 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.2708435058594, + "epoch": 0.677, + "grad_norm": 6.137807981682768, + "kl": 1.51171875, + "learning_rate": 3.5697918996858443e-07, + "loss": 0.0373, + "reward": 2.459334135055542, + "reward_std": 0.5140793472528458, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.02156880311667919, + "rewards/tag_count_reward": 0.9114583432674408, + "step": 1354 + }, + { + "clip_ratio": 0.0, + "completion_length": 291.35418701171875, + "epoch": 0.6775, + "grad_norm": 18.4925775719971, + "kl": 1.98046875, + "learning_rate": 3.5627000643626704e-07, + "loss": 0.0317, + "reward": 2.6040374040603638, + "reward_std": 0.40690936520695686, + "rewards/accuracy_reward": 0.7291666716337204, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.02096256148070097, + "rewards/tag_count_reward": 0.9166666865348816, + "step": 1355 + }, + { + "clip_ratio": 0.0, + "completion_length": 289.25, + "epoch": 0.678, + "grad_norm": 32.05231705546735, + "kl": 2.2890625, + "learning_rate": 3.555614130391079e-07, + "loss": -0.1021, + "reward": 2.4450795650482178, + "reward_std": 0.5791552066802979, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.9375000596046448, + "rewards/repetition_penalty_reward": -0.02887882199138403, + "rewards/tag_count_reward": 0.890625, + "step": 1356 + }, + { + "clip_ratio": 0.0, + "completion_length": 283.81251525878906, + "epoch": 0.6785, + "grad_norm": 11.77786900698309, + "kl": 1.3203125, + "learning_rate": 3.5485341193560503e-07, + "loss": -0.0302, + "reward": 2.523303985595703, + "reward_std": 0.5757817924022675, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9861111640930176, + "rewards/repetition_penalty_reward": -0.04093226231634617, + "rewards/tag_count_reward": 0.8697916865348816, + "step": 1357 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.8541717529297, + "epoch": 0.679, + "grad_norm": 5.649915735065612, + "kl": 0.642578125, + "learning_rate": 3.5414600528245266e-07, + "loss": 0.0445, + "reward": 2.7024723291397095, + "reward_std": 0.05420066323131323, + "rewards/accuracy_reward": 0.75, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.03711113706231117, + "rewards/tag_count_reward": 0.9895833432674408, + "step": 1358 + }, + { + "clip_ratio": 0.0, + "completion_length": 269.7916717529297, + "epoch": 0.6795, + "grad_norm": 5.052148500574189, + "kl": 1.94140625, + "learning_rate": 3.534391952345341e-07, + "loss": -0.2133, + "reward": 2.543895125389099, + "reward_std": 0.7587614357471466, + "rewards/accuracy_reward": 0.75, + "rewards/reasoning_steps_reward": 0.9166667461395264, + "rewards/repetition_penalty_reward": -0.023813419975340366, + "rewards/tag_count_reward": 0.9010416865348816, + "step": 1359 + }, + { + "clip_ratio": 0.0, + "completion_length": 235.64584350585938, + "epoch": 0.68, + "grad_norm": 19.199009856910283, + "kl": 3.46875, + "learning_rate": 3.5273298394491515e-07, + "loss": -0.0754, + "reward": 1.7231401801109314, + "reward_std": 0.5748586058616638, + "rewards/accuracy_reward": 0.125, + "rewards/reasoning_steps_reward": 0.8611111342906952, + "rewards/repetition_penalty_reward": -0.023387585766613483, + "rewards/tag_count_reward": 0.7604166865348816, + "step": 1360 + }, + { + "clip_ratio": 0.0, + "completion_length": 236.62500762939453, + "epoch": 0.6805, + "grad_norm": 6.3532586709420995, + "kl": 2.1875, + "learning_rate": 3.5202737356483816e-07, + "loss": -0.1683, + "reward": 2.469906449317932, + "reward_std": 0.6009674370288849, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9305555522441864, + "rewards/repetition_penalty_reward": -0.02314911223948002, + "rewards/tag_count_reward": 0.875, + "step": 1361 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.1666717529297, + "epoch": 0.681, + "grad_norm": 6.5401180752318036, + "kl": 2.05078125, + "learning_rate": 3.513223662437147e-07, + "loss": -0.1042, + "reward": 2.417715549468994, + "reward_std": 0.5224019438028336, + "rewards/accuracy_reward": 0.583333358168602, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.028464973904192448, + "rewards/tag_count_reward": 0.9114583432674408, + "step": 1362 + }, + { + "clip_ratio": 0.0, + "completion_length": 263.5833435058594, + "epoch": 0.6815, + "grad_norm": 6.66636418634202, + "kl": 1.2109375, + "learning_rate": 3.5061796412911913e-07, + "loss": -0.0175, + "reward": 2.8467352390289307, + "reward_std": 0.4228966236114502, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.016112081240862608, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 1363 + }, + { + "clip_ratio": 0.0, + "completion_length": 288.7083435058594, + "epoch": 0.682, + "grad_norm": 5.548234698164087, + "kl": 0.7255859375, + "learning_rate": 3.4991416936678276e-07, + "loss": 0.0183, + "reward": 2.5735777616500854, + "reward_std": 0.2825750932097435, + "rewards/accuracy_reward": 0.6250000149011612, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.0288529209792614, + "rewards/tag_count_reward": 0.984375, + "step": 1364 + }, + { + "clip_ratio": 0.0, + "completion_length": 302.5208435058594, + "epoch": 0.6825, + "grad_norm": 7.01523336912401, + "kl": 1.494140625, + "learning_rate": 3.49210984100586e-07, + "loss": -0.0757, + "reward": 2.2636520862579346, + "reward_std": 0.5402050614356995, + "rewards/accuracy_reward": 0.4166666865348816, + "rewards/reasoning_steps_reward": 0.9722222089767456, + "rewards/repetition_penalty_reward": -0.02627858892083168, + "rewards/tag_count_reward": 0.9010416865348816, + "step": 1365 + }, + { + "clip_ratio": 0.0, + "completion_length": 262.4375, + "epoch": 0.683, + "grad_norm": 10.223491193197818, + "kl": 2.58203125, + "learning_rate": 3.4850841047255364e-07, + "loss": 0.0061, + "reward": 2.299806833267212, + "reward_std": 0.5171876847743988, + "rewards/accuracy_reward": 0.4166666716337204, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.01963762380182743, + "rewards/tag_count_reward": 0.9166666865348816, + "step": 1366 + }, + { + "clip_ratio": 0.0, + "completion_length": 306.25, + "epoch": 0.6835, + "grad_norm": 3.9777954578862067, + "kl": 0.681640625, + "learning_rate": 3.4780645062284665e-07, + "loss": 0.049, + "reward": 2.467653274536133, + "reward_std": 0.14734814129769802, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.025402450934052467, + "rewards/tag_count_reward": 0.9583333432674408, + "step": 1367 + }, + { + "clip_ratio": 0.0, + "completion_length": 299.4791717529297, + "epoch": 0.684, + "grad_norm": 7.507696165222269, + "kl": 1.6875, + "learning_rate": 3.471051066897562e-07, + "loss": -0.0996, + "reward": 2.167901575565338, + "reward_std": 0.4075617045164108, + "rewards/accuracy_reward": 0.2916666865348816, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.024806811474263668, + "rewards/tag_count_reward": 0.9427083432674408, + "step": 1368 + }, + { + "clip_ratio": 0.0, + "completion_length": 264.7083435058594, + "epoch": 0.6845, + "grad_norm": 6.142733918004118, + "kl": 2.0546875, + "learning_rate": 3.4640438080969773e-07, + "loss": -0.1285, + "reward": 2.334149956703186, + "reward_std": 0.4152573347091675, + "rewards/accuracy_reward": 0.4375000149011612, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.02175288973376155, + "rewards/tag_count_reward": 0.953125, + "step": 1369 + }, + { + "clip_ratio": 0.0, + "completion_length": 281.2708435058594, + "epoch": 0.685, + "grad_norm": 19.072228574321905, + "kl": 2.3271484375, + "learning_rate": 3.45704275117204e-07, + "loss": 0.064, + "reward": 2.5171972513198853, + "reward_std": 0.44150757789611816, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.965277761220932, + "rewards/repetition_penalty_reward": -0.03141411580145359, + "rewards/tag_count_reward": 0.9166666865348816, + "step": 1370 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.72918701171875, + "epoch": 0.6855, + "grad_norm": 3.7189442550634975, + "kl": 0.521484375, + "learning_rate": 3.450047917449181e-07, + "loss": 0.0285, + "reward": 2.536484122276306, + "reward_std": 0.20072168111801147, + "rewards/accuracy_reward": 0.5625000149011612, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.02601606771349907, + "rewards/tag_count_reward": 1.0, + "step": 1371 + }, + { + "clip_ratio": 0.0, + "completion_length": 248.20833587646484, + "epoch": 0.686, + "grad_norm": 7.372964659791776, + "kl": 1.83984375, + "learning_rate": 3.4430593282358777e-07, + "loss": -0.0345, + "reward": 2.7801530361175537, + "reward_std": 0.5550233423709869, + "rewards/accuracy_reward": 0.8750000298023224, + "rewards/reasoning_steps_reward": 0.9722222089767456, + "rewards/repetition_penalty_reward": -0.02540257666260004, + "rewards/tag_count_reward": 0.9583333730697632, + "step": 1372 + }, + { + "clip_ratio": 0.0, + "completion_length": 272.66668701171875, + "epoch": 0.6865, + "grad_norm": 11.180474899161313, + "kl": 1.767578125, + "learning_rate": 3.4360770048205843e-07, + "loss": 0.0396, + "reward": 2.6890029907226562, + "reward_std": 0.49107812345027924, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.024538762867450714, + "rewards/tag_count_reward": 0.9427083432674408, + "step": 1373 + }, + { + "clip_ratio": 0.0, + "completion_length": 262.4791717529297, + "epoch": 0.687, + "grad_norm": 7.7693979337518275, + "kl": 0.7421875, + "learning_rate": 3.429100968472668e-07, + "loss": 0.0575, + "reward": 2.973837971687317, + "reward_std": 0.017366621643304825, + "rewards/accuracy_reward": 1.0, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.02616224344819784, + "rewards/tag_count_reward": 1.0, + "step": 1374 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.4166717529297, + "epoch": 0.6875, + "grad_norm": 3.5163971736947826, + "kl": 1.734375, + "learning_rate": 3.4221312404423486e-07, + "loss": -0.0705, + "reward": 2.387254238128662, + "reward_std": 0.3396844193339348, + "rewards/accuracy_reward": 0.4791666865348816, + "rewards/reasoning_steps_reward": 0.965277761220932, + "rewards/repetition_penalty_reward": -0.025940215215086937, + "rewards/tag_count_reward": 0.96875, + "step": 1375 + }, + { + "clip_ratio": 0.0, + "completion_length": 270.5208435058594, + "epoch": 0.688, + "grad_norm": 4.361183296818996, + "kl": 0.716796875, + "learning_rate": 3.4151678419606233e-07, + "loss": 0.007, + "reward": 2.502146601676941, + "reward_std": 0.3024323433637619, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.02215902181342244, + "rewards/tag_count_reward": 0.9895833432674408, + "step": 1376 + }, + { + "clip_ratio": 0.0, + "completion_length": 254.18750762939453, + "epoch": 0.6885, + "grad_norm": 5.83748552522261, + "kl": 2.4140625, + "learning_rate": 3.4082107942392136e-07, + "loss": -0.0837, + "reward": 2.4153069257736206, + "reward_std": 0.5053312480449677, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9236111342906952, + "rewards/repetition_penalty_reward": -0.03955421969294548, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 1377 + }, + { + "clip_ratio": 0.0, + "completion_length": 285.0416717529297, + "epoch": 0.689, + "grad_norm": 5.928807362280472, + "kl": 0.73046875, + "learning_rate": 3.4012601184704904e-07, + "loss": 0.0042, + "reward": 2.7761462926864624, + "reward_std": 0.2563507854938507, + "rewards/accuracy_reward": 0.8333333730697632, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.020728619303554296, + "rewards/tag_count_reward": 0.984375, + "step": 1378 + }, + { + "clip_ratio": 0.0, + "completion_length": 255.58334350585938, + "epoch": 0.6895, + "grad_norm": 6.1951381664884595, + "kl": 1.62890625, + "learning_rate": 3.3943158358274203e-07, + "loss": -0.1059, + "reward": 2.466705560684204, + "reward_std": 0.49967700242996216, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9444445371627808, + "rewards/repetition_penalty_reward": -0.029822363518178463, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 1379 + }, + { + "clip_ratio": 0.0, + "completion_length": 300.8333435058594, + "epoch": 0.69, + "grad_norm": 6.849165033673041, + "kl": 0.73828125, + "learning_rate": 3.387377967463493e-07, + "loss": 0.0257, + "reward": 2.7919795513153076, + "reward_std": 0.2739688716828823, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.030937272123992443, + "rewards/tag_count_reward": 0.96875, + "step": 1380 + }, + { + "clip_ratio": 0.0, + "completion_length": 253.33334350585938, + "epoch": 0.6905, + "grad_norm": 4.414135015976259, + "kl": 0.666015625, + "learning_rate": 3.3804465345126545e-07, + "loss": 0.0039, + "reward": 2.940259575843811, + "reward_std": 0.16437675757333636, + "rewards/accuracy_reward": 0.9791666865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.014601573813706636, + "rewards/tag_count_reward": 0.9895833432674408, + "step": 1381 + }, + { + "clip_ratio": 0.0, + "completion_length": 278.1458435058594, + "epoch": 0.691, + "grad_norm": 4.724734711788012, + "kl": 0.767578125, + "learning_rate": 3.3735215580892575e-07, + "loss": -0.1102, + "reward": 2.497297525405884, + "reward_std": 0.42764711380004883, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.8888889253139496, + "rewards/repetition_penalty_reward": -0.02179985586553812, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 1382 + }, + { + "clip_ratio": 0.0, + "completion_length": 272.125, + "epoch": 0.6915, + "grad_norm": 6.326351779698143, + "kl": 0.69921875, + "learning_rate": 3.366603059287977e-07, + "loss": -0.0165, + "reward": 2.3360326290130615, + "reward_std": 0.4005787819623947, + "rewards/accuracy_reward": 0.4166666865348816, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.02334247063845396, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 1383 + }, + { + "clip_ratio": 0.0, + "completion_length": 273.3541793823242, + "epoch": 0.692, + "grad_norm": 7.84619977481907, + "kl": 1.3828125, + "learning_rate": 3.359691059183761e-07, + "loss": -0.0385, + "reward": 2.366807222366333, + "reward_std": 0.606653481721878, + "rewards/accuracy_reward": 0.4583333432674408, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.025554182939231396, + "rewards/tag_count_reward": 0.9687500298023224, + "step": 1384 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.5625, + "epoch": 0.6925, + "grad_norm": 6.807191768545, + "kl": 0.876953125, + "learning_rate": 3.3527855788317614e-07, + "loss": 0.0225, + "reward": 2.5943243503570557, + "reward_std": 0.487922340631485, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.0341478455811739, + "rewards/tag_count_reward": 0.96875, + "step": 1385 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.04168701171875, + "epoch": 0.693, + "grad_norm": 9.428973328086897, + "kl": 0.693359375, + "learning_rate": 3.3458866392672694e-07, + "loss": -0.0436, + "reward": 2.774644374847412, + "reward_std": 0.39781980216503143, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.01875837054103613, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 1386 + }, + { + "clip_ratio": 0.0, + "completion_length": 286.7708435058594, + "epoch": 0.6935, + "grad_norm": 6.33506224802435, + "kl": 0.98046875, + "learning_rate": 3.338994261505649e-07, + "loss": -0.0637, + "reward": 2.1281405091285706, + "reward_std": 0.37489962577819824, + "rewards/accuracy_reward": 0.2291666716337204, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.026373423635959625, + "rewards/tag_count_reward": 0.9322916865348816, + "step": 1387 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.1666793823242, + "epoch": 0.694, + "grad_norm": 3.780779728016683, + "kl": 0.5712890625, + "learning_rate": 3.3321084665422803e-07, + "loss": -0.0363, + "reward": 2.5015201568603516, + "reward_std": 0.32130593061447144, + "rewards/accuracy_reward": 0.5625000149011612, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.026257574558258057, + "rewards/tag_count_reward": 0.9791666865348816, + "step": 1388 + }, + { + "clip_ratio": 0.0, + "completion_length": 292.0416717529297, + "epoch": 0.6945, + "grad_norm": 4.691021479308191, + "kl": 1.3203125, + "learning_rate": 3.325229275352489e-07, + "loss": -0.0273, + "reward": 2.5804378986358643, + "reward_std": 0.4816815108060837, + "rewards/accuracy_reward": 0.708333358168602, + "rewards/reasoning_steps_reward": 0.9375000298023224, + "rewards/repetition_penalty_reward": -0.0237288074567914, + "rewards/tag_count_reward": 0.9583333432674408, + "step": 1389 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.50001525878906, + "epoch": 0.695, + "grad_norm": 5.62934560927229, + "kl": 0.6484375, + "learning_rate": 3.3183567088914833e-07, + "loss": 0.0106, + "reward": 2.7332091331481934, + "reward_std": 0.08886073343455791, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.02720770798623562, + "rewards/tag_count_reward": 0.9895833432674408, + "step": 1390 + }, + { + "clip_ratio": 0.0, + "completion_length": 264.4375, + "epoch": 0.6955, + "grad_norm": 6.2652786345135985, + "kl": 0.77734375, + "learning_rate": 3.3114907880942933e-07, + "loss": 0.0052, + "reward": 2.7369773387908936, + "reward_std": 0.24390956666320562, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.023439462296664715, + "rewards/tag_count_reward": 0.96875, + "step": 1391 + }, + { + "clip_ratio": 0.0, + "completion_length": 305.5208435058594, + "epoch": 0.696, + "grad_norm": 4.763664401378893, + "kl": 0.587890625, + "learning_rate": 3.3046315338757026e-07, + "loss": 0.0138, + "reward": 2.908858060836792, + "reward_std": 0.17314723134040833, + "rewards/accuracy_reward": 0.9375000298023224, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.028641941025853157, + "rewards/tag_count_reward": 1.0, + "step": 1392 + }, + { + "clip_ratio": 0.0, + "completion_length": 265.62500762939453, + "epoch": 0.6965, + "grad_norm": 32.37138858122824, + "kl": 3.26953125, + "learning_rate": 3.297778967130191e-07, + "loss": 0.0044, + "reward": 2.3406665325164795, + "reward_std": 0.43019232153892517, + "rewards/accuracy_reward": 0.3958333432674408, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.0187085154466331, + "rewards/tag_count_reward": 0.984375, + "step": 1393 + }, + { + "clip_ratio": 0.0, + "completion_length": 285.6458435058594, + "epoch": 0.697, + "grad_norm": 3.700595698109454, + "kl": 0.462890625, + "learning_rate": 3.290933108731866e-07, + "loss": 0.03, + "reward": 2.709470510482788, + "reward_std": 0.254656121134758, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.04052966833114624, + "rewards/tag_count_reward": 1.0, + "step": 1394 + }, + { + "clip_ratio": 0.0, + "completion_length": 269.93750762939453, + "epoch": 0.6975, + "grad_norm": 17.617668970781107, + "kl": 3.083984375, + "learning_rate": 3.2840939795343987e-07, + "loss": -0.0307, + "reward": 2.570359468460083, + "reward_std": 0.3866235390305519, + "rewards/accuracy_reward": 0.7291666716337204, + "rewards/reasoning_steps_reward": 0.9236111342906952, + "rewards/repetition_penalty_reward": -0.025126777589321136, + "rewards/tag_count_reward": 0.9427083432674408, + "step": 1395 + }, + { + "clip_ratio": 0.0, + "completion_length": 262.7916717529297, + "epoch": 0.698, + "grad_norm": 5.286876953487483, + "kl": 1.009765625, + "learning_rate": 3.2772616003709616e-07, + "loss": -0.0073, + "reward": 2.618911862373352, + "reward_std": 0.3726077973842621, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.016504944302141666, + "rewards/tag_count_reward": 0.9895833432674408, + "step": 1396 + }, + { + "clip_ratio": 0.0, + "completion_length": 245.6875, + "epoch": 0.6985, + "grad_norm": 11.84313036080623, + "kl": 2.0234375, + "learning_rate": 3.270435992054166e-07, + "loss": -0.0171, + "reward": 2.6750162839889526, + "reward_std": 0.4495895653963089, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.009011534042656422, + "rewards/tag_count_reward": 0.9687500298023224, + "step": 1397 + }, + { + "clip_ratio": 0.0, + "completion_length": 266.5208435058594, + "epoch": 0.699, + "grad_norm": 4.6172662372737525, + "kl": 0.720703125, + "learning_rate": 3.263617175376001e-07, + "loss": -0.0352, + "reward": 2.491300940513611, + "reward_std": 0.3568393215537071, + "rewards/accuracy_reward": 0.520833358168602, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.019115855917334557, + "rewards/tag_count_reward": 0.9895833432674408, + "step": 1398 + }, + { + "clip_ratio": 0.0, + "completion_length": 312.0208435058594, + "epoch": 0.6995, + "grad_norm": 6.661016856595855, + "kl": 1.9296875, + "learning_rate": 3.2568051711077636e-07, + "loss": 0.0014, + "reward": 2.4803801774978638, + "reward_std": 0.2543382793664932, + "rewards/accuracy_reward": 0.5625000298023224, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.024828164838254452, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 1399 + }, + { + "clip_ratio": 0.0, + "completion_length": 245.64584350585938, + "epoch": 0.7, + "grad_norm": 7.280485227211617, + "kl": 1.5078125, + "learning_rate": 3.250000000000001e-07, + "loss": 0.0254, + "reward": 2.686321496963501, + "reward_std": 0.4712058752775192, + "rewards/accuracy_reward": 0.7708333730697632, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.011595276184380054, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 1400 + }, + { + "clip_ratio": 0.0, + "completion_length": 247.22917938232422, + "epoch": 0.7005, + "grad_norm": 5.073476939105119, + "kl": 0.724609375, + "learning_rate": 3.2432016827824414e-07, + "loss": -0.0513, + "reward": 2.620236396789551, + "reward_std": 0.33665385842323303, + "rewards/accuracy_reward": 0.6875, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.02038868237286806, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 1401 + }, + { + "clip_ratio": 0.0, + "completion_length": 259.4791717529297, + "epoch": 0.701, + "grad_norm": 14.591352187407846, + "kl": 1.109375, + "learning_rate": 3.2364102401639423e-07, + "loss": 0.0143, + "reward": 2.708950996398926, + "reward_std": 0.34444018453359604, + "rewards/accuracy_reward": 0.8125, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.01674342295154929, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 1402 + }, + { + "clip_ratio": 0.0, + "completion_length": 250.79167938232422, + "epoch": 0.7015, + "grad_norm": 4.900616214888936, + "kl": 1.1083984375, + "learning_rate": 3.229625692832414e-07, + "loss": -0.008, + "reward": 2.674125075340271, + "reward_std": 0.4905224144458771, + "rewards/accuracy_reward": 0.75, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.020319399423897266, + "rewards/tag_count_reward": 0.9791666865348816, + "step": 1403 + }, + { + "clip_ratio": 0.0, + "completion_length": 281.2708435058594, + "epoch": 0.702, + "grad_norm": 3.8548797240419503, + "kl": 0.87109375, + "learning_rate": 3.222848061454764e-07, + "loss": -0.0311, + "reward": 2.7337580919265747, + "reward_std": 0.4215731620788574, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.017978041898459196, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 1404 + }, + { + "clip_ratio": 0.0, + "completion_length": 258.8333435058594, + "epoch": 0.7025, + "grad_norm": 7.245780935291608, + "kl": 0.900390625, + "learning_rate": 3.216077366676833e-07, + "loss": -0.0148, + "reward": 2.862648129463196, + "reward_std": 0.3005357086658478, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.0331853311508894, + "rewards/tag_count_reward": 1.0, + "step": 1405 + }, + { + "clip_ratio": 0.0, + "completion_length": 277.7083435058594, + "epoch": 0.703, + "grad_norm": 8.179242595228185, + "kl": 1.078125, + "learning_rate": 3.209313629123329e-07, + "loss": 0.0065, + "reward": 2.753788113594055, + "reward_std": 0.3614218980073929, + "rewards/accuracy_reward": 0.8333333730697632, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.0378786064684391, + "rewards/tag_count_reward": 0.9791666865348816, + "step": 1406 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.5208435058594, + "epoch": 0.7035, + "grad_norm": 8.893778669993276, + "kl": 2.21875, + "learning_rate": 3.2025568693977745e-07, + "loss": -0.0732, + "reward": 2.5992143154144287, + "reward_std": 0.45963357388973236, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.951388955116272, + "rewards/repetition_penalty_reward": -0.03446643240749836, + "rewards/tag_count_reward": 0.9531250298023224, + "step": 1407 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.4791717529297, + "epoch": 0.704, + "grad_norm": 19.76315212110327, + "kl": 4.63671875, + "learning_rate": 3.195807108082429e-07, + "loss": -0.1105, + "reward": 2.3212958574295044, + "reward_std": 0.6959238648414612, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.9027777910232544, + "rewards/repetition_penalty_reward": -0.018981934990733862, + "rewards/tag_count_reward": 0.8958333730697632, + "step": 1408 + }, + { + "clip_ratio": 0.0, + "completion_length": 265.7708435058594, + "epoch": 0.7045, + "grad_norm": 7.783421626644704, + "kl": 2.55859375, + "learning_rate": 3.1890643657382356e-07, + "loss": -0.1032, + "reward": 2.7160589694976807, + "reward_std": 0.5336126536130905, + "rewards/accuracy_reward": 0.8333333730697632, + "rewards/reasoning_steps_reward": 0.9375000298023224, + "rewards/repetition_penalty_reward": -0.028732833918184042, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 1409 + }, + { + "clip_ratio": 0.0, + "completion_length": 212.7291717529297, + "epoch": 0.705, + "grad_norm": 13.071825269979621, + "kl": 3.9296875, + "learning_rate": 3.182328662904756e-07, + "loss": -0.0981, + "reward": 2.5850327014923096, + "reward_std": 0.6792955249547958, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.9027777910232544, + "rewards/repetition_penalty_reward": -0.02087017334997654, + "rewards/tag_count_reward": 0.890625, + "step": 1410 + }, + { + "clip_ratio": 0.0, + "completion_length": 227.12500762939453, + "epoch": 0.7055, + "grad_norm": 16.647908802505288, + "kl": 3.201171875, + "learning_rate": 3.175600020100112e-07, + "loss": -0.0142, + "reward": 2.422129988670349, + "reward_std": 0.3781207883730531, + "rewards/accuracy_reward": 0.625, + "rewards/reasoning_steps_reward": 0.9236111342906952, + "rewards/repetition_penalty_reward": -0.011897902470082045, + "rewards/tag_count_reward": 0.8854166865348816, + "step": 1411 + }, + { + "clip_ratio": 0.0, + "completion_length": 251.8541717529297, + "epoch": 0.706, + "grad_norm": 4.246659669908856, + "kl": 2.57421875, + "learning_rate": 3.168878457820915e-07, + "loss": -0.193, + "reward": 2.498375415802002, + "reward_std": 0.7053702175617218, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/reasoning_steps_reward": 0.902777761220932, + "rewards/repetition_penalty_reward": -0.024194156285375357, + "rewards/tag_count_reward": 0.953125, + "step": 1412 + }, + { + "clip_ratio": 0.0, + "completion_length": 290.00001525878906, + "epoch": 0.7065, + "grad_norm": 4.186288561909542, + "kl": 0.861328125, + "learning_rate": 3.162163996542209e-07, + "loss": -0.1177, + "reward": 2.7583051919937134, + "reward_std": 0.5227586776018143, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.04204211104661226, + "rewards/tag_count_reward": 0.9739583730697632, + "step": 1413 + }, + { + "clip_ratio": 0.0, + "completion_length": 295.8541717529297, + "epoch": 0.707, + "grad_norm": 4.18530097815396, + "kl": 0.830078125, + "learning_rate": 3.155456656717408e-07, + "loss": -0.0055, + "reward": 2.917997360229492, + "reward_std": 0.1906261881813407, + "rewards/accuracy_reward": 0.9791666865348816, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.029919438064098358, + "rewards/tag_count_reward": 0.9895833432674408, + "step": 1414 + }, + { + "clip_ratio": 0.0, + "completion_length": 229.52083587646484, + "epoch": 0.7075, + "grad_norm": 7.3471907992432515, + "kl": 1.296875, + "learning_rate": 3.14875645877823e-07, + "loss": -0.1006, + "reward": 2.7960588932037354, + "reward_std": 0.5089210569858551, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.9444444477558136, + "rewards/repetition_penalty_reward": -0.028593890368938446, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 1415 + }, + { + "clip_ratio": 0.0, + "completion_length": 293.8333435058594, + "epoch": 0.708, + "grad_norm": 5.66147418702201, + "kl": 1.16015625, + "learning_rate": 3.142063423134644e-07, + "loss": -0.0736, + "reward": 2.4612691402435303, + "reward_std": 0.4342493712902069, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.026578163262456656, + "rewards/tag_count_reward": 0.9114583432674408, + "step": 1416 + }, + { + "clip_ratio": 0.0, + "completion_length": 277.2708435058594, + "epoch": 0.7085, + "grad_norm": 4.258559874095392, + "kl": 0.826171875, + "learning_rate": 3.135377570174796e-07, + "loss": 0.0009, + "reward": 2.634892225265503, + "reward_std": 0.32757639279589057, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.02830233983695507, + "rewards/tag_count_reward": 0.96875, + "step": 1417 + }, + { + "clip_ratio": 0.0, + "completion_length": 258.3333435058594, + "epoch": 0.709, + "grad_norm": 5.069921662523948, + "kl": 2.03515625, + "learning_rate": 3.1286989202649503e-07, + "loss": -0.0317, + "reward": 2.490046262741089, + "reward_std": 0.43885305523872375, + "rewards/accuracy_reward": 0.6250000149011612, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.027314997278153896, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 1418 + }, + { + "clip_ratio": 0.0, + "completion_length": 236.12500762939453, + "epoch": 0.7095, + "grad_norm": 6.797930142073019, + "kl": 3.453125, + "learning_rate": 3.122027493749438e-07, + "loss": -0.1656, + "reward": 1.9841939210891724, + "reward_std": 0.6511839628219604, + "rewards/accuracy_reward": 0.2708333432674408, + "rewards/reasoning_steps_reward": 0.8611111044883728, + "rewards/repetition_penalty_reward": -0.02275061421096325, + "rewards/tag_count_reward": 0.8750000298023224, + "step": 1419 + }, + { + "clip_ratio": 0.0, + "completion_length": 256.3125, + "epoch": 0.71, + "grad_norm": 7.202911096329466, + "kl": 3.12109375, + "learning_rate": 3.115363310950578e-07, + "loss": -0.1218, + "reward": 2.4370113611221313, + "reward_std": 0.6423384845256805, + "rewards/accuracy_reward": 0.6875, + "rewards/reasoning_steps_reward": 0.8750000298023224, + "rewards/repetition_penalty_reward": -0.016113675199449062, + "rewards/tag_count_reward": 0.890625, + "step": 1420 + }, + { + "clip_ratio": 0.0, + "completion_length": 248.43750762939453, + "epoch": 0.7105, + "grad_norm": 7.687028342637116, + "kl": 2.15625, + "learning_rate": 3.1087063921686263e-07, + "loss": -0.0065, + "reward": 2.5198620557785034, + "reward_std": 0.6163989156484604, + "rewards/accuracy_reward": 0.7083333730697632, + "rewards/reasoning_steps_reward": 0.9444444477558136, + "rewards/repetition_penalty_reward": -0.023540794849395752, + "rewards/tag_count_reward": 0.8906250298023224, + "step": 1421 + }, + { + "clip_ratio": 0.0, + "completion_length": 264.33333587646484, + "epoch": 0.711, + "grad_norm": 6.503537489968444, + "kl": 1.65625, + "learning_rate": 3.102056757681715e-07, + "loss": -0.0039, + "reward": 2.3564590215682983, + "reward_std": 0.3768700957298279, + "rewards/accuracy_reward": 0.4375, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.03243005648255348, + "rewards/tag_count_reward": 0.9791666865348816, + "step": 1422 + }, + { + "clip_ratio": 0.0, + "completion_length": 218.125, + "epoch": 0.7115, + "grad_norm": 13.267307709264951, + "kl": 2.6328125, + "learning_rate": 3.0954144277457817e-07, + "loss": -0.0765, + "reward": 2.4606250524520874, + "reward_std": 0.48642072081565857, + "rewards/accuracy_reward": 0.5625000149011612, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.009861073223873973, + "rewards/tag_count_reward": 0.9427083432674408, + "step": 1423 + }, + { + "clip_ratio": 0.0, + "completion_length": 265.93750762939453, + "epoch": 0.712, + "grad_norm": 7.851695017414299, + "kl": 1.34765625, + "learning_rate": 3.0887794225945143e-07, + "loss": -0.0492, + "reward": 2.350769519805908, + "reward_std": 0.5201582908630371, + "rewards/accuracy_reward": 0.5, + "rewards/reasoning_steps_reward": 0.9444444477558136, + "rewards/repetition_penalty_reward": -0.036383312195539474, + "rewards/tag_count_reward": 0.9427083432674408, + "step": 1424 + }, + { + "clip_ratio": 0.0, + "completion_length": 234.89584350585938, + "epoch": 0.7125, + "grad_norm": 8.683593317223446, + "kl": 2.890625, + "learning_rate": 3.0821517624392925e-07, + "loss": -0.069, + "reward": 2.6100372076034546, + "reward_std": 0.6106880903244019, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.8958333730697632, + "rewards/repetition_penalty_reward": -0.030587902292609215, + "rewards/tag_count_reward": 0.9322916865348816, + "step": 1425 + }, + { + "clip_ratio": 0.0, + "completion_length": 239.77084350585938, + "epoch": 0.713, + "grad_norm": 16.77464036064384, + "kl": 3.6015625, + "learning_rate": 3.075531467469116e-07, + "loss": -0.0936, + "reward": 2.576740264892578, + "reward_std": 0.7488699555397034, + "rewards/accuracy_reward": 0.75, + "rewards/reasoning_steps_reward": 0.9236111044883728, + "rewards/repetition_penalty_reward": -0.013537466991692781, + "rewards/tag_count_reward": 0.9166666865348816, + "step": 1426 + }, + { + "clip_ratio": 0.0, + "completion_length": 327.06251525878906, + "epoch": 0.7135, + "grad_norm": 7.518467200296574, + "kl": 1.0859375, + "learning_rate": 3.0689185578505525e-07, + "loss": -0.0554, + "reward": 2.6223039627075195, + "reward_std": 0.5526255667209625, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.027001574635505676, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 1427 + }, + { + "clip_ratio": 0.0, + "completion_length": 276.62501525878906, + "epoch": 0.714, + "grad_norm": 8.336081909558045, + "kl": 1.48828125, + "learning_rate": 3.062313053727671e-07, + "loss": -0.0128, + "reward": 2.6898101568222046, + "reward_std": 0.5765750408172607, + "rewards/accuracy_reward": 0.7708333730697632, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.016787134110927582, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 1428 + }, + { + "clip_ratio": 0.0, + "completion_length": 251.14583587646484, + "epoch": 0.7145, + "grad_norm": 13.511892818726327, + "kl": 2.55859375, + "learning_rate": 3.055714975221981e-07, + "loss": 0.0218, + "reward": 2.5956475734710693, + "reward_std": 0.34704746305942535, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9722222089767456, + "rewards/repetition_penalty_reward": -0.022408071905374527, + "rewards/tag_count_reward": 0.9791666865348816, + "step": 1429 + }, + { + "clip_ratio": 0.0, + "completion_length": 262.8541793823242, + "epoch": 0.715, + "grad_norm": 9.586287180266988, + "kl": 1.05078125, + "learning_rate": 3.0491243424323783e-07, + "loss": 0.0298, + "reward": 2.318286895751953, + "reward_std": 0.4003664702177048, + "rewards/accuracy_reward": 0.4166666865348816, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.03414386324584484, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 1430 + }, + { + "clip_ratio": 0.0, + "completion_length": 336.4583435058594, + "epoch": 0.7155, + "grad_norm": 4.201243088427591, + "kl": 1.953125, + "learning_rate": 3.0425411754350694e-07, + "loss": -0.0551, + "reward": 2.253325581550598, + "reward_std": 0.4799940884113312, + "rewards/accuracy_reward": 0.39583333395421505, + "rewards/reasoning_steps_reward": 0.9305555820465088, + "rewards/repetition_penalty_reward": -0.03660505823791027, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 1431 + }, + { + "clip_ratio": 0.0, + "completion_length": 261.1458435058594, + "epoch": 0.716, + "grad_norm": 11.754368643179218, + "kl": 2.6796875, + "learning_rate": 3.0359654942835247e-07, + "loss": 0.0551, + "reward": 2.5765037536621094, + "reward_std": 0.49608848989009857, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.02766304276883602, + "rewards/tag_count_reward": 0.9375, + "step": 1432 + }, + { + "clip_ratio": 0.0, + "completion_length": 293.6458435058594, + "epoch": 0.7165, + "grad_norm": 4.101040359968373, + "kl": 0.55078125, + "learning_rate": 3.029397319008407e-07, + "loss": -0.0074, + "reward": 2.9076608419418335, + "reward_std": 0.1762450411915779, + "rewards/accuracy_reward": 0.9375000298023224, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.029839315451681614, + "rewards/tag_count_reward": 1.0, + "step": 1433 + }, + { + "clip_ratio": 0.0, + "completion_length": 273.2083435058594, + "epoch": 0.717, + "grad_norm": 5.6015729375673065, + "kl": 0.94140625, + "learning_rate": 3.02283666961752e-07, + "loss": -0.0133, + "reward": 2.6073429584503174, + "reward_std": 0.20822132378816605, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.028073765337467194, + "rewards/tag_count_reward": 0.9895833432674408, + "step": 1434 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.7708435058594, + "epoch": 0.7175, + "grad_norm": 13.758583049621224, + "kl": 3.21875, + "learning_rate": 3.016283566095739e-07, + "loss": -0.0096, + "reward": 2.5433467626571655, + "reward_std": 0.5455273687839508, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.024361703544855118, + "rewards/tag_count_reward": 0.9010416865348816, + "step": 1435 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.7708435058594, + "epoch": 0.718, + "grad_norm": 6.637973784958379, + "kl": 1.078125, + "learning_rate": 3.0097380284049523e-07, + "loss": -0.0344, + "reward": 2.5512558221817017, + "reward_std": 0.4321069046854973, + "rewards/accuracy_reward": 0.6250000149011612, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.021661010570824146, + "rewards/tag_count_reward": 0.9687500298023224, + "step": 1436 + }, + { + "clip_ratio": 0.0, + "completion_length": 256.9166793823242, + "epoch": 0.7185, + "grad_norm": 9.5020255723838, + "kl": 2.4609375, + "learning_rate": 3.003200076484004e-07, + "loss": -0.0598, + "reward": 2.710210919380188, + "reward_std": 0.5970990657806396, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.8958334028720856, + "rewards/repetition_penalty_reward": -0.018955985084176064, + "rewards/tag_count_reward": 0.9791666865348816, + "step": 1437 + }, + { + "clip_ratio": 0.0, + "completion_length": 270.14583587646484, + "epoch": 0.719, + "grad_norm": 6.729758300453091, + "kl": 1.2109375, + "learning_rate": 2.996669730248628e-07, + "loss": 0.0368, + "reward": 2.680380702018738, + "reward_std": 0.4900428205728531, + "rewards/accuracy_reward": 0.7708333730697632, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.022744507528841496, + "rewards/tag_count_reward": 0.953125, + "step": 1438 + }, + { + "clip_ratio": 0.0, + "completion_length": 297.37501525878906, + "epoch": 0.7195, + "grad_norm": 6.100933401861528, + "kl": 1.5703125, + "learning_rate": 2.9901470095913943e-07, + "loss": -0.139, + "reward": 2.433695912361145, + "reward_std": 0.47993409633636475, + "rewards/accuracy_reward": 0.5625, + "rewards/reasoning_steps_reward": 0.951388955116272, + "rewards/repetition_penalty_reward": -0.033318111672997475, + "rewards/tag_count_reward": 0.953125, + "step": 1439 + }, + { + "clip_ratio": 0.0, + "completion_length": 255.10417938232422, + "epoch": 0.72, + "grad_norm": 17.639971258796862, + "kl": 4.703125, + "learning_rate": 2.9836319343816397e-07, + "loss": -0.0614, + "reward": 2.4460188150405884, + "reward_std": 0.6759764552116394, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.9027778208255768, + "rewards/repetition_penalty_reward": -0.01925905141979456, + "rewards/tag_count_reward": 0.9166666865348816, + "step": 1440 + }, + { + "clip_ratio": 0.0, + "completion_length": 379.8541717529297, + "epoch": 0.7205, + "grad_norm": 6.145673583189092, + "kl": 1.65625, + "learning_rate": 2.977124524465413e-07, + "loss": -0.0369, + "reward": 2.5621336698532104, + "reward_std": 0.4305399991571903, + "rewards/accuracy_reward": 0.6875, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.029880317859351635, + "rewards/tag_count_reward": 0.953125, + "step": 1441 + }, + { + "clip_ratio": 0.0, + "completion_length": 257.8333435058594, + "epoch": 0.721, + "grad_norm": 22.556717516314336, + "kl": 3.953125, + "learning_rate": 2.9706247996654134e-07, + "loss": 0.046, + "reward": 2.0625728368759155, + "reward_std": 0.7029457688331604, + "rewards/accuracy_reward": 0.3541666716337204, + "rewards/reasoning_steps_reward": 0.8750000298023224, + "rewards/repetition_penalty_reward": -0.02076055482029915, + "rewards/tag_count_reward": 0.8541666865348816, + "step": 1442 + }, + { + "clip_ratio": 0.0, + "completion_length": 326.5833435058594, + "epoch": 0.7215, + "grad_norm": 9.337104428836039, + "kl": 1.0546875, + "learning_rate": 2.964132779780929e-07, + "loss": 0.0703, + "reward": 2.6360541582107544, + "reward_std": 0.4235610216856003, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.0358209777623415, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 1443 + }, + { + "clip_ratio": 0.0, + "completion_length": 291.29168701171875, + "epoch": 0.722, + "grad_norm": 14.451017089953675, + "kl": 1.880859375, + "learning_rate": 2.9576484845877793e-07, + "loss": -0.0315, + "reward": 2.36261785030365, + "reward_std": 0.4776066839694977, + "rewards/accuracy_reward": 0.520833358168602, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.029743347316980362, + "rewards/tag_count_reward": 0.9270833432674408, + "step": 1444 + }, + { + "clip_ratio": 0.0, + "completion_length": 264.62501525878906, + "epoch": 0.7225, + "grad_norm": 9.176318494305416, + "kl": 1.43359375, + "learning_rate": 2.9511719338382535e-07, + "loss": -0.0615, + "reward": 2.627773642539978, + "reward_std": 0.6298104226589203, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9444444477558136, + "rewards/repetition_penalty_reward": -0.019795984961092472, + "rewards/tag_count_reward": 0.953125, + "step": 1445 + }, + { + "clip_ratio": 0.0, + "completion_length": 331.5416717529297, + "epoch": 0.723, + "grad_norm": 10.455365880612927, + "kl": 2.8125, + "learning_rate": 2.944703147261046e-07, + "loss": -0.1501, + "reward": 2.3291326761245728, + "reward_std": 0.7611142992973328, + "rewards/accuracy_reward": 0.5625, + "rewards/reasoning_steps_reward": 0.902777910232544, + "rewards/repetition_penalty_reward": -0.03197855316102505, + "rewards/tag_count_reward": 0.8958333432674408, + "step": 1446 + }, + { + "clip_ratio": 0.0, + "completion_length": 261.89583587646484, + "epoch": 0.7235, + "grad_norm": 5.3123582713501944, + "kl": 1.1015625, + "learning_rate": 2.938242144561201e-07, + "loss": 0.055, + "reward": 2.4350517988204956, + "reward_std": 0.3026663661003113, + "rewards/accuracy_reward": 0.4791666716337204, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.016337126959115267, + "rewards/tag_count_reward": 0.9791666865348816, + "step": 1447 + }, + { + "clip_ratio": 0.0, + "completion_length": 283.6041717529297, + "epoch": 0.724, + "grad_norm": 10.893245730461343, + "kl": 1.38671875, + "learning_rate": 2.931788945420058e-07, + "loss": -0.043, + "reward": 2.7075858116149902, + "reward_std": 0.4429877921938896, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.021580896340310574, + "rewards/tag_count_reward": 0.9583333432674408, + "step": 1448 + }, + { + "clip_ratio": 0.0, + "completion_length": 283.8958435058594, + "epoch": 0.7245, + "grad_norm": 7.744283708478767, + "kl": 1.791015625, + "learning_rate": 2.925343569495178e-07, + "loss": -0.053, + "reward": 2.6358524560928345, + "reward_std": 0.5802061557769775, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.03602285124361515, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 1449 + }, + { + "clip_ratio": 0.0, + "completion_length": 292.2291717529297, + "epoch": 0.725, + "grad_norm": 6.348495492774551, + "kl": 1.56640625, + "learning_rate": 2.918906036420294e-07, + "loss": -0.0218, + "reward": 2.6777302026748657, + "reward_std": 0.554046094417572, + "rewards/accuracy_reward": 0.75, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.023658874444663525, + "rewards/tag_count_reward": 0.9583333432674408, + "step": 1450 + }, + { + "clip_ratio": 0.0, + "completion_length": 278.7291717529297, + "epoch": 0.7255, + "grad_norm": 12.67301139932153, + "kl": 2.2890625, + "learning_rate": 2.9124763658052474e-07, + "loss": 0.0824, + "reward": 2.542311429977417, + "reward_std": 0.4715491533279419, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9444445073604584, + "rewards/repetition_penalty_reward": -0.03234157059341669, + "rewards/tag_count_reward": 0.9427083432674408, + "step": 1451 + }, + { + "clip_ratio": 0.0, + "completion_length": 320.2708435058594, + "epoch": 0.726, + "grad_norm": 6.540507187939509, + "kl": 1.77734375, + "learning_rate": 2.9060545772359305e-07, + "loss": -0.0138, + "reward": 2.2951096296310425, + "reward_std": 0.7114408314228058, + "rewards/accuracy_reward": 0.4583333432674408, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.02259882539510727, + "rewards/tag_count_reward": 0.8802083730697632, + "step": 1452 + }, + { + "clip_ratio": 0.0, + "completion_length": 316.4583435058594, + "epoch": 0.7265, + "grad_norm": 3.9719963516737162, + "kl": 1.443359375, + "learning_rate": 2.8996406902742267e-07, + "loss": 0.0444, + "reward": 2.827430486679077, + "reward_std": 0.2230637795291841, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.018055669963359833, + "rewards/tag_count_reward": 0.984375, + "step": 1453 + }, + { + "clip_ratio": 0.0, + "completion_length": 283.6666717529297, + "epoch": 0.727, + "grad_norm": 7.2887085683958945, + "kl": 1.142578125, + "learning_rate": 2.893234724457946e-07, + "loss": 0.0405, + "reward": 2.776036858558655, + "reward_std": 0.2234293557703495, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.024310494773089886, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 1454 + }, + { + "clip_ratio": 0.0, + "completion_length": 297.56251525878906, + "epoch": 0.7275, + "grad_norm": 7.258259475742343, + "kl": 1.33203125, + "learning_rate": 2.886836699300771e-07, + "loss": 0.0238, + "reward": 2.5538902282714844, + "reward_std": 0.48146432638168335, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9791666567325592, + "rewards/repetition_penalty_reward": -0.029443158768117428, + "rewards/tag_count_reward": 0.9375, + "step": 1455 + }, + { + "clip_ratio": 0.0, + "completion_length": 288.2083435058594, + "epoch": 0.728, + "grad_norm": 4.296720231595159, + "kl": 0.537109375, + "learning_rate": 2.8804466342921987e-07, + "loss": 0.0545, + "reward": 2.8775835037231445, + "reward_std": 0.20204564975574613, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.018249690299853683, + "rewards/tag_count_reward": 1.0, + "step": 1456 + }, + { + "clip_ratio": 0.0, + "completion_length": 316.5625, + "epoch": 0.7285, + "grad_norm": 14.229710817664285, + "kl": 1.59375, + "learning_rate": 2.874064548897472e-07, + "loss": 0.0002, + "reward": 2.417072057723999, + "reward_std": 0.6614536046981812, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9444444477558136, + "rewards/repetition_penalty_reward": -0.027372430078685284, + "rewards/tag_count_reward": 0.8958333432674408, + "step": 1457 + }, + { + "clip_ratio": 0.0, + "completion_length": 286.0833435058594, + "epoch": 0.729, + "grad_norm": 6.574513295769042, + "kl": 3.15625, + "learning_rate": 2.86769046255753e-07, + "loss": -0.0074, + "reward": 2.3303749561309814, + "reward_std": 0.8391402065753937, + "rewards/accuracy_reward": 0.5625000298023224, + "rewards/reasoning_steps_reward": 0.9027778208255768, + "rewards/repetition_penalty_reward": -0.030736176297068596, + "rewards/tag_count_reward": 0.8958333432674408, + "step": 1458 + }, + { + "clip_ratio": 0.0, + "completion_length": 303.5833435058594, + "epoch": 0.7295, + "grad_norm": 5.801082971170413, + "kl": 0.763671875, + "learning_rate": 2.8613243946889477e-07, + "loss": 0.0801, + "reward": 2.7063515186309814, + "reward_std": 0.49810338020324707, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.034967850893735886, + "rewards/tag_count_reward": 0.984375, + "step": 1459 + }, + { + "clip_ratio": 0.0, + "completion_length": 236.06250762939453, + "epoch": 0.73, + "grad_norm": 6.729899483661514, + "kl": 1.302734375, + "learning_rate": 2.854966364683872e-07, + "loss": -0.0084, + "reward": 2.4949501752853394, + "reward_std": 0.4440324157476425, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.8958333730697632, + "rewards/repetition_penalty_reward": -0.015466715674847364, + "rewards/tag_count_reward": 0.96875, + "step": 1460 + }, + { + "clip_ratio": 0.0, + "completion_length": 262.18750762939453, + "epoch": 0.7305, + "grad_norm": 25.447643773212334, + "kl": 4.2109375, + "learning_rate": 2.848616391909959e-07, + "loss": 0.0692, + "reward": 2.4095431566238403, + "reward_std": 0.4361160099506378, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9236111640930176, + "rewards/repetition_penalty_reward": -0.024484614841639996, + "rewards/tag_count_reward": 0.9270833730697632, + "step": 1461 + }, + { + "clip_ratio": 0.0, + "completion_length": 270.1666793823242, + "epoch": 0.731, + "grad_norm": 20.974946344266833, + "kl": 3.1328125, + "learning_rate": 2.842274495710335e-07, + "loss": 0.1305, + "reward": 2.630603075027466, + "reward_std": 0.5745793282985687, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.03953584283590317, + "rewards/tag_count_reward": 0.9270833432674408, + "step": 1462 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.4791717529297, + "epoch": 0.7315, + "grad_norm": 50.08951516994972, + "kl": 3.041015625, + "learning_rate": 2.835940695403512e-07, + "loss": 0.1238, + "reward": 2.7686582803726196, + "reward_std": 0.3026948422193527, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.023008490912616253, + "rewards/tag_count_reward": 0.9583333432674408, + "step": 1463 + }, + { + "clip_ratio": 0.0, + "completion_length": 278.18751525878906, + "epoch": 0.732, + "grad_norm": 9.460276151267854, + "kl": 2.908203125, + "learning_rate": 2.829615010283344e-07, + "loss": 0.0303, + "reward": 2.6455318927764893, + "reward_std": 0.5820550620555878, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.03502368927001953, + "rewards/tag_count_reward": 0.9375, + "step": 1464 + }, + { + "clip_ratio": 0.0, + "completion_length": 275.1041717529297, + "epoch": 0.7325, + "grad_norm": 10.796804080704874, + "kl": 2.96875, + "learning_rate": 2.8232974596189653e-07, + "loss": 0.0181, + "reward": 2.4227362871170044, + "reward_std": 0.4896738827228546, + "rewards/accuracy_reward": 0.5416666716337204, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.019972197711467743, + "rewards/tag_count_reward": 0.9427083432674408, + "step": 1465 + }, + { + "clip_ratio": 0.0, + "completion_length": 260.0416793823242, + "epoch": 0.733, + "grad_norm": 18.4213130469492, + "kl": 3.0, + "learning_rate": 2.8169880626547283e-07, + "loss": 0.1161, + "reward": 2.2570921182632446, + "reward_std": 0.6915050446987152, + "rewards/accuracy_reward": 0.4583333432674408, + "rewards/reasoning_steps_reward": 0.9305556118488312, + "rewards/repetition_penalty_reward": -0.022421800531446934, + "rewards/tag_count_reward": 0.8906250298023224, + "step": 1466 + }, + { + "clip_ratio": 0.0, + "completion_length": 299.1875, + "epoch": 0.7335, + "grad_norm": 7.219616999713985, + "kl": 1.9921875, + "learning_rate": 2.8106868386101545e-07, + "loss": -0.0582, + "reward": 2.4741777181625366, + "reward_std": 0.46304862946271896, + "rewards/accuracy_reward": 0.645833358168602, + "rewards/reasoning_steps_reward": 0.9166666865348816, + "rewards/repetition_penalty_reward": -0.025822297669947147, + "rewards/tag_count_reward": 0.9375, + "step": 1467 + }, + { + "clip_ratio": 0.0, + "completion_length": 288.2291717529297, + "epoch": 0.734, + "grad_norm": 8.310583448002093, + "kl": 1.1015625, + "learning_rate": 2.8043938066798645e-07, + "loss": 0.0088, + "reward": 2.6622068881988525, + "reward_std": 0.3658689558506012, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.021820951253175735, + "rewards/tag_count_reward": 0.9687500298023224, + "step": 1468 + }, + { + "clip_ratio": 0.0, + "completion_length": 293.3333435058594, + "epoch": 0.7345, + "grad_norm": 10.495545513429683, + "kl": 0.80078125, + "learning_rate": 2.7981089860335225e-07, + "loss": 0.0209, + "reward": 2.71765673160553, + "reward_std": 0.37030835449695587, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.030607173219323158, + "rewards/tag_count_reward": 0.984375, + "step": 1469 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.3333435058594, + "epoch": 0.735, + "grad_norm": 3.6146465901165348, + "kl": 0.54296875, + "learning_rate": 2.791832395815782e-07, + "loss": 0.0526, + "reward": 2.7022202014923096, + "reward_std": 0.08511605486273766, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.026946650817990303, + "rewards/tag_count_reward": 1.0, + "step": 1470 + }, + { + "clip_ratio": 0.0, + "completion_length": 313.8541717529297, + "epoch": 0.7355, + "grad_norm": 12.76797627033702, + "kl": 0.978515625, + "learning_rate": 2.7855640551462287e-07, + "loss": 0.0287, + "reward": 2.633518099784851, + "reward_std": 0.5054084360599518, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.02446805965155363, + "rewards/tag_count_reward": 0.921875, + "step": 1471 + }, + { + "clip_ratio": 0.0, + "completion_length": 286.06251525878906, + "epoch": 0.736, + "grad_norm": 11.77020718227818, + "kl": 1.6171875, + "learning_rate": 2.7793039831193133e-07, + "loss": -0.0021, + "reward": 2.617877721786499, + "reward_std": 0.5727947354316711, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9444445371627808, + "rewards/repetition_penalty_reward": -0.04531669802963734, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 1472 + }, + { + "clip_ratio": 0.0, + "completion_length": 280.1458435058594, + "epoch": 0.7365, + "grad_norm": 6.357148681677889, + "kl": 0.94140625, + "learning_rate": 2.773052198804301e-07, + "loss": -0.0548, + "reward": 2.5771846771240234, + "reward_std": 0.6074622422456741, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.965277761220932, + "rewards/repetition_penalty_reward": -0.01830137614160776, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 1473 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.4166717529297, + "epoch": 0.737, + "grad_norm": 6.670239361314333, + "kl": 1.5390625, + "learning_rate": 2.766808721245211e-07, + "loss": 0.0276, + "reward": 2.690428137779236, + "reward_std": 0.5497512817382812, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9583334028720856, + "rewards/repetition_penalty_reward": -0.023113532923161983, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 1474 + }, + { + "clip_ratio": 0.0, + "completion_length": 269.18751525878906, + "epoch": 0.7375, + "grad_norm": 6.355162766928286, + "kl": 2.09765625, + "learning_rate": 2.760573569460757e-07, + "loss": 0.0137, + "reward": 2.8231576681137085, + "reward_std": 0.3577008843421936, + "rewards/accuracy_reward": 0.9375000298023224, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.041425829753279686, + "rewards/tag_count_reward": 0.9687500298023224, + "step": 1475 + }, + { + "clip_ratio": 0.0, + "completion_length": 289.7708435058594, + "epoch": 0.738, + "grad_norm": 4.4722454967487995, + "kl": 1.279296875, + "learning_rate": 2.7543467624442956e-07, + "loss": 0.0344, + "reward": 2.824985980987549, + "reward_std": 0.3037413991987705, + "rewards/accuracy_reward": 0.875, + "rewards/reasoning_steps_reward": 0.9722222089767456, + "rewards/repetition_penalty_reward": -0.017028134781867266, + "rewards/tag_count_reward": 0.9947916865348816, + "step": 1476 + }, + { + "clip_ratio": 0.0, + "completion_length": 252.89583587646484, + "epoch": 0.7385, + "grad_norm": 16.986839644883492, + "kl": 4.03125, + "learning_rate": 2.7481283191637605e-07, + "loss": 0.0652, + "reward": 2.479672074317932, + "reward_std": 0.8198486566543579, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.8958333432674408, + "rewards/repetition_penalty_reward": -0.009911454282701015, + "rewards/tag_count_reward": 0.8645833432674408, + "step": 1477 + }, + { + "clip_ratio": 0.0, + "completion_length": 261.2708435058594, + "epoch": 0.739, + "grad_norm": 9.368512168345658, + "kl": 2.2265625, + "learning_rate": 2.741918258561607e-07, + "loss": -0.0376, + "reward": 2.04381787776947, + "reward_std": 0.49914440512657166, + "rewards/accuracy_reward": 0.1875000074505806, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.023890579119324684, + "rewards/tag_count_reward": 0.921875, + "step": 1478 + }, + { + "clip_ratio": 0.0, + "completion_length": 303.625, + "epoch": 0.7395, + "grad_norm": 13.277381628251995, + "kl": 2.6484375, + "learning_rate": 2.7357165995547547e-07, + "loss": 0.1547, + "reward": 2.4987998008728027, + "reward_std": 0.6838244497776031, + "rewards/accuracy_reward": 0.6458333730697632, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.0272420234978199, + "rewards/tag_count_reward": 0.921875, + "step": 1479 + }, + { + "clip_ratio": 0.0, + "completion_length": 263.8333435058594, + "epoch": 0.74, + "grad_norm": 7.347881942486295, + "kl": 1.5859375, + "learning_rate": 2.729523361034538e-07, + "loss": -0.0831, + "reward": 2.309889793395996, + "reward_std": 0.5420120805501938, + "rewards/accuracy_reward": 0.4583333432674408, + "rewards/reasoning_steps_reward": 0.9444445073604584, + "rewards/repetition_penalty_reward": -0.025179607793688774, + "rewards/tag_count_reward": 0.9322916865348816, + "step": 1480 + }, + { + "clip_ratio": 0.0, + "completion_length": 292.7291717529297, + "epoch": 0.7405, + "grad_norm": 14.604720055826698, + "kl": 1.775390625, + "learning_rate": 2.7233385618666315e-07, + "loss": 0.1195, + "reward": 2.609587788581848, + "reward_std": 0.3773094117641449, + "rewards/accuracy_reward": 0.6875000149011612, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.03103726916015148, + "rewards/tag_count_reward": 0.953125, + "step": 1481 + }, + { + "clip_ratio": 0.0, + "completion_length": 290.4375, + "epoch": 0.741, + "grad_norm": 34.38249331144065, + "kl": 4.140625, + "learning_rate": 2.717162220891007e-07, + "loss": 0.0792, + "reward": 1.9346249103546143, + "reward_std": 0.3914206847548485, + "rewards/accuracy_reward": 0.12500000558793545, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.01676415279507637, + "rewards/tag_count_reward": 0.875, + "step": 1482 + }, + { + "clip_ratio": 0.0, + "completion_length": 274.0416717529297, + "epoch": 0.7415, + "grad_norm": 4.498402010991118, + "kl": 1.48046875, + "learning_rate": 2.7109943569218707e-07, + "loss": -0.0545, + "reward": 2.460047483444214, + "reward_std": 0.543542668223381, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.026063593104481697, + "rewards/tag_count_reward": 0.9583333432674408, + "step": 1483 + }, + { + "clip_ratio": 0.0, + "completion_length": 225.5416717529297, + "epoch": 0.742, + "grad_norm": 9.59444129515133, + "kl": 1.5859375, + "learning_rate": 2.7048349887476037e-07, + "loss": -0.0225, + "reward": 2.367347240447998, + "reward_std": 0.5063965022563934, + "rewards/accuracy_reward": 0.47916667722165585, + "rewards/reasoning_steps_reward": 0.9444445371627808, + "rewards/repetition_penalty_reward": -0.014597164001315832, + "rewards/tag_count_reward": 0.9583333432674408, + "step": 1484 + }, + { + "clip_ratio": 0.0, + "completion_length": 297.1666717529297, + "epoch": 0.7425, + "grad_norm": 6.833454160186346, + "kl": 1.1015625, + "learning_rate": 2.698684135130713e-07, + "loss": -0.0404, + "reward": 2.429524064064026, + "reward_std": 0.6519744396209717, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.04096201993525028, + "rewards/tag_count_reward": 0.9427083432674408, + "step": 1485 + }, + { + "clip_ratio": 0.0, + "completion_length": 259.3958435058594, + "epoch": 0.743, + "grad_norm": 5.515959993557466, + "kl": 0.701171875, + "learning_rate": 2.692541814807763e-07, + "loss": 0.0611, + "reward": 2.8888471126556396, + "reward_std": 0.18290941882878542, + "rewards/accuracy_reward": 0.9375, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.0174028305336833, + "rewards/tag_count_reward": 0.96875, + "step": 1486 + }, + { + "clip_ratio": 0.0, + "completion_length": 269.5208435058594, + "epoch": 0.7435, + "grad_norm": 6.265052966360687, + "kl": 1.017578125, + "learning_rate": 2.686408046489328e-07, + "loss": -0.0232, + "reward": 2.5907429456710815, + "reward_std": 0.3499850779771805, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.020368190482258797, + "rewards/tag_count_reward": 0.9791666865348816, + "step": 1487 + }, + { + "clip_ratio": 0.0, + "completion_length": 324.0208435058594, + "epoch": 0.744, + "grad_norm": 6.5782759908928945, + "kl": 0.80859375, + "learning_rate": 2.6802828488599294e-07, + "loss": 0.0469, + "reward": 2.6091192960739136, + "reward_std": 0.4149770438671112, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.02282518707215786, + "rewards/tag_count_reward": 0.9791666865348816, + "step": 1488 + }, + { + "clip_ratio": 0.0, + "completion_length": 290.68751525878906, + "epoch": 0.7445, + "grad_norm": 6.142749229443921, + "kl": 0.759765625, + "learning_rate": 2.6741662405779796e-07, + "loss": 0.0677, + "reward": 2.660447597503662, + "reward_std": 0.3648398518562317, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.027052627876400948, + "rewards/tag_count_reward": 0.9791666865348816, + "step": 1489 + }, + { + "clip_ratio": 0.0, + "completion_length": 303.0833435058594, + "epoch": 0.745, + "grad_norm": 11.211102602501663, + "kl": 1.7421875, + "learning_rate": 2.6680582402757324e-07, + "loss": -0.006, + "reward": 2.27160906791687, + "reward_std": 0.5131259560585022, + "rewards/accuracy_reward": 0.4375000298023224, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.0322106322273612, + "rewards/tag_count_reward": 0.921875, + "step": 1490 + }, + { + "clip_ratio": 0.0, + "completion_length": 310.7916717529297, + "epoch": 0.7455, + "grad_norm": 7.269196197513535, + "kl": 1.5234375, + "learning_rate": 2.661958866559213e-07, + "loss": 0.0242, + "reward": 2.570857882499695, + "reward_std": 0.2515888065099716, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.035044897347688675, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 1491 + }, + { + "clip_ratio": 0.0, + "completion_length": 267.81250762939453, + "epoch": 0.746, + "grad_norm": 16.92104888480676, + "kl": 4.099609375, + "learning_rate": 2.655868138008171e-07, + "loss": -0.074, + "reward": 2.5825823545455933, + "reward_std": 0.6137717366218567, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.8958334028720856, + "rewards/repetition_penalty_reward": -0.032001130282878876, + "rewards/tag_count_reward": 0.90625, + "step": 1492 + }, + { + "clip_ratio": 0.0, + "completion_length": 275.7916717529297, + "epoch": 0.7465, + "grad_norm": 6.220590527633046, + "kl": 1.08984375, + "learning_rate": 2.649786073176025e-07, + "loss": -0.0146, + "reward": 2.8706815242767334, + "reward_std": 0.18304454255849123, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.019943561404943466, + "rewards/tag_count_reward": 0.9947916865348816, + "step": 1493 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.0416717529297, + "epoch": 0.747, + "grad_norm": 8.80186866611191, + "kl": 1.95703125, + "learning_rate": 2.6437126905897967e-07, + "loss": 0.0754, + "reward": 2.5168925523757935, + "reward_std": 0.46842535585165024, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.029982510022819042, + "rewards/tag_count_reward": 0.9427083432674408, + "step": 1494 + }, + { + "clip_ratio": 0.0, + "completion_length": 250.89584350585938, + "epoch": 0.7475, + "grad_norm": 11.509437222020193, + "kl": 3.453125, + "learning_rate": 2.637648008750062e-07, + "loss": -0.0703, + "reward": 2.5987823009490967, + "reward_std": 0.7667766213417053, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.0227454686537385, + "rewards/tag_count_reward": 0.90625, + "step": 1495 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.5208435058594, + "epoch": 0.748, + "grad_norm": 11.744253799204682, + "kl": 2.890625, + "learning_rate": 2.631592046130896e-07, + "loss": 0.0104, + "reward": 2.1920148134231567, + "reward_std": 0.5672837495803833, + "rewards/accuracy_reward": 0.3958333432674408, + "rewards/reasoning_steps_reward": 0.9236111342906952, + "rewards/repetition_penalty_reward": -0.02847137115895748, + "rewards/tag_count_reward": 0.9010416865348816, + "step": 1496 + }, + { + "clip_ratio": 0.0, + "completion_length": 305.6458435058594, + "epoch": 0.7485, + "grad_norm": 14.637527744293745, + "kl": 3.61328125, + "learning_rate": 2.6255448211798103e-07, + "loss": -0.0551, + "reward": 2.6001073122024536, + "reward_std": 0.43944015353918076, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.03878166899085045, + "rewards/tag_count_reward": 0.9791666865348816, + "step": 1497 + }, + { + "clip_ratio": 0.0, + "completion_length": 272.4583435058594, + "epoch": 0.749, + "grad_norm": 4.858094415340834, + "kl": 0.740234375, + "learning_rate": 2.6195063523177e-07, + "loss": 0.0534, + "reward": 2.713899612426758, + "reward_std": 0.0610094303265214, + "rewards/accuracy_reward": 0.75, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.020475570112466812, + "rewards/tag_count_reward": 0.984375, + "step": 1498 + }, + { + "clip_ratio": 0.0, + "completion_length": 269.0833435058594, + "epoch": 0.7495, + "grad_norm": 5.449268107276577, + "kl": 1.134765625, + "learning_rate": 2.613476657938789e-07, + "loss": 0.0716, + "reward": 2.6610913276672363, + "reward_std": 0.27378853410482407, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.021200459450483322, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 1499 + }, + { + "clip_ratio": 0.0, + "completion_length": 272.5416717529297, + "epoch": 0.75, + "grad_norm": 5.9488407024205685, + "kl": 1.52734375, + "learning_rate": 2.6074557564105724e-07, + "loss": 0.0979, + "reward": 2.62813401222229, + "reward_std": 0.37979737704154104, + "rewards/accuracy_reward": 0.75, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.015963357756845653, + "rewards/tag_count_reward": 0.9427083432674408, + "step": 1500 + }, + { + "clip_ratio": 0.0, + "completion_length": 267.95833587646484, + "epoch": 0.7505, + "grad_norm": 4.3632751002774315, + "kl": 1.6484375, + "learning_rate": 2.6014436660737605e-07, + "loss": -0.0364, + "reward": 2.6962149143218994, + "reward_std": 0.5003593862056732, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.02253533434122801, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 1501 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.2708435058594, + "epoch": 0.751, + "grad_norm": 8.236462804572298, + "kl": 1.58984375, + "learning_rate": 2.595440405242222e-07, + "loss": -0.0203, + "reward": 2.7835288047790527, + "reward_std": 0.4721106141805649, + "rewards/accuracy_reward": 0.875, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.013346214778721333, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 1502 + }, + { + "clip_ratio": 0.0, + "completion_length": 251.625, + "epoch": 0.7515, + "grad_norm": 6.981131847692658, + "kl": 1.91015625, + "learning_rate": 2.589445992202931e-07, + "loss": -0.0142, + "reward": 2.6148020029067993, + "reward_std": 0.5384411364793777, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9583334028720856, + "rewards/repetition_penalty_reward": -0.020614707842469215, + "rewards/tag_count_reward": 0.9062500298023224, + "step": 1503 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.31251525878906, + "epoch": 0.752, + "grad_norm": 6.52821102623756, + "kl": 1.8125, + "learning_rate": 2.583460445215911e-07, + "loss": -0.0262, + "reward": 2.4926270246505737, + "reward_std": 0.7499818503856659, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9375000596046448, + "rewards/repetition_penalty_reward": -0.0334148071706295, + "rewards/tag_count_reward": 0.9218750298023224, + "step": 1504 + }, + { + "clip_ratio": 0.0, + "completion_length": 295.93751525878906, + "epoch": 0.7525, + "grad_norm": 33.49372224861189, + "kl": 1.78125, + "learning_rate": 2.5774837825141736e-07, + "loss": 0.1068, + "reward": 2.8040874004364014, + "reward_std": 0.3838850110769272, + "rewards/accuracy_reward": 0.8958333730697632, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.034454355016350746, + "rewards/tag_count_reward": 0.9427083730697632, + "step": 1505 + }, + { + "clip_ratio": 0.0, + "completion_length": 306.62501525878906, + "epoch": 0.753, + "grad_norm": 4.115301737132047, + "kl": 0.611328125, + "learning_rate": 2.571516022303671e-07, + "loss": 0.0313, + "reward": 2.837872266769409, + "reward_std": 0.33879856765270233, + "rewards/accuracy_reward": 0.8750000298023224, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.02150270715355873, + "rewards/tag_count_reward": 0.984375, + "step": 1506 + }, + { + "clip_ratio": 0.0, + "completion_length": 322.7708435058594, + "epoch": 0.7535, + "grad_norm": 6.925696299115607, + "kl": 1.39453125, + "learning_rate": 2.565557182763235e-07, + "loss": 0.0259, + "reward": 2.422218918800354, + "reward_std": 0.5452222675085068, + "rewards/accuracy_reward": 0.5625, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.027433859184384346, + "rewards/tag_count_reward": 0.9218750298023224, + "step": 1507 + }, + { + "clip_ratio": 0.0, + "completion_length": 298.5, + "epoch": 0.754, + "grad_norm": 5.78599950453997, + "kl": 1.595703125, + "learning_rate": 2.5596072820445254e-07, + "loss": 0.0677, + "reward": 2.514102339744568, + "reward_std": 0.51523557305336, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.01714779995381832, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 1508 + }, + { + "clip_ratio": 0.0, + "completion_length": 331.6875, + "epoch": 0.7545, + "grad_norm": 23.247679880499817, + "kl": 1.98828125, + "learning_rate": 2.5536663382719713e-07, + "loss": 0.1553, + "reward": 2.4761857986450195, + "reward_std": 0.5169162601232529, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.034231009893119335, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 1509 + }, + { + "clip_ratio": 0.0, + "completion_length": 293.41668701171875, + "epoch": 0.755, + "grad_norm": 4.780358683601452, + "kl": 1.453125, + "learning_rate": 2.547734369542718e-07, + "loss": -0.006, + "reward": 2.5231523513793945, + "reward_std": 0.5849803388118744, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.023722639307379723, + "rewards/tag_count_reward": 0.9427083432674408, + "step": 1510 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.54168701171875, + "epoch": 0.7555, + "grad_norm": 54.12276638774576, + "kl": 2.9609375, + "learning_rate": 2.5418113939265686e-07, + "loss": 0.0877, + "reward": 2.6192400455474854, + "reward_std": 0.6453405022621155, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.023121179081499577, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 1511 + }, + { + "clip_ratio": 0.0, + "completion_length": 263.85418701171875, + "epoch": 0.756, + "grad_norm": 19.449042044921896, + "kl": 2.671875, + "learning_rate": 2.5358974294659373e-07, + "loss": -0.0146, + "reward": 2.5455033779144287, + "reward_std": 0.7476102709770203, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9305555820465088, + "rewards/repetition_penalty_reward": -0.020468920469284058, + "rewards/tag_count_reward": 0.9270833730697632, + "step": 1512 + }, + { + "clip_ratio": 0.0, + "completion_length": 283.97918701171875, + "epoch": 0.7565, + "grad_norm": 10.92392466120309, + "kl": 1.876953125, + "learning_rate": 2.5299924941757843e-07, + "loss": 0.1007, + "reward": 2.4055851697921753, + "reward_std": 0.5663338005542755, + "rewards/accuracy_reward": 0.5625000149011612, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.04233153909444809, + "rewards/tag_count_reward": 0.9270833432674408, + "step": 1513 + }, + { + "clip_ratio": 0.0, + "completion_length": 262.5416793823242, + "epoch": 0.757, + "grad_norm": 44.00142117460777, + "kl": 4.20703125, + "learning_rate": 2.5240966060435674e-07, + "loss": -0.0122, + "reward": 2.163232743740082, + "reward_std": 0.4055490791797638, + "rewards/accuracy_reward": 0.5625000149011612, + "rewards/reasoning_steps_reward": 0.7986111044883728, + "rewards/repetition_penalty_reward": -0.020795070566236973, + "rewards/tag_count_reward": 0.8229166865348816, + "step": 1514 + }, + { + "clip_ratio": 0.0, + "completion_length": 275.5625, + "epoch": 0.7575, + "grad_norm": 6.548140605154211, + "kl": 2.34375, + "learning_rate": 2.5182097830291824e-07, + "loss": -0.1129, + "reward": 2.405154585838318, + "reward_std": 0.7896367907524109, + "rewards/accuracy_reward": 0.625, + "rewards/reasoning_steps_reward": 0.9097222685813904, + "rewards/repetition_penalty_reward": -0.020192706026136875, + "rewards/tag_count_reward": 0.890625, + "step": 1515 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.4791717529297, + "epoch": 0.758, + "grad_norm": 16.21577117974573, + "kl": 1.146484375, + "learning_rate": 2.512332043064913e-07, + "loss": 0.1167, + "reward": 2.509866714477539, + "reward_std": 0.5259620547294617, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.03527211956679821, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 1516 + }, + { + "clip_ratio": 0.0, + "completion_length": 259.1875, + "epoch": 0.7585, + "grad_norm": 6.825195538822801, + "kl": 1.1484375, + "learning_rate": 2.5064634040553767e-07, + "loss": 0.0764, + "reward": 2.596480369567871, + "reward_std": 0.37036192417144775, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9375000298023224, + "rewards/repetition_penalty_reward": -0.023311403580009937, + "rewards/tag_count_reward": 0.953125, + "step": 1517 + }, + { + "clip_ratio": 0.0, + "completion_length": 259.39583587646484, + "epoch": 0.759, + "grad_norm": 7.531211105112099, + "kl": 2.37890625, + "learning_rate": 2.5006038838774647e-07, + "loss": 0.0148, + "reward": 2.2257198095321655, + "reward_std": 0.731947273015976, + "rewards/accuracy_reward": 0.4583333432674408, + "rewards/reasoning_steps_reward": 0.9305556118488312, + "rewards/repetition_penalty_reward": -0.022544228471815586, + "rewards/tag_count_reward": 0.8593750298023224, + "step": 1518 + }, + { + "clip_ratio": 0.0, + "completion_length": 290.9791717529297, + "epoch": 0.7595, + "grad_norm": 12.630414631027104, + "kl": 1.7578125, + "learning_rate": 2.494753500380291e-07, + "loss": 0.0332, + "reward": 2.614049792289734, + "reward_std": 0.6869710385799408, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9305555820465088, + "rewards/repetition_penalty_reward": -0.05608919635415077, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 1519 + }, + { + "clip_ratio": 0.0, + "completion_length": 288.54168701171875, + "epoch": 0.76, + "grad_norm": 5.390351956211983, + "kl": 1.11328125, + "learning_rate": 2.488912271385139e-07, + "loss": 0.0027, + "reward": 2.871764063835144, + "reward_std": 0.32710327208042145, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.02059698849916458, + "rewards/tag_count_reward": 0.9895833432674408, + "step": 1520 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.1041717529297, + "epoch": 0.7605, + "grad_norm": 15.294086423040786, + "kl": 2.3125, + "learning_rate": 2.483080214685404e-07, + "loss": 0.0703, + "reward": 2.6500085592269897, + "reward_std": 0.6067156195640564, + "rewards/accuracy_reward": 0.8125, + "rewards/reasoning_steps_reward": 0.9375, + "rewards/repetition_penalty_reward": -0.021866421215236187, + "rewards/tag_count_reward": 0.9218750298023224, + "step": 1521 + }, + { + "clip_ratio": 0.0, + "completion_length": 262.5833435058594, + "epoch": 0.761, + "grad_norm": 7.991198731135982, + "kl": 1.4140625, + "learning_rate": 2.4772573480465445e-07, + "loss": 0.0322, + "reward": 2.7214081287384033, + "reward_std": 0.42735913395881653, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.021647341549396515, + "rewards/tag_count_reward": 0.9583333432674408, + "step": 1522 + }, + { + "clip_ratio": 0.0, + "completion_length": 259.81251525878906, + "epoch": 0.7615, + "grad_norm": 7.133883392763306, + "kl": 1.2734375, + "learning_rate": 2.471443689206021e-07, + "loss": 0.0274, + "reward": 2.5906002521514893, + "reward_std": 0.45796915888786316, + "rewards/accuracy_reward": 0.6875000149011612, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.030927601736038923, + "rewards/tag_count_reward": 0.96875, + "step": 1523 + }, + { + "clip_ratio": 0.0, + "completion_length": 313.87501525878906, + "epoch": 0.762, + "grad_norm": 9.686910468630904, + "kl": 2.16015625, + "learning_rate": 2.465639255873246e-07, + "loss": 0.3267, + "reward": 2.679943323135376, + "reward_std": 0.5434834957122803, + "rewards/accuracy_reward": 0.8333333730697632, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.030126115307211876, + "rewards/tag_count_reward": 0.9322916865348816, + "step": 1524 + }, + { + "clip_ratio": 0.0, + "completion_length": 273.2708435058594, + "epoch": 0.7625, + "grad_norm": 5.1490011118154655, + "kl": 2.33203125, + "learning_rate": 2.4598440657295286e-07, + "loss": 0.0067, + "reward": 2.389386534690857, + "reward_std": 0.5596683621406555, + "rewards/accuracy_reward": 0.5000000149011612, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.01686352863907814, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 1525 + }, + { + "clip_ratio": 0.0, + "completion_length": 252.52083587646484, + "epoch": 0.763, + "grad_norm": 34.19051071296814, + "kl": 3.17578125, + "learning_rate": 2.454058136428027e-07, + "loss": 0.2352, + "reward": 2.393932580947876, + "reward_std": 0.526284646242857, + "rewards/accuracy_reward": 0.5625, + "rewards/reasoning_steps_reward": 0.9375, + "rewards/repetition_penalty_reward": -0.017525773961097002, + "rewards/tag_count_reward": 0.9114583432674408, + "step": 1526 + }, + { + "clip_ratio": 0.0, + "completion_length": 293.8125, + "epoch": 0.7635, + "grad_norm": 17.54640194645924, + "kl": 1.44140625, + "learning_rate": 2.4482814855936834e-07, + "loss": 0.0779, + "reward": 2.7223398685455322, + "reward_std": 0.46589484065771103, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.965277761220932, + "rewards/repetition_penalty_reward": -0.018979622051119804, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 1527 + }, + { + "clip_ratio": 0.0, + "completion_length": 263.0208435058594, + "epoch": 0.764, + "grad_norm": 9.08850475587058, + "kl": 2.18359375, + "learning_rate": 2.4425141308231765e-07, + "loss": 0.0748, + "reward": 2.8009958267211914, + "reward_std": 0.44475598144344985, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.009768242482095957, + "rewards/tag_count_reward": 0.9427083432674408, + "step": 1528 + }, + { + "clip_ratio": 0.0, + "completion_length": 352.81251525878906, + "epoch": 0.7645, + "grad_norm": 19.330244578947458, + "kl": 2.6796875, + "learning_rate": 2.43675608968487e-07, + "loss": 0.1724, + "reward": 1.9503087401390076, + "reward_std": 0.43879881501197815, + "rewards/accuracy_reward": 0.1666666679084301, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.03580245561897755, + "rewards/tag_count_reward": 0.8750000298023224, + "step": 1529 + }, + { + "clip_ratio": 0.0, + "completion_length": 278.8125, + "epoch": 0.765, + "grad_norm": 10.153421840080243, + "kl": 1.87890625, + "learning_rate": 2.4310073797187573e-07, + "loss": 0.0312, + "reward": 2.5163590908050537, + "reward_std": 0.5512074381113052, + "rewards/accuracy_reward": 0.6875000149011612, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.0322520462796092, + "rewards/tag_count_reward": 0.9166666865348816, + "step": 1530 + }, + { + "clip_ratio": 0.0, + "completion_length": 239.02083587646484, + "epoch": 0.7655, + "grad_norm": 6.7813462328053635, + "kl": 1.6484375, + "learning_rate": 2.4252680184364045e-07, + "loss": -0.043, + "reward": 2.6130350828170776, + "reward_std": 0.6324526071548462, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9375000298023224, + "rewards/repetition_penalty_reward": -0.017173412023112178, + "rewards/tag_count_reward": 0.9010416865348816, + "step": 1531 + }, + { + "clip_ratio": 0.0, + "completion_length": 266.4166717529297, + "epoch": 0.766, + "grad_norm": 8.265844118560226, + "kl": 1.4765625, + "learning_rate": 2.4195380233209006e-07, + "loss": -0.0266, + "reward": 2.7328121662139893, + "reward_std": 0.6724408268928528, + "rewards/accuracy_reward": 0.875, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.02413230948150158, + "rewards/tag_count_reward": 0.9375000298023224, + "step": 1532 + }, + { + "clip_ratio": 0.0, + "completion_length": 306.9375, + "epoch": 0.7665, + "grad_norm": 3.9331602535210513, + "kl": 1.0859375, + "learning_rate": 2.413817411826807e-07, + "loss": -0.1036, + "reward": 2.5074063539505005, + "reward_std": 0.5442797392606735, + "rewards/accuracy_reward": 0.6250000149011612, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.029052074998617172, + "rewards/tag_count_reward": 0.953125, + "step": 1533 + }, + { + "clip_ratio": 0.0, + "completion_length": 240.77084350585938, + "epoch": 0.767, + "grad_norm": 9.973948039444034, + "kl": 2.41796875, + "learning_rate": 2.408106201380097e-07, + "loss": 0.0, + "reward": 2.4841666221618652, + "reward_std": 0.5229880660772324, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9375, + "rewards/repetition_penalty_reward": -0.021041802130639553, + "rewards/tag_count_reward": 0.9010416865348816, + "step": 1534 + }, + { + "clip_ratio": 0.0, + "completion_length": 316.625, + "epoch": 0.7675, + "grad_norm": 11.813541271263112, + "kl": 2.3046875, + "learning_rate": 2.4024044093781063e-07, + "loss": 0.1092, + "reward": 2.418421983718872, + "reward_std": 0.761476993560791, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.031230845488607883, + "rewards/tag_count_reward": 0.8593750298023224, + "step": 1535 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.9583435058594, + "epoch": 0.768, + "grad_norm": 13.907263883986914, + "kl": 1.25390625, + "learning_rate": 2.3967120531894857e-07, + "loss": 0.0569, + "reward": 2.579371452331543, + "reward_std": 0.3498530462384224, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.02132294327020645, + "rewards/tag_count_reward": 0.96875, + "step": 1536 + }, + { + "clip_ratio": 0.0, + "completion_length": 260.4166717529297, + "epoch": 0.7685, + "grad_norm": 12.302399407816415, + "kl": 3.2890625, + "learning_rate": 2.391029150154137e-07, + "loss": 0.0624, + "reward": 2.3881059885025024, + "reward_std": 0.6679995059967041, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9236111640930176, + "rewards/repetition_penalty_reward": -0.014672036748379469, + "rewards/tag_count_reward": 0.875, + "step": 1537 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.5833435058594, + "epoch": 0.769, + "grad_norm": 12.935252444247103, + "kl": 2.51953125, + "learning_rate": 2.38535571758317e-07, + "loss": 0.0609, + "reward": 2.621963858604431, + "reward_std": 0.6098971962928772, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9305555820465088, + "rewards/repetition_penalty_reward": -0.027341697365045547, + "rewards/tag_count_reward": 0.9270833432674408, + "step": 1538 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.5833435058594, + "epoch": 0.7695, + "grad_norm": 6.135311130568607, + "kl": 1.037109375, + "learning_rate": 2.3796917727588412e-07, + "loss": 0.0857, + "reward": 2.657423496246338, + "reward_std": 0.2529868111014366, + "rewards/accuracy_reward": 0.6875, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.03007663320749998, + "rewards/tag_count_reward": 1.0, + "step": 1539 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.8333435058594, + "epoch": 0.77, + "grad_norm": 10.519026184776623, + "kl": 2.1875, + "learning_rate": 2.374037332934512e-07, + "loss": 0.037, + "reward": 2.3503577709198, + "reward_std": 0.5039149820804596, + "rewards/accuracy_reward": 0.4791666716337204, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.029850583523511887, + "rewards/tag_count_reward": 0.9427083432674408, + "step": 1540 + }, + { + "clip_ratio": 0.0, + "completion_length": 233.3541717529297, + "epoch": 0.7705, + "grad_norm": 7.260626069337147, + "kl": 1.4296875, + "learning_rate": 2.3683924153345854e-07, + "loss": 0.0345, + "reward": 2.9000203609466553, + "reward_std": 0.28021588921546936, + "rewards/accuracy_reward": 0.9583333730697632, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.020118530839681625, + "rewards/tag_count_reward": 0.9895833730697632, + "step": 1541 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.72918701171875, + "epoch": 0.771, + "grad_norm": 4.314407100492501, + "kl": 1.2578125, + "learning_rate": 2.36275703715446e-07, + "loss": -0.0121, + "reward": 2.534322142601013, + "reward_std": 0.3907851278781891, + "rewards/accuracy_reward": 0.645833358168602, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.022969634272158146, + "rewards/tag_count_reward": 0.953125, + "step": 1542 + }, + { + "clip_ratio": 0.0, + "completion_length": 271.81251525878906, + "epoch": 0.7715, + "grad_norm": 5.670151769824431, + "kl": 1.0703125, + "learning_rate": 2.357131215560474e-07, + "loss": 0.0037, + "reward": 2.828339695930481, + "reward_std": 0.2872830927371979, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.03277149237692356, + "rewards/tag_count_reward": 0.9791666865348816, + "step": 1543 + }, + { + "clip_ratio": 0.0, + "completion_length": 293.5, + "epoch": 0.772, + "grad_norm": 7.6282657037651695, + "kl": 2.09375, + "learning_rate": 2.3515149676898552e-07, + "loss": 0.0448, + "reward": 2.5357162952423096, + "reward_std": 0.5322327762842178, + "rewards/accuracy_reward": 0.6875, + "rewards/reasoning_steps_reward": 0.9375000596046448, + "rewards/repetition_penalty_reward": -0.037200456485152245, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 1544 + }, + { + "clip_ratio": 0.0, + "completion_length": 296.93751525878906, + "epoch": 0.7725, + "grad_norm": 4.352945225717926, + "kl": 0.837890625, + "learning_rate": 2.3459083106506712e-07, + "loss": 0.221, + "reward": 2.649806499481201, + "reward_std": 0.2627423256635666, + "rewards/accuracy_reward": 0.708333358168602, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.027276871260255575, + "rewards/tag_count_reward": 0.9895833432674408, + "step": 1545 + }, + { + "clip_ratio": 0.0, + "completion_length": 311.8541717529297, + "epoch": 0.773, + "grad_norm": 5.842490680004781, + "kl": 1.6875, + "learning_rate": 2.3403112615217693e-07, + "loss": 0.1197, + "reward": 2.5485793352127075, + "reward_std": 0.4063766598701477, + "rewards/accuracy_reward": 0.708333358168602, + "rewards/reasoning_steps_reward": 0.9375000298023224, + "rewards/repetition_penalty_reward": -0.024337463080883026, + "rewards/tag_count_reward": 0.9270833730697632, + "step": 1546 + }, + { + "clip_ratio": 0.0, + "completion_length": 279.43751525878906, + "epoch": 0.7735, + "grad_norm": 6.003285903115502, + "kl": 1.828125, + "learning_rate": 2.334723837352733e-07, + "loss": 0.1055, + "reward": 2.6460882425308228, + "reward_std": 0.5506278276443481, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.013634048402309418, + "rewards/tag_count_reward": 0.9583333432674408, + "step": 1547 + }, + { + "clip_ratio": 0.0, + "completion_length": 320.5833435058594, + "epoch": 0.774, + "grad_norm": 11.249985053869272, + "kl": 3.328125, + "learning_rate": 2.3291460551638237e-07, + "loss": 0.2142, + "reward": 2.2939001321792603, + "reward_std": 0.8971810340881348, + "rewards/accuracy_reward": 0.5625000298023224, + "rewards/reasoning_steps_reward": 0.875, + "rewards/repetition_penalty_reward": -0.018599930219352245, + "rewards/tag_count_reward": 0.875, + "step": 1548 + }, + { + "clip_ratio": 0.0, + "completion_length": 290.37501525878906, + "epoch": 0.7745, + "grad_norm": 11.075594302485063, + "kl": 1.55859375, + "learning_rate": 2.3235779319459355e-07, + "loss": 0.0602, + "reward": 2.5785093307495117, + "reward_std": 0.29725973308086395, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.037810200825333595, + "rewards/tag_count_reward": 0.9427083432674408, + "step": 1549 + }, + { + "clip_ratio": 0.0, + "completion_length": 317.93751525878906, + "epoch": 0.775, + "grad_norm": 11.673776782665986, + "kl": 1.67578125, + "learning_rate": 2.3180194846605364e-07, + "loss": 0.1151, + "reward": 2.7235286235809326, + "reward_std": 0.585949033498764, + "rewards/accuracy_reward": 0.875, + "rewards/reasoning_steps_reward": 0.9375000894069672, + "rewards/repetition_penalty_reward": -0.026471680030226707, + "rewards/tag_count_reward": 0.9375000298023224, + "step": 1550 + }, + { + "clip_ratio": 0.0, + "completion_length": 257.4583435058594, + "epoch": 0.7755, + "grad_norm": 6.3252296705969115, + "kl": 1.35546875, + "learning_rate": 2.312470730239621e-07, + "loss": -0.0488, + "reward": 2.4448201656341553, + "reward_std": 0.48708635568618774, + "rewards/accuracy_reward": 0.5625000298023224, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.016985515132546425, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 1551 + }, + { + "clip_ratio": 0.0, + "completion_length": 294.2083435058594, + "epoch": 0.776, + "grad_norm": 12.954209363640562, + "kl": 2.06640625, + "learning_rate": 2.306931685585657e-07, + "loss": 0.1214, + "reward": 2.228946566581726, + "reward_std": 0.35232171416282654, + "rewards/accuracy_reward": 0.3958333432674408, + "rewards/reasoning_steps_reward": 0.9375000596046448, + "rewards/repetition_penalty_reward": -0.026261983439326286, + "rewards/tag_count_reward": 0.9218750298023224, + "step": 1552 + }, + { + "clip_ratio": 0.0, + "completion_length": 259.8125, + "epoch": 0.7765, + "grad_norm": 3.9761424833250136, + "kl": 1.26953125, + "learning_rate": 2.3014023675715339e-07, + "loss": 0.0292, + "reward": 2.886070728302002, + "reward_std": 0.23379188776016235, + "rewards/accuracy_reward": 0.9583333432674408, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.018443217035382986, + "rewards/tag_count_reward": 0.9739583730697632, + "step": 1553 + }, + { + "clip_ratio": 0.0, + "completion_length": 313.18751525878906, + "epoch": 0.777, + "grad_norm": 9.05056417179947, + "kl": 1.35546875, + "learning_rate": 2.2958827930405162e-07, + "loss": 0.1063, + "reward": 2.5847833156585693, + "reward_std": 0.4528532326221466, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.035008576698601246, + "rewards/tag_count_reward": 0.9114583730697632, + "step": 1554 + }, + { + "clip_ratio": 0.0, + "completion_length": 273.9791793823242, + "epoch": 0.7775, + "grad_norm": 104.85246338097696, + "kl": 2.3125, + "learning_rate": 2.2903729788061834e-07, + "loss": 0.1352, + "reward": 2.5495121479034424, + "reward_std": 0.7037906050682068, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9236111044883728, + "rewards/repetition_penalty_reward": -0.01993226632475853, + "rewards/tag_count_reward": 0.9583333432674408, + "step": 1555 + }, + { + "clip_ratio": 0.0, + "completion_length": 299.1458435058594, + "epoch": 0.778, + "grad_norm": 6.890435145460005, + "kl": 1.11328125, + "learning_rate": 2.2848729416523859e-07, + "loss": 0.0421, + "reward": 2.3158843517303467, + "reward_std": 0.47459036111831665, + "rewards/accuracy_reward": 0.4166666865348816, + "rewards/reasoning_steps_reward": 0.9861111640930176, + "rewards/repetition_penalty_reward": -0.024393508210778236, + "rewards/tag_count_reward": 0.9375, + "step": 1556 + }, + { + "clip_ratio": 0.0, + "completion_length": 302.3541717529297, + "epoch": 0.7785, + "grad_norm": 9.282545995378012, + "kl": 1.044921875, + "learning_rate": 2.2793826983331886e-07, + "loss": 0.1611, + "reward": 2.3017712831497192, + "reward_std": 0.3790929764509201, + "rewards/accuracy_reward": 0.3958333544433117, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.03156219515949488, + "rewards/tag_count_reward": 0.9583333432674408, + "step": 1557 + }, + { + "clip_ratio": 0.0, + "completion_length": 285.87501525878906, + "epoch": 0.779, + "grad_norm": 3.6904386519177583, + "kl": 0.564453125, + "learning_rate": 2.2739022655728277e-07, + "loss": 0.0184, + "reward": 2.7911198139190674, + "reward_std": 0.22485119104385376, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.04915817081928253, + "rewards/tag_count_reward": 1.0, + "step": 1558 + }, + { + "clip_ratio": 0.0, + "completion_length": 257.875, + "epoch": 0.7795, + "grad_norm": 9.948120856303264, + "kl": 1.69921875, + "learning_rate": 2.268431660065651e-07, + "loss": 0.1686, + "reward": 2.5424914360046387, + "reward_std": 0.6018918454647064, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.881944477558136, + "rewards/repetition_penalty_reward": -0.021744604222476482, + "rewards/tag_count_reward": 0.9322916865348816, + "step": 1559 + }, + { + "clip_ratio": 0.0, + "completion_length": 293.29168701171875, + "epoch": 0.78, + "grad_norm": 5.310065668788173, + "kl": 2.4140625, + "learning_rate": 2.2629708984760706e-07, + "loss": 0.1289, + "reward": 2.24657142162323, + "reward_std": 0.9247779548168182, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.8263889253139496, + "rewards/repetition_penalty_reward": -0.02252592984586954, + "rewards/tag_count_reward": 0.859375, + "step": 1560 + }, + { + "clip_ratio": 0.0, + "completion_length": 291.75001525878906, + "epoch": 0.7805, + "grad_norm": 8.564129818034443, + "kl": 1.359375, + "learning_rate": 2.2575199974385144e-07, + "loss": 0.1631, + "reward": 2.4449636936187744, + "reward_std": 0.49174782633781433, + "rewards/accuracy_reward": 0.5625000298023224, + "rewards/reasoning_steps_reward": 0.9305555820465088, + "rewards/repetition_penalty_reward": -0.022050250321626663, + "rewards/tag_count_reward": 0.9739583730697632, + "step": 1561 + }, + { + "clip_ratio": 0.0, + "completion_length": 230.7291717529297, + "epoch": 0.781, + "grad_norm": 5.561119420119422, + "kl": 2.189453125, + "learning_rate": 2.2520789735573704e-07, + "loss": 0.0542, + "reward": 2.534723997116089, + "reward_std": 0.5092030912637711, + "rewards/accuracy_reward": 0.6875, + "rewards/reasoning_steps_reward": 0.9166666865348816, + "rewards/repetition_penalty_reward": -0.012151051312685013, + "rewards/tag_count_reward": 0.9427083432674408, + "step": 1562 + }, + { + "clip_ratio": 0.0, + "completion_length": 283.93751525878906, + "epoch": 0.7815, + "grad_norm": 5.117268929695284, + "kl": 0.931640625, + "learning_rate": 2.2466478434069435e-07, + "loss": 0.0107, + "reward": 2.3754416704177856, + "reward_std": 0.39969532936811447, + "rewards/accuracy_reward": 0.4375000111758709, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.025599990040063858, + "rewards/tag_count_reward": 0.984375, + "step": 1563 + }, + { + "clip_ratio": 0.0, + "completion_length": 350.6041717529297, + "epoch": 0.782, + "grad_norm": 9.440710428522506, + "kl": 1.81640625, + "learning_rate": 2.2412266235313973e-07, + "loss": 0.3979, + "reward": 2.50728178024292, + "reward_std": 0.6064448654651642, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.03959329519420862, + "rewards/tag_count_reward": 0.9427083432674408, + "step": 1564 + }, + { + "clip_ratio": 0.0, + "completion_length": 380.7083435058594, + "epoch": 0.7825, + "grad_norm": 24.815520443713027, + "kl": 2.8515625, + "learning_rate": 2.2358153304447066e-07, + "loss": 0.4005, + "reward": 2.2530555725097656, + "reward_std": 0.5099890530109406, + "rewards/accuracy_reward": 0.4791666716337204, + "rewards/reasoning_steps_reward": 0.8958334028720856, + "rewards/repetition_penalty_reward": -0.02298619970679283, + "rewards/tag_count_reward": 0.9010416865348816, + "step": 1565 + }, + { + "clip_ratio": 0.0, + "completion_length": 273.00001525878906, + "epoch": 0.783, + "grad_norm": 12.635922668896523, + "kl": 2.2734375, + "learning_rate": 2.230413980630609e-07, + "loss": 0.0445, + "reward": 2.488055467605591, + "reward_std": 0.5981208235025406, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.032778057269752026, + "rewards/tag_count_reward": 0.9583333432674408, + "step": 1566 + }, + { + "clip_ratio": 0.0, + "completion_length": 370.56251525878906, + "epoch": 0.7835, + "grad_norm": 14.898019687710494, + "kl": 2.625, + "learning_rate": 2.2250225905425532e-07, + "loss": 0.2279, + "reward": 2.41832172870636, + "reward_std": 0.6433684527873993, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.8680556118488312, + "rewards/repetition_penalty_reward": -0.0226507056504488, + "rewards/tag_count_reward": 0.8437500298023224, + "step": 1567 + }, + { + "clip_ratio": 0.0, + "completion_length": 332.1458435058594, + "epoch": 0.784, + "grad_norm": 8.069928988207886, + "kl": 2.375, + "learning_rate": 2.2196411766036487e-07, + "loss": 0.2497, + "reward": 2.5703223943710327, + "reward_std": 0.686181902885437, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9305555522441864, + "rewards/repetition_penalty_reward": -0.026900025084614754, + "rewards/tag_count_reward": 0.9375000298023224, + "step": 1568 + }, + { + "clip_ratio": 0.0, + "completion_length": 289.8958435058594, + "epoch": 0.7845, + "grad_norm": 10.489749526563086, + "kl": 2.8671875, + "learning_rate": 2.2142697552066142e-07, + "loss": 0.2871, + "reward": 2.377824902534485, + "reward_std": 0.7266697287559509, + "rewards/accuracy_reward": 0.5625, + "rewards/reasoning_steps_reward": 0.9444444477558136, + "rewards/repetition_penalty_reward": -0.019744650460779667, + "rewards/tag_count_reward": 0.8906250298023224, + "step": 1569 + }, + { + "clip_ratio": 0.0, + "completion_length": 278.3958435058594, + "epoch": 0.785, + "grad_norm": 4.268659681870845, + "kl": 1.625, + "learning_rate": 2.2089083427137329e-07, + "loss": -0.0771, + "reward": 2.36256742477417, + "reward_std": 0.4559687077999115, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 0.9305555522441864, + "rewards/repetition_penalty_reward": -0.031529780477285385, + "rewards/tag_count_reward": 0.9427083432674408, + "step": 1570 + }, + { + "clip_ratio": 0.0, + "completion_length": 283.5416717529297, + "epoch": 0.7855, + "grad_norm": 8.822752169087817, + "kl": 1.3984375, + "learning_rate": 2.203556955456796e-07, + "loss": -0.0131, + "reward": 2.594992756843567, + "reward_std": 0.6085972636938095, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.030007286928594112, + "rewards/tag_count_reward": 0.9375, + "step": 1571 + }, + { + "clip_ratio": 0.0, + "completion_length": 289.95833587646484, + "epoch": 0.786, + "grad_norm": 31.096559251399594, + "kl": 1.474609375, + "learning_rate": 2.1982156097370557e-07, + "loss": 0.2107, + "reward": 2.1280053853988647, + "reward_std": 0.2961831293068826, + "rewards/accuracy_reward": 0.2291666716337204, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.012619667453691363, + "rewards/tag_count_reward": 0.953125, + "step": 1572 + }, + { + "clip_ratio": 0.0, + "completion_length": 303.5208435058594, + "epoch": 0.7865, + "grad_norm": 7.590109325873461, + "kl": 2.75390625, + "learning_rate": 2.1928843218251803e-07, + "loss": 0.0717, + "reward": 2.455071449279785, + "reward_std": 0.6115701645612717, + "rewards/accuracy_reward": 0.7708333730697632, + "rewards/reasoning_steps_reward": 0.826388955116272, + "rewards/repetition_penalty_reward": -0.02756764553487301, + "rewards/tag_count_reward": 0.8854166865348816, + "step": 1573 + }, + { + "clip_ratio": 0.0, + "completion_length": 290.1458435058594, + "epoch": 0.787, + "grad_norm": 10.349493504338458, + "kl": 1.09765625, + "learning_rate": 2.1875631079611956e-07, + "loss": 0.1317, + "reward": 2.6623623371124268, + "reward_std": 0.5247603058815002, + "rewards/accuracy_reward": 0.8125, + "rewards/reasoning_steps_reward": 0.9236111342906952, + "rewards/repetition_penalty_reward": -0.026873953640460968, + "rewards/tag_count_reward": 0.9531250298023224, + "step": 1574 + }, + { + "clip_ratio": 0.0, + "completion_length": 304.43751525878906, + "epoch": 0.7875, + "grad_norm": 10.846195351334975, + "kl": 1.388671875, + "learning_rate": 2.1822519843544422e-07, + "loss": 0.0464, + "reward": 2.253710389137268, + "reward_std": 0.6600132286548615, + "rewards/accuracy_reward": 0.4375, + "rewards/reasoning_steps_reward": 0.9097222983837128, + "rewards/repetition_penalty_reward": -0.025803642347455025, + "rewards/tag_count_reward": 0.9322916865348816, + "step": 1575 + }, + { + "clip_ratio": 0.0, + "completion_length": 316.5416717529297, + "epoch": 0.788, + "grad_norm": 11.7016761594706, + "kl": 1.296875, + "learning_rate": 2.1769509671835223e-07, + "loss": 0.0149, + "reward": 2.3044776916503906, + "reward_std": 0.6875050067901611, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 0.881944477558136, + "rewards/repetition_penalty_reward": -0.041008614003658295, + "rewards/tag_count_reward": 0.9427083432674408, + "step": 1576 + }, + { + "clip_ratio": 0.0, + "completion_length": 305.0, + "epoch": 0.7885, + "grad_norm": 23.225955314196877, + "kl": 1.875, + "learning_rate": 2.1716600725962558e-07, + "loss": 0.3445, + "reward": 2.3711761236190796, + "reward_std": 0.7729770541191101, + "rewards/accuracy_reward": 0.5625000149011612, + "rewards/reasoning_steps_reward": 0.9236111640930176, + "rewards/repetition_penalty_reward": -0.02118516620248556, + "rewards/tag_count_reward": 0.90625, + "step": 1577 + }, + { + "clip_ratio": 0.0, + "completion_length": 271.04168701171875, + "epoch": 0.789, + "grad_norm": 9.258843562543557, + "kl": 1.7578125, + "learning_rate": 2.166379316709625e-07, + "loss": -0.0326, + "reward": 2.26195228099823, + "reward_std": 0.7921708822250366, + "rewards/accuracy_reward": 0.5000000149011612, + "rewards/reasoning_steps_reward": 0.8888888955116272, + "rewards/repetition_penalty_reward": -0.02797840256243944, + "rewards/tag_count_reward": 0.9010416865348816, + "step": 1578 + }, + { + "clip_ratio": 0.0, + "completion_length": 301.75, + "epoch": 0.7895, + "grad_norm": 10.626888168484715, + "kl": 1.875, + "learning_rate": 2.1611087156097267e-07, + "loss": -0.0186, + "reward": 2.7739405632019043, + "reward_std": 0.3001384465023875, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.029878957197070122, + "rewards/tag_count_reward": 0.9427083432674408, + "step": 1579 + }, + { + "clip_ratio": 0.0, + "completion_length": 297.72918701171875, + "epoch": 0.79, + "grad_norm": 7.09473650785355, + "kl": 1.38671875, + "learning_rate": 2.1558482853517253e-07, + "loss": 0.1454, + "reward": 2.6300599575042725, + "reward_std": 0.4111658036708832, + "rewards/accuracy_reward": 0.708333358168602, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.024454043712466955, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 1580 + }, + { + "clip_ratio": 0.0, + "completion_length": 320.8333435058594, + "epoch": 0.7905, + "grad_norm": 5.85391538917019, + "kl": 1.34375, + "learning_rate": 2.1505980419598063e-07, + "loss": 0.0935, + "reward": 2.8416703939437866, + "reward_std": 0.30897435545921326, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.029857425950467587, + "rewards/tag_count_reward": 0.9895833432674408, + "step": 1581 + }, + { + "clip_ratio": 0.0, + "completion_length": 278.6666717529297, + "epoch": 0.791, + "grad_norm": 5.122296677365158, + "kl": 1.146484375, + "learning_rate": 2.1453580014271203e-07, + "loss": 0.0385, + "reward": 2.530571699142456, + "reward_std": 0.24665961414575577, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.02324793115258217, + "rewards/tag_count_reward": 0.984375, + "step": 1582 + }, + { + "clip_ratio": 0.0, + "completion_length": 271.31251525878906, + "epoch": 0.7915, + "grad_norm": 19.824116779052915, + "kl": 3.8125, + "learning_rate": 2.1401281797157395e-07, + "loss": 0.1603, + "reward": 2.473771572113037, + "reward_std": 0.672837495803833, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.8333334028720856, + "rewards/repetition_penalty_reward": -0.015811904333531857, + "rewards/tag_count_reward": 0.9062500298023224, + "step": 1583 + }, + { + "clip_ratio": 0.0, + "completion_length": 274.91668701171875, + "epoch": 0.792, + "grad_norm": 6.585486937770688, + "kl": 1.6484375, + "learning_rate": 2.134908592756607e-07, + "loss": 0.1439, + "reward": 2.618052840232849, + "reward_std": 0.6666180491447449, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.015627761371433735, + "rewards/tag_count_reward": 0.9531250298023224, + "step": 1584 + }, + { + "clip_ratio": 0.0, + "completion_length": 340.12501525878906, + "epoch": 0.7925, + "grad_norm": 18.241947160047147, + "kl": 2.865234375, + "learning_rate": 2.1296992564494904e-07, + "loss": 0.1482, + "reward": 2.4161217212677, + "reward_std": 0.7258022725582123, + "rewards/accuracy_reward": 0.6458333730697632, + "rewards/reasoning_steps_reward": 0.881944477558136, + "rewards/repetition_penalty_reward": -0.028322923928499222, + "rewards/tag_count_reward": 0.9166666865348816, + "step": 1585 + }, + { + "clip_ratio": 0.0, + "completion_length": 262.2291793823242, + "epoch": 0.793, + "grad_norm": 4.017024733526017, + "kl": 0.64453125, + "learning_rate": 2.124500186662932e-07, + "loss": 0.0263, + "reward": 2.8568403720855713, + "reward_std": 0.22998722037300467, + "rewards/accuracy_reward": 0.875, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.01815977320075035, + "rewards/tag_count_reward": 1.0, + "step": 1586 + }, + { + "clip_ratio": 0.0, + "completion_length": 261.6458435058594, + "epoch": 0.7935, + "grad_norm": 5.17341645646987, + "kl": 1.130859375, + "learning_rate": 2.1193113992342001e-07, + "loss": 0.0359, + "reward": 2.874253273010254, + "reward_std": 0.3271254301071167, + "rewards/accuracy_reward": 0.9583333730697632, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.02505233883857727, + "rewards/tag_count_reward": 0.96875, + "step": 1587 + }, + { + "clip_ratio": 0.0, + "completion_length": 326.6041717529297, + "epoch": 0.794, + "grad_norm": 15.607330298676484, + "kl": 2.05078125, + "learning_rate": 2.1141329099692406e-07, + "loss": 0.2062, + "reward": 2.5750577449798584, + "reward_std": 0.32292182743549347, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.042997824028134346, + "rewards/tag_count_reward": 0.9583333432674408, + "step": 1588 + }, + { + "clip_ratio": 0.0, + "completion_length": 339.93751525878906, + "epoch": 0.7945, + "grad_norm": 10.81146440795989, + "kl": 1.345703125, + "learning_rate": 2.1089647346426303e-07, + "loss": 0.3202, + "reward": 2.6132642030715942, + "reward_std": 0.2669622115790844, + "rewards/accuracy_reward": 0.7291666716337204, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.022152533754706383, + "rewards/tag_count_reward": 0.9270833432674408, + "step": 1589 + }, + { + "clip_ratio": 0.0, + "completion_length": 377.9583435058594, + "epoch": 0.795, + "grad_norm": 5.637731152589917, + "kl": 1.8984375, + "learning_rate": 2.1038068889975259e-07, + "loss": 0.3067, + "reward": 2.5194292068481445, + "reward_std": 0.6721850037574768, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9027778506278992, + "rewards/repetition_penalty_reward": -0.029181976802647114, + "rewards/tag_count_reward": 0.9166666865348816, + "step": 1590 + }, + { + "clip_ratio": 0.0, + "completion_length": 331.0833435058594, + "epoch": 0.7955, + "grad_norm": 12.21283214309097, + "kl": 0.978515625, + "learning_rate": 2.0986593887456223e-07, + "loss": 0.233, + "reward": 2.7278926372528076, + "reward_std": 0.5767909586429596, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.023843545466661453, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 1591 + }, + { + "clip_ratio": 0.0, + "completion_length": 313.22918701171875, + "epoch": 0.796, + "grad_norm": 5.466559112755202, + "kl": 1.630859375, + "learning_rate": 2.0935222495670968e-07, + "loss": 0.1362, + "reward": 2.65623676776886, + "reward_std": 0.6166912019252777, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9444444179534912, + "rewards/repetition_penalty_reward": -0.02779117412865162, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 1592 + }, + { + "clip_ratio": 0.0, + "completion_length": 297.2916717529297, + "epoch": 0.7965, + "grad_norm": 6.094615509828546, + "kl": 1.22265625, + "learning_rate": 2.088395487110566e-07, + "loss": 0.078, + "reward": 2.733526110649109, + "reward_std": 0.44847723841667175, + "rewards/accuracy_reward": 0.8125, + "rewards/reasoning_steps_reward": 0.9722222089767456, + "rewards/repetition_penalty_reward": -0.02515467908233404, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 1593 + }, + { + "clip_ratio": 0.0, + "completion_length": 282.56251525878906, + "epoch": 0.797, + "grad_norm": 4.218497044118198, + "kl": 1.359375, + "learning_rate": 2.0832791169930363e-07, + "loss": 0.2248, + "reward": 2.6115695238113403, + "reward_std": 0.34022286534309387, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9375000298023224, + "rewards/repetition_penalty_reward": -0.018638767302036285, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 1594 + }, + { + "clip_ratio": 0.0, + "completion_length": 353.7916717529297, + "epoch": 0.7975, + "grad_norm": 4.25463358775561, + "kl": 0.748046875, + "learning_rate": 2.078173154799861e-07, + "loss": 0.1418, + "reward": 2.7314101457595825, + "reward_std": 0.33814239501953125, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.03768732026219368, + "rewards/tag_count_reward": 0.984375, + "step": 1595 + }, + { + "clip_ratio": 0.0, + "completion_length": 311.1666717529297, + "epoch": 0.798, + "grad_norm": 7.987189771627374, + "kl": 1.658203125, + "learning_rate": 2.0730776160846853e-07, + "loss": 0.1429, + "reward": 2.6225337982177734, + "reward_std": 0.3525502234697342, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.023299389518797398, + "rewards/tag_count_reward": 0.9583333432674408, + "step": 1596 + }, + { + "clip_ratio": 0.0, + "completion_length": 350.0833435058594, + "epoch": 0.7985, + "grad_norm": 4.859278250656306, + "kl": 1.57421875, + "learning_rate": 2.0679925163694033e-07, + "loss": 0.3676, + "reward": 2.552080273628235, + "reward_std": 0.8139557242393494, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9166666865348816, + "rewards/repetition_penalty_reward": -0.031253148801624775, + "rewards/tag_count_reward": 0.9375, + "step": 1597 + }, + { + "clip_ratio": 0.0, + "completion_length": 283.68751525878906, + "epoch": 0.799, + "grad_norm": 7.356529004359635, + "kl": 0.640625, + "learning_rate": 2.0629178711441115e-07, + "loss": 0.0221, + "reward": 2.4863110780715942, + "reward_std": 0.23228544741868973, + "rewards/accuracy_reward": 0.5, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.013688962906599045, + "rewards/tag_count_reward": 1.0, + "step": 1598 + }, + { + "clip_ratio": 0.0, + "completion_length": 273.2083435058594, + "epoch": 0.7995, + "grad_norm": 3.7867120654273236, + "kl": 0.576171875, + "learning_rate": 2.0578536958670574e-07, + "loss": 0.0753, + "reward": 2.936859369277954, + "reward_std": 0.15612836927175522, + "rewards/accuracy_reward": 0.9583333730697632, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.021473943255841732, + "rewards/tag_count_reward": 1.0, + "step": 1599 + }, + { + "clip_ratio": 0.0, + "completion_length": 296.41668701171875, + "epoch": 0.8, + "grad_norm": 6.667486304605809, + "kl": 0.890625, + "learning_rate": 2.0528000059645995e-07, + "loss": 0.0294, + "reward": 2.498704433441162, + "reward_std": 0.3463284894824028, + "rewards/accuracy_reward": 0.5416666716337204, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.023865243420004845, + "rewards/tag_count_reward": 0.9947916865348816, + "step": 1600 + }, + { + "clip_ratio": 0.0, + "completion_length": 283.0208435058594, + "epoch": 0.8005, + "grad_norm": 5.392405663450308, + "kl": 1.0859375, + "learning_rate": 2.0477568168311525e-07, + "loss": 0.1415, + "reward": 2.3850942850112915, + "reward_std": 0.3655809760093689, + "rewards/accuracy_reward": 0.4583333358168602, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.015947438776493073, + "rewards/tag_count_reward": 0.984375, + "step": 1601 + }, + { + "clip_ratio": 0.0, + "completion_length": 281.0, + "epoch": 0.801, + "grad_norm": 10.683966440840944, + "kl": 0.97265625, + "learning_rate": 2.042724143829146e-07, + "loss": 0.1412, + "reward": 2.4589797258377075, + "reward_std": 0.5436213612556458, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9236111342906952, + "rewards/repetition_penalty_reward": -0.016714807134121656, + "rewards/tag_count_reward": 0.96875, + "step": 1602 + }, + { + "clip_ratio": 0.0, + "completion_length": 264.875, + "epoch": 0.8015, + "grad_norm": 3.118222730823267, + "kl": 0.625, + "learning_rate": 2.037702002288973e-07, + "loss": 0.0022, + "reward": 2.7855957746505737, + "reward_std": 0.193808451294899, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.026904198341071606, + "rewards/tag_count_reward": 1.0, + "step": 1603 + }, + { + "clip_ratio": 0.0, + "completion_length": 339.50001525878906, + "epoch": 0.802, + "grad_norm": 4.604018186494893, + "kl": 0.7734375, + "learning_rate": 2.032690407508949e-07, + "loss": 0.1366, + "reward": 2.8414154052734375, + "reward_std": 0.21933773159980774, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.04400132969021797, + "rewards/tag_count_reward": 0.9895833432674408, + "step": 1604 + }, + { + "clip_ratio": 0.0, + "completion_length": 306.93751525878906, + "epoch": 0.8025, + "grad_norm": 21.004216450205973, + "kl": 1.06640625, + "learning_rate": 2.027689374755261e-07, + "loss": 0.0028, + "reward": 2.575451135635376, + "reward_std": 0.37137471139431, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.028715766966342926, + "rewards/tag_count_reward": 0.9791666865348816, + "step": 1605 + }, + { + "clip_ratio": 0.0, + "completion_length": 313.37501525878906, + "epoch": 0.803, + "grad_norm": 6.2354379721667526, + "kl": 0.53125, + "learning_rate": 2.0226989192619204e-07, + "loss": 0.0291, + "reward": 2.5753469467163086, + "reward_std": 0.19728727638721466, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.02881983108818531, + "rewards/tag_count_reward": 1.0, + "step": 1606 + }, + { + "clip_ratio": 0.0, + "completion_length": 291.0833435058594, + "epoch": 0.8035, + "grad_norm": 8.229728899859898, + "kl": 0.8642578125, + "learning_rate": 2.0177190562307224e-07, + "loss": 0.0185, + "reward": 2.852303981781006, + "reward_std": 0.3710853382945061, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.024431976955384016, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 1607 + }, + { + "clip_ratio": 0.0, + "completion_length": 309.3125, + "epoch": 0.804, + "grad_norm": 10.300857336577062, + "kl": 1.041015625, + "learning_rate": 2.0127498008311922e-07, + "loss": 0.2301, + "reward": 2.592196464538574, + "reward_std": 0.41349270567297935, + "rewards/accuracy_reward": 0.6875, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.018914744723588228, + "rewards/tag_count_reward": 0.9583333432674408, + "step": 1608 + }, + { + "clip_ratio": 0.0, + "completion_length": 316.75, + "epoch": 0.8045, + "grad_norm": 7.563778647893601, + "kl": 1.94921875, + "learning_rate": 2.0077911682005428e-07, + "loss": 0.1949, + "reward": 2.3974047899246216, + "reward_std": 0.6470229029655457, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 0.9444445371627808, + "rewards/repetition_penalty_reward": -0.03141475468873978, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 1609 + }, + { + "clip_ratio": 0.0, + "completion_length": 320.9166717529297, + "epoch": 0.805, + "grad_norm": 7.811338730077243, + "kl": 1.5078125, + "learning_rate": 2.0028431734436308e-07, + "loss": 0.2798, + "reward": 2.4768539667129517, + "reward_std": 0.4787053167819977, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.028354503214359283, + "rewards/tag_count_reward": 0.9427083730697632, + "step": 1610 + }, + { + "clip_ratio": 0.0, + "completion_length": 292.25001525878906, + "epoch": 0.8055, + "grad_norm": 8.594070426030529, + "kl": 2.0, + "learning_rate": 1.9979058316329055e-07, + "loss": 0.3485, + "reward": 2.784187912940979, + "reward_std": 0.45950163900852203, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.026576073840260506, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 1611 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.1875, + "epoch": 0.806, + "grad_norm": 4.2199364202164, + "kl": 0.599609375, + "learning_rate": 1.9929791578083655e-07, + "loss": 0.0667, + "reward": 2.9232022762298584, + "reward_std": 0.15263231098651886, + "rewards/accuracy_reward": 0.9583333730697632, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.03513127192854881, + "rewards/tag_count_reward": 1.0, + "step": 1612 + }, + { + "clip_ratio": 0.0, + "completion_length": 308.875, + "epoch": 0.8065, + "grad_norm": 8.990639470337845, + "kl": 0.84765625, + "learning_rate": 1.9880631669775162e-07, + "loss": 0.2945, + "reward": 2.668937563896179, + "reward_std": 0.4244493693113327, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9791666567325592, + "rewards/repetition_penalty_reward": -0.023770801723003387, + "rewards/tag_count_reward": 0.984375, + "step": 1613 + }, + { + "clip_ratio": 0.0, + "completion_length": 289.00001525878906, + "epoch": 0.807, + "grad_norm": 8.09121179675446, + "kl": 0.935546875, + "learning_rate": 1.9831578741153155e-07, + "loss": 0.0507, + "reward": 2.8520257472991943, + "reward_std": 0.32762178778648376, + "rewards/accuracy_reward": 0.9375000298023224, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.03512696735560894, + "rewards/tag_count_reward": 0.984375, + "step": 1614 + }, + { + "clip_ratio": 0.0, + "completion_length": 296.1458435058594, + "epoch": 0.8075, + "grad_norm": 9.454399169013804, + "kl": 0.595703125, + "learning_rate": 1.9782632941641375e-07, + "loss": 0.0606, + "reward": 2.6736570596694946, + "reward_std": 0.13973642978817225, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.019051472656428814, + "rewards/tag_count_reward": 0.984375, + "step": 1615 + }, + { + "clip_ratio": 0.0, + "completion_length": 251.06250762939453, + "epoch": 0.808, + "grad_norm": 4.329960507797107, + "kl": 0.650390625, + "learning_rate": 1.9733794420337213e-07, + "loss": -0.0227, + "reward": 2.630091071128845, + "reward_std": 0.29190193116664886, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.015742299146950245, + "rewards/tag_count_reward": 1.0, + "step": 1616 + }, + { + "clip_ratio": 0.0, + "completion_length": 336.7291717529297, + "epoch": 0.8085, + "grad_norm": 6.82490142880819, + "kl": 1.55078125, + "learning_rate": 1.9685063326011263e-07, + "loss": 0.3148, + "reward": 2.5862770080566406, + "reward_std": 0.7006584405899048, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9305556118488312, + "rewards/repetition_penalty_reward": -0.04219539649784565, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 1617 + }, + { + "clip_ratio": 0.0, + "completion_length": 289.9166717529297, + "epoch": 0.809, + "grad_norm": 7.151058516859466, + "kl": 1.71484375, + "learning_rate": 1.9636439807106912e-07, + "loss": 0.2832, + "reward": 2.503158688545227, + "reward_std": 0.4477211833000183, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.01941095246002078, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 1618 + }, + { + "clip_ratio": 0.0, + "completion_length": 245.93750762939453, + "epoch": 0.8095, + "grad_norm": 4.443081763622589, + "kl": 0.625, + "learning_rate": 1.9587924011739826e-07, + "loss": 0.0627, + "reward": 2.9678823947906494, + "reward_std": 0.01614804659038782, + "rewards/accuracy_reward": 1.0, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.03211780823767185, + "rewards/tag_count_reward": 1.0, + "step": 1619 + }, + { + "clip_ratio": 0.0, + "completion_length": 323.87500762939453, + "epoch": 0.81, + "grad_norm": 9.980771833809246, + "kl": 1.87109375, + "learning_rate": 1.9539516087697517e-07, + "loss": 0.2491, + "reward": 2.570975422859192, + "reward_std": 0.36428187415003777, + "rewards/accuracy_reward": 0.75, + "rewards/reasoning_steps_reward": 0.8958333730697632, + "rewards/repetition_penalty_reward": -0.022774542681872845, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 1620 + }, + { + "clip_ratio": 0.0, + "completion_length": 337.4375, + "epoch": 0.8105, + "grad_norm": 5.299585054431229, + "kl": 1.01171875, + "learning_rate": 1.9491216182438926e-07, + "loss": 0.0517, + "reward": 2.593162417411804, + "reward_std": 0.40712933242321014, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.965277761220932, + "rewards/repetition_penalty_reward": -0.04399042949080467, + "rewards/tag_count_reward": 0.984375, + "step": 1621 + }, + { + "clip_ratio": 0.0, + "completion_length": 284.9791717529297, + "epoch": 0.811, + "grad_norm": 6.996029293693946, + "kl": 0.671875, + "learning_rate": 1.944302444309393e-07, + "loss": 0.0078, + "reward": 2.6557544469833374, + "reward_std": 0.2550486624240875, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.024801287800073624, + "rewards/tag_count_reward": 1.0, + "step": 1622 + }, + { + "clip_ratio": 0.0, + "completion_length": 295.66668701171875, + "epoch": 0.8115, + "grad_norm": 3.696905547483718, + "kl": 1.087890625, + "learning_rate": 1.9394941016462947e-07, + "loss": 0.0938, + "reward": 2.5640480518341064, + "reward_std": 0.3755566477775574, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.036646610125899315, + "rewards/tag_count_reward": 0.9687500298023224, + "step": 1623 + }, + { + "clip_ratio": 0.0, + "completion_length": 314.22918701171875, + "epoch": 0.812, + "grad_norm": 10.073124148755838, + "kl": 1.7578125, + "learning_rate": 1.934696604901642e-07, + "loss": 0.2146, + "reward": 2.8040868043899536, + "reward_std": 0.5022929012775421, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.03271871618926525, + "rewards/tag_count_reward": 0.96875, + "step": 1624 + }, + { + "clip_ratio": 0.0, + "completion_length": 281.5, + "epoch": 0.8125, + "grad_norm": 4.120406566625248, + "kl": 0.9375, + "learning_rate": 1.929909968689442e-07, + "loss": 0.057, + "reward": 2.469716787338257, + "reward_std": 0.394562803208828, + "rewards/accuracy_reward": 0.5416666716337204, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.035491637885570526, + "rewards/tag_count_reward": 0.984375, + "step": 1625 + }, + { + "clip_ratio": 0.0, + "completion_length": 315.5625, + "epoch": 0.813, + "grad_norm": 6.662465778476507, + "kl": 1.498046875, + "learning_rate": 1.9251342075906179e-07, + "loss": 0.1782, + "reward": 2.5808849334716797, + "reward_std": 0.3251846134662628, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.033698457293212414, + "rewards/tag_count_reward": 0.96875, + "step": 1626 + }, + { + "clip_ratio": 0.0, + "completion_length": 344.1458435058594, + "epoch": 0.8135, + "grad_norm": 18.710995596257547, + "kl": 2.18359375, + "learning_rate": 1.9203693361529687e-07, + "loss": 0.2733, + "reward": 2.6579350233078003, + "reward_std": 0.6114742159843445, + "rewards/accuracy_reward": 0.8333333730697632, + "rewards/reasoning_steps_reward": 0.8958333730697632, + "rewards/repetition_penalty_reward": -0.01914846431463957, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 1627 + }, + { + "clip_ratio": 0.0, + "completion_length": 269.29168701171875, + "epoch": 0.814, + "grad_norm": 4.508553972215575, + "kl": 0.888671875, + "learning_rate": 1.915615368891117e-07, + "loss": 0.0428, + "reward": 2.79397451877594, + "reward_std": 0.3388333395123482, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.023733829148113728, + "rewards/tag_count_reward": 0.984375, + "step": 1628 + }, + { + "clip_ratio": 0.0, + "completion_length": 267.5416717529297, + "epoch": 0.8145, + "grad_norm": 5.521089193451119, + "kl": 0.595703125, + "learning_rate": 1.9108723202864723e-07, + "loss": 0.0554, + "reward": 2.9251835346221924, + "reward_std": 0.1714300513267517, + "rewards/accuracy_reward": 0.9375000298023224, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.012316623236984015, + "rewards/tag_count_reward": 1.0, + "step": 1629 + }, + { + "clip_ratio": 0.0, + "completion_length": 371.39583587646484, + "epoch": 0.815, + "grad_norm": 7.336042303384628, + "kl": 1.06640625, + "learning_rate": 1.9061402047871833e-07, + "loss": 0.2051, + "reward": 2.7813527584075928, + "reward_std": 0.33912873174995184, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9722222089767456, + "rewards/repetition_penalty_reward": -0.02420270536094904, + "rewards/tag_count_reward": 0.9791666865348816, + "step": 1630 + }, + { + "clip_ratio": 0.0, + "completion_length": 431.54168701171875, + "epoch": 0.8155, + "grad_norm": 9.604124750722942, + "kl": 1.875, + "learning_rate": 1.9014190368080924e-07, + "loss": 0.5587, + "reward": 2.37904691696167, + "reward_std": 0.7319284677505493, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9027778208255768, + "rewards/repetition_penalty_reward": -0.023730785585939884, + "rewards/tag_count_reward": 0.9166666865348816, + "step": 1631 + }, + { + "clip_ratio": 0.0, + "completion_length": 319.25, + "epoch": 0.816, + "grad_norm": 9.391423961724705, + "kl": 1.513671875, + "learning_rate": 1.8967088307307e-07, + "loss": 0.2808, + "reward": 2.566171407699585, + "reward_std": 0.5563443601131439, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.029314829036593437, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 1632 + }, + { + "clip_ratio": 0.0, + "completion_length": 324.5416717529297, + "epoch": 0.8165, + "grad_norm": 6.896452210575316, + "kl": 1.6875, + "learning_rate": 1.8920096009031072e-07, + "loss": 0.2943, + "reward": 2.5484179258346558, + "reward_std": 0.5188319385051727, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.02449881285429001, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 1633 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.00001525878906, + "epoch": 0.817, + "grad_norm": 6.919405126450731, + "kl": 0.716796875, + "learning_rate": 1.887321361639985e-07, + "loss": 0.0629, + "reward": 2.8598101139068604, + "reward_std": 0.2988024652004242, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.03428714908659458, + "rewards/tag_count_reward": 0.984375, + "step": 1634 + }, + { + "clip_ratio": 0.0, + "completion_length": 295.60418701171875, + "epoch": 0.8175, + "grad_norm": 10.231296261630002, + "kl": 1.177734375, + "learning_rate": 1.8826441272225225e-07, + "loss": 0.2059, + "reward": 2.720053195953369, + "reward_std": 0.5625387728214264, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.03168301936239004, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 1635 + }, + { + "clip_ratio": 0.0, + "completion_length": 336.7083435058594, + "epoch": 0.818, + "grad_norm": 10.632802931373286, + "kl": 1.51171875, + "learning_rate": 1.8779779118983867e-07, + "loss": 0.5578, + "reward": 2.639328956604004, + "reward_std": 0.6874139904975891, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9444445371627808, + "rewards/repetition_penalty_reward": -0.01865724567323923, + "rewards/tag_count_reward": 0.9427083432674408, + "step": 1636 + }, + { + "clip_ratio": 0.0, + "completion_length": 299.6458435058594, + "epoch": 0.8185, + "grad_norm": 10.048893605843471, + "kl": 1.33203125, + "learning_rate": 1.8733227298816794e-07, + "loss": 0.3385, + "reward": 2.6121253967285156, + "reward_std": 0.5836938470602036, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.025027431547641754, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 1637 + }, + { + "clip_ratio": 0.0, + "completion_length": 314.7708435058594, + "epoch": 0.819, + "grad_norm": 6.465062212154395, + "kl": 0.65625, + "learning_rate": 1.8686785953528922e-07, + "loss": 0.1366, + "reward": 2.602375626564026, + "reward_std": 0.4529639333486557, + "rewards/accuracy_reward": 0.645833358168602, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.026096642017364502, + "rewards/tag_count_reward": 0.9895833432674408, + "step": 1638 + }, + { + "clip_ratio": 0.0, + "completion_length": 466.66668701171875, + "epoch": 0.8195, + "grad_norm": 30.96092236002016, + "kl": 3.671875, + "learning_rate": 1.8640455224588636e-07, + "loss": 0.4841, + "reward": 2.4283162355422974, + "reward_std": 0.6781846880912781, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.8680555820465088, + "rewards/repetition_penalty_reward": -0.017864545807242393, + "rewards/tag_count_reward": 0.8489583730697632, + "step": 1639 + }, + { + "clip_ratio": 0.0, + "completion_length": 291.00001525878906, + "epoch": 0.82, + "grad_norm": 11.249771941826626, + "kl": 2.7109375, + "learning_rate": 1.8594235253127372e-07, + "loss": 0.2243, + "reward": 2.520586609840393, + "reward_std": 0.6222244799137115, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.8888889253139496, + "rewards/repetition_penalty_reward": -0.019344151951372623, + "rewards/tag_count_reward": 0.9427083432674408, + "step": 1640 + }, + { + "clip_ratio": 0.0, + "completion_length": 315.81251525878906, + "epoch": 0.8205, + "grad_norm": 4.567943396166058, + "kl": 0.7119140625, + "learning_rate": 1.8548126179939188e-07, + "loss": 0.0507, + "reward": 2.6988391876220703, + "reward_std": 0.5334462970495224, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.9444444477558136, + "rewards/repetition_penalty_reward": -0.037272000685334206, + "rewards/tag_count_reward": 0.9791666865348816, + "step": 1641 + }, + { + "clip_ratio": 0.0, + "completion_length": 260.87500762939453, + "epoch": 0.821, + "grad_norm": 5.377391931963283, + "kl": 0.59375, + "learning_rate": 1.850212814548031e-07, + "loss": 0.0612, + "reward": 2.5510483980178833, + "reward_std": 0.32321809232234955, + "rewards/accuracy_reward": 0.5833333730697632, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.021868368610739708, + "rewards/tag_count_reward": 0.9895833432674408, + "step": 1642 + }, + { + "clip_ratio": 0.0, + "completion_length": 383.87501525878906, + "epoch": 0.8215, + "grad_norm": 10.395746122624809, + "kl": 2.015625, + "learning_rate": 1.8456241289868718e-07, + "loss": 0.3281, + "reward": 2.2994015216827393, + "reward_std": 0.4463878870010376, + "rewards/accuracy_reward": 0.4166666679084301, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.023515181615948677, + "rewards/tag_count_reward": 0.9270833432674408, + "step": 1643 + }, + { + "clip_ratio": 0.0, + "completion_length": 309.7083435058594, + "epoch": 0.822, + "grad_norm": 9.658762972884702, + "kl": 0.984375, + "learning_rate": 1.8410465752883758e-07, + "loss": 0.3969, + "reward": 2.853973150253296, + "reward_std": 0.33096832036972046, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.017554799327626824, + "rewards/tag_count_reward": 0.96875, + "step": 1644 + }, + { + "clip_ratio": 0.0, + "completion_length": 295.3958435058594, + "epoch": 0.8225, + "grad_norm": 40.51819002173804, + "kl": 1.42578125, + "learning_rate": 1.8364801673965642e-07, + "loss": 0.287, + "reward": 2.553430914878845, + "reward_std": 0.580101728439331, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.022958120796829462, + "rewards/tag_count_reward": 0.9583333432674408, + "step": 1645 + }, + { + "clip_ratio": 0.0, + "completion_length": 342.06251525878906, + "epoch": 0.823, + "grad_norm": 7.09616651885635, + "kl": 1.55859375, + "learning_rate": 1.8319249192215055e-07, + "loss": 0.5539, + "reward": 2.7683907747268677, + "reward_std": 0.487443208694458, + "rewards/accuracy_reward": 0.8958333730697632, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.035428643226623535, + "rewards/tag_count_reward": 0.9427083432674408, + "step": 1646 + }, + { + "clip_ratio": 0.0, + "completion_length": 525.1875305175781, + "epoch": 0.8235, + "grad_norm": 22.444784133823088, + "kl": 3.859375, + "learning_rate": 1.8273808446392785e-07, + "loss": 0.5689, + "reward": 2.390328526496887, + "reward_std": 0.5453556627035141, + "rewards/accuracy_reward": 0.708333358168602, + "rewards/reasoning_steps_reward": 0.8611111640930176, + "rewards/repetition_penalty_reward": -0.0072408574633300304, + "rewards/tag_count_reward": 0.8281250298023224, + "step": 1647 + }, + { + "clip_ratio": 0.0, + "completion_length": 402.1875, + "epoch": 0.824, + "grad_norm": 7.4531870630991515, + "kl": 2.71875, + "learning_rate": 1.822847957491922e-07, + "loss": 0.65, + "reward": 2.406116247177124, + "reward_std": 0.7253749072551727, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 0.9097221791744232, + "rewards/repetition_penalty_reward": -0.019231081008911133, + "rewards/tag_count_reward": 0.8906250298023224, + "step": 1648 + }, + { + "clip_ratio": 0.0, + "completion_length": 375.25001525878906, + "epoch": 0.8245, + "grad_norm": 4.1340736903663595, + "kl": 1.33984375, + "learning_rate": 1.8183262715873938e-07, + "loss": 0.3177, + "reward": 2.4100518226623535, + "reward_std": 0.6681110262870789, + "rewards/accuracy_reward": 0.5625000149011612, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.023975889198482037, + "rewards/tag_count_reward": 0.9270833432674408, + "step": 1649 + }, + { + "clip_ratio": 0.0, + "completion_length": 306.4583435058594, + "epoch": 0.825, + "grad_norm": 7.486120901127916, + "kl": 0.978515625, + "learning_rate": 1.8138158006995363e-07, + "loss": 0.1757, + "reward": 2.8634815216064453, + "reward_std": 0.3373248726129532, + "rewards/accuracy_reward": 0.9583333432674408, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.042768435552716255, + "rewards/tag_count_reward": 0.96875, + "step": 1650 + }, + { + "clip_ratio": 0.0, + "completion_length": 345.66668701171875, + "epoch": 0.8255, + "grad_norm": 9.259229368359508, + "kl": 1.505859375, + "learning_rate": 1.8093165585680253e-07, + "loss": 0.3075, + "reward": 2.573695659637451, + "reward_std": 0.47858020663261414, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.021790657192468643, + "rewards/tag_count_reward": 0.9427083432674408, + "step": 1651 + }, + { + "clip_ratio": 0.0, + "completion_length": 308.62501525878906, + "epoch": 0.826, + "grad_norm": 5.948718639971257, + "kl": 0.59765625, + "learning_rate": 1.804828558898332e-07, + "loss": -0.0333, + "reward": 2.6328206062316895, + "reward_std": 0.3835385888814926, + "rewards/accuracy_reward": 0.7083333730697632, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.032110003754496574, + "rewards/tag_count_reward": 0.984375, + "step": 1652 + }, + { + "clip_ratio": 0.0, + "completion_length": 416.81251525878906, + "epoch": 0.8265, + "grad_norm": 5.733286623959369, + "kl": 1.5546875, + "learning_rate": 1.800351815361682e-07, + "loss": 0.4485, + "reward": 2.4559801816940308, + "reward_std": 0.6161761581897736, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.021450520493090153, + "rewards/tag_count_reward": 0.9010416865348816, + "step": 1653 + }, + { + "clip_ratio": 0.0, + "completion_length": 379.1458435058594, + "epoch": 0.827, + "grad_norm": 5.107287414386508, + "kl": 1.21484375, + "learning_rate": 1.7958863415950112e-07, + "loss": 0.1107, + "reward": 2.421965479850769, + "reward_std": 0.3980814069509506, + "rewards/accuracy_reward": 0.5625000149011612, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.029423545114696026, + "rewards/tag_count_reward": 0.9375000298023224, + "step": 1654 + }, + { + "clip_ratio": 0.0, + "completion_length": 273.18751525878906, + "epoch": 0.8275, + "grad_norm": 6.952303501359962, + "kl": 0.76171875, + "learning_rate": 1.7914321512009296e-07, + "loss": 0.0656, + "reward": 2.8843663930892944, + "reward_std": 0.22652451507747173, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.02014744747430086, + "rewards/tag_count_reward": 0.9947916865348816, + "step": 1655 + }, + { + "clip_ratio": 0.0, + "completion_length": 476.1666717529297, + "epoch": 0.828, + "grad_norm": 10.341849236868242, + "kl": 2.90234375, + "learning_rate": 1.7869892577476722e-07, + "loss": 0.4727, + "reward": 2.3343173265457153, + "reward_std": 0.7950380742549896, + "rewards/accuracy_reward": 0.6875, + "rewards/reasoning_steps_reward": 0.8472222983837128, + "rewards/repetition_penalty_reward": -0.033738273195922375, + "rewards/tag_count_reward": 0.8333333432674408, + "step": 1656 + }, + { + "clip_ratio": 0.0, + "completion_length": 339.7083435058594, + "epoch": 0.8285, + "grad_norm": 7.640794732443528, + "kl": 1.568359375, + "learning_rate": 1.782557674769063e-07, + "loss": 0.2939, + "reward": 2.689374566078186, + "reward_std": 0.424824059009552, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.022430949844419956, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 1657 + }, + { + "clip_ratio": 0.0, + "completion_length": 525.875, + "epoch": 0.829, + "grad_norm": 11.914470370020899, + "kl": 2.82421875, + "learning_rate": 1.7781374157644713e-07, + "loss": 0.6357, + "reward": 2.147131085395813, + "reward_std": 0.5802744626998901, + "rewards/accuracy_reward": 0.3541666716337204, + "rewards/reasoning_steps_reward": 0.9236111640930176, + "rewards/repetition_penalty_reward": -0.016063490882515907, + "rewards/tag_count_reward": 0.8854166865348816, + "step": 1658 + }, + { + "clip_ratio": 0.0, + "completion_length": 406.5416717529297, + "epoch": 0.8295, + "grad_norm": 10.946757964721602, + "kl": 3.1015625, + "learning_rate": 1.773728494198775e-07, + "loss": 0.4053, + "reward": 2.3653860092163086, + "reward_std": 0.5210855007171631, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.021766817197203636, + "rewards/tag_count_reward": 0.9010416865348816, + "step": 1659 + }, + { + "clip_ratio": 0.0, + "completion_length": 405.5208435058594, + "epoch": 0.83, + "grad_norm": 15.491914941420152, + "kl": 0.91796875, + "learning_rate": 1.7693309235023127e-07, + "loss": 0.2427, + "reward": 2.8351590633392334, + "reward_std": 0.33579762279987335, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.03463265113532543, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 1660 + }, + { + "clip_ratio": 0.0, + "completion_length": 342.60418701171875, + "epoch": 0.8305, + "grad_norm": 13.255170819740858, + "kl": 1.369140625, + "learning_rate": 1.7649447170708466e-07, + "loss": 0.5412, + "reward": 2.8557145595550537, + "reward_std": 0.31544718984514475, + "rewards/accuracy_reward": 0.9375, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.02796613797545433, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 1661 + }, + { + "clip_ratio": 0.0, + "completion_length": 392.9166717529297, + "epoch": 0.831, + "grad_norm": 10.898791772241301, + "kl": 2.2890625, + "learning_rate": 1.7605698882655233e-07, + "loss": 0.4671, + "reward": 2.5021458864212036, + "reward_std": 0.6049044132232666, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.027368071489036083, + "rewards/tag_count_reward": 0.890625, + "step": 1662 + }, + { + "clip_ratio": 0.0, + "completion_length": 318.0833435058594, + "epoch": 0.8315, + "grad_norm": 7.104099769758903, + "kl": 1.33203125, + "learning_rate": 1.7562064504128281e-07, + "loss": 0.4717, + "reward": 2.5585243701934814, + "reward_std": 0.5537194460630417, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.017864540684968233, + "rewards/tag_count_reward": 0.9583333432674408, + "step": 1663 + }, + { + "clip_ratio": 0.0, + "completion_length": 320.4583435058594, + "epoch": 0.832, + "grad_norm": 7.006000014838191, + "kl": 1.06640625, + "learning_rate": 1.7518544168045524e-07, + "loss": 0.3078, + "reward": 2.647615075111389, + "reward_std": 0.2714964300394058, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.025996237061917782, + "rewards/tag_count_reward": 0.9791666865348816, + "step": 1664 + }, + { + "clip_ratio": 0.0, + "completion_length": 381.5625, + "epoch": 0.8325, + "grad_norm": 25.862084169657674, + "kl": 2.109375, + "learning_rate": 1.7475138006977437e-07, + "loss": 0.6256, + "reward": 2.650961399078369, + "reward_std": 0.5018740892410278, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9652778506278992, + "rewards/repetition_penalty_reward": -0.012233282905071974, + "rewards/tag_count_reward": 0.9270833432674408, + "step": 1665 + }, + { + "clip_ratio": 0.0, + "completion_length": 291.9583435058594, + "epoch": 0.833, + "grad_norm": 7.442304533677901, + "kl": 1.189453125, + "learning_rate": 1.743184615314671e-07, + "loss": 0.1916, + "reward": 2.9172674417495728, + "reward_std": 0.1536797545850277, + "rewards/accuracy_reward": 0.9583333432674408, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.023704865016043186, + "rewards/tag_count_reward": 0.9895833432674408, + "step": 1666 + }, + { + "clip_ratio": 0.0, + "completion_length": 488.93751525878906, + "epoch": 0.8335, + "grad_norm": 17.015693320793893, + "kl": 2.3828125, + "learning_rate": 1.7388668738427847e-07, + "loss": 0.5714, + "reward": 2.4601612091064453, + "reward_std": 0.6804801672697067, + "rewards/accuracy_reward": 0.6875000149011612, + "rewards/reasoning_steps_reward": 0.9027778208255768, + "rewards/repetition_penalty_reward": -0.020741629414260387, + "rewards/tag_count_reward": 0.890625, + "step": 1667 + }, + { + "clip_ratio": 0.0, + "completion_length": 343.0833435058594, + "epoch": 0.834, + "grad_norm": 6.622283982214126, + "kl": 1.80078125, + "learning_rate": 1.7345605894346726e-07, + "loss": 0.2748, + "reward": 2.1607359647750854, + "reward_std": 0.5849722027778625, + "rewards/accuracy_reward": 0.3541666716337204, + "rewards/reasoning_steps_reward": 0.916666716337204, + "rewards/repetition_penalty_reward": -0.02676413208246231, + "rewards/tag_count_reward": 0.9166666865348816, + "step": 1668 + }, + { + "clip_ratio": 0.0, + "completion_length": 435.8125, + "epoch": 0.8345, + "grad_norm": 10.366061275777671, + "kl": 1.87890625, + "learning_rate": 1.7302657752080258e-07, + "loss": 0.5968, + "reward": 2.4938762187957764, + "reward_std": 0.6266940236091614, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.023485003039240837, + "rewards/tag_count_reward": 0.9062500298023224, + "step": 1669 + }, + { + "clip_ratio": 0.0, + "completion_length": 306.9791717529297, + "epoch": 0.835, + "grad_norm": 6.471286087528944, + "kl": 0.912109375, + "learning_rate": 1.7259824442455923e-07, + "loss": 0.1973, + "reward": 2.6409337520599365, + "reward_std": 0.41041404008865356, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.023996802046895027, + "rewards/tag_count_reward": 0.984375, + "step": 1670 + }, + { + "clip_ratio": 0.0, + "completion_length": 344.3958435058594, + "epoch": 0.8355, + "grad_norm": 8.18949050435243, + "kl": 2.07421875, + "learning_rate": 1.7217106095951412e-07, + "loss": 0.4581, + "reward": 2.5663615465164185, + "reward_std": 0.6968798041343689, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9305556118488312, + "rewards/repetition_penalty_reward": -0.015235766302794218, + "rewards/tag_count_reward": 0.9010416865348816, + "step": 1671 + }, + { + "clip_ratio": 0.0, + "completion_length": 440.8958435058594, + "epoch": 0.836, + "grad_norm": 11.296052099828927, + "kl": 2.265625, + "learning_rate": 1.7174502842694212e-07, + "loss": 0.5144, + "reward": 2.2743382453918457, + "reward_std": 0.6736903786659241, + "rewards/accuracy_reward": 0.4791666865348816, + "rewards/reasoning_steps_reward": 0.951388955116272, + "rewards/repetition_penalty_reward": -0.01038410421460867, + "rewards/tag_count_reward": 0.8541666865348816, + "step": 1672 + }, + { + "clip_ratio": 0.0, + "completion_length": 432.4583435058594, + "epoch": 0.8365, + "grad_norm": 11.962173628260595, + "kl": 2.888671875, + "learning_rate": 1.7132014812461227e-07, + "loss": 0.3413, + "reward": 2.4873476028442383, + "reward_std": 0.47372131049633026, + "rewards/accuracy_reward": 0.645833358168602, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.021332964301109314, + "rewards/tag_count_reward": 0.9114583432674408, + "step": 1673 + }, + { + "clip_ratio": 0.0, + "completion_length": 399.37501525878906, + "epoch": 0.837, + "grad_norm": 4.56941687729545, + "kl": 1.314453125, + "learning_rate": 1.7089642134678364e-07, + "loss": 0.165, + "reward": 2.6062146425247192, + "reward_std": 0.4681692570447922, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.0344105139374733, + "rewards/tag_count_reward": 0.9114583432674408, + "step": 1674 + }, + { + "clip_ratio": 0.0, + "completion_length": 418.12501525878906, + "epoch": 0.8375, + "grad_norm": 16.236737665323922, + "kl": 1.578125, + "learning_rate": 1.704738493842015e-07, + "loss": 0.5058, + "reward": 2.3385233879089355, + "reward_std": 0.7678199410438538, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.9097222983837128, + "rewards/repetition_penalty_reward": -0.013907157350331545, + "rewards/tag_count_reward": 0.9010416865348816, + "step": 1675 + }, + { + "clip_ratio": 0.0, + "completion_length": 344.7916717529297, + "epoch": 0.838, + "grad_norm": 8.549333095195593, + "kl": 2.1328125, + "learning_rate": 1.7005243352409333e-07, + "loss": 0.175, + "reward": 2.302179217338562, + "reward_std": 0.8448854684829712, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.8819444477558136, + "rewards/repetition_penalty_reward": -0.03289027698338032, + "rewards/tag_count_reward": 0.8697916865348816, + "step": 1676 + }, + { + "clip_ratio": 0.0, + "completion_length": 313.9791717529297, + "epoch": 0.8385, + "grad_norm": 4.575278870826187, + "kl": 1.41015625, + "learning_rate": 1.6963217505016475e-07, + "loss": 0.1386, + "reward": 2.2787574529647827, + "reward_std": 0.3729303479194641, + "rewards/accuracy_reward": 0.3750000111758709, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.016381369438022375, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 1677 + }, + { + "clip_ratio": 0.0, + "completion_length": 343.5208435058594, + "epoch": 0.839, + "grad_norm": 4.195422017395303, + "kl": 1.482421875, + "learning_rate": 1.6921307524259625e-07, + "loss": 0.1819, + "reward": 2.7099480628967285, + "reward_std": 0.41559091210365295, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.022690830752253532, + "rewards/tag_count_reward": 0.90625, + "step": 1678 + }, + { + "clip_ratio": 0.0, + "completion_length": 359.0208435058594, + "epoch": 0.8395, + "grad_norm": 11.950641286729567, + "kl": 0.943359375, + "learning_rate": 1.6879513537803839e-07, + "loss": 0.453, + "reward": 2.5649707317352295, + "reward_std": 0.39247044920921326, + "rewards/accuracy_reward": 0.6875000149011612, + "rewards/reasoning_steps_reward": 0.9305555522441864, + "rewards/repetition_penalty_reward": -0.02183501608669758, + "rewards/tag_count_reward": 0.96875, + "step": 1679 + }, + { + "clip_ratio": 0.0, + "completion_length": 447.4166717529297, + "epoch": 0.84, + "grad_norm": 8.687575451435944, + "kl": 1.419921875, + "learning_rate": 1.6837835672960831e-07, + "loss": 0.5576, + "reward": 2.619246244430542, + "reward_std": 0.6550075113773346, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9236111044883728, + "rewards/repetition_penalty_reward": -0.028323073871433735, + "rewards/tag_count_reward": 0.9322916865348816, + "step": 1680 + }, + { + "clip_ratio": 0.0, + "completion_length": 277.83333587646484, + "epoch": 0.8405, + "grad_norm": 7.023805921675465, + "kl": 1.224609375, + "learning_rate": 1.6796274056688637e-07, + "loss": 0.0823, + "reward": 2.508470058441162, + "reward_std": 0.3732000142335892, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.014099352061748505, + "rewards/tag_count_reward": 0.9947916865348816, + "step": 1681 + }, + { + "clip_ratio": 0.0, + "completion_length": 349.25001525878906, + "epoch": 0.841, + "grad_norm": 6.786127817511959, + "kl": 1.1328125, + "learning_rate": 1.6754828815591131e-07, + "loss": 0.29, + "reward": 2.645016670227051, + "reward_std": 0.5615668296813965, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.018177741672843695, + "rewards/tag_count_reward": 0.9687500298023224, + "step": 1682 + }, + { + "clip_ratio": 0.0, + "completion_length": 533.6458435058594, + "epoch": 0.8415, + "grad_norm": 37.07630929388199, + "kl": 5.0625, + "learning_rate": 1.6713500075917694e-07, + "loss": 0.5224, + "reward": 1.9782096147537231, + "reward_std": 0.7190957963466644, + "rewards/accuracy_reward": 0.291666679084301, + "rewards/reasoning_steps_reward": 0.8750000596046448, + "rewards/repetition_penalty_reward": -0.026998871937394142, + "rewards/tag_count_reward": 0.8385416865348816, + "step": 1683 + }, + { + "clip_ratio": 0.0, + "completion_length": 509.54168701171875, + "epoch": 0.842, + "grad_norm": 25.945658463095125, + "kl": 2.25390625, + "learning_rate": 1.6672287963562852e-07, + "loss": 0.5761, + "reward": 2.5873239040374756, + "reward_std": 0.6155489385128021, + "rewards/accuracy_reward": 0.8125, + "rewards/reasoning_steps_reward": 0.8888890147209167, + "rewards/repetition_penalty_reward": -0.030731705483049154, + "rewards/tag_count_reward": 0.9166666865348816, + "step": 1684 + }, + { + "clip_ratio": 0.0, + "completion_length": 443.7916717529297, + "epoch": 0.8425, + "grad_norm": 12.106839765407578, + "kl": 1.5078125, + "learning_rate": 1.6631192604065852e-07, + "loss": 0.4876, + "reward": 2.326740026473999, + "reward_std": 0.4968672841787338, + "rewards/accuracy_reward": 0.4375, + "rewards/reasoning_steps_reward": 0.9652778506278992, + "rewards/repetition_penalty_reward": -0.02395441848784685, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 1685 + }, + { + "clip_ratio": 0.0, + "completion_length": 315.0833435058594, + "epoch": 0.843, + "grad_norm": 40.28917535782033, + "kl": 1.04296875, + "learning_rate": 1.659021412261026e-07, + "loss": 0.3504, + "reward": 2.5606080293655396, + "reward_std": 0.3592444807291031, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.014044871088117361, + "rewards/tag_count_reward": 0.9843750298023224, + "step": 1686 + }, + { + "clip_ratio": 0.0, + "completion_length": 361.2708435058594, + "epoch": 0.8435, + "grad_norm": 6.73864611491594, + "kl": 1.017578125, + "learning_rate": 1.6549352644023668e-07, + "loss": 0.2952, + "reward": 2.494922637939453, + "reward_std": 0.2953641563653946, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.02764677256345749, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 1687 + }, + { + "clip_ratio": 0.0, + "completion_length": 417.3333435058594, + "epoch": 0.844, + "grad_norm": 8.47333082669667, + "kl": 1.14453125, + "learning_rate": 1.6508608292777203e-07, + "loss": 0.3733, + "reward": 2.648282527923584, + "reward_std": 0.3749905973672867, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9652778506278992, + "rewards/repetition_penalty_reward": -0.025328767485916615, + "rewards/tag_count_reward": 0.9583333432674408, + "step": 1688 + }, + { + "clip_ratio": 0.0, + "completion_length": 315.3125, + "epoch": 0.8445, + "grad_norm": 10.455138386711887, + "kl": 1.40234375, + "learning_rate": 1.646798119298523e-07, + "loss": 0.2977, + "reward": 2.5954939126968384, + "reward_std": 0.37971626222133636, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.026033984497189522, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 1689 + }, + { + "clip_ratio": 0.0, + "completion_length": 399.25001525878906, + "epoch": 0.845, + "grad_norm": 10.80867448506658, + "kl": 1.30078125, + "learning_rate": 1.6427471468404952e-07, + "loss": 0.3745, + "reward": 2.5844664573669434, + "reward_std": 0.5553542077541351, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.03532520309090614, + "rewards/tag_count_reward": 0.953125, + "step": 1690 + }, + { + "clip_ratio": 0.0, + "completion_length": 328.75, + "epoch": 0.8455, + "grad_norm": 10.515772073918267, + "kl": 1.103515625, + "learning_rate": 1.6387079242435995e-07, + "loss": 0.4825, + "reward": 2.847057342529297, + "reward_std": 0.3490803837776184, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.010581327602267265, + "rewards/tag_count_reward": 0.9687500298023224, + "step": 1691 + }, + { + "clip_ratio": 0.0, + "completion_length": 380.50001525878906, + "epoch": 0.846, + "grad_norm": 9.354849527674562, + "kl": 1.859375, + "learning_rate": 1.6346804638120098e-07, + "loss": 0.3717, + "reward": 2.734411835670471, + "reward_std": 0.4372542053461075, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.010379912098869681, + "rewards/tag_count_reward": 0.9531250298023224, + "step": 1692 + }, + { + "clip_ratio": 0.0, + "completion_length": 461.54168701171875, + "epoch": 0.8465, + "grad_norm": 10.81389100794866, + "kl": 1.71875, + "learning_rate": 1.6306647778140697e-07, + "loss": 0.8252, + "reward": 2.553888440132141, + "reward_std": 0.7343409359455109, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.8888889253139496, + "rewards/repetition_penalty_reward": -0.022500536404550076, + "rewards/tag_count_reward": 0.9375000298023224, + "step": 1693 + }, + { + "clip_ratio": 0.0, + "completion_length": 650.8333435058594, + "epoch": 0.847, + "grad_norm": 18.03694586453546, + "kl": 2.8359375, + "learning_rate": 1.6266608784822542e-07, + "loss": 0.5358, + "reward": 1.9123651385307312, + "reward_std": 0.5860169529914856, + "rewards/accuracy_reward": 0.18750000558793545, + "rewards/reasoning_steps_reward": 0.9027778208255768, + "rewards/repetition_penalty_reward": -0.02166277915239334, + "rewards/tag_count_reward": 0.8437500298023224, + "step": 1694 + }, + { + "clip_ratio": 0.0, + "completion_length": 634.6041870117188, + "epoch": 0.8475, + "grad_norm": 13.570448263572654, + "kl": 3.578125, + "learning_rate": 1.6226687780131337e-07, + "loss": 0.9188, + "reward": 2.1532905101776123, + "reward_std": 0.8254519104957581, + "rewards/accuracy_reward": 0.4791666716337204, + "rewards/reasoning_steps_reward": 0.8888888955116272, + "rewards/repetition_penalty_reward": -0.01684834063053131, + "rewards/tag_count_reward": 0.8020833432674408, + "step": 1695 + }, + { + "clip_ratio": 0.0, + "completion_length": 366.7708435058594, + "epoch": 0.848, + "grad_norm": 9.145183407264888, + "kl": 0.953125, + "learning_rate": 1.6186884885673413e-07, + "loss": 0.443, + "reward": 2.3256582021713257, + "reward_std": 0.47854430973529816, + "rewards/accuracy_reward": 0.4166666716337204, + "rewards/reasoning_steps_reward": 0.9722222089767456, + "rewards/repetition_penalty_reward": -0.016355796717107296, + "rewards/tag_count_reward": 0.953125, + "step": 1696 + }, + { + "clip_ratio": 0.0, + "completion_length": 324.1458435058594, + "epoch": 0.8485, + "grad_norm": 13.420921446394319, + "kl": 1.810546875, + "learning_rate": 1.6147200222695275e-07, + "loss": 0.2714, + "reward": 2.748153805732727, + "reward_std": 0.5222579091787338, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 0.9305555820465088, + "rewards/repetition_penalty_reward": -0.026151607744395733, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 1697 + }, + { + "clip_ratio": 0.0, + "completion_length": 379.4791717529297, + "epoch": 0.849, + "grad_norm": 16.712319193882028, + "kl": 1.3515625, + "learning_rate": 1.610763391208329e-07, + "loss": 0.4492, + "reward": 2.480373501777649, + "reward_std": 0.4376864656805992, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.019626670517027378, + "rewards/tag_count_reward": 0.9583333432674408, + "step": 1698 + }, + { + "clip_ratio": 0.0, + "completion_length": 335.68751525878906, + "epoch": 0.8495, + "grad_norm": 8.460670702699383, + "kl": 1.15234375, + "learning_rate": 1.6068186074363307e-07, + "loss": 0.4626, + "reward": 2.559247851371765, + "reward_std": 0.5322316884994507, + "rewards/accuracy_reward": 0.625, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.01887742755934596, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 1699 + }, + { + "clip_ratio": 0.0, + "completion_length": 600.6875305175781, + "epoch": 0.85, + "grad_norm": 15.150759771270096, + "kl": 2.6953125, + "learning_rate": 1.6028856829700258e-07, + "loss": 0.5681, + "reward": 2.267993927001953, + "reward_std": 0.7472598850727081, + "rewards/accuracy_reward": 0.5, + "rewards/reasoning_steps_reward": 0.8958334028720856, + "rewards/repetition_penalty_reward": -0.018464550375938416, + "rewards/tag_count_reward": 0.890625, + "step": 1700 + }, + { + "clip_ratio": 0.0, + "completion_length": 429.2916717529297, + "epoch": 0.8505, + "grad_norm": 13.06168321828285, + "kl": 1.9921875, + "learning_rate": 1.5989646297897876e-07, + "loss": 0.3076, + "reward": 2.5890194177627563, + "reward_std": 0.5929211974143982, + "rewards/accuracy_reward": 0.7708333730697632, + "rewards/reasoning_steps_reward": 0.9375000298023224, + "rewards/repetition_penalty_reward": -0.030772419180721045, + "rewards/tag_count_reward": 0.9114583432674408, + "step": 1701 + }, + { + "clip_ratio": 0.0, + "completion_length": 471.2916717529297, + "epoch": 0.851, + "grad_norm": 5.600339328654864, + "kl": 1.7421875, + "learning_rate": 1.5950554598398228e-07, + "loss": 0.523, + "reward": 2.2353298664093018, + "reward_std": 0.478582501411438, + "rewards/accuracy_reward": 0.3958333432674408, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.026822968386113644, + "rewards/tag_count_reward": 0.9218750298023224, + "step": 1702 + }, + { + "clip_ratio": 0.0, + "completion_length": 443.70835876464844, + "epoch": 0.8515, + "grad_norm": 10.128996657974916, + "kl": 1.298828125, + "learning_rate": 1.59115818502814e-07, + "loss": 0.6875, + "reward": 2.5388582944869995, + "reward_std": 0.3540804013609886, + "rewards/accuracy_reward": 0.645833358168602, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.04273904860019684, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 1703 + }, + { + "clip_ratio": 0.0, + "completion_length": 291.5833435058594, + "epoch": 0.852, + "grad_norm": 8.164831996051532, + "kl": 1.609375, + "learning_rate": 1.5872728172265146e-07, + "loss": 0.0751, + "reward": 2.435483932495117, + "reward_std": 0.7406750321388245, + "rewards/accuracy_reward": 0.6041666716337204, + "rewards/reasoning_steps_reward": 0.9305555522441864, + "rewards/repetition_penalty_reward": -0.02111327089369297, + "rewards/tag_count_reward": 0.921875, + "step": 1704 + }, + { + "clip_ratio": 0.0, + "completion_length": 278.9375, + "epoch": 0.8525, + "grad_norm": 10.858353319143736, + "kl": 0.80859375, + "learning_rate": 1.5833993682704515e-07, + "loss": 0.2179, + "reward": 2.5048502683639526, + "reward_std": 0.344282865524292, + "rewards/accuracy_reward": 0.5625, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.02119144331663847, + "rewards/tag_count_reward": 0.984375, + "step": 1705 + }, + { + "clip_ratio": 0.0, + "completion_length": 446.3541717529297, + "epoch": 0.853, + "grad_norm": 5.175917884900346, + "kl": 1.5703125, + "learning_rate": 1.579537849959148e-07, + "loss": 0.3352, + "reward": 2.296080231666565, + "reward_std": 0.6603887677192688, + "rewards/accuracy_reward": 0.4583333432674408, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.03378090541809797, + "rewards/tag_count_reward": 0.9270833730697632, + "step": 1706 + }, + { + "clip_ratio": 0.0, + "completion_length": 358.0625, + "epoch": 0.8535, + "grad_norm": 5.15758064164511, + "kl": 1.333984375, + "learning_rate": 1.5756882740554578e-07, + "loss": 0.3919, + "reward": 2.4720072746276855, + "reward_std": 0.41646583657711744, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.02972903847694397, + "rewards/tag_count_reward": 0.953125, + "step": 1707 + }, + { + "clip_ratio": 0.0, + "completion_length": 458.00001525878906, + "epoch": 0.854, + "grad_norm": 17.280373579635196, + "kl": 1.3203125, + "learning_rate": 1.5718506522858572e-07, + "loss": 0.5052, + "reward": 2.531672954559326, + "reward_std": 0.6426400542259216, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.951388955116272, + "rewards/repetition_penalty_reward": -0.029090996831655502, + "rewards/tag_count_reward": 0.9427083432674408, + "step": 1708 + }, + { + "clip_ratio": 0.0, + "completion_length": 551.5833435058594, + "epoch": 0.8545, + "grad_norm": 7.826892273231986, + "kl": 2.5625, + "learning_rate": 1.5680249963404065e-07, + "loss": 0.8511, + "reward": 2.2978310585021973, + "reward_std": 0.8033435940742493, + "rewards/accuracy_reward": 0.5625000298023224, + "rewards/reasoning_steps_reward": 0.8888888955116272, + "rewards/repetition_penalty_reward": -0.03897454775869846, + "rewards/tag_count_reward": 0.8854166865348816, + "step": 1709 + }, + { + "clip_ratio": 0.0, + "completion_length": 312.68751525878906, + "epoch": 0.855, + "grad_norm": 5.338178479303831, + "kl": 0.853515625, + "learning_rate": 1.5642113178727193e-07, + "loss": 0.2184, + "reward": 2.568590521812439, + "reward_std": 0.4426009804010391, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.9722222089767456, + "rewards/repetition_penalty_reward": -0.01821515103802085, + "rewards/tag_count_reward": 0.96875, + "step": 1710 + }, + { + "clip_ratio": 0.0, + "completion_length": 431.75001525878906, + "epoch": 0.8555, + "grad_norm": 5.124987048089, + "kl": 1.8359375, + "learning_rate": 1.56040962849992e-07, + "loss": 0.4555, + "reward": 2.6776022911071777, + "reward_std": 0.5898395925760269, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.01684217620640993, + "rewards/tag_count_reward": 0.9375, + "step": 1711 + }, + { + "clip_ratio": 0.0, + "completion_length": 439.25001525878906, + "epoch": 0.856, + "grad_norm": 7.502268383392246, + "kl": 2.271484375, + "learning_rate": 1.5566199398026147e-07, + "loss": 0.2842, + "reward": 2.5188140869140625, + "reward_std": 0.5468065068125725, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9375000298023224, + "rewards/repetition_penalty_reward": -0.02285270020365715, + "rewards/tag_count_reward": 0.9166666865348816, + "step": 1712 + }, + { + "clip_ratio": 0.0, + "completion_length": 527.1458587646484, + "epoch": 0.8565, + "grad_norm": 8.654666055300138, + "kl": 1.9453125, + "learning_rate": 1.5528422633248516e-07, + "loss": 0.3654, + "reward": 2.467902421951294, + "reward_std": 0.7317368686199188, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9305556118488312, + "rewards/repetition_penalty_reward": -0.0407780222594738, + "rewards/tag_count_reward": 0.8697916865348816, + "step": 1713 + }, + { + "clip_ratio": 0.0, + "completion_length": 520.5208587646484, + "epoch": 0.857, + "grad_norm": 5.651385244412516, + "kl": 2.1875, + "learning_rate": 1.5490766105740876e-07, + "loss": 0.6816, + "reward": 2.492395281791687, + "reward_std": 0.4476168677210808, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9652778506278992, + "rewards/repetition_penalty_reward": -0.01454922091215849, + "rewards/tag_count_reward": 0.8750000298023224, + "step": 1714 + }, + { + "clip_ratio": 0.0, + "completion_length": 411.875, + "epoch": 0.8575, + "grad_norm": 9.849249493770895, + "kl": 1.8671875, + "learning_rate": 1.5453229930211563e-07, + "loss": 0.6325, + "reward": 2.4678452014923096, + "reward_std": 0.4792132079601288, + "rewards/accuracy_reward": 0.6250000149011612, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.02173815481364727, + "rewards/tag_count_reward": 0.90625, + "step": 1715 + }, + { + "clip_ratio": 0.0, + "completion_length": 421.7708435058594, + "epoch": 0.858, + "grad_norm": 15.301189903888302, + "kl": 2.1796875, + "learning_rate": 1.5415814221002265e-07, + "loss": 0.2572, + "reward": 2.3374699354171753, + "reward_std": 0.4645020067691803, + "rewards/accuracy_reward": 0.5000000149011612, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.023641261272132397, + "rewards/tag_count_reward": 0.9166666865348816, + "step": 1716 + }, + { + "clip_ratio": 0.0, + "completion_length": 325.18750762939453, + "epoch": 0.8585, + "grad_norm": 6.930860696826815, + "kl": 1.078125, + "learning_rate": 1.5378519092087712e-07, + "loss": 0.246, + "reward": 2.599347233772278, + "reward_std": 0.4288046956062317, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.013500066474080086, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 1717 + }, + { + "clip_ratio": 0.0, + "completion_length": 296.0208435058594, + "epoch": 0.859, + "grad_norm": 6.893289627917303, + "kl": 0.708984375, + "learning_rate": 1.5341344657075354e-07, + "loss": 0.0453, + "reward": 2.7252657413482666, + "reward_std": 0.38875049352645874, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9861111640930176, + "rewards/repetition_penalty_reward": -0.0212620310485363, + "rewards/tag_count_reward": 0.9895833432674408, + "step": 1718 + }, + { + "clip_ratio": 0.0, + "completion_length": 377.62501525878906, + "epoch": 0.8595, + "grad_norm": 5.587920983816194, + "kl": 1.4921875, + "learning_rate": 1.5304291029204954e-07, + "loss": 0.4535, + "reward": 2.5673869848251343, + "reward_std": 0.7304165959358215, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9444444477558136, + "rewards/repetition_penalty_reward": -0.022890993393957615, + "rewards/tag_count_reward": 0.9375, + "step": 1719 + }, + { + "clip_ratio": 0.0, + "completion_length": 476.1041717529297, + "epoch": 0.86, + "grad_norm": 5.569410661498516, + "kl": 1.2109375, + "learning_rate": 1.5267358321348285e-07, + "loss": 0.4086, + "reward": 2.5152324438095093, + "reward_std": 0.5040201544761658, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.035114867612719536, + "rewards/tag_count_reward": 0.9322916865348816, + "step": 1720 + }, + { + "clip_ratio": 0.0, + "completion_length": 509.18751525878906, + "epoch": 0.8605, + "grad_norm": 9.547003786726727, + "kl": 2.05078125, + "learning_rate": 1.5230546646008792e-07, + "loss": 0.3872, + "reward": 2.3030202388763428, + "reward_std": 0.302436888217926, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 0.9166667461395264, + "rewards/repetition_penalty_reward": -0.030313138850033283, + "rewards/tag_count_reward": 0.8958333730697632, + "step": 1721 + }, + { + "clip_ratio": 0.0, + "completion_length": 349.2708435058594, + "epoch": 0.861, + "grad_norm": 4.655955530469267, + "kl": 1.0625, + "learning_rate": 1.5193856115321224e-07, + "loss": 0.2487, + "reward": 2.71955668926239, + "reward_std": 0.43974626809358597, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.02349890023469925, + "rewards/tag_count_reward": 0.9583333432674408, + "step": 1722 + }, + { + "clip_ratio": 0.0, + "completion_length": 318.7083435058594, + "epoch": 0.8615, + "grad_norm": 3.532375552852986, + "kl": 0.6015625, + "learning_rate": 1.5157286841051285e-07, + "loss": 0.0244, + "reward": 2.9468058347702026, + "reward_std": 0.07718131458386779, + "rewards/accuracy_reward": 0.9791666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.032360827550292015, + "rewards/tag_count_reward": 1.0, + "step": 1723 + }, + { + "clip_ratio": 0.0, + "completion_length": 315.9166717529297, + "epoch": 0.862, + "grad_norm": 11.354635521002548, + "kl": 0.9140625, + "learning_rate": 1.5120838934595337e-07, + "loss": 0.2791, + "reward": 2.9219456911087036, + "reward_std": 0.1730342721566558, + "rewards/accuracy_reward": 0.9583333432674408, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.01381842652335763, + "rewards/tag_count_reward": 0.984375, + "step": 1724 + }, + { + "clip_ratio": 0.0, + "completion_length": 328.5833435058594, + "epoch": 0.8625, + "grad_norm": 6.092509316447617, + "kl": 1.162109375, + "learning_rate": 1.5084512506980023e-07, + "loss": 0.1386, + "reward": 2.519277572631836, + "reward_std": 0.3455023765563965, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.027597556822001934, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 1725 + }, + { + "clip_ratio": 0.0, + "completion_length": 442.3125305175781, + "epoch": 0.863, + "grad_norm": 6.529119686425871, + "kl": 1.9296875, + "learning_rate": 1.5048307668861947e-07, + "loss": 0.451, + "reward": 2.341726541519165, + "reward_std": 0.5323204696178436, + "rewards/accuracy_reward": 0.520833358168602, + "rewards/reasoning_steps_reward": 0.9097222685813904, + "rewards/repetition_penalty_reward": -0.02632923796772957, + "rewards/tag_count_reward": 0.9375000298023224, + "step": 1726 + }, + { + "clip_ratio": 0.0, + "completion_length": 502.6458435058594, + "epoch": 0.8635, + "grad_norm": 11.398809580135172, + "kl": 1.892578125, + "learning_rate": 1.5012224530527297e-07, + "loss": 0.2973, + "reward": 2.4390887022018433, + "reward_std": 0.264788331463933, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.029661373235285282, + "rewards/tag_count_reward": 0.90625, + "step": 1727 + }, + { + "clip_ratio": 0.0, + "completion_length": 328.5208435058594, + "epoch": 0.864, + "grad_norm": 7.130945265677619, + "kl": 1.052734375, + "learning_rate": 1.4976263201891613e-07, + "loss": 0.2046, + "reward": 2.761821746826172, + "reward_std": 0.5065120309591293, + "rewards/accuracy_reward": 0.8750000298023224, + "rewards/reasoning_steps_reward": 0.951388955116272, + "rewards/repetition_penalty_reward": -0.017692207358777523, + "rewards/tag_count_reward": 0.953125, + "step": 1728 + }, + { + "clip_ratio": 0.0, + "completion_length": 373.8333435058594, + "epoch": 0.8645, + "grad_norm": 7.459954968059489, + "kl": 1.54296875, + "learning_rate": 1.4940423792499306e-07, + "loss": 0.4104, + "reward": 2.648692011833191, + "reward_std": 0.5437482595443726, + "rewards/accuracy_reward": 0.7708333730697632, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.02318317536264658, + "rewards/tag_count_reward": 0.9427083432674408, + "step": 1729 + }, + { + "clip_ratio": 0.0, + "completion_length": 354.7083435058594, + "epoch": 0.865, + "grad_norm": 8.31639308568495, + "kl": 1.419921875, + "learning_rate": 1.4904706411523448e-07, + "loss": 0.277, + "reward": 2.6593313217163086, + "reward_std": 0.50356225669384, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.02122433064505458, + "rewards/tag_count_reward": 0.9375, + "step": 1730 + }, + { + "clip_ratio": 0.0, + "completion_length": 424.2708435058594, + "epoch": 0.8655, + "grad_norm": 6.80725176888497, + "kl": 1.158203125, + "learning_rate": 1.4869111167765372e-07, + "loss": 0.4757, + "reward": 2.588240623474121, + "reward_std": 0.5882539600133896, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.019398383796215057, + "rewards/tag_count_reward": 0.9270833432674408, + "step": 1731 + }, + { + "clip_ratio": 0.0, + "completion_length": 403.8541717529297, + "epoch": 0.866, + "grad_norm": 10.635761448282858, + "kl": 1.3984375, + "learning_rate": 1.483363816965435e-07, + "loss": 0.5591, + "reward": 2.674522638320923, + "reward_std": 0.6385620087385178, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/reasoning_steps_reward": 0.9305555522441864, + "rewards/repetition_penalty_reward": -0.016449684277176857, + "rewards/tag_count_reward": 0.9270833730697632, + "step": 1732 + }, + { + "clip_ratio": 0.0, + "completion_length": 430.35418701171875, + "epoch": 0.8665, + "grad_norm": 14.223874714304245, + "kl": 1.41015625, + "learning_rate": 1.479828752524731e-07, + "loss": 0.3242, + "reward": 2.7146769762039185, + "reward_std": 0.4512895792722702, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.021434213500469923, + "rewards/tag_count_reward": 0.9375, + "step": 1733 + }, + { + "clip_ratio": 0.0, + "completion_length": 399.75001525878906, + "epoch": 0.867, + "grad_norm": 8.449702624326806, + "kl": 1.3359375, + "learning_rate": 1.4763059342228434e-07, + "loss": 0.4984, + "reward": 2.7380974292755127, + "reward_std": 0.6073452234268188, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 0.951388955116272, + "rewards/repetition_penalty_reward": -0.030999962240457535, + "rewards/tag_count_reward": 0.921875, + "step": 1734 + }, + { + "clip_ratio": 0.0, + "completion_length": 333.4166717529297, + "epoch": 0.8675, + "grad_norm": 4.452898713963573, + "kl": 0.64453125, + "learning_rate": 1.4727953727908877e-07, + "loss": 0.1877, + "reward": 2.59316086769104, + "reward_std": 0.3150123804807663, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9722222089767456, + "rewards/repetition_penalty_reward": -0.030102994292974472, + "rewards/tag_count_reward": 0.984375, + "step": 1735 + }, + { + "clip_ratio": 0.0, + "completion_length": 376.7291717529297, + "epoch": 0.868, + "grad_norm": 6.27878798387463, + "kl": 1.08203125, + "learning_rate": 1.469297078922642e-07, + "loss": 0.4336, + "reward": 2.635421872138977, + "reward_std": 0.31346043944358826, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.04166135564446449, + "rewards/tag_count_reward": 0.9687500298023224, + "step": 1736 + }, + { + "clip_ratio": 0.0, + "completion_length": 377.8541717529297, + "epoch": 0.8685, + "grad_norm": 4.719740434148159, + "kl": 1.3046875, + "learning_rate": 1.4658110632745174e-07, + "loss": 0.2813, + "reward": 2.764721632003784, + "reward_std": 0.43931836541742086, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.025209171697497368, + "rewards/tag_count_reward": 0.9427083432674408, + "step": 1737 + }, + { + "clip_ratio": 0.0, + "completion_length": 343.62501525878906, + "epoch": 0.869, + "grad_norm": 5.45166053562493, + "kl": 1.02734375, + "learning_rate": 1.4623373364655223e-07, + "loss": 0.3308, + "reward": 2.8263684511184692, + "reward_std": 0.26489363610744476, + "rewards/accuracy_reward": 0.8750000298023224, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.027798308990895748, + "rewards/tag_count_reward": 0.9791666865348816, + "step": 1738 + }, + { + "clip_ratio": 0.0, + "completion_length": 331.4375, + "epoch": 0.8695, + "grad_norm": 11.37070470078536, + "kl": 1.609375, + "learning_rate": 1.45887590907723e-07, + "loss": 0.3803, + "reward": 2.567717671394348, + "reward_std": 0.7005654871463776, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.019088009372353554, + "rewards/tag_count_reward": 0.9270833432674408, + "step": 1739 + }, + { + "clip_ratio": 0.0, + "completion_length": 534.7291717529297, + "epoch": 0.87, + "grad_norm": 15.389689880178889, + "kl": 2.359375, + "learning_rate": 1.4554267916537495e-07, + "loss": 0.8358, + "reward": 2.2046847343444824, + "reward_std": 0.6854039132595062, + "rewards/accuracy_reward": 0.4583333432674408, + "rewards/reasoning_steps_reward": 0.909722238779068, + "rewards/repetition_penalty_reward": -0.017537596635520458, + "rewards/tag_count_reward": 0.8541666865348816, + "step": 1740 + }, + { + "clip_ratio": 0.0, + "completion_length": 422.6041717529297, + "epoch": 0.8705, + "grad_norm": 13.823047244979128, + "kl": 1.76953125, + "learning_rate": 1.4519899947016888e-07, + "loss": 0.5483, + "reward": 2.4729528427124023, + "reward_std": 0.536044716835022, + "rewards/accuracy_reward": 0.625, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.014894509688019753, + "rewards/tag_count_reward": 0.9114583432674408, + "step": 1741 + }, + { + "clip_ratio": 0.0, + "completion_length": 407.02085876464844, + "epoch": 0.871, + "grad_norm": 10.689522653892224, + "kl": 1.3583984375, + "learning_rate": 1.448565528690129e-07, + "loss": 0.4994, + "reward": 2.4924492835998535, + "reward_std": 0.46057581901550293, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.02838408574461937, + "rewards/tag_count_reward": 0.9583333432674408, + "step": 1742 + }, + { + "clip_ratio": 0.0, + "completion_length": 329.5208435058594, + "epoch": 0.8715, + "grad_norm": 4.741298253263598, + "kl": 0.931640625, + "learning_rate": 1.4451534040505881e-07, + "loss": 0.1265, + "reward": 2.8301846981048584, + "reward_std": 0.25126277655363083, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.03613481484353542, + "rewards/tag_count_reward": 0.984375, + "step": 1743 + }, + { + "clip_ratio": 0.0, + "completion_length": 393.8541717529297, + "epoch": 0.872, + "grad_norm": 17.650431883419923, + "kl": 2.62890625, + "learning_rate": 1.4417536311769885e-07, + "loss": 0.4878, + "reward": 2.3627763986587524, + "reward_std": 0.4748759865760803, + "rewards/accuracy_reward": 0.5000000223517418, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.029584777541458607, + "rewards/tag_count_reward": 0.9270833432674408, + "step": 1744 + }, + { + "clip_ratio": 0.0, + "completion_length": 368.0833435058594, + "epoch": 0.8725, + "grad_norm": 4.887810570436975, + "kl": 1.171875, + "learning_rate": 1.438366220425628e-07, + "loss": 0.4411, + "reward": 2.735856294631958, + "reward_std": 0.47497308254241943, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.03671339526772499, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 1745 + }, + { + "clip_ratio": 0.0, + "completion_length": 356.5416717529297, + "epoch": 0.873, + "grad_norm": 5.959026182042601, + "kl": 1.115234375, + "learning_rate": 1.4349911821151462e-07, + "loss": 0.4461, + "reward": 2.7494075298309326, + "reward_std": 0.3780593601986766, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.03010638989508152, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 1746 + }, + { + "clip_ratio": 0.0, + "completion_length": 505.04168701171875, + "epoch": 0.8735, + "grad_norm": 9.461995383287107, + "kl": 2.03125, + "learning_rate": 1.4316285265264978e-07, + "loss": 0.6131, + "reward": 2.443665385246277, + "reward_std": 0.32284732908010483, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.021612409502267838, + "rewards/tag_count_reward": 0.9166666865348816, + "step": 1747 + }, + { + "clip_ratio": 0.0, + "completion_length": 505.41668701171875, + "epoch": 0.874, + "grad_norm": 5.960798471551781, + "kl": 1.921875, + "learning_rate": 1.4282782639029128e-07, + "loss": 0.6131, + "reward": 2.305721640586853, + "reward_std": 0.7335332632064819, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 0.9236111342906952, + "rewards/repetition_penalty_reward": -0.018931284546852112, + "rewards/tag_count_reward": 0.8802083730697632, + "step": 1748 + }, + { + "clip_ratio": 0.0, + "completion_length": 230.1666717529297, + "epoch": 0.8745, + "grad_norm": 5.536492769825701, + "kl": 0.98046875, + "learning_rate": 1.4249404044498727e-07, + "loss": 0.1038, + "reward": 2.941461443901062, + "reward_std": 0.18287718016654253, + "rewards/accuracy_reward": 0.9791666865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.008191618835553527, + "rewards/tag_count_reward": 0.984375, + "step": 1749 + }, + { + "clip_ratio": 0.0, + "completion_length": 355.0208435058594, + "epoch": 0.875, + "grad_norm": 12.021308076552485, + "kl": 0.67578125, + "learning_rate": 1.4216149583350755e-07, + "loss": 0.2597, + "reward": 2.925000309944153, + "reward_std": 0.15377740375697613, + "rewards/accuracy_reward": 0.9791666865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.03506927099078894, + "rewards/tag_count_reward": 0.9947916865348816, + "step": 1750 + }, + { + "clip_ratio": 0.0, + "completion_length": 332.375, + "epoch": 0.8755, + "grad_norm": 18.87149191181025, + "kl": 0.92578125, + "learning_rate": 1.418301935688408e-07, + "loss": 0.2581, + "reward": 2.644270658493042, + "reward_std": 0.24090787768363953, + "rewards/accuracy_reward": 0.6875000149011612, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.022396131418645382, + "rewards/tag_count_reward": 0.9791666865348816, + "step": 1751 + }, + { + "clip_ratio": 0.0, + "completion_length": 285.0416793823242, + "epoch": 0.876, + "grad_norm": 4.302085628117804, + "kl": 0.828125, + "learning_rate": 1.4150013466019114e-07, + "loss": 0.16, + "reward": 2.77993905544281, + "reward_std": 0.36863449215888977, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.015199816785752773, + "rewards/tag_count_reward": 0.96875, + "step": 1752 + }, + { + "clip_ratio": 0.0, + "completion_length": 327.93751525878906, + "epoch": 0.8765, + "grad_norm": 10.151412864739319, + "kl": 1.16796875, + "learning_rate": 1.4117132011297528e-07, + "loss": 0.4048, + "reward": 2.7850375175476074, + "reward_std": 0.502088338136673, + "rewards/accuracy_reward": 0.8750000298023224, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.015309692360460758, + "rewards/tag_count_reward": 0.953125, + "step": 1753 + }, + { + "clip_ratio": 0.0, + "completion_length": 315.8333435058594, + "epoch": 0.877, + "grad_norm": 6.5516024659716425, + "kl": 0.76953125, + "learning_rate": 1.4084375092881917e-07, + "loss": 0.294, + "reward": 2.7152035236358643, + "reward_std": 0.4315020889043808, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.02611603494733572, + "rewards/tag_count_reward": 0.984375, + "step": 1754 + }, + { + "clip_ratio": 0.0, + "completion_length": 480.45835876464844, + "epoch": 0.8775, + "grad_norm": 11.39389155107235, + "kl": 1.8203125, + "learning_rate": 1.405174281055556e-07, + "loss": 0.4952, + "reward": 2.452518105506897, + "reward_std": 0.5552013963460922, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.011023662984371185, + "rewards/tag_count_reward": 0.9010416865348816, + "step": 1755 + }, + { + "clip_ratio": 0.0, + "completion_length": 323.8333435058594, + "epoch": 0.878, + "grad_norm": 4.28402802913973, + "kl": 0.873046875, + "learning_rate": 1.4019235263722034e-07, + "loss": 0.1875, + "reward": 2.5597667694091797, + "reward_std": 0.40155889838933945, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.030511243268847466, + "rewards/tag_count_reward": 0.9791666865348816, + "step": 1756 + }, + { + "clip_ratio": 0.0, + "completion_length": 483.0208435058594, + "epoch": 0.8785, + "grad_norm": 13.359830394312729, + "kl": 2.0078125, + "learning_rate": 1.3986852551404962e-07, + "loss": 0.499, + "reward": 2.397765874862671, + "reward_std": 0.4639376848936081, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9375, + "rewards/repetition_penalty_reward": -0.024109240621328354, + "rewards/tag_count_reward": 0.9010416865348816, + "step": 1757 + }, + { + "clip_ratio": 0.0, + "completion_length": 434.4375, + "epoch": 0.879, + "grad_norm": 7.144462129204785, + "kl": 1.4609375, + "learning_rate": 1.395459477224772e-07, + "loss": 0.3825, + "reward": 2.5313947200775146, + "reward_std": 0.4858998954296112, + "rewards/accuracy_reward": 0.6458333730697632, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.0311053404584527, + "rewards/tag_count_reward": 0.9375000298023224, + "step": 1758 + }, + { + "clip_ratio": 0.0, + "completion_length": 309.25001525878906, + "epoch": 0.8795, + "grad_norm": 5.497608019794996, + "kl": 0.890625, + "learning_rate": 1.3922462024513075e-07, + "loss": 0.3099, + "reward": 2.6326241493225098, + "reward_std": 0.32827917486429214, + "rewards/accuracy_reward": 0.6875000149011612, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.009737157728523016, + "rewards/tag_count_reward": 0.96875, + "step": 1759 + }, + { + "clip_ratio": 0.0, + "completion_length": 383.22918701171875, + "epoch": 0.88, + "grad_norm": 7.950064862238582, + "kl": 1.3828125, + "learning_rate": 1.3890454406082956e-07, + "loss": 0.4321, + "reward": 2.515586256980896, + "reward_std": 0.5082004070281982, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9791666567325592, + "rewards/repetition_penalty_reward": -0.020872057415544987, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 1760 + }, + { + "clip_ratio": 0.0, + "completion_length": 337.18751525878906, + "epoch": 0.8805, + "grad_norm": 4.154164825967078, + "kl": 0.740234375, + "learning_rate": 1.385857201445813e-07, + "loss": 0.2849, + "reward": 2.466183662414551, + "reward_std": 0.2627424318343401, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.030344081111252308, + "rewards/tag_count_reward": 0.9895833432674408, + "step": 1761 + }, + { + "clip_ratio": 0.0, + "completion_length": 357.7916717529297, + "epoch": 0.881, + "grad_norm": 8.822719078728802, + "kl": 1.125, + "learning_rate": 1.3826814946757888e-07, + "loss": 0.1549, + "reward": 2.106475353240967, + "reward_std": 0.4267265051603317, + "rewards/accuracy_reward": 0.2083333358168602, + "rewards/reasoning_steps_reward": 0.9791666269302368, + "rewards/repetition_penalty_reward": -0.03935814555734396, + "rewards/tag_count_reward": 0.9583333432674408, + "step": 1762 + }, + { + "clip_ratio": 0.0, + "completion_length": 341.6041717529297, + "epoch": 0.8815, + "grad_norm": 4.2275590882335266, + "kl": 0.904296875, + "learning_rate": 1.3795183299719753e-07, + "loss": 0.1513, + "reward": 2.4813464879989624, + "reward_std": 0.5057013630867004, + "rewards/accuracy_reward": 0.5625, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.03601475339382887, + "rewards/tag_count_reward": 0.96875, + "step": 1763 + }, + { + "clip_ratio": 0.0, + "completion_length": 290.9583435058594, + "epoch": 0.882, + "grad_norm": 5.592735227584482, + "kl": 0.982421875, + "learning_rate": 1.3763677169699217e-07, + "loss": -0.0045, + "reward": 2.6361879110336304, + "reward_std": 0.3717052489519119, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.021798397414386272, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 1764 + }, + { + "clip_ratio": 0.0, + "completion_length": 379.3541717529297, + "epoch": 0.8825, + "grad_norm": 4.766850312236698, + "kl": 1.126953125, + "learning_rate": 1.3732296652669417e-07, + "loss": 0.2914, + "reward": 2.6185107231140137, + "reward_std": 0.407680407166481, + "rewards/accuracy_reward": 0.6875, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.03253097087144852, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 1765 + }, + { + "clip_ratio": 0.0, + "completion_length": 348.00001525878906, + "epoch": 0.883, + "grad_norm": 4.908003911372998, + "kl": 1.236328125, + "learning_rate": 1.370104184422085e-07, + "loss": 0.4774, + "reward": 2.7331286668777466, + "reward_std": 0.5909359902143478, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.03423246555030346, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 1766 + }, + { + "clip_ratio": 0.0, + "completion_length": 424.31251525878906, + "epoch": 0.8835, + "grad_norm": 4.218081542001781, + "kl": 1.5625, + "learning_rate": 1.3669912839561083e-07, + "loss": 0.4539, + "reward": 2.497738838195801, + "reward_std": 0.3815242201089859, + "rewards/accuracy_reward": 0.6875000149011612, + "rewards/reasoning_steps_reward": 0.9375000596046448, + "rewards/repetition_penalty_reward": -0.028302965685725212, + "rewards/tag_count_reward": 0.9010416865348816, + "step": 1767 + }, + { + "clip_ratio": 0.0, + "completion_length": 355.7708435058594, + "epoch": 0.884, + "grad_norm": 11.659886994958134, + "kl": 1.17578125, + "learning_rate": 1.3638909733514452e-07, + "loss": 0.3457, + "reward": 2.616908311843872, + "reward_std": 0.5863338112831116, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.021980691701173782, + "rewards/tag_count_reward": 0.9583333432674408, + "step": 1768 + }, + { + "clip_ratio": 0.0, + "completion_length": 435.93751525878906, + "epoch": 0.8845, + "grad_norm": 10.949518936212161, + "kl": 1.16015625, + "learning_rate": 1.3608032620521803e-07, + "loss": 0.5473, + "reward": 2.750458240509033, + "reward_std": 0.5212399363517761, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.025583509355783463, + "rewards/tag_count_reward": 0.9427083730697632, + "step": 1769 + }, + { + "clip_ratio": 0.0, + "completion_length": 354.81251525878906, + "epoch": 0.885, + "grad_norm": 10.36571400939137, + "kl": 1.19921875, + "learning_rate": 1.3577281594640182e-07, + "loss": 0.7841, + "reward": 2.636287212371826, + "reward_std": 0.4996710419654846, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.023435143288224936, + "rewards/tag_count_reward": 0.9583333432674408, + "step": 1770 + }, + { + "clip_ratio": 0.0, + "completion_length": 452.375, + "epoch": 0.8855, + "grad_norm": 9.00198300414043, + "kl": 1.53125, + "learning_rate": 1.354665674954255e-07, + "loss": 0.4026, + "reward": 2.6022530794143677, + "reward_std": 0.33922192733734846, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.027955124154686928, + "rewards/tag_count_reward": 0.921875, + "step": 1771 + }, + { + "clip_ratio": 0.0, + "completion_length": 364.68750762939453, + "epoch": 0.886, + "grad_norm": 7.279570069897522, + "kl": 0.958984375, + "learning_rate": 1.351615817851748e-07, + "loss": 0.297, + "reward": 2.481359362602234, + "reward_std": 0.33179375529289246, + "rewards/accuracy_reward": 0.5625000298023224, + "rewards/reasoning_steps_reward": 0.9722222089767456, + "rewards/repetition_penalty_reward": -0.02732121106237173, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 1772 + }, + { + "clip_ratio": 0.0, + "completion_length": 390.12501525878906, + "epoch": 0.8865, + "grad_norm": 10.916967026611704, + "kl": 1.41015625, + "learning_rate": 1.3485785974468913e-07, + "loss": 0.3221, + "reward": 2.6607974767684937, + "reward_std": 0.38768453896045685, + "rewards/accuracy_reward": 0.7708333730697632, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.035383082926273346, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 1773 + }, + { + "clip_ratio": 0.0, + "completion_length": 440.12501525878906, + "epoch": 0.887, + "grad_norm": 12.121221337164327, + "kl": 2.30078125, + "learning_rate": 1.345554022991586e-07, + "loss": 0.5623, + "reward": 2.4204729795455933, + "reward_std": 0.6897162795066833, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 0.9166666865348816, + "rewards/repetition_penalty_reward": -0.03265202045440674, + "rewards/tag_count_reward": 0.9114583432674408, + "step": 1774 + }, + { + "clip_ratio": 0.0, + "completion_length": 318.7083435058594, + "epoch": 0.8875, + "grad_norm": 10.982971637870204, + "kl": 1.44140625, + "learning_rate": 1.3425421036992097e-07, + "loss": 0.3624, + "reward": 2.7605329751968384, + "reward_std": 0.5273626148700714, + "rewards/accuracy_reward": 0.875, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.020717153325676918, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 1775 + }, + { + "clip_ratio": 0.0, + "completion_length": 468.8958435058594, + "epoch": 0.888, + "grad_norm": 10.54577580044071, + "kl": 1.63671875, + "learning_rate": 1.3395428487445914e-07, + "loss": 0.39, + "reward": 2.5657081604003906, + "reward_std": 0.4877399206161499, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.0332502406090498, + "rewards/tag_count_reward": 0.9322916865348816, + "step": 1776 + }, + { + "clip_ratio": 0.0, + "completion_length": 290.4791717529297, + "epoch": 0.8885, + "grad_norm": 5.822202062218948, + "kl": 0.548828125, + "learning_rate": 1.3365562672639807e-07, + "loss": 0.0764, + "reward": 2.9392874240875244, + "reward_std": 0.10572412749752402, + "rewards/accuracy_reward": 0.9583333432674408, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.019045927096158266, + "rewards/tag_count_reward": 1.0, + "step": 1777 + }, + { + "clip_ratio": 0.0, + "completion_length": 367.0208435058594, + "epoch": 0.889, + "grad_norm": 12.670512447704958, + "kl": 1.421875, + "learning_rate": 1.3335823683550237e-07, + "loss": 0.7225, + "reward": 2.640958547592163, + "reward_std": 0.7428161203861237, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9444444477558136, + "rewards/repetition_penalty_reward": -0.017027711495757103, + "rewards/tag_count_reward": 0.921875, + "step": 1778 + }, + { + "clip_ratio": 0.0, + "completion_length": 403.1458435058594, + "epoch": 0.8895, + "grad_norm": 9.688022329481187, + "kl": 1.75, + "learning_rate": 1.3306211610767327e-07, + "loss": 0.8464, + "reward": 2.530915379524231, + "reward_std": 0.6522108912467957, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9583334028720856, + "rewards/repetition_penalty_reward": -0.021168189123272896, + "rewards/tag_count_reward": 0.9270833432674408, + "step": 1779 + }, + { + "clip_ratio": 0.0, + "completion_length": 362.0208435058594, + "epoch": 0.89, + "grad_norm": 13.017237996007623, + "kl": 1.33984375, + "learning_rate": 1.3276726544494571e-07, + "loss": 0.4707, + "reward": 2.7710498571395874, + "reward_std": 0.40325474739074707, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.025825275108218193, + "rewards/tag_count_reward": 0.9427083432674408, + "step": 1780 + }, + { + "clip_ratio": 0.0, + "completion_length": 389.7291717529297, + "epoch": 0.8905, + "grad_norm": 5.387878254985672, + "kl": 1.51171875, + "learning_rate": 1.3247368574548605e-07, + "loss": 0.4883, + "reward": 2.2095279693603516, + "reward_std": 0.547009214758873, + "rewards/accuracy_reward": 0.3750000223517418, + "rewards/reasoning_steps_reward": 0.9236110746860504, + "rewards/repetition_penalty_reward": -0.02137490874156356, + "rewards/tag_count_reward": 0.9322916865348816, + "step": 1781 + }, + { + "clip_ratio": 0.0, + "completion_length": 348.8541717529297, + "epoch": 0.891, + "grad_norm": 4.93435481117121, + "kl": 1.052734375, + "learning_rate": 1.3218137790358892e-07, + "loss": 0.356, + "reward": 2.8138599395751953, + "reward_std": 0.295873099938035, + "rewards/accuracy_reward": 0.875, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.029890276491642, + "rewards/tag_count_reward": 0.96875, + "step": 1782 + }, + { + "clip_ratio": 0.0, + "completion_length": 503.6250305175781, + "epoch": 0.8915, + "grad_norm": 18.667124890654204, + "kl": 1.90625, + "learning_rate": 1.3189034280967474e-07, + "loss": 0.4897, + "reward": 2.317874312400818, + "reward_std": 0.3144010305404663, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 0.9166667461395264, + "rewards/repetition_penalty_reward": -0.051917336881160736, + "rewards/tag_count_reward": 0.9322916865348816, + "step": 1783 + }, + { + "clip_ratio": 0.0, + "completion_length": 509.16668701171875, + "epoch": 0.892, + "grad_norm": 137.39498749282393, + "kl": 6.7734375, + "learning_rate": 1.316005813502869e-07, + "loss": 0.6333, + "reward": 2.432840585708618, + "reward_std": 0.6739647388458252, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.9375000298023224, + "rewards/repetition_penalty_reward": -0.025492852553725243, + "rewards/tag_count_reward": 0.875, + "step": 1784 + }, + { + "clip_ratio": 0.0, + "completion_length": 349.2291717529297, + "epoch": 0.8925, + "grad_norm": 5.858027527176804, + "kl": 1.3203125, + "learning_rate": 1.31312094408089e-07, + "loss": 0.3111, + "reward": 2.6231387853622437, + "reward_std": 0.44334521889686584, + "rewards/accuracy_reward": 0.7083333730697632, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.0192225007340312, + "rewards/tag_count_reward": 0.96875, + "step": 1785 + }, + { + "clip_ratio": 0.0, + "completion_length": 402.58335876464844, + "epoch": 0.893, + "grad_norm": 4.541492575563727, + "kl": 1.345703125, + "learning_rate": 1.3102488286186234e-07, + "loss": 0.4852, + "reward": 2.777048349380493, + "reward_std": 0.41408103704452515, + "rewards/accuracy_reward": 0.8750000298023224, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.028507346287369728, + "rewards/tag_count_reward": 0.9583333432674408, + "step": 1786 + }, + { + "clip_ratio": 0.0, + "completion_length": 483.72918701171875, + "epoch": 0.8935, + "grad_norm": 6.934337961180237, + "kl": 2.015625, + "learning_rate": 1.30738947586503e-07, + "loss": 0.6592, + "reward": 2.56930148601532, + "reward_std": 0.6575948297977448, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.031392961740493774, + "rewards/tag_count_reward": 0.8854166865348816, + "step": 1787 + }, + { + "clip_ratio": 0.0, + "completion_length": 290.25, + "epoch": 0.894, + "grad_norm": 6.966975877164384, + "kl": 1.09765625, + "learning_rate": 1.3045428945301953e-07, + "loss": 0.3265, + "reward": 2.8180272579193115, + "reward_std": 0.34140200912952423, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 0.9861111640930176, + "rewards/repetition_penalty_reward": -0.017042224761098623, + "rewards/tag_count_reward": 0.953125, + "step": 1788 + }, + { + "clip_ratio": 0.0, + "completion_length": 304.2708435058594, + "epoch": 0.8945, + "grad_norm": 6.272772314297587, + "kl": 1.0859375, + "learning_rate": 1.3017090932852998e-07, + "loss": 0.1709, + "reward": 2.655430555343628, + "reward_std": 0.332093209028244, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.014708399772644043, + "rewards/tag_count_reward": 0.96875, + "step": 1789 + }, + { + "clip_ratio": 0.0, + "completion_length": 458.9375305175781, + "epoch": 0.895, + "grad_norm": 8.12356036354293, + "kl": 1.83203125, + "learning_rate": 1.2988880807625927e-07, + "loss": 0.6271, + "reward": 2.6602718830108643, + "reward_std": 0.5047450065612793, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.03070042561739683, + "rewards/tag_count_reward": 0.9270833432674408, + "step": 1790 + }, + { + "clip_ratio": 0.0, + "completion_length": 418.4166717529297, + "epoch": 0.8955, + "grad_norm": 8.535870766876632, + "kl": 1.236328125, + "learning_rate": 1.2960798655553673e-07, + "loss": 0.3997, + "reward": 2.491165280342102, + "reward_std": 0.27948543429374695, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.9097222089767456, + "rewards/repetition_penalty_reward": -0.017515364568680525, + "rewards/tag_count_reward": 0.953125, + "step": 1791 + }, + { + "clip_ratio": 0.0, + "completion_length": 321.7708435058594, + "epoch": 0.896, + "grad_norm": 6.5441149884787, + "kl": 1.076171875, + "learning_rate": 1.2932844562179352e-07, + "loss": 0.3451, + "reward": 2.7869484424591064, + "reward_std": 0.531157523393631, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.023815508000552654, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 1792 + }, + { + "clip_ratio": 0.0, + "completion_length": 296.8958435058594, + "epoch": 0.8965, + "grad_norm": 8.808586232096019, + "kl": 0.890625, + "learning_rate": 1.2905018612655974e-07, + "loss": 0.0777, + "reward": 2.684244394302368, + "reward_std": 0.2914978265762329, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.029297261498868465, + "rewards/tag_count_reward": 0.984375, + "step": 1793 + }, + { + "clip_ratio": 0.0, + "completion_length": 589.0208435058594, + "epoch": 0.897, + "grad_norm": 5.648691995344081, + "kl": 2.07421875, + "learning_rate": 1.2877320891746201e-07, + "loss": 0.7666, + "reward": 2.3528552055358887, + "reward_std": 0.4950469881296158, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.0290891882032156, + "rewards/tag_count_reward": 0.8750000298023224, + "step": 1794 + }, + { + "clip_ratio": 0.0, + "completion_length": 479.2708435058594, + "epoch": 0.8975, + "grad_norm": 5.153259121443274, + "kl": 1.52734375, + "learning_rate": 1.284975148382211e-07, + "loss": 0.4386, + "reward": 2.5888208150863647, + "reward_std": 0.37291720509529114, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.036179195158183575, + "rewards/tag_count_reward": 0.9166666865348816, + "step": 1795 + }, + { + "clip_ratio": 0.0, + "completion_length": 330.6041717529297, + "epoch": 0.898, + "grad_norm": 9.368121569370517, + "kl": 1.115234375, + "learning_rate": 1.2822310472864885e-07, + "loss": 0.2721, + "reward": 2.7130978107452393, + "reward_std": 0.25604604184627533, + "rewards/accuracy_reward": 0.7708333730697632, + "rewards/reasoning_steps_reward": 0.9861111640930176, + "rewards/repetition_penalty_reward": -0.023013423196971416, + "rewards/tag_count_reward": 0.9791666865348816, + "step": 1796 + }, + { + "clip_ratio": 0.0, + "completion_length": 347.7083435058594, + "epoch": 0.8985, + "grad_norm": 12.211005971271671, + "kl": 1.15234375, + "learning_rate": 1.2794997942464603e-07, + "loss": 0.3333, + "reward": 2.739255905151367, + "reward_std": 0.45043814182281494, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9375000298023224, + "rewards/repetition_penalty_reward": -0.026369189843535423, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 1797 + }, + { + "clip_ratio": 0.0, + "completion_length": 422.35418701171875, + "epoch": 0.899, + "grad_norm": 10.548113579660656, + "kl": 2.203125, + "learning_rate": 1.2767813975819983e-07, + "loss": 0.3646, + "reward": 2.240885376930237, + "reward_std": 0.4887467324733734, + "rewards/accuracy_reward": 0.4583333432674408, + "rewards/reasoning_steps_reward": 0.8888889253139496, + "rewards/repetition_penalty_reward": -0.02300364524126053, + "rewards/tag_count_reward": 0.9166666865348816, + "step": 1798 + }, + { + "clip_ratio": 0.0, + "completion_length": 417.1666717529297, + "epoch": 0.8995, + "grad_norm": 4.435343161658372, + "kl": 1.275390625, + "learning_rate": 1.274075865573809e-07, + "loss": 0.3645, + "reward": 2.541459321975708, + "reward_std": 0.3853719085454941, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.03666570037603378, + "rewards/tag_count_reward": 0.953125, + "step": 1799 + }, + { + "clip_ratio": 0.0, + "completion_length": 572.6666870117188, + "epoch": 0.9, + "grad_norm": 12.274646361156972, + "kl": 2.36328125, + "learning_rate": 1.2713832064634125e-07, + "loss": 0.5502, + "reward": 2.4783600568771362, + "reward_std": 0.6872712820768356, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9305555820465088, + "rewards/repetition_penalty_reward": -0.014695549616590142, + "rewards/tag_count_reward": 0.8541666865348816, + "step": 1800 + }, + { + "clip_ratio": 0.0, + "completion_length": 435.7083435058594, + "epoch": 0.9005, + "grad_norm": 6.174999936072762, + "kl": 1.251953125, + "learning_rate": 1.2687034284531145e-07, + "loss": 0.219, + "reward": 2.5370699167251587, + "reward_std": 0.4170294851064682, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.027166323270648718, + "rewards/tag_count_reward": 0.9531250298023224, + "step": 1801 + }, + { + "clip_ratio": 0.0, + "completion_length": 423.16668701171875, + "epoch": 0.901, + "grad_norm": 8.371287383648244, + "kl": 1.984375, + "learning_rate": 1.2660365397059856e-07, + "loss": 0.6964, + "reward": 2.5543148517608643, + "reward_std": 0.6248021870851517, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.02728235349059105, + "rewards/tag_count_reward": 0.9010416865348816, + "step": 1802 + }, + { + "clip_ratio": 0.0, + "completion_length": 443.08335876464844, + "epoch": 0.9015, + "grad_norm": 4.999316997210919, + "kl": 1.60546875, + "learning_rate": 1.263382548345829e-07, + "loss": 0.4042, + "reward": 2.5936447381973267, + "reward_std": 0.49106191098690033, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.0313554760068655, + "rewards/tag_count_reward": 0.8958333432674408, + "step": 1803 + }, + { + "clip_ratio": 0.0, + "completion_length": 438.00001525878906, + "epoch": 0.902, + "grad_norm": 13.183151595537693, + "kl": 2.0703125, + "learning_rate": 1.260741462457165e-07, + "loss": 0.3495, + "reward": 2.561880946159363, + "reward_std": 0.6570396423339844, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9375, + "rewards/repetition_penalty_reward": -0.011035696603357792, + "rewards/tag_count_reward": 0.8645833432674408, + "step": 1804 + }, + { + "clip_ratio": 0.0, + "completion_length": 480.4375, + "epoch": 0.9025, + "grad_norm": 7.229161374154403, + "kl": 2.3125, + "learning_rate": 1.258113290085197e-07, + "loss": 0.405, + "reward": 2.4197758436203003, + "reward_std": 0.7334087789058685, + "rewards/accuracy_reward": 0.6458333730697632, + "rewards/reasoning_steps_reward": 0.9305555820465088, + "rewards/repetition_penalty_reward": -0.03161316737532616, + "rewards/tag_count_reward": 0.875, + "step": 1805 + }, + { + "clip_ratio": 0.0, + "completion_length": 374.2291717529297, + "epoch": 0.903, + "grad_norm": 8.888192201627456, + "kl": 1.01171875, + "learning_rate": 1.2554980392357956e-07, + "loss": 0.5509, + "reward": 2.8383933305740356, + "reward_std": 0.3369765877723694, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.027926414273679256, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 1806 + }, + { + "clip_ratio": 0.0, + "completion_length": 305.31251525878906, + "epoch": 0.9035, + "grad_norm": 7.34813847799913, + "kl": 0.875, + "learning_rate": 1.2528957178754676e-07, + "loss": 0.1249, + "reward": 2.66613507270813, + "reward_std": 0.42864419519901276, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.026573394425213337, + "rewards/tag_count_reward": 0.984375, + "step": 1807 + }, + { + "clip_ratio": 0.0, + "completion_length": 331.7291717529297, + "epoch": 0.904, + "grad_norm": 6.090013410671144, + "kl": 0.939453125, + "learning_rate": 1.2503063339313356e-07, + "loss": 0.4151, + "reward": 2.8806700706481934, + "reward_std": 0.2488960325717926, + "rewards/accuracy_reward": 0.9375000298023224, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.025580264627933502, + "rewards/tag_count_reward": 0.96875, + "step": 1808 + }, + { + "clip_ratio": 0.0, + "completion_length": 430.2083435058594, + "epoch": 0.9045, + "grad_norm": 13.573272613458863, + "kl": 1.37890625, + "learning_rate": 1.2477298952911116e-07, + "loss": 0.5799, + "reward": 2.1159850358963013, + "reward_std": 0.4026012271642685, + "rewards/accuracy_reward": 0.2291666716337204, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.014223407953977585, + "rewards/tag_count_reward": 0.9218750298023224, + "step": 1809 + }, + { + "clip_ratio": 0.0, + "completion_length": 404.0833435058594, + "epoch": 0.905, + "grad_norm": 8.300631259188407, + "kl": 2.8203125, + "learning_rate": 1.2451664098030743e-07, + "loss": 0.5028, + "reward": 2.1978728771209717, + "reward_std": 0.8104668259620667, + "rewards/accuracy_reward": 0.520833358168602, + "rewards/reasoning_steps_reward": 0.8333333432674408, + "rewards/repetition_penalty_reward": -0.026085459627211094, + "rewards/tag_count_reward": 0.8697916865348816, + "step": 1810 + }, + { + "clip_ratio": 0.0, + "completion_length": 313.7291717529297, + "epoch": 0.9055, + "grad_norm": 16.892046533776394, + "kl": 1.1953125, + "learning_rate": 1.242615885276046e-07, + "loss": 0.5455, + "reward": 2.822051763534546, + "reward_std": 0.5525839030742645, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.01822614297270775, + "rewards/tag_count_reward": 0.9583333432674408, + "step": 1811 + }, + { + "clip_ratio": 0.0, + "completion_length": 369.9583435058594, + "epoch": 0.906, + "grad_norm": 6.741532569071179, + "kl": 0.923828125, + "learning_rate": 1.2400783294793668e-07, + "loss": 0.3853, + "reward": 2.762416362762451, + "reward_std": 0.38367322087287903, + "rewards/accuracy_reward": 0.8333333730697632, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.025778336450457573, + "rewards/tag_count_reward": 0.96875, + "step": 1812 + }, + { + "clip_ratio": 0.0, + "completion_length": 438.8333435058594, + "epoch": 0.9065, + "grad_norm": 7.797105914296452, + "kl": 1.375, + "learning_rate": 1.2375537501428706e-07, + "loss": 0.3492, + "reward": 2.350590944290161, + "reward_std": 0.4562319219112396, + "rewards/accuracy_reward": 0.4375, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.015728731639683247, + "rewards/tag_count_reward": 0.9427083432674408, + "step": 1813 + }, + { + "clip_ratio": 0.0, + "completion_length": 515.3958435058594, + "epoch": 0.907, + "grad_norm": 16.218331905338545, + "kl": 1.94921875, + "learning_rate": 1.235042154956865e-07, + "loss": 0.4092, + "reward": 2.4498504400253296, + "reward_std": 0.2465880587697029, + "rewards/accuracy_reward": 0.5625000298023224, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.024108044803142548, + "rewards/tag_count_reward": 0.9114583432674408, + "step": 1814 + }, + { + "clip_ratio": 0.0, + "completion_length": 463.27085876464844, + "epoch": 0.9075, + "grad_norm": 4.573817188249182, + "kl": 1.50390625, + "learning_rate": 1.232543551572103e-07, + "loss": 0.6756, + "reward": 2.6501909494400024, + "reward_std": 0.4004078805446625, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.025156395509839058, + "rewards/tag_count_reward": 0.9531250298023224, + "step": 1815 + }, + { + "clip_ratio": 0.0, + "completion_length": 484.3333435058594, + "epoch": 0.908, + "grad_norm": 21.11225799549021, + "kl": 2.875, + "learning_rate": 1.2300579475997657e-07, + "loss": 0.6141, + "reward": 2.0554409623146057, + "reward_std": 0.6282560527324677, + "rewards/accuracy_reward": 0.354166679084301, + "rewards/reasoning_steps_reward": 0.854166716337204, + "rewards/repetition_penalty_reward": -0.022684063762426376, + "rewards/tag_count_reward": 0.8697916865348816, + "step": 1816 + }, + { + "clip_ratio": 0.0, + "completion_length": 390.3958435058594, + "epoch": 0.9085, + "grad_norm": 4.270688484326959, + "kl": 1.458984375, + "learning_rate": 1.227585350611433e-07, + "loss": 0.4887, + "reward": 2.5446548461914062, + "reward_std": 0.4144492670893669, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.026525692082941532, + "rewards/tag_count_reward": 0.9531250298023224, + "step": 1817 + }, + { + "clip_ratio": 0.0, + "completion_length": 510.0833435058594, + "epoch": 0.909, + "grad_norm": 10.195671953035172, + "kl": 1.78125, + "learning_rate": 1.2251257681390645e-07, + "loss": 0.5759, + "reward": 2.590057134628296, + "reward_std": 0.579290121793747, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.02799857221543789, + "rewards/tag_count_reward": 0.8958333432674408, + "step": 1818 + }, + { + "clip_ratio": 0.0, + "completion_length": 365.5208435058594, + "epoch": 0.9095, + "grad_norm": 14.858149785129092, + "kl": 1.1015625, + "learning_rate": 1.2226792076749734e-07, + "loss": 0.3952, + "reward": 2.7983046770095825, + "reward_std": 0.5069041550159454, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 0.965277761220932, + "rewards/repetition_penalty_reward": -0.031556460075080395, + "rewards/tag_count_reward": 0.96875, + "step": 1819 + }, + { + "clip_ratio": 0.0, + "completion_length": 498.27085876464844, + "epoch": 0.91, + "grad_norm": 13.474699117275431, + "kl": 2.5390625, + "learning_rate": 1.220245676671809e-07, + "loss": 0.6901, + "reward": 2.2409090995788574, + "reward_std": 0.5821706056594849, + "rewards/accuracy_reward": 0.3958333358168602, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.01777158584445715, + "rewards/tag_count_reward": 0.8906250298023224, + "step": 1820 + }, + { + "clip_ratio": 0.0, + "completion_length": 321.2083435058594, + "epoch": 0.9105, + "grad_norm": 5.583742600416253, + "kl": 1.087890625, + "learning_rate": 1.2178251825425282e-07, + "loss": 0.3231, + "reward": 2.433494746685028, + "reward_std": 0.37168148159980774, + "rewards/accuracy_reward": 0.5208333544433117, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.01963021163828671, + "rewards/tag_count_reward": 0.953125, + "step": 1821 + }, + { + "clip_ratio": 0.0, + "completion_length": 437.35418701171875, + "epoch": 0.911, + "grad_norm": 15.808710038962724, + "kl": 1.85546875, + "learning_rate": 1.2154177326603763e-07, + "loss": 0.3345, + "reward": 1.9915515780448914, + "reward_std": 0.4922345131635666, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/reasoning_steps_reward": 0.9305555820465088, + "rewards/repetition_penalty_reward": -0.017128958366811275, + "rewards/tag_count_reward": 0.9114583730697632, + "step": 1822 + }, + { + "clip_ratio": 0.0, + "completion_length": 461.66668701171875, + "epoch": 0.9115, + "grad_norm": 4.487639145726479, + "kl": 1.5859375, + "learning_rate": 1.2130233343588623e-07, + "loss": 0.6766, + "reward": 2.4462087154388428, + "reward_std": 0.48641470074653625, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9444444477558136, + "rewards/repetition_penalty_reward": -0.013860756065696478, + "rewards/tag_count_reward": 0.9322916865348816, + "step": 1823 + }, + { + "clip_ratio": 0.0, + "completion_length": 427.5208435058594, + "epoch": 0.912, + "grad_norm": 5.206907795171928, + "kl": 1.11328125, + "learning_rate": 1.2106419949317388e-07, + "loss": 0.298, + "reward": 2.293039083480835, + "reward_std": 0.4323887377977371, + "rewards/accuracy_reward": 0.3958333432674408, + "rewards/reasoning_steps_reward": 0.9722222983837128, + "rewards/repetition_penalty_reward": -0.022933254949748516, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 1824 + }, + { + "clip_ratio": 0.0, + "completion_length": 455.7916717529297, + "epoch": 0.9125, + "grad_norm": 3.5810924881234034, + "kl": 1.447265625, + "learning_rate": 1.2082737216329792e-07, + "loss": 0.4868, + "reward": 2.722281336784363, + "reward_std": 0.5233409157954156, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 0.9236111342906952, + "rewards/repetition_penalty_reward": -0.02424659300595522, + "rewards/tag_count_reward": 0.9270833432674408, + "step": 1825 + }, + { + "clip_ratio": 0.0, + "completion_length": 366.6875, + "epoch": 0.913, + "grad_norm": 10.549190317425618, + "kl": 1.34765625, + "learning_rate": 1.2059185216767543e-07, + "loss": 0.6111, + "reward": 2.620617389678955, + "reward_std": 0.4774337261915207, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.018271582201123238, + "rewards/tag_count_reward": 0.9583333432674408, + "step": 1826 + }, + { + "clip_ratio": 0.0, + "completion_length": 331.50001525878906, + "epoch": 0.9135, + "grad_norm": 8.459023548215319, + "kl": 0.80859375, + "learning_rate": 1.203576402237412e-07, + "loss": 0.186, + "reward": 2.81669545173645, + "reward_std": 0.3177480548620224, + "rewards/accuracy_reward": 0.8958333730697632, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.046151867136359215, + "rewards/tag_count_reward": 0.9947916865348816, + "step": 1827 + }, + { + "clip_ratio": 0.0, + "completion_length": 435.4791717529297, + "epoch": 0.914, + "grad_norm": 10.059114870695714, + "kl": 1.42578125, + "learning_rate": 1.2012473704494537e-07, + "loss": 0.8145, + "reward": 2.4705541133880615, + "reward_std": 0.6846470832824707, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.9166666865348816, + "rewards/repetition_penalty_reward": -0.024237760342657566, + "rewards/tag_count_reward": 0.9322916865348816, + "step": 1828 + }, + { + "clip_ratio": 0.0, + "completion_length": 396.3541717529297, + "epoch": 0.9145, + "grad_norm": 7.729647467876799, + "kl": 1.3515625, + "learning_rate": 1.1989314334075144e-07, + "loss": 0.3771, + "reward": 2.610179305076599, + "reward_std": 0.4845607876777649, + "rewards/accuracy_reward": 0.7083333730697632, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.025237280875444412, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 1829 + }, + { + "clip_ratio": 0.0, + "completion_length": 404.16668701171875, + "epoch": 0.915, + "grad_norm": 6.446666593330831, + "kl": 1.466796875, + "learning_rate": 1.1966285981663407e-07, + "loss": 0.735, + "reward": 2.597218871116638, + "reward_std": 0.3849441111087799, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.01562830712646246, + "rewards/tag_count_reward": 0.9531250298023224, + "step": 1830 + }, + { + "clip_ratio": 0.0, + "completion_length": 400.66668701171875, + "epoch": 0.9155, + "grad_norm": 7.699253526367492, + "kl": 1.23046875, + "learning_rate": 1.1943388717407668e-07, + "loss": 0.5849, + "reward": 2.241459369659424, + "reward_std": 0.3354404419660568, + "rewards/accuracy_reward": 0.35416667722165585, + "rewards/reasoning_steps_reward": 0.9861111640930176, + "rewards/repetition_penalty_reward": -0.02069345023483038, + "rewards/tag_count_reward": 0.921875, + "step": 1831 + }, + { + "clip_ratio": 0.0, + "completion_length": 341.31251525878906, + "epoch": 0.916, + "grad_norm": 3.9150353521261585, + "kl": 1.09375, + "learning_rate": 1.1920622611056974e-07, + "loss": 0.5796, + "reward": 2.800022602081299, + "reward_std": 0.3668653219938278, + "rewards/accuracy_reward": 0.8958333730697632, + "rewards/reasoning_steps_reward": 0.9583334028720856, + "rewards/repetition_penalty_reward": -0.02810249850153923, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 1832 + }, + { + "clip_ratio": 0.0, + "completion_length": 333.31251525878906, + "epoch": 0.9165, + "grad_norm": 6.331026738180072, + "kl": 1.453125, + "learning_rate": 1.1897987731960835e-07, + "loss": 0.5084, + "reward": 2.8525108098983765, + "reward_std": 0.3399234637618065, + "rewards/accuracy_reward": 0.9375000298023224, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.01901690987870097, + "rewards/tag_count_reward": 0.9687500298023224, + "step": 1833 + }, + { + "clip_ratio": 0.0, + "completion_length": 427.2708435058594, + "epoch": 0.917, + "grad_norm": 7.909012344076658, + "kl": 1.515625, + "learning_rate": 1.1875484149069004e-07, + "loss": 0.9793, + "reward": 2.418446898460388, + "reward_std": 0.4566657245159149, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.013844884466379881, + "rewards/tag_count_reward": 0.9322916865348816, + "step": 1834 + }, + { + "clip_ratio": 0.0, + "completion_length": 326.4375, + "epoch": 0.9175, + "grad_norm": 15.090270389095783, + "kl": 1.81640625, + "learning_rate": 1.1853111930931312e-07, + "loss": 0.3326, + "reward": 2.7400245666503906, + "reward_std": 0.5180048495531082, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9652778506278992, + "rewards/repetition_penalty_reward": -0.01692010648548603, + "rewards/tag_count_reward": 0.9375, + "step": 1835 + }, + { + "clip_ratio": 0.0, + "completion_length": 293.33333587646484, + "epoch": 0.918, + "grad_norm": 14.385789840879672, + "kl": 1.265625, + "learning_rate": 1.1830871145697412e-07, + "loss": 0.276, + "reward": 2.505857825279236, + "reward_std": 0.44202379882335663, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.018447946291416883, + "rewards/tag_count_reward": 0.96875, + "step": 1836 + }, + { + "clip_ratio": 0.0, + "completion_length": 474.4583435058594, + "epoch": 0.9185, + "grad_norm": 22.396059856717127, + "kl": 1.96484375, + "learning_rate": 1.1808761861116589e-07, + "loss": 0.9942, + "reward": 2.2753371000289917, + "reward_std": 0.6203365921974182, + "rewards/accuracy_reward": 0.4375000149011612, + "rewards/reasoning_steps_reward": 0.9374999701976776, + "rewards/repetition_penalty_reward": -0.016329674050211906, + "rewards/tag_count_reward": 0.9166666865348816, + "step": 1837 + }, + { + "clip_ratio": 0.0, + "completion_length": 442.62501525878906, + "epoch": 0.919, + "grad_norm": 15.472496124027192, + "kl": 1.697265625, + "learning_rate": 1.1786784144537563e-07, + "loss": 0.6291, + "reward": 2.5869717597961426, + "reward_std": 0.6588756740093231, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9444444179534912, + "rewards/repetition_penalty_reward": -0.034556107595562935, + "rewards/tag_count_reward": 0.9270833432674408, + "step": 1838 + }, + { + "clip_ratio": 0.0, + "completion_length": 360.6666717529297, + "epoch": 0.9195, + "grad_norm": 6.6879952807989715, + "kl": 1.015625, + "learning_rate": 1.1764938062908261e-07, + "loss": 0.3457, + "reward": 2.6010701656341553, + "reward_std": 0.44879356026649475, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.02740222029387951, + "rewards/tag_count_reward": 0.96875, + "step": 1839 + }, + { + "clip_ratio": 0.0, + "completion_length": 565.2083587646484, + "epoch": 0.92, + "grad_norm": 14.940386937141579, + "kl": 2.140625, + "learning_rate": 1.1743223682775649e-07, + "loss": 0.7608, + "reward": 2.381098985671997, + "reward_std": 0.6445316672325134, + "rewards/accuracy_reward": 0.583333358168602, + "rewards/reasoning_steps_reward": 0.9236111640930176, + "rewards/repetition_penalty_reward": -0.021678834222257137, + "rewards/tag_count_reward": 0.8958333432674408, + "step": 1840 + }, + { + "clip_ratio": 0.0, + "completion_length": 427.81251525878906, + "epoch": 0.9205, + "grad_norm": 7.0097527574764635, + "kl": 1.4765625, + "learning_rate": 1.172164107028549e-07, + "loss": 0.5432, + "reward": 2.608567476272583, + "reward_std": 0.7569788694381714, + "rewards/accuracy_reward": 0.7708333730697632, + "rewards/reasoning_steps_reward": 0.9305555522441864, + "rewards/repetition_penalty_reward": -0.019904857501387596, + "rewards/tag_count_reward": 0.9270833432674408, + "step": 1841 + }, + { + "clip_ratio": 0.0, + "completion_length": 314.10418701171875, + "epoch": 0.921, + "grad_norm": 5.8284852901692314, + "kl": 0.796875, + "learning_rate": 1.1700190291182158e-07, + "loss": 0.2706, + "reward": 2.492654800415039, + "reward_std": 0.44027116894721985, + "rewards/accuracy_reward": 0.5625000149011612, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.028178581036627293, + "rewards/tag_count_reward": 0.9791666865348816, + "step": 1842 + }, + { + "clip_ratio": 0.0, + "completion_length": 383.10418701171875, + "epoch": 0.9215, + "grad_norm": 11.442490301965112, + "kl": 0.736328125, + "learning_rate": 1.1678871410808454e-07, + "loss": 0.2935, + "reward": 2.5915920734405518, + "reward_std": 0.4983036518096924, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.03340807091444731, + "rewards/tag_count_reward": 0.9791666865348816, + "step": 1843 + }, + { + "clip_ratio": 0.0, + "completion_length": 519.125, + "epoch": 0.922, + "grad_norm": 14.031885386954286, + "kl": 1.98046875, + "learning_rate": 1.1657684494105386e-07, + "loss": 0.2441, + "reward": 2.374902606010437, + "reward_std": 0.2330544777214527, + "rewards/accuracy_reward": 0.5625, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.019194713328033686, + "rewards/tag_count_reward": 0.8802083432674408, + "step": 1844 + }, + { + "clip_ratio": 0.0, + "completion_length": 438.875, + "epoch": 0.9225, + "grad_norm": 5.420069805796557, + "kl": 1.453125, + "learning_rate": 1.1636629605611966e-07, + "loss": 0.7929, + "reward": 2.498497247695923, + "reward_std": 0.4927578568458557, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.013655558926984668, + "rewards/tag_count_reward": 0.921875, + "step": 1845 + }, + { + "clip_ratio": 0.0, + "completion_length": 339.35418701171875, + "epoch": 0.923, + "grad_norm": 12.990900222582072, + "kl": 1.109375, + "learning_rate": 1.1615706809465051e-07, + "loss": 0.4619, + "reward": 2.6084574460983276, + "reward_std": 0.43079179525375366, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.02175095770508051, + "rewards/tag_count_reward": 0.9427083432674408, + "step": 1846 + }, + { + "clip_ratio": 0.0, + "completion_length": 318.1041717529297, + "epoch": 0.9235, + "grad_norm": 3.6306004992091196, + "kl": 0.671875, + "learning_rate": 1.1594916169399087e-07, + "loss": 0.0754, + "reward": 2.619461178779602, + "reward_std": 0.2529575452208519, + "rewards/accuracy_reward": 0.6875000149011612, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.01942774746567011, + "rewards/tag_count_reward": 0.9791666865348816, + "step": 1847 + }, + { + "clip_ratio": 0.0, + "completion_length": 320.2083435058594, + "epoch": 0.924, + "grad_norm": 12.262592006970328, + "kl": 1.064453125, + "learning_rate": 1.1574257748745986e-07, + "loss": 0.0744, + "reward": 2.376179814338684, + "reward_std": 0.5322179198265076, + "rewards/accuracy_reward": 0.4791666865348816, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.028334129601716995, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 1848 + }, + { + "clip_ratio": 0.0, + "completion_length": 389.12501525878906, + "epoch": 0.9245, + "grad_norm": 4.963783636537933, + "kl": 1.248046875, + "learning_rate": 1.1553731610434876e-07, + "loss": 0.308, + "reward": 2.5988192558288574, + "reward_std": 0.4921792149543762, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.031389085575938225, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 1849 + }, + { + "clip_ratio": 0.0, + "completion_length": 416.8125, + "epoch": 0.925, + "grad_norm": 11.2545524933753, + "kl": 1.724609375, + "learning_rate": 1.1533337816991931e-07, + "loss": 0.1345, + "reward": 2.312873065471649, + "reward_std": 0.45963311195373535, + "rewards/accuracy_reward": 0.5000000223517418, + "rewards/reasoning_steps_reward": 0.9097222685813904, + "rewards/repetition_penalty_reward": -0.018724264577031136, + "rewards/tag_count_reward": 0.9218750298023224, + "step": 1850 + }, + { + "clip_ratio": 0.0, + "completion_length": 400.43751525878906, + "epoch": 0.9255, + "grad_norm": 6.837253831426617, + "kl": 1.1015625, + "learning_rate": 1.1513076430540177e-07, + "loss": 0.6187, + "reward": 2.7513691186904907, + "reward_std": 0.4550359845161438, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.035089388489723206, + "rewards/tag_count_reward": 0.953125, + "step": 1851 + }, + { + "clip_ratio": 0.0, + "completion_length": 349.2708435058594, + "epoch": 0.926, + "grad_norm": 11.15586838979647, + "kl": 0.6845703125, + "learning_rate": 1.1492947512799328e-07, + "loss": 0.3386, + "reward": 2.8566954135894775, + "reward_std": 0.34863437712192535, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.02872132882475853, + "rewards/tag_count_reward": 0.9895833432674408, + "step": 1852 + }, + { + "clip_ratio": 0.0, + "completion_length": 293.2291717529297, + "epoch": 0.9265, + "grad_norm": 7.058419371924635, + "kl": 1.044921875, + "learning_rate": 1.1472951125085547e-07, + "loss": 0.2017, + "reward": 2.3934552669525146, + "reward_std": 0.22491255030035973, + "rewards/accuracy_reward": 0.4583333432674408, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.02668381668627262, + "rewards/tag_count_reward": 0.9895833432674408, + "step": 1853 + }, + { + "clip_ratio": 0.0, + "completion_length": 335.3958435058594, + "epoch": 0.927, + "grad_norm": 8.473916574225584, + "kl": 1.1015625, + "learning_rate": 1.1453087328311299e-07, + "loss": 0.3028, + "reward": 2.344975471496582, + "reward_std": 0.41645242273807526, + "rewards/accuracy_reward": 0.4583333544433117, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.026552507653832436, + "rewards/tag_count_reward": 0.9687500298023224, + "step": 1854 + }, + { + "clip_ratio": 0.0, + "completion_length": 267.3541717529297, + "epoch": 0.9275, + "grad_norm": 4.44552019323035, + "kl": 0.564453125, + "learning_rate": 1.1433356182985158e-07, + "loss": 0.0992, + "reward": 2.769820213317871, + "reward_std": 0.2516600340604782, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.02184649184346199, + "rewards/tag_count_reward": 1.0, + "step": 1855 + }, + { + "clip_ratio": 0.0, + "completion_length": 496.75, + "epoch": 0.928, + "grad_norm": 6.704669428410124, + "kl": 1.333984375, + "learning_rate": 1.1413757749211602e-07, + "loss": 0.494, + "reward": 2.636578679084778, + "reward_std": 0.5088631808757782, + "rewards/accuracy_reward": 0.7708333730697632, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.03008817508816719, + "rewards/tag_count_reward": 0.9375, + "step": 1856 + }, + { + "clip_ratio": 0.0, + "completion_length": 361.9166793823242, + "epoch": 0.9285, + "grad_norm": 5.283796075893927, + "kl": 1.150390625, + "learning_rate": 1.1394292086690874e-07, + "loss": 0.5762, + "reward": 2.6141971349716187, + "reward_std": 0.36808090656995773, + "rewards/accuracy_reward": 0.708333358168602, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.019483450800180435, + "rewards/tag_count_reward": 0.953125, + "step": 1857 + }, + { + "clip_ratio": 0.0, + "completion_length": 315.7708435058594, + "epoch": 0.929, + "grad_norm": 13.090857743748684, + "kl": 0.96875, + "learning_rate": 1.137495925471875e-07, + "loss": 0.2827, + "reward": 2.789952039718628, + "reward_std": 0.39836449921131134, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.965277761220932, + "rewards/repetition_penalty_reward": -0.019075598567724228, + "rewards/tag_count_reward": 0.9895833432674408, + "step": 1858 + }, + { + "clip_ratio": 0.0, + "completion_length": 289.7708435058594, + "epoch": 0.9295, + "grad_norm": 12.791650212488964, + "kl": 1.384765625, + "learning_rate": 1.1355759312186396e-07, + "loss": 0.0501, + "reward": 2.5625548362731934, + "reward_std": 0.6163710951805115, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.027723138220608234, + "rewards/tag_count_reward": 0.9583333432674408, + "step": 1859 + }, + { + "clip_ratio": 0.0, + "completion_length": 343.18751525878906, + "epoch": 0.93, + "grad_norm": 3.6089686265116567, + "kl": 1.65234375, + "learning_rate": 1.1336692317580158e-07, + "loss": 0.4677, + "reward": 2.5494874715805054, + "reward_std": 0.5432776808738708, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9375000298023224, + "rewards/repetition_penalty_reward": -0.013012669514864683, + "rewards/tag_count_reward": 0.9375, + "step": 1860 + }, + { + "clip_ratio": 0.0, + "completion_length": 363.47918701171875, + "epoch": 0.9305, + "grad_norm": 4.537501307347617, + "kl": 1.193359375, + "learning_rate": 1.1317758328981414e-07, + "loss": 0.4101, + "reward": 2.4980164766311646, + "reward_std": 0.415087066590786, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.01760861463844776, + "rewards/tag_count_reward": 0.953125, + "step": 1861 + }, + { + "clip_ratio": 0.0, + "completion_length": 552.3541870117188, + "epoch": 0.931, + "grad_norm": 26.096314917799425, + "kl": 3.484375, + "learning_rate": 1.1298957404066381e-07, + "loss": 0.5845, + "reward": 2.1797789335250854, + "reward_std": 0.6568257510662079, + "rewards/accuracy_reward": 0.5833333730697632, + "rewards/reasoning_steps_reward": 0.7986111044883728, + "rewards/repetition_penalty_reward": -0.014665620867162943, + "rewards/tag_count_reward": 0.8125, + "step": 1862 + }, + { + "clip_ratio": 0.0, + "completion_length": 424.1875, + "epoch": 0.9315, + "grad_norm": 8.73387409565459, + "kl": 1.814453125, + "learning_rate": 1.1280289600105928e-07, + "loss": 0.415, + "reward": 2.678765296936035, + "reward_std": 0.4467965252697468, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/reasoning_steps_reward": 0.965277761220932, + "rewards/repetition_penalty_reward": -0.03130429983139038, + "rewards/tag_count_reward": 0.9114583432674408, + "step": 1863 + }, + { + "clip_ratio": 0.0, + "completion_length": 415.8750305175781, + "epoch": 0.932, + "grad_norm": 8.913281266427834, + "kl": 1.458984375, + "learning_rate": 1.1261754973965422e-07, + "loss": 0.2109, + "reward": 2.7937296628952026, + "reward_std": 0.3128529414534569, + "rewards/accuracy_reward": 0.8958333730697632, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.02918718010187149, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 1864 + }, + { + "clip_ratio": 0.0, + "completion_length": 433.8333435058594, + "epoch": 0.9325, + "grad_norm": 6.364728489318106, + "kl": 1.65625, + "learning_rate": 1.1243353582104555e-07, + "loss": 0.4969, + "reward": 2.676972270011902, + "reward_std": 0.4657895863056183, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9722222089767456, + "rewards/repetition_penalty_reward": -0.029625079594552517, + "rewards/tag_count_reward": 0.9427083432674408, + "step": 1865 + }, + { + "clip_ratio": 0.0, + "completion_length": 387.31251525878906, + "epoch": 0.933, + "grad_norm": 13.019299595053573, + "kl": 1.962890625, + "learning_rate": 1.1225085480577158e-07, + "loss": 0.4171, + "reward": 2.349880337715149, + "reward_std": 0.5604653209447861, + "rewards/accuracy_reward": 0.5625000298023224, + "rewards/reasoning_steps_reward": 0.881944477558136, + "rewards/repetition_penalty_reward": -0.021647500805556774, + "rewards/tag_count_reward": 0.9270833432674408, + "step": 1866 + }, + { + "clip_ratio": 0.0, + "completion_length": 293.06251525878906, + "epoch": 0.9335, + "grad_norm": 8.091047902175246, + "kl": 0.810546875, + "learning_rate": 1.1206950725031034e-07, + "loss": 0.2837, + "reward": 2.5777370929718018, + "reward_std": 0.35544297099113464, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.022957434877753258, + "rewards/tag_count_reward": 0.9895833432674408, + "step": 1867 + }, + { + "clip_ratio": 0.0, + "completion_length": 235.00000762939453, + "epoch": 0.934, + "grad_norm": 6.992390350124804, + "kl": 0.6953125, + "learning_rate": 1.1188949370707787e-07, + "loss": 0.0188, + "reward": 2.7592689990997314, + "reward_std": 0.17366931587457657, + "rewards/accuracy_reward": 0.7708333730697632, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.011564383283257484, + "rewards/tag_count_reward": 1.0, + "step": 1868 + }, + { + "clip_ratio": 0.0, + "completion_length": 571.7708435058594, + "epoch": 0.9345, + "grad_norm": 21.933887010235615, + "kl": 2.62109375, + "learning_rate": 1.117108147244268e-07, + "loss": 1.082, + "reward": 2.3994613885879517, + "reward_std": 0.9821091592311859, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.8333333432674408, + "rewards/repetition_penalty_reward": -0.01720538828521967, + "rewards/tag_count_reward": 0.8541666865348816, + "step": 1869 + }, + { + "clip_ratio": 0.0, + "completion_length": 377.81251525878906, + "epoch": 0.935, + "grad_norm": 12.81237348732685, + "kl": 1.54296875, + "learning_rate": 1.1153347084664419e-07, + "loss": 0.5125, + "reward": 2.731147050857544, + "reward_std": 0.4906059801578522, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/reasoning_steps_reward": 0.965277761220932, + "rewards/repetition_penalty_reward": -0.02058906713500619, + "rewards/tag_count_reward": 0.953125, + "step": 1870 + }, + { + "clip_ratio": 0.0, + "completion_length": 328.7916717529297, + "epoch": 0.9355, + "grad_norm": 13.632759689287054, + "kl": 1.46875, + "learning_rate": 1.1135746261395021e-07, + "loss": 0.456, + "reward": 2.5591378211975098, + "reward_std": 0.518625944852829, + "rewards/accuracy_reward": 0.6875000149011612, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.0224593966268003, + "rewards/tag_count_reward": 0.9427083432674408, + "step": 1871 + }, + { + "clip_ratio": 0.0, + "completion_length": 470.2708435058594, + "epoch": 0.936, + "grad_norm": 7.995683940604791, + "kl": 1.75390625, + "learning_rate": 1.1118279056249653e-07, + "loss": 0.5945, + "reward": 2.374450922012329, + "reward_std": 0.7306532114744186, + "rewards/accuracy_reward": 0.6041666716337204, + "rewards/reasoning_steps_reward": 0.9027777910232544, + "rewards/repetition_penalty_reward": -0.028326891362667084, + "rewards/tag_count_reward": 0.8958333730697632, + "step": 1872 + }, + { + "clip_ratio": 0.0, + "completion_length": 379.47918701171875, + "epoch": 0.9365, + "grad_norm": 8.269844718099108, + "kl": 1.5546875, + "learning_rate": 1.1100945522436453e-07, + "loss": 0.5047, + "reward": 2.2810773849487305, + "reward_std": 0.45627573132514954, + "rewards/accuracy_reward": 0.4166666865348816, + "rewards/reasoning_steps_reward": 0.9583333134651184, + "rewards/repetition_penalty_reward": -0.021005945280194283, + "rewards/tag_count_reward": 0.9270833432674408, + "step": 1873 + }, + { + "clip_ratio": 0.0, + "completion_length": 247.1041717529297, + "epoch": 0.937, + "grad_norm": 4.879841129317827, + "kl": 0.658203125, + "learning_rate": 1.1083745712756364e-07, + "loss": 0.0382, + "reward": 2.8617427349090576, + "reward_std": 0.1286549223586917, + "rewards/accuracy_reward": 0.875, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.013257297687232494, + "rewards/tag_count_reward": 1.0, + "step": 1874 + }, + { + "clip_ratio": 0.0, + "completion_length": 315.62501525878906, + "epoch": 0.9375, + "grad_norm": 5.772943696708483, + "kl": 1.052734375, + "learning_rate": 1.1066679679602998e-07, + "loss": 0.1094, + "reward": 2.2595136165618896, + "reward_std": 0.6155846416950226, + "rewards/accuracy_reward": 0.375, + "rewards/reasoning_steps_reward": 0.951388955116272, + "rewards/repetition_penalty_reward": -0.02000036695972085, + "rewards/tag_count_reward": 0.953125, + "step": 1875 + }, + { + "clip_ratio": 0.0, + "completion_length": 562.0625152587891, + "epoch": 0.938, + "grad_norm": 12.685880213504184, + "kl": 2.75, + "learning_rate": 1.1049747474962444e-07, + "loss": 0.3528, + "reward": 2.470315456390381, + "reward_std": 0.3357051908969879, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.8958333432674408, + "rewards/repetition_penalty_reward": -0.01926790364086628, + "rewards/tag_count_reward": 0.8645833730697632, + "step": 1876 + }, + { + "clip_ratio": 0.0, + "completion_length": 399.00001525878906, + "epoch": 0.9385, + "grad_norm": 10.487921547989083, + "kl": 1.80078125, + "learning_rate": 1.1032949150413137e-07, + "loss": 0.2994, + "reward": 2.3582355976104736, + "reward_std": 0.48615631461143494, + "rewards/accuracy_reward": 0.520833358168602, + "rewards/reasoning_steps_reward": 0.9305555522441864, + "rewards/repetition_penalty_reward": -0.025445050559937954, + "rewards/tag_count_reward": 0.9322916865348816, + "step": 1877 + }, + { + "clip_ratio": 0.0, + "completion_length": 305.2708435058594, + "epoch": 0.939, + "grad_norm": 13.345702679349012, + "kl": 0.615234375, + "learning_rate": 1.1016284757125685e-07, + "loss": 0.0566, + "reward": 2.74407958984375, + "reward_std": 0.42066872119903564, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.023281564470380545, + "rewards/tag_count_reward": 0.96875, + "step": 1878 + }, + { + "clip_ratio": 0.0, + "completion_length": 381.2291717529297, + "epoch": 0.9395, + "grad_norm": 5.3082591079335915, + "kl": 1.546875, + "learning_rate": 1.099975434586272e-07, + "loss": 0.4069, + "reward": 2.447960615158081, + "reward_std": 0.7231379747390747, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.022525550797581673, + "rewards/tag_count_reward": 0.921875, + "step": 1879 + }, + { + "clip_ratio": 0.0, + "completion_length": 306.3541717529297, + "epoch": 0.94, + "grad_norm": 6.836011499419833, + "kl": 1.158203125, + "learning_rate": 1.0983357966978745e-07, + "loss": 0.2872, + "reward": 2.6273016929626465, + "reward_std": 0.5883876979351044, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9375000596046448, + "rewards/repetition_penalty_reward": -0.013323335442692041, + "rewards/tag_count_reward": 0.953125, + "step": 1880 + }, + { + "clip_ratio": 0.0, + "completion_length": 385.75001525878906, + "epoch": 0.9405, + "grad_norm": 5.637240680811261, + "kl": 1.46875, + "learning_rate": 1.096709567041997e-07, + "loss": 0.3754, + "reward": 2.4940316677093506, + "reward_std": 0.5485326498746872, + "rewards/accuracy_reward": 0.6250000149011612, + "rewards/reasoning_steps_reward": 0.9652778506278992, + "rewards/repetition_penalty_reward": -0.02332939486950636, + "rewards/tag_count_reward": 0.9270833432674408, + "step": 1881 + }, + { + "clip_ratio": 0.0, + "completion_length": 521.2083587646484, + "epoch": 0.941, + "grad_norm": 12.973036573210113, + "kl": 2.0078125, + "learning_rate": 1.0950967505724175e-07, + "loss": 0.3535, + "reward": 2.4711567163467407, + "reward_std": 0.4558318704366684, + "rewards/accuracy_reward": 0.6875, + "rewards/reasoning_steps_reward": 0.9166666865348816, + "rewards/repetition_penalty_reward": -0.018426863476634026, + "rewards/tag_count_reward": 0.8854166865348816, + "step": 1882 + }, + { + "clip_ratio": 0.0, + "completion_length": 384.2291717529297, + "epoch": 0.9415, + "grad_norm": 13.634252652179764, + "kl": 1.35546875, + "learning_rate": 1.0934973522020537e-07, + "loss": 0.5317, + "reward": 2.6534253358840942, + "reward_std": 0.546121746301651, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.020185869187116623, + "rewards/tag_count_reward": 0.9375, + "step": 1883 + }, + { + "clip_ratio": 0.0, + "completion_length": 415.3541717529297, + "epoch": 0.942, + "grad_norm": 7.550722470094486, + "kl": 1.59375, + "learning_rate": 1.0919113768029517e-07, + "loss": 0.5054, + "reward": 2.2631616592407227, + "reward_std": 0.5261791199445724, + "rewards/accuracy_reward": 0.416666679084301, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.021560687571763992, + "rewards/tag_count_reward": 0.9166666865348816, + "step": 1884 + }, + { + "clip_ratio": 0.0, + "completion_length": 407.9375, + "epoch": 0.9425, + "grad_norm": 5.164750350944079, + "kl": 1.28515625, + "learning_rate": 1.0903388292062668e-07, + "loss": 0.6485, + "reward": 2.5368224382400513, + "reward_std": 0.6539618074893951, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.016997200436890125, + "rewards/tag_count_reward": 0.9218750298023224, + "step": 1885 + }, + { + "clip_ratio": 0.0, + "completion_length": 282.2916717529297, + "epoch": 0.943, + "grad_norm": 3.8431346982306622, + "kl": 0.546875, + "learning_rate": 1.0887797142022521e-07, + "loss": 0.0839, + "reward": 2.8730608224868774, + "reward_std": 0.1409488208591938, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.02277265675365925, + "rewards/tag_count_reward": 1.0, + "step": 1886 + }, + { + "clip_ratio": 0.0, + "completion_length": 459.41668701171875, + "epoch": 0.9435, + "grad_norm": 10.504402038451365, + "kl": 1.92578125, + "learning_rate": 1.0872340365402415e-07, + "loss": 0.7645, + "reward": 2.4082196950912476, + "reward_std": 0.7337851822376251, + "rewards/accuracy_reward": 0.6250000149011612, + "rewards/reasoning_steps_reward": 0.902777761220932, + "rewards/repetition_penalty_reward": -0.020599967800080776, + "rewards/tag_count_reward": 0.9010416865348816, + "step": 1887 + }, + { + "clip_ratio": 0.0, + "completion_length": 287.29168701171875, + "epoch": 0.944, + "grad_norm": 12.459861460775484, + "kl": 1.25, + "learning_rate": 1.0857018009286381e-07, + "loss": 0.1306, + "reward": 2.6860339641571045, + "reward_std": 0.3863665908575058, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.029244041070342064, + "rewards/tag_count_reward": 0.9791666865348816, + "step": 1888 + }, + { + "clip_ratio": 0.0, + "completion_length": 351.0833435058594, + "epoch": 0.9445, + "grad_norm": 6.926656405954893, + "kl": 1.1328125, + "learning_rate": 1.0841830120348969e-07, + "loss": 0.3466, + "reward": 2.717449426651001, + "reward_std": 0.533571720123291, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9722222089767456, + "rewards/repetition_penalty_reward": -0.020397868007421494, + "rewards/tag_count_reward": 0.9739583730697632, + "step": 1889 + }, + { + "clip_ratio": 0.0, + "completion_length": 362.81250762939453, + "epoch": 0.945, + "grad_norm": 7.389820923834504, + "kl": 1.1484375, + "learning_rate": 1.0826776744855121e-07, + "loss": 0.22, + "reward": 2.472840905189514, + "reward_std": 0.3649376714602113, + "rewards/accuracy_reward": 0.5625, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.018478597048670053, + "rewards/tag_count_reward": 0.9427083432674408, + "step": 1890 + }, + { + "clip_ratio": 0.0, + "completion_length": 264.6041793823242, + "epoch": 0.9455, + "grad_norm": 6.258707268362366, + "kl": 0.951171875, + "learning_rate": 1.0811857928660037e-07, + "loss": 0.2564, + "reward": 2.780505061149597, + "reward_std": 0.317706068046391, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.01116166659630835, + "rewards/tag_count_reward": 0.9791666865348816, + "step": 1891 + }, + { + "clip_ratio": 0.0, + "completion_length": 467.6458435058594, + "epoch": 0.946, + "grad_norm": 6.7122965752871995, + "kl": 1.60546875, + "learning_rate": 1.0797073717209013e-07, + "loss": 0.432, + "reward": 2.605563759803772, + "reward_std": 0.6417441666126251, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9236111342906952, + "rewards/repetition_penalty_reward": -0.015964028425514698, + "rewards/tag_count_reward": 0.9270833432674408, + "step": 1892 + }, + { + "clip_ratio": 0.0, + "completion_length": 313.9791717529297, + "epoch": 0.9465, + "grad_norm": 6.7300045965088735, + "kl": 0.9375, + "learning_rate": 1.0782424155537314e-07, + "loss": 0.2671, + "reward": 2.7784035205841064, + "reward_std": 0.41331613063812256, + "rewards/accuracy_reward": 0.8333333730697632, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.020207691006362438, + "rewards/tag_count_reward": 0.9791666865348816, + "step": 1893 + }, + { + "clip_ratio": 0.0, + "completion_length": 548.9166717529297, + "epoch": 0.947, + "grad_norm": 11.551989318050323, + "kl": 1.875, + "learning_rate": 1.0767909288270063e-07, + "loss": 0.5684, + "reward": 2.411288857460022, + "reward_std": 0.6332357153296471, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 0.9027777910232544, + "rewards/repetition_penalty_reward": -0.01753074210137129, + "rewards/tag_count_reward": 0.9010416865348816, + "step": 1894 + }, + { + "clip_ratio": 0.0, + "completion_length": 663.4375305175781, + "epoch": 0.9475, + "grad_norm": 11.540534185730696, + "kl": 2.625, + "learning_rate": 1.0753529159622047e-07, + "loss": 0.8717, + "reward": 2.090899109840393, + "reward_std": 0.831304669380188, + "rewards/accuracy_reward": 0.3958333358168602, + "rewards/reasoning_steps_reward": 0.8958333432674408, + "rewards/repetition_penalty_reward": -0.018475967459380627, + "rewards/tag_count_reward": 0.8177083730697632, + "step": 1895 + }, + { + "clip_ratio": 0.0, + "completion_length": 409.9791717529297, + "epoch": 0.948, + "grad_norm": 4.503871661371665, + "kl": 1.1015625, + "learning_rate": 1.0739283813397639e-07, + "loss": 0.3769, + "reward": 2.603825807571411, + "reward_std": 0.39157669246196747, + "rewards/accuracy_reward": 0.7083333730697632, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.026382511481642723, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 1896 + }, + { + "clip_ratio": 0.0, + "completion_length": 481.1875305175781, + "epoch": 0.9485, + "grad_norm": 14.294347009213048, + "kl": 1.68359375, + "learning_rate": 1.0725173292990626e-07, + "loss": 1.1626, + "reward": 2.711683988571167, + "reward_std": 0.6804588735103607, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.008802221855148673, + "rewards/tag_count_reward": 0.9010416865348816, + "step": 1897 + }, + { + "clip_ratio": 0.0, + "completion_length": 573.5208435058594, + "epoch": 0.949, + "grad_norm": 7.82241175608747, + "kl": 2.578125, + "learning_rate": 1.0711197641384115e-07, + "loss": 0.8068, + "reward": 2.370753049850464, + "reward_std": 0.9385839700698853, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.847222238779068, + "rewards/repetition_penalty_reward": -0.01813590247184038, + "rewards/tag_count_reward": 0.8333333730697632, + "step": 1898 + }, + { + "clip_ratio": 0.0, + "completion_length": 467.0833435058594, + "epoch": 0.9495, + "grad_norm": 10.691850459801657, + "kl": 1.5625, + "learning_rate": 1.0697356901150353e-07, + "loss": 0.7529, + "reward": 2.5153530836105347, + "reward_std": 0.8187885880470276, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9097222089767456, + "rewards/repetition_penalty_reward": -0.02457757294178009, + "rewards/tag_count_reward": 0.921875, + "step": 1899 + }, + { + "clip_ratio": 0.0, + "completion_length": 548.8958435058594, + "epoch": 0.95, + "grad_norm": 6.406869092852505, + "kl": 2.15234375, + "learning_rate": 1.068365111445064e-07, + "loss": 0.6496, + "reward": 2.331941604614258, + "reward_std": 0.6170355081558228, + "rewards/accuracy_reward": 0.5, + "rewards/reasoning_steps_reward": 0.9652778506278992, + "rewards/repetition_penalty_reward": -0.0291697159409523, + "rewards/tag_count_reward": 0.8958333432674408, + "step": 1900 + }, + { + "clip_ratio": 0.0, + "completion_length": 595.0000228881836, + "epoch": 0.9505, + "grad_norm": 17.669263047736123, + "kl": 2.453125, + "learning_rate": 1.0670080323035176e-07, + "loss": 0.3439, + "reward": 2.2904654145240784, + "reward_std": 0.47801604866981506, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 0.9097222685813904, + "rewards/repetition_penalty_reward": -0.01509033516049385, + "rewards/tag_count_reward": 0.875, + "step": 1901 + }, + { + "clip_ratio": 0.0, + "completion_length": 541.0208435058594, + "epoch": 0.951, + "grad_norm": 4.730967706498896, + "kl": 2.078125, + "learning_rate": 1.0656644568242946e-07, + "loss": 0.5377, + "reward": 2.2303082942962646, + "reward_std": 0.7414398193359375, + "rewards/accuracy_reward": 0.4791666865348816, + "rewards/reasoning_steps_reward": 0.8888888955116272, + "rewards/repetition_penalty_reward": -0.028372248634696007, + "rewards/tag_count_reward": 0.890625, + "step": 1902 + }, + { + "clip_ratio": 0.0, + "completion_length": 455.4583435058594, + "epoch": 0.9515, + "grad_norm": 10.877460950090613, + "kl": 1.27734375, + "learning_rate": 1.0643343891001591e-07, + "loss": 0.7656, + "reward": 2.5471227169036865, + "reward_std": 0.412681981921196, + "rewards/accuracy_reward": 0.6875, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.024057872593402863, + "rewards/tag_count_reward": 0.9322916865348816, + "step": 1903 + }, + { + "clip_ratio": 0.0, + "completion_length": 361.4583435058594, + "epoch": 0.952, + "grad_norm": 12.3885461588749, + "kl": 1.26953125, + "learning_rate": 1.063017833182728e-07, + "loss": 0.7706, + "reward": 2.5880606174468994, + "reward_std": 0.756420761346817, + "rewards/accuracy_reward": 0.75, + "rewards/reasoning_steps_reward": 0.9166666865348816, + "rewards/repetition_penalty_reward": -0.01610612729564309, + "rewards/tag_count_reward": 0.9375, + "step": 1904 + }, + { + "clip_ratio": 0.0, + "completion_length": 311.9166717529297, + "epoch": 0.9525, + "grad_norm": 10.093151031262789, + "kl": 1.091796875, + "learning_rate": 1.0617147930824586e-07, + "loss": 0.237, + "reward": 2.755587577819824, + "reward_std": 0.5843307077884674, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9652778506278992, + "rewards/repetition_penalty_reward": -0.016982081811875105, + "rewards/tag_count_reward": 0.9531250298023224, + "step": 1905 + }, + { + "clip_ratio": 0.0, + "completion_length": 426.7708435058594, + "epoch": 0.953, + "grad_norm": 8.18441727922782, + "kl": 1.35546875, + "learning_rate": 1.0604252727686379e-07, + "loss": 0.6793, + "reward": 2.3721930980682373, + "reward_std": 0.6670109927654266, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 0.9375000298023224, + "rewards/repetition_penalty_reward": -0.023640274070203304, + "rewards/tag_count_reward": 0.9375000298023224, + "step": 1906 + }, + { + "clip_ratio": 0.0, + "completion_length": 332.4583435058594, + "epoch": 0.9535, + "grad_norm": 5.6481670754710835, + "kl": 0.818359375, + "learning_rate": 1.0591492761693674e-07, + "loss": 0.2792, + "reward": 2.5908457040786743, + "reward_std": 0.29737700521945953, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.022001695819199085, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 1907 + }, + { + "clip_ratio": 0.0, + "completion_length": 324.2083435058594, + "epoch": 0.954, + "grad_norm": 16.28687030981918, + "kl": 1.1875, + "learning_rate": 1.0578868071715544e-07, + "loss": 0.4517, + "reward": 2.8130040168762207, + "reward_std": 0.47449105978012085, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.01859321352094412, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 1908 + }, + { + "clip_ratio": 0.0, + "completion_length": 443.3333435058594, + "epoch": 0.9545, + "grad_norm": 6.091387449777901, + "kl": 1.541015625, + "learning_rate": 1.0566378696208987e-07, + "loss": 0.6145, + "reward": 2.5944347381591797, + "reward_std": 0.6039746999740601, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.020148571580648422, + "rewards/tag_count_reward": 0.9270833432674408, + "step": 1909 + }, + { + "clip_ratio": 0.0, + "completion_length": 468.6666717529297, + "epoch": 0.955, + "grad_norm": 5.333170833207152, + "kl": 1.3046875, + "learning_rate": 1.0554024673218806e-07, + "loss": 0.5553, + "reward": 2.528602719306946, + "reward_std": 0.48105429112911224, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.03910571709275246, + "rewards/tag_count_reward": 0.9427083432674408, + "step": 1910 + }, + { + "clip_ratio": 0.0, + "completion_length": 298.7083435058594, + "epoch": 0.9555, + "grad_norm": 5.486537631035801, + "kl": 1.017578125, + "learning_rate": 1.054180604037749e-07, + "loss": 0.1708, + "reward": 2.73952579498291, + "reward_std": 0.3539083171635866, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.03130759112536907, + "rewards/tag_count_reward": 0.9583333432674408, + "step": 1911 + }, + { + "clip_ratio": 0.0, + "completion_length": 327.4791717529297, + "epoch": 0.956, + "grad_norm": 9.305310519927465, + "kl": 0.92578125, + "learning_rate": 1.0529722834905125e-07, + "loss": 0.2203, + "reward": 2.675445556640625, + "reward_std": 0.3721562922000885, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.031151846051216125, + "rewards/tag_count_reward": 0.984375, + "step": 1912 + }, + { + "clip_ratio": 0.0, + "completion_length": 364.5416717529297, + "epoch": 0.9565, + "grad_norm": 4.1629931880553315, + "kl": 1.1962890625, + "learning_rate": 1.0517775093609241e-07, + "loss": 0.5035, + "reward": 2.6332170963287354, + "reward_std": 0.21848932653665543, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.03865810390561819, + "rewards/tag_count_reward": 0.984375, + "step": 1913 + }, + { + "clip_ratio": 0.0, + "completion_length": 388.91668701171875, + "epoch": 0.957, + "grad_norm": 6.405039325043405, + "kl": 1.72265625, + "learning_rate": 1.0505962852884739e-07, + "loss": 0.5846, + "reward": 2.712180018424988, + "reward_std": 0.5378515720367432, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.02045894879847765, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 1914 + }, + { + "clip_ratio": 0.0, + "completion_length": 412.83335876464844, + "epoch": 0.9575, + "grad_norm": 16.248265342729503, + "kl": 1.7421875, + "learning_rate": 1.0494286148713744e-07, + "loss": 0.9792, + "reward": 2.438920497894287, + "reward_std": 0.568135529756546, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 0.8958333730697632, + "rewards/repetition_penalty_reward": -0.014204645762220025, + "rewards/tag_count_reward": 0.9322916865348816, + "step": 1915 + }, + { + "clip_ratio": 0.0, + "completion_length": 490.60418701171875, + "epoch": 0.958, + "grad_norm": 13.583903329253506, + "kl": 1.8203125, + "learning_rate": 1.0482745016665526e-07, + "loss": 0.7488, + "reward": 2.5179425477981567, + "reward_std": 0.6293874979019165, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9305555522441864, + "rewards/repetition_penalty_reward": -0.037613097578287125, + "rewards/tag_count_reward": 0.9166666865348816, + "step": 1916 + }, + { + "clip_ratio": 0.0, + "completion_length": 268.9375, + "epoch": 0.9585, + "grad_norm": 3.8538643099347962, + "kl": 0.916015625, + "learning_rate": 1.0471339491896373e-07, + "loss": 0.3297, + "reward": 2.9169150590896606, + "reward_std": 0.2011337815783918, + "rewards/accuracy_reward": 0.9791666865348816, + "rewards/reasoning_steps_reward": 0.965277761220932, + "rewards/repetition_penalty_reward": -0.011904461309313774, + "rewards/tag_count_reward": 0.984375, + "step": 1917 + }, + { + "clip_ratio": 0.0, + "completion_length": 373.7083435058594, + "epoch": 0.959, + "grad_norm": 8.19149127254569, + "kl": 1.080078125, + "learning_rate": 1.0460069609149496e-07, + "loss": 0.3442, + "reward": 2.6776570081710815, + "reward_std": 0.4415852725505829, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.02546807937324047, + "rewards/tag_count_reward": 0.953125, + "step": 1918 + }, + { + "clip_ratio": 0.0, + "completion_length": 425.1041717529297, + "epoch": 0.9595, + "grad_norm": 7.078016567150192, + "kl": 1.62109375, + "learning_rate": 1.044893540275491e-07, + "loss": 0.5886, + "reward": 2.3200796842575073, + "reward_std": 0.72013059258461, + "rewards/accuracy_reward": 0.5000000298023224, + "rewards/reasoning_steps_reward": 0.9305555820465088, + "rewards/repetition_penalty_reward": -0.02193425875157118, + "rewards/tag_count_reward": 0.9114583432674408, + "step": 1919 + }, + { + "clip_ratio": 0.0, + "completion_length": 376.0416717529297, + "epoch": 0.96, + "grad_norm": 3.7664864308299464, + "kl": 1.33203125, + "learning_rate": 1.0437936906629334e-07, + "loss": 0.6744, + "reward": 2.767207145690918, + "reward_std": 0.5426684468984604, + "rewards/accuracy_reward": 0.8958333730697632, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.01925129722803831, + "rewards/tag_count_reward": 0.9322916865348816, + "step": 1920 + }, + { + "clip_ratio": 0.0, + "completion_length": 493.375, + "epoch": 0.9605, + "grad_norm": 8.937614698943863, + "kl": 2.099609375, + "learning_rate": 1.0427074154276104e-07, + "loss": 0.5657, + "reward": 2.1462767124176025, + "reward_std": 0.6453874707221985, + "rewards/accuracy_reward": 0.3750000149011612, + "rewards/reasoning_steps_reward": 0.916666716337204, + "rewards/repetition_penalty_reward": -0.020390215329825878, + "rewards/tag_count_reward": 0.8750000298023224, + "step": 1921 + }, + { + "clip_ratio": 0.0, + "completion_length": 473.91668701171875, + "epoch": 0.961, + "grad_norm": 13.323431549636778, + "kl": 2.11328125, + "learning_rate": 1.0416347178785039e-07, + "loss": 0.846, + "reward": 2.5011810064315796, + "reward_std": 0.6241348683834076, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9236111342906952, + "rewards/repetition_penalty_reward": -0.02138859312981367, + "rewards/tag_count_reward": 0.9114583432674408, + "step": 1922 + }, + { + "clip_ratio": 0.0, + "completion_length": 350.37501525878906, + "epoch": 0.9615, + "grad_norm": 7.917549405794224, + "kl": 1.013671875, + "learning_rate": 1.0405756012832367e-07, + "loss": 0.4971, + "reward": 2.590930223464966, + "reward_std": 0.4288761019706726, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.02191714197397232, + "rewards/tag_count_reward": 0.9739583730697632, + "step": 1923 + }, + { + "clip_ratio": 0.0, + "completion_length": 309.7916717529297, + "epoch": 0.962, + "grad_norm": 9.512873967750881, + "kl": 1.076171875, + "learning_rate": 1.0395300688680625e-07, + "loss": 0.5754, + "reward": 2.356972813606262, + "reward_std": 0.4319635033607483, + "rewards/accuracy_reward": 0.4583333432674408, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.011082816403359175, + "rewards/tag_count_reward": 0.9583333432674408, + "step": 1924 + }, + { + "clip_ratio": 0.0, + "completion_length": 451.14585876464844, + "epoch": 0.9625, + "grad_norm": 7.649494528326308, + "kl": 1.37890625, + "learning_rate": 1.0384981238178533e-07, + "loss": 0.7388, + "reward": 2.306758999824524, + "reward_std": 0.5872194170951843, + "rewards/accuracy_reward": 0.4583333432674408, + "rewards/reasoning_steps_reward": 0.9305555820465088, + "rewards/repetition_penalty_reward": -0.019629981368780136, + "rewards/tag_count_reward": 0.9375, + "step": 1925 + }, + { + "clip_ratio": 0.0, + "completion_length": 313.37501525878906, + "epoch": 0.963, + "grad_norm": 7.3664247239012175, + "kl": 0.87109375, + "learning_rate": 1.0374797692760933e-07, + "loss": 0.29, + "reward": 2.7261266708374023, + "reward_std": 0.4131031781435013, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.02040126919746399, + "rewards/tag_count_reward": 0.9687500298023224, + "step": 1926 + }, + { + "clip_ratio": 0.0, + "completion_length": 457.72918701171875, + "epoch": 0.9635, + "grad_norm": 9.740860339646286, + "kl": 1.5390625, + "learning_rate": 1.036475008344867e-07, + "loss": 0.8369, + "reward": 2.6725724935531616, + "reward_std": 0.5475434064865112, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.951388955116272, + "rewards/repetition_penalty_reward": -0.02360812947154045, + "rewards/tag_count_reward": 0.9322916865348816, + "step": 1927 + }, + { + "clip_ratio": 0.0, + "completion_length": 526.0208435058594, "epoch": 0.964, - "grad_norm": 72.11734538341996, - "kl": 6.40625, + "grad_norm": 7.32137984381808, + "kl": 2.19140625, "learning_rate": 1.0354838440848501e-07, - "loss": 0.7886, - "reward": 2.2568776607513428, - "reward_std": 0.48433394730091095, - "rewards/accuracy_reward": 0.4375000149011612, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.017428037710487843, - "rewards/tag_count_reward": 0.8645833730697632, - "step": 964 + "loss": 0.8225, + "reward": 2.199436902999878, + "reward_std": 0.6961483359336853, + "rewards/accuracy_reward": 0.4166666865348816, + "rewards/reasoning_steps_reward": 0.9375000596046448, + "rewards/repetition_penalty_reward": -0.008896507322788239, + "rewards/tag_count_reward": 0.8541666865348816, + "step": 1928 + }, + { + "clip_ratio": 0.0, + "completion_length": 312.00001525878906, + "epoch": 0.9645, + "grad_norm": 7.548627408249209, + "kl": 1.587890625, + "learning_rate": 1.0345062795153009e-07, + "loss": 0.22, + "reward": 2.5896737575531006, + "reward_std": 0.4959706515073776, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.024909449741244316, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 1929 }, { "clip_ratio": 0.0, - "completion_length": 414.0833435058594, + "completion_length": 540.4375152587891, "epoch": 0.965, - "grad_norm": 22.298889854844614, - "kl": 3.390625, + "grad_norm": 4.690334600576642, + "kl": 1.890625, "learning_rate": 1.0335423176140511e-07, - "loss": 0.4795, - "reward": 2.5144201517105103, - "reward_std": 0.6362143456935883, - "rewards/accuracy_reward": 0.6458333432674408, - "rewards/reasoning_steps_reward": 0.958333432674408, - "rewards/repetition_penalty_reward": -0.016829868778586388, - "rewards/tag_count_reward": 0.9270833432674408, - "step": 965 + "loss": 0.6585, + "reward": 2.1660616397857666, + "reward_std": 0.6110673546791077, + "rewards/accuracy_reward": 0.3750000149011612, + "rewards/reasoning_steps_reward": 0.9236111044883728, + "rewards/repetition_penalty_reward": -0.01796635054051876, + "rewards/tag_count_reward": 0.8854166865348816, + "step": 1930 + }, + { + "clip_ratio": 0.0, + "completion_length": 483.3958435058594, + "epoch": 0.9655, + "grad_norm": 4.681975241875133, + "kl": 1.26953125, + "learning_rate": 1.0325919613174951e-07, + "loss": 0.6228, + "reward": 2.6714521646499634, + "reward_std": 0.6622795760631561, + "rewards/accuracy_reward": 0.8125, + "rewards/reasoning_steps_reward": 0.9583334028720856, + "rewards/repetition_penalty_reward": -0.02125630248337984, + "rewards/tag_count_reward": 0.9218750298023224, + "step": 1931 }, { "clip_ratio": 0.0, - "completion_length": 511.43751525878906, + "completion_length": 317.0208435058594, "epoch": 0.966, - "grad_norm": 21.828332739197077, - "kl": 5.1015625, + "grad_norm": 5.82480472157766, + "kl": 1.294921875, "learning_rate": 1.0316552135205837e-07, - "loss": 0.8053, - "reward": 2.537219762802124, - "reward_std": 0.5350492745637894, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/reasoning_steps_reward": 0.9583334028720856, - "rewards/repetition_penalty_reward": -0.020071997307240963, - "rewards/tag_count_reward": 0.8906250298023224, - "step": 966 + "loss": 0.2464, + "reward": 2.7479259967803955, + "reward_std": 0.4953030524775386, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 0.9305555820465088, + "rewards/repetition_penalty_reward": -0.010754720773547888, + "rewards/tag_count_reward": 0.9322916865348816, + "step": 1932 + }, + { + "clip_ratio": 0.0, + "completion_length": 331.5416717529297, + "epoch": 0.9665, + "grad_norm": 6.724427674491569, + "kl": 1.34765625, + "learning_rate": 1.0307320770768129e-07, + "loss": 0.0875, + "reward": 2.34666109085083, + "reward_std": 0.47824424505233765, + "rewards/accuracy_reward": 0.4583333432674408, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.02660275436937809, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 1933 }, { "clip_ratio": 0.0, - "completion_length": 437.9166717529297, + "completion_length": 532.9583435058594, "epoch": 0.967, - "grad_norm": 36.06941047437207, - "kl": 4.046875, + "grad_norm": 12.406632178922738, + "kl": 2.44140625, "learning_rate": 1.029822554798216e-07, - "loss": 0.7437, - "reward": 2.4883724451065063, - "reward_std": 0.756380021572113, - "rewards/accuracy_reward": 0.6875, - "rewards/reasoning_steps_reward": 0.8958333432674408, - "rewards/repetition_penalty_reward": -0.016835974529385567, - "rewards/tag_count_reward": 0.9218750298023224, - "step": 967 + "loss": 0.7136, + "reward": 2.3831958770751953, + "reward_std": 0.7278265357017517, + "rewards/accuracy_reward": 0.625, + "rewards/reasoning_steps_reward": 0.902777761220932, + "rewards/repetition_penalty_reward": -0.014373642392456532, + "rewards/tag_count_reward": 0.8697916865348816, + "step": 1934 }, { "clip_ratio": 0.0, - "completion_length": 361.47918701171875, + "completion_length": 274.7083435058594, + "epoch": 0.9675, + "grad_norm": 10.363127459700797, + "kl": 0.884765625, + "learning_rate": 1.0289266494553565e-07, + "loss": 0.1206, + "reward": 2.2719321250915527, + "reward_std": 0.19579820428043604, + "rewards/accuracy_reward": 0.2916666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.019734575413167477, + "rewards/tag_count_reward": 1.0, + "step": 1935 + }, + { + "clip_ratio": 0.0, + "completion_length": 292.5208435058594, "epoch": 0.968, - "grad_norm": 21.352797175646803, - "kl": 1.361328125, + "grad_norm": 3.7229224037837403, + "kl": 1.138671875, "learning_rate": 1.0280443637773163e-07, - "loss": 0.2428, - "reward": 2.3535938262939453, - "reward_std": 0.34989143908023834, - "rewards/accuracy_reward": 0.4166666865348816, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.017933969385921955, + "loss": 0.1786, + "reward": 2.8120529651641846, + "reward_std": 0.497484490275383, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.9444445371627808, + "rewards/repetition_penalty_reward": -0.01780821569263935, "rewards/tag_count_reward": 0.96875, - "step": 968 + "step": 1936 + }, + { + "clip_ratio": 0.0, + "completion_length": 419.14585876464844, + "epoch": 0.9685, + "grad_norm": 6.842482946515753, + "kl": 1.5859375, + "learning_rate": 1.0271757004516918e-07, + "loss": 0.9308, + "reward": 2.6955126523971558, + "reward_std": 0.6962899565696716, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9305555820465088, + "rewards/repetition_penalty_reward": -0.016293123364448547, + "rewards/tag_count_reward": 0.9270833432674408, + "step": 1937 }, { "clip_ratio": 0.0, - "completion_length": 397.4583435058594, + "completion_length": 431.81251525878906, "epoch": 0.969, - "grad_norm": 35.203792265802775, - "kl": 2.890625, + "grad_norm": 4.528182947858323, + "kl": 1.62890625, "learning_rate": 1.0263206621245807e-07, - "loss": 0.7547, - "reward": 2.354074716567993, - "reward_std": 0.3742067515850067, - "rewards/accuracy_reward": 0.4375000149011612, - "rewards/reasoning_steps_reward": 0.9722222089767456, - "rewards/repetition_penalty_reward": -0.00877267774194479, + "loss": 0.5407, + "reward": 2.654048204421997, + "reward_std": 0.5423067063093185, + "rewards/accuracy_reward": 0.8333333730697632, + "rewards/reasoning_steps_reward": 0.9305555820465088, + "rewards/repetition_penalty_reward": -0.016090850345790386, + "rewards/tag_count_reward": 0.9062500298023224, + "step": 1938 + }, + { + "clip_ratio": 0.0, + "completion_length": 419.2916717529297, + "epoch": 0.9695, + "grad_norm": 10.72838395202056, + "kl": 1.0546875, + "learning_rate": 1.0254792514005792e-07, + "loss": 0.661, + "reward": 2.7793266773223877, + "reward_std": 0.44574533961713314, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.0418538823723793, "rewards/tag_count_reward": 0.953125, - "step": 969 + "step": 1939 }, { "clip_ratio": 0.0, - "completion_length": 422.56251525878906, + "completion_length": 409.1875, "epoch": 0.97, - "grad_norm": 38.23995456003063, - "kl": 4.59375, + "grad_norm": 5.836202176867066, + "kl": 1.3515625, "learning_rate": 1.0246514708427701e-07, - "loss": 0.5101, - "reward": 2.6607662439346313, - "reward_std": 0.5710070729255676, - "rewards/accuracy_reward": 0.8125, - "rewards/reasoning_steps_reward": 0.9305556118488312, - "rewards/repetition_penalty_reward": -0.014581102412194014, - "rewards/tag_count_reward": 0.9322916865348816, - "step": 970 + "loss": 0.6253, + "reward": 2.5530662536621094, + "reward_std": 0.36100663244724274, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.021586645394563675, + "rewards/tag_count_reward": 0.9218750298023224, + "step": 1940 + }, + { + "clip_ratio": 0.0, + "completion_length": 352.56251525878906, + "epoch": 0.9705, + "grad_norm": 4.689568620130673, + "kl": 1.009765625, + "learning_rate": 1.0238373229727166e-07, + "loss": 0.2688, + "reward": 2.7452911138534546, + "reward_std": 0.22191456332802773, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/reasoning_steps_reward": 0.9722222089767456, + "rewards/repetition_penalty_reward": -0.013389479368925095, + "rewards/tag_count_reward": 0.953125, + "step": 1941 }, { "clip_ratio": 0.0, - "completion_length": 518.6875305175781, + "completion_length": 301.7083435058594, "epoch": 0.971, - "grad_norm": 21.69225757308708, - "kl": 5.203125, + "grad_norm": 6.599491959106032, + "kl": 0.931640625, "learning_rate": 1.0230368102704531e-07, - "loss": 0.8592, - "reward": 2.31989324092865, - "reward_std": 0.6834602952003479, - "rewards/accuracy_reward": 0.5000000298023224, - "rewards/reasoning_steps_reward": 0.9583334028720856, - "rewards/repetition_penalty_reward": -0.018648307770490646, - "rewards/tag_count_reward": 0.8802083432674408, - "step": 971 + "loss": 0.3311, + "reward": 2.8716362714767456, + "reward_std": 0.38382330536842346, + "rewards/accuracy_reward": 0.9375000298023224, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.018988667987287045, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 1942 + }, + { + "clip_ratio": 0.0, + "completion_length": 561.4375305175781, + "epoch": 0.9715, + "grad_norm": 8.295763130048279, + "kl": 2.08203125, + "learning_rate": 1.022249935174482e-07, + "loss": 1.1474, + "reward": 2.3098472356796265, + "reward_std": 0.7834429144859314, + "rewards/accuracy_reward": 0.6041666716337204, + "rewards/reasoning_steps_reward": 0.8611111640930176, + "rewards/repetition_penalty_reward": -0.020014054141938686, + "rewards/tag_count_reward": 0.8645833432674408, + "step": 1943 }, { "clip_ratio": 0.0, - "completion_length": 512.9375, + "completion_length": 446.3125305175781, "epoch": 0.972, - "grad_norm": 35.530778404533244, - "kl": 6.0859375, + "grad_norm": 8.663254346647918, + "kl": 1.36328125, "learning_rate": 1.0214767000817596e-07, - "loss": 0.8544, - "reward": 2.3701701164245605, - "reward_std": 0.8559737205505371, - "rewards/accuracy_reward": 0.625, - "rewards/reasoning_steps_reward": 0.8958334028720856, - "rewards/repetition_penalty_reward": -0.02045489940792322, - "rewards/tag_count_reward": 0.8697916865348816, - "step": 972 + "loss": 0.5033, + "reward": 2.553582787513733, + "reward_std": 0.5564675778150558, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.03322279639542103, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 1944 }, { "clip_ratio": 0.0, - "completion_length": 524.1666870117188, + "completion_length": 355.62501525878906, + "epoch": 0.9725, + "grad_norm": 4.153430704410522, + "kl": 1.0390625, + "learning_rate": 1.0207171073476951e-07, + "loss": 0.247, + "reward": 2.5982961654663086, + "reward_std": 0.31510170828551054, + "rewards/accuracy_reward": 0.6875, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.03712050523608923, + "rewards/tag_count_reward": 0.96875, + "step": 1945 + }, + { + "clip_ratio": 0.0, + "completion_length": 365.6458435058594, "epoch": 0.973, - "grad_norm": 30.66580558458817, - "kl": 8.59375, + "grad_norm": 6.995041095518159, + "kl": 1.1484375, "learning_rate": 1.01997115928614e-07, - "loss": 1.0968, - "reward": 2.2603888511657715, - "reward_std": 0.8733722567558289, - "rewards/accuracy_reward": 0.5833333432674408, - "rewards/reasoning_steps_reward": 0.8888888955116272, - "rewards/repetition_penalty_reward": -0.019125062506645918, - "rewards/tag_count_reward": 0.8072916865348816, - "step": 973 + "loss": 0.5551, + "reward": 2.7796308994293213, + "reward_std": 0.35975193604826927, + "rewards/accuracy_reward": 0.875, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.018980273976922035, + "rewards/tag_count_reward": 0.9583333432674408, + "step": 1946 + }, + { + "clip_ratio": 0.0, + "completion_length": 431.12501525878906, + "epoch": 0.9735, + "grad_norm": 4.410695064600253, + "kl": 1.57421875, + "learning_rate": 1.0192388581693806e-07, + "loss": 0.6488, + "reward": 2.5324418544769287, + "reward_std": 0.6425078958272934, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.033530326560139656, + "rewards/tag_count_reward": 0.9270833432674408, + "step": 1947 }, { "clip_ratio": 0.0, - "completion_length": 598.7916870117188, + "completion_length": 421.6041717529297, "epoch": 0.974, - "grad_norm": 21.256888216031047, - "kl": 6.8125, + "grad_norm": 10.096856330968942, + "kl": 1.796875, "learning_rate": 1.0185202062281336e-07, - "loss": 1.205, - "reward": 2.3526848554611206, - "reward_std": 0.7947600483894348, - "rewards/accuracy_reward": 0.6250000149011612, - "rewards/reasoning_steps_reward": 0.9305555522441864, - "rewards/repetition_penalty_reward": -0.02578743826597929, - "rewards/tag_count_reward": 0.8229166865348816, - "step": 974 + "loss": 0.3495, + "reward": 2.2948466539382935, + "reward_std": 0.3842976242303848, + "rewards/accuracy_reward": 0.5000000204890966, + "rewards/reasoning_steps_reward": 0.9027777910232544, + "rewards/repetition_penalty_reward": -0.019389580003917217, + "rewards/tag_count_reward": 0.9114583432674408, + "step": 1948 + }, + { + "clip_ratio": 0.0, + "completion_length": 413.22918701171875, + "epoch": 0.9745, + "grad_norm": 4.028820810607798, + "kl": 1.22265625, + "learning_rate": 1.0178152056515371e-07, + "loss": 0.7743, + "reward": 2.539444088935852, + "reward_std": 0.5672547519207001, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.0161115531809628, + "rewards/tag_count_reward": 0.9375, + "step": 1949 }, { "clip_ratio": 0.0, - "completion_length": 607.625, + "completion_length": 301.3958435058594, "epoch": 0.975, - "grad_norm": 19.685066419282403, - "kl": 6.890625, + "grad_norm": 7.863861707517645, + "kl": 1.05859375, "learning_rate": 1.017123858587145e-07, - "loss": 0.9204, - "reward": 2.2289880514144897, - "reward_std": 0.5278165340423584, - "rewards/accuracy_reward": 0.5208333432674408, - "rewards/reasoning_steps_reward": 0.9236111044883728, - "rewards/repetition_penalty_reward": -0.027956443838775158, - "rewards/tag_count_reward": 0.8125000298023224, - "step": 975 + "loss": 0.1674, + "reward": 2.773741364479065, + "reward_std": 0.4449689909815788, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.023133596405386925, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 1950 }, { "clip_ratio": 0.0, - "completion_length": 585.8541870117188, + "completion_length": 621.6041870117188, + "epoch": 0.9755, + "grad_norm": 496.05634241089564, + "kl": 2.5625, + "learning_rate": 1.0164461671409212e-07, + "loss": 1.1447, + "reward": 2.413469076156616, + "reward_std": 0.8790780007839203, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.854166716337204, + "rewards/repetition_penalty_reward": -0.018822629936039448, + "rewards/tag_count_reward": 0.8489583730697632, + "step": 1951 + }, + { + "clip_ratio": 0.0, + "completion_length": 449.4166717529297, "epoch": 0.976, - "grad_norm": 21.89405265517972, - "kl": 7.140625, + "grad_norm": 4.37873588820322, + "kl": 1.5078125, "learning_rate": 1.0157821333772304e-07, - "loss": 1.0002, - "reward": 2.272312641143799, - "reward_std": 0.7230339646339417, - "rewards/accuracy_reward": 0.5625000298023224, - "rewards/reasoning_steps_reward": 0.9375, - "rewards/repetition_penalty_reward": -0.01935403887182474, - "rewards/tag_count_reward": 0.7916666865348816, - "step": 976 + "loss": 0.6414, + "reward": 2.761192560195923, + "reward_std": 0.3843380808830261, + "rewards/accuracy_reward": 0.8958333730697632, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.025265809148550034, + "rewards/tag_count_reward": 0.9322916865348816, + "step": 1952 + }, + { + "clip_ratio": 0.0, + "completion_length": 313.8958435058594, + "epoch": 0.9765, + "grad_norm": 7.709851287682846, + "kl": 1.125, + "learning_rate": 1.0151317593188354e-07, + "loss": 0.4473, + "reward": 2.742506980895996, + "reward_std": 0.28937215672340244, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.009229286457411945, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 1953 }, { "clip_ratio": 0.0, - "completion_length": 517.0833435058594, + "completion_length": 585.6041717529297, "epoch": 0.977, - "grad_norm": 19.225554130007215, - "kl": 4.0234375, + "grad_norm": 7.1975720249775526, + "kl": 2.0546875, "learning_rate": 1.014495046946888e-07, - "loss": 0.8685, - "reward": 2.6582794189453125, - "reward_std": 0.5991591513156891, - "rewards/accuracy_reward": 0.8125000298023224, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.022276088129729033, - "rewards/tag_count_reward": 0.8958333432674408, - "step": 977 + "loss": 0.6728, + "reward": 2.314143478870392, + "reward_std": 0.5873344540596008, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.026134257204830647, + "rewards/tag_count_reward": 0.8541666865348816, + "step": 1954 + }, + { + "clip_ratio": 0.0, + "completion_length": 461.8958435058594, + "epoch": 0.9775, + "grad_norm": 4.880043762137739, + "kl": 1.34765625, + "learning_rate": 1.0138719982009242e-07, + "loss": 0.6723, + "reward": 2.327051043510437, + "reward_std": 0.6554215252399445, + "rewards/accuracy_reward": 0.4791666865348816, + "rewards/reasoning_steps_reward": 0.9375000298023224, + "rewards/repetition_penalty_reward": -0.021907367277890444, + "rewards/tag_count_reward": 0.9322916865348816, + "step": 1955 }, { "clip_ratio": 0.0, - "completion_length": 436.8333435058594, + "completion_length": 450.6041717529297, "epoch": 0.978, - "grad_norm": 40.49511138880158, - "kl": 2.71875, + "grad_norm": 4.714021383660871, + "kl": 1.515625, "learning_rate": 1.013262614978859e-07, - "loss": 0.8484, - "reward": 2.702500820159912, - "reward_std": 0.6443986296653748, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 0.951388955116272, - "rewards/repetition_penalty_reward": -0.024929875042289495, - "rewards/tag_count_reward": 0.9218750298023224, - "step": 978 + "loss": 0.3862, + "reward": 2.6686251163482666, + "reward_std": 0.49254344403743744, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9861111640930176, + "rewards/repetition_penalty_reward": -0.03102776501327753, + "rewards/tag_count_reward": 0.921875, + "step": 1956 }, { "clip_ratio": 0.0, - "completion_length": 469.5416717529297, + "completion_length": 374.9166717529297, + "epoch": 0.9785, + "grad_norm": 4.593658438724619, + "kl": 1.19140625, + "learning_rate": 1.0126668991369792e-07, + "loss": 0.3453, + "reward": 2.5224099159240723, + "reward_std": 0.4157126843929291, + "rewards/accuracy_reward": 0.6250000149011612, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.024465198628604412, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 1957 + }, + { + "clip_ratio": 0.0, + "completion_length": 270.00001525878906, "epoch": 0.979, - "grad_norm": 31.345278883427696, - "kl": 3.375, + "grad_norm": 8.529770354299277, + "kl": 0.966796875, "learning_rate": 1.0120848524899386e-07, - "loss": 0.9379, - "reward": 2.366294264793396, - "reward_std": 0.646638810634613, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.0278029702603817, - "rewards/tag_count_reward": 0.8802083730697632, - "step": 979 + "loss": 0.3444, + "reward": 2.6569780111312866, + "reward_std": 0.3028900623321533, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.006216405134182423, + "rewards/tag_count_reward": 0.96875, + "step": 1958 + }, + { + "clip_ratio": 0.0, + "completion_length": 473.10418701171875, + "epoch": 0.9795, + "grad_norm": 5.589279353041215, + "kl": 1.9375, + "learning_rate": 1.0115164768107522e-07, + "loss": 0.7369, + "reward": 2.5905479192733765, + "reward_std": 0.7043185234069824, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.92361119389534, + "rewards/repetition_penalty_reward": -0.025771519169211388, + "rewards/tag_count_reward": 0.9010416865348816, + "step": 1959 }, { "clip_ratio": 0.0, - "completion_length": 448.4583435058594, + "completion_length": 253.41667938232422, "epoch": 0.98, - "grad_norm": 14.372103163574602, - "kl": 2.703125, + "grad_norm": 4.817368128682914, + "kl": 0.62109375, "learning_rate": 1.0109617738307911e-07, - "loss": 0.4298, - "reward": 2.4362382888793945, - "reward_std": 0.5597494542598724, - "rewards/accuracy_reward": 0.6041666865348816, - "rewards/reasoning_steps_reward": 0.9583333730697632, - "rewards/repetition_penalty_reward": -0.03251180611550808, - "rewards/tag_count_reward": 0.9062500298023224, - "step": 980 + "loss": 0.0064, + "reward": 2.6383495330810547, + "reward_std": 0.2589970678091049, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.0213725995272398, + "rewards/tag_count_reward": 1.0, + "step": 1960 + }, + { + "clip_ratio": 0.0, + "completion_length": 366.6041717529297, + "epoch": 0.9805, + "grad_norm": 10.352217018184477, + "kl": 1.03125, + "learning_rate": 1.0104207452397761e-07, + "loss": 0.6742, + "reward": 2.5156456232070923, + "reward_std": 0.5356017798185349, + "rewards/accuracy_reward": 0.6041666716337204, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.015604355372488499, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 1961 + }, + { + "clip_ratio": 0.0, + "completion_length": 340.18750762939453, + "epoch": 0.981, + "grad_norm": 5.328236970495506, + "kl": 0.94921875, + "learning_rate": 1.0098933926857752e-07, + "loss": 0.3793, + "reward": 2.6204140186309814, + "reward_std": 0.24081332981586456, + "rewards/accuracy_reward": 0.7291666716337204, + "rewards/reasoning_steps_reward": 0.9375000596046448, + "rewards/repetition_penalty_reward": -0.02541936282068491, + "rewards/tag_count_reward": 0.9791666865348816, + "step": 1962 + }, + { + "clip_ratio": 0.0, + "completion_length": 710.3125305175781, + "epoch": 0.9815, + "grad_norm": 17.365217085343485, + "kl": 2.859375, + "learning_rate": 1.0093797177751944e-07, + "loss": 1.1361, + "reward": 2.3238528966903687, + "reward_std": 0.8956755101680756, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.8472222685813904, + "rewards/repetition_penalty_reward": -0.018161091022193432, + "rewards/tag_count_reward": 0.8281250298023224, + "step": 1963 }, { "clip_ratio": 0.0, - "completion_length": 372.0208435058594, - "epoch": 0.981, - "grad_norm": 40.416913288237026, - "kl": 2.2578125, - "learning_rate": 1.0098933926857752e-07, - "loss": 0.4458, - "reward": 2.7070990800857544, - "reward_std": 0.5603702366352081, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 0.9444444477558136, - "rewards/repetition_penalty_reward": -0.018595370464026928, - "rewards/tag_count_reward": 0.9270833432674408, - "step": 981 + "completion_length": 306.7916717529297, + "epoch": 0.982, + "grad_norm": 5.0392308029500725, + "kl": 0.810546875, + "learning_rate": 1.0088797220727779e-07, + "loss": 0.3007, + "reward": 2.6658294200897217, + "reward_std": 0.208114517852664, + "rewards/accuracy_reward": 0.7291666716337204, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.026879037730395794, + "rewards/tag_count_reward": 0.984375, + "step": 1964 }, { "clip_ratio": 0.0, - "completion_length": 503.1666717529297, - "epoch": 0.982, - "grad_norm": 23.377929734927985, - "kl": 4.28125, - "learning_rate": 1.0088797220727779e-07, - "loss": 1.1209, - "reward": 2.3728084564208984, - "reward_std": 0.7684433162212372, - "rewards/accuracy_reward": 0.5625000298023224, - "rewards/reasoning_steps_reward": 0.951388955116272, - "rewards/repetition_penalty_reward": -0.03170543722808361, - "rewards/tag_count_reward": 0.8906250298023224, - "step": 982 + "completion_length": 291.56251525878906, + "epoch": 0.9825, + "grad_norm": 7.335928965442494, + "kl": 0.9375, + "learning_rate": 1.0083934071015988e-07, + "loss": 0.0943, + "reward": 2.629801034927368, + "reward_std": 0.2985463812947273, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.02297679055482149, + "rewards/tag_count_reward": 0.9791666865348816, + "step": 1965 }, { "clip_ratio": 0.0, - "completion_length": 457.6250305175781, + "completion_length": 289.5416793823242, "epoch": 0.983, - "grad_norm": 23.345141824922493, - "kl": 3.3046875, + "grad_norm": 8.189186451005964, + "kl": 1.376953125, "learning_rate": 1.007920774343056e-07, - "loss": 0.7212, - "reward": 2.3101412057876587, - "reward_std": 0.5889628529548645, - "rewards/accuracy_reward": 0.4375, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.016247691586613655, - "rewards/tag_count_reward": 0.8958333432674408, - "step": 983 + "loss": 0.051, + "reward": 2.853655695915222, + "reward_std": 0.26282477006316185, + "rewards/accuracy_reward": 0.9375, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.04391396418213844, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 1966 + }, + { + "clip_ratio": 0.0, + "completion_length": 345.4583435058594, + "epoch": 0.9835, + "grad_norm": 8.561365760313354, + "kl": 2.14453125, + "learning_rate": 1.0074618252368726e-07, + "loss": 0.5123, + "reward": 2.6916539669036865, + "reward_std": 0.6342622339725494, + "rewards/accuracy_reward": 0.8750000298023224, + "rewards/reasoning_steps_reward": 0.9097222685813904, + "rewards/repetition_penalty_reward": -0.02015175297856331, + "rewards/tag_count_reward": 0.9270833432674408, + "step": 1967 }, { "clip_ratio": 0.0, - "completion_length": 370.7708435058594, + "completion_length": 316.7291717529297, "epoch": 0.984, - "grad_norm": 42.40361638319742, - "kl": 2.703125, + "grad_norm": 5.9460907756296475, + "kl": 1.0078125, "learning_rate": 1.0070165611810855e-07, - "loss": 0.4711, - "reward": 2.4439754486083984, - "reward_std": 0.4780399203300476, - "rewards/accuracy_reward": 0.520833358168602, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.02130233682692051, + "loss": 0.3107, + "reward": 2.4366408586502075, + "reward_std": 0.44192972034215927, + "rewards/accuracy_reward": 0.5000000223517418, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.016484168358147144, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 1968 + }, + { + "clip_ratio": 0.0, + "completion_length": 359.85418701171875, + "epoch": 0.9845, + "grad_norm": 3.5890920698204845, + "kl": 1.2109375, + "learning_rate": 1.0065849835320473e-07, + "loss": 0.3065, + "reward": 2.7752894163131714, + "reward_std": 0.41077224910259247, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.03721063770353794, "rewards/tag_count_reward": 0.9583333432674408, - "step": 984 + "step": 1969 }, { "clip_ratio": 0.0, - "completion_length": 569.6666870117188, + "completion_length": 339.6041717529297, "epoch": 0.985, - "grad_norm": 41.35640722099136, - "kl": 6.78125, + "grad_norm": 4.603168506552496, + "kl": 0.732421875, "learning_rate": 1.0061670936044178e-07, - "loss": 1.0564, - "reward": 2.4486454725265503, - "reward_std": 0.6525047123432159, - "rewards/accuracy_reward": 0.6458333432674408, + "loss": 0.1274, + "reward": 2.7143149375915527, + "reward_std": 0.3336441293358803, + "rewards/accuracy_reward": 0.7708333432674408, "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.020104597322642803, - "rewards/tag_count_reward": 0.8437500298023224, - "step": 985 + "rewards/repetition_penalty_reward": -0.020060266833752394, + "rewards/tag_count_reward": 0.984375, + "step": 1970 + }, + { + "clip_ratio": 0.0, + "completion_length": 320.125, + "epoch": 0.9855, + "grad_norm": 8.307150756656938, + "kl": 0.787109375, + "learning_rate": 1.0057628926711624e-07, + "loss": 0.308, + "reward": 2.7315473556518555, + "reward_std": 0.42569366097450256, + "rewards/accuracy_reward": 0.8750000298023224, + "rewards/reasoning_steps_reward": 0.888888955116272, + "rewards/repetition_penalty_reward": -0.016716606449335814, + "rewards/tag_count_reward": 0.984375, + "step": 1971 }, { "clip_ratio": 0.0, - "completion_length": 408.7916717529297, + "completion_length": 348.2291717529297, "epoch": 0.986, - "grad_norm": 47.381447307488855, - "kl": 4.13671875, + "grad_norm": 8.306320282400852, + "kl": 1.294921875, "learning_rate": 1.005372381963547e-07, - "loss": 0.6591, - "reward": 2.723273992538452, - "reward_std": 0.3431188315153122, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.023253954481333494, - "rewards/tag_count_reward": 0.9270833730697632, - "step": 986 + "loss": 0.2106, + "reward": 2.2555699348449707, + "reward_std": 0.2516388399526477, + "rewards/accuracy_reward": 0.3333333358168602, + "rewards/reasoning_steps_reward": 0.9791666269302368, + "rewards/repetition_penalty_reward": -0.01526356441900134, + "rewards/tag_count_reward": 0.9583333432674408, + "step": 1972 }, { "clip_ratio": 0.0, - "completion_length": 401.1041717529297, + "completion_length": 338.18751525878906, + "epoch": 0.9865, + "grad_norm": 5.448252350820911, + "kl": 1.0390625, + "learning_rate": 1.0049955626711354e-07, + "loss": 0.2096, + "reward": 2.6255258321762085, + "reward_std": 0.24584404285997152, + "rewards/accuracy_reward": 0.6875, + "rewards/reasoning_steps_reward": 0.9722222089767456, + "rewards/repetition_penalty_reward": -0.013363123405724764, + "rewards/tag_count_reward": 0.9791666865348816, + "step": 1973 + }, + { + "clip_ratio": 0.0, + "completion_length": 401.6458435058594, "epoch": 0.987, - "grad_norm": 64.30330810008361, - "kl": 2.298828125, + "grad_norm": 6.478360593950035, + "kl": 1.6171875, "learning_rate": 1.0046324359417842e-07, - "loss": 0.4446, - "reward": 2.5345929861068726, - "reward_std": 0.383854431565851, - "rewards/accuracy_reward": 0.625, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.026170868426561356, - "rewards/tag_count_reward": 0.9635416865348816, - "step": 987 + "loss": 0.3997, + "reward": 2.263159990310669, + "reward_std": 0.6182901561260223, + "rewards/accuracy_reward": 0.4166666716337204, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.026770692318677902, + "rewards/tag_count_reward": 0.9218750298023224, + "step": 1974 + }, + { + "clip_ratio": 0.0, + "completion_length": 285.00001525878906, + "epoch": 0.9875, + "grad_norm": 9.459666746367304, + "kl": 0.9453125, + "learning_rate": 1.0042830028816399e-07, + "loss": 0.1263, + "reward": 2.572731137275696, + "reward_std": 0.37718044966459274, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.014074573758989573, + "rewards/tag_count_reward": 0.96875, + "step": 1975 }, { "clip_ratio": 0.0, - "completion_length": 508.54168701171875, + "completion_length": 276.2708435058594, "epoch": 0.988, - "grad_norm": 18.399449871940522, - "kl": 5.25, + "grad_norm": 4.3871213558384845, + "kl": 0.861328125, "learning_rate": 1.0039472645551372e-07, - "loss": 0.7395, - "reward": 2.3243101835250854, - "reward_std": 0.6121802628040314, - "rewards/accuracy_reward": 0.520833358168602, - "rewards/reasoning_steps_reward": 0.9375000298023224, - "rewards/repetition_penalty_reward": -0.035064928233623505, - "rewards/tag_count_reward": 0.9010416865348816, - "step": 988 + "loss": 0.0517, + "reward": 2.809394598007202, + "reward_std": 0.18590081203728914, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.010049775708466768, + "rewards/tag_count_reward": 0.9791666865348816, + "step": 1976 + }, + { + "clip_ratio": 0.0, + "completion_length": 232.52084350585938, + "epoch": 0.9885, + "grad_norm": 8.470772145588477, + "kl": 0.68359375, + "learning_rate": 1.0036252219849932e-07, + "loss": 0.0676, + "reward": 2.9624515771865845, + "reward_std": 0.08178183203563094, + "rewards/accuracy_reward": 0.9791666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.01671524066478014, + "rewards/tag_count_reward": 1.0, + "step": 1977 }, { "clip_ratio": 0.0, - "completion_length": 360.4583435058594, + "completion_length": 424.5208435058594, "epoch": 0.989, - "grad_norm": 23.549397740992113, - "kl": 1.0419921875, + "grad_norm": 5.674149225074641, + "kl": 1.59375, "learning_rate": 1.0033168761522048e-07, - "loss": 0.3115, - "reward": 2.6984081268310547, - "reward_std": 0.3867932856082916, + "loss": 0.499, + "reward": 2.5746582746505737, + "reward_std": 0.4873664379119873, "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.015133682638406754, - "rewards/tag_count_reward": 0.984375, - "step": 989 + "rewards/reasoning_steps_reward": 0.951388955116272, + "rewards/repetition_penalty_reward": -0.02777231764048338, + "rewards/tag_count_reward": 0.921875, + "step": 1978 }, { "clip_ratio": 0.0, - "completion_length": 415.16668701171875, + "completion_length": 412.8958435058594, + "epoch": 0.9895, + "grad_norm": 5.813467245598919, + "kl": 1.2734375, + "learning_rate": 1.0030222279960469e-07, + "loss": 0.6803, + "reward": 2.581472635269165, + "reward_std": 0.5482227504253387, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.02790247928351164, + "rewards/tag_count_reward": 0.9427083432674408, + "step": 1979 + }, + { + "clip_ratio": 0.0, + "completion_length": 454.9375, "epoch": 0.99, - "grad_norm": 33.19104469693633, - "kl": 4.4765625, + "grad_norm": 6.839115137637116, + "kl": 1.765625, "learning_rate": 1.002741278414069e-07, - "loss": 0.6084, - "reward": 2.583367347717285, - "reward_std": 0.5392884910106659, + "loss": 0.643, + "reward": 2.5423187017440796, + "reward_std": 0.4872732013463974, "rewards/accuracy_reward": 0.7083333432674408, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.019063206389546394, - "rewards/tag_count_reward": 0.9218750298023224, - "step": 990 + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.021917639300227165, + "rewards/tag_count_reward": 0.8906250298023224, + "step": 1980 + }, + { + "clip_ratio": 0.0, + "completion_length": 426.7083435058594, + "epoch": 0.9905, + "grad_norm": 12.357127431821414, + "kl": 1.1328125, + "learning_rate": 1.002474028262093e-07, + "loss": 0.6164, + "reward": 2.5697481632232666, + "reward_std": 0.3590293526649475, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.0205296752974391, + "rewards/tag_count_reward": 0.9583333432674408, + "step": 1981 }, { "clip_ratio": 0.0, - "completion_length": 623.3125305175781, + "completion_length": 442.4583435058594, "epoch": 0.991, - "grad_norm": 80.63884599695989, - "kl": 9.921875, + "grad_norm": 8.104455391245587, + "kl": 1.765625, "learning_rate": 1.0022204783542078e-07, - "loss": 1.1105, - "reward": 2.0245230197906494, - "reward_std": 0.5311869978904724, - "rewards/accuracy_reward": 0.3333333432674408, - "rewards/reasoning_steps_reward": 0.9097222089767456, - "rewards/repetition_penalty_reward": -0.010199269745498896, - "rewards/tag_count_reward": 0.7916666865348816, - "step": 991 + "loss": 1.1146, + "reward": 2.4505070447921753, + "reward_std": 0.7328682243824005, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/reasoning_steps_reward": 0.9097222685813904, + "rewards/repetition_penalty_reward": -0.02692350000143051, + "rewards/tag_count_reward": 0.9010416865348816, + "step": 1982 + }, + { + "clip_ratio": 0.0, + "completion_length": 360.2083435058594, + "epoch": 0.9915, + "grad_norm": 7.862692984107953, + "kl": 0.900390625, + "learning_rate": 1.001980629462772e-07, + "loss": 0.4929, + "reward": 2.743220090866089, + "reward_std": 0.480159193277359, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.02414090372622013, + "rewards/tag_count_reward": 0.9687500298023224, + "step": 1983 }, { "clip_ratio": 0.0, - "completion_length": 506.25, + "completion_length": 435.0, "epoch": 0.992, - "grad_norm": 18.011334199256787, - "kl": 5.390625, + "grad_norm": 8.155193647156521, + "kl": 1.6015625, "learning_rate": 1.0017544823184055e-07, - "loss": 0.9891, - "reward": 2.471301794052124, - "reward_std": 0.5606184005737305, - "rewards/accuracy_reward": 0.6458333432674408, - "rewards/reasoning_steps_reward": 0.944444477558136, - "rewards/repetition_penalty_reward": -0.014809360727667809, - "rewards/tag_count_reward": 0.8958333730697632, - "step": 992 + "loss": 0.4418, + "reward": 2.546668291091919, + "reward_std": 0.5175595879554749, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9375, + "rewards/repetition_penalty_reward": -0.0210402044467628, + "rewards/tag_count_reward": 0.9218750298023224, + "step": 1984 + }, + { + "clip_ratio": 0.0, + "completion_length": 307.6458435058594, + "epoch": 0.9925, + "grad_norm": 11.349064132553734, + "kl": 1.390625, + "learning_rate": 1.0015420376099923e-07, + "loss": 0.395, + "reward": 2.6141462326049805, + "reward_std": 0.3557308465242386, + "rewards/accuracy_reward": 0.6875, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.01953439600765705, + "rewards/tag_count_reward": 0.9739583432674408, + "step": 1985 }, { "clip_ratio": 0.0, - "completion_length": 409.43751525878906, + "completion_length": 390.625, "epoch": 0.993, - "grad_norm": 28.657341564218928, - "kl": 1.66796875, + "grad_norm": 3.8219642488097927, + "kl": 1.1328125, "learning_rate": 1.001343295984676e-07, - "loss": 0.5813, - "reward": 2.359801173210144, - "reward_std": 0.4328533709049225, - "rewards/accuracy_reward": 0.4375000149011612, - "rewards/reasoning_steps_reward": 0.972222238779068, - "rewards/repetition_penalty_reward": -0.01867112284526229, - "rewards/tag_count_reward": 0.9687500298023224, - "step": 993 + "loss": 0.4817, + "reward": 2.7117778062820435, + "reward_std": 0.5155874937772751, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.034749857150018215, + "rewards/tag_count_reward": 0.9479166865348816, + "step": 1986 + }, + { + "clip_ratio": 0.0, + "completion_length": 606.7916870117188, + "epoch": 0.9935, + "grad_norm": 14.173929864366983, + "kl": 2.625, + "learning_rate": 1.0011582580478576e-07, + "loss": 0.6239, + "reward": 2.241394519805908, + "reward_std": 0.7578141689300537, + "rewards/accuracy_reward": 0.5000000298023224, + "rewards/reasoning_steps_reward": 0.9027778208255768, + "rewards/repetition_penalty_reward": -0.025966664776206017, + "rewards/tag_count_reward": 0.8645833432674408, + "step": 1987 }, { "clip_ratio": 0.0, - "completion_length": 437.9583435058594, + "completion_length": 324.3541793823242, "epoch": 0.994, - "grad_norm": 14.15639088003171, - "kl": 2.4453125, + "grad_norm": 5.893395799651384, + "kl": 1.6328125, "learning_rate": 1.0009869243631952e-07, - "loss": 0.6178, - "reward": 2.6296534538269043, - "reward_std": 0.6427464187145233, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9652777910232544, - "rewards/repetition_penalty_reward": -0.012707796646282077, - "rewards/tag_count_reward": 0.9479166865348816, - "step": 994 + "loss": 0.4473, + "reward": 2.572134852409363, + "reward_std": 0.43454277515411377, + "rewards/accuracy_reward": 0.6875, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.016406969632953405, + "rewards/tag_count_reward": 0.9427083432674408, + "step": 1988 + }, + { + "clip_ratio": 0.0, + "completion_length": 365.5416793823242, + "epoch": 0.9945, + "grad_norm": 10.350146304014267, + "kl": 0.990234375, + "learning_rate": 1.000829295452601e-07, + "loss": 0.4707, + "reward": 2.8346047401428223, + "reward_std": 0.31515760254114866, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.02650655061006546, + "rewards/tag_count_reward": 0.9791666865348816, + "step": 1989 }, { "clip_ratio": 0.0, - "completion_length": 422.2083435058594, + "completion_length": 510.5208435058594, "epoch": 0.995, - "grad_norm": 21.464668161928536, - "kl": 2.0234375, + "grad_norm": 10.609712135513492, + "kl": 1.94140625, "learning_rate": 1.0006853717962393e-07, - "loss": 0.5771, - "reward": 2.529154658317566, - "reward_std": 0.3804011940956116, - "rewards/accuracy_reward": 0.6041666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.015984368277713656, - "rewards/tag_count_reward": 0.9479166865348816, - "step": 995 + "loss": 0.4845, + "reward": 2.408261299133301, + "reward_std": 0.6599603295326233, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9444444477558136, + "rewards/repetition_penalty_reward": -0.010141530307009816, + "rewards/tag_count_reward": 0.8906250298023224, + "step": 1990 + }, + { + "clip_ratio": 0.0, + "completion_length": 329.87501525878906, + "epoch": 0.9955, + "grad_norm": 8.570082105600441, + "kl": 1.505859375, + "learning_rate": 1.0005551538325274e-07, + "loss": 0.2282, + "reward": 2.8254886865615845, + "reward_std": 0.2351871496066451, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.02173378225415945, + "rewards/tag_count_reward": 0.9583333432674408, + "step": 1991 }, { "clip_ratio": 0.0, - "completion_length": 450.04168701171875, + "completion_length": 315.1666717529297, "epoch": 0.996, - "grad_norm": 29.84614498881874, - "kl": 3.0390625, + "grad_norm": 6.189976928471374, + "kl": 0.90234375, "learning_rate": 1.000438641958131e-07, - "loss": 0.9596, - "reward": 2.2574844360351562, - "reward_std": 0.5413917303085327, - "rewards/accuracy_reward": 0.375, - "rewards/reasoning_steps_reward": 0.9652778208255768, - "rewards/repetition_penalty_reward": -0.015085038263350725, - "rewards/tag_count_reward": 0.9322916865348816, - "step": 996 + "loss": 0.2172, + "reward": 2.61415696144104, + "reward_std": 0.2505191368982196, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.022995917359367013, + "rewards/tag_count_reward": 0.984375, + "step": 1992 }, { "clip_ratio": 0.0, - "completion_length": 411.0208435058594, + "completion_length": 506.27085876464844, + "epoch": 0.9965, + "grad_norm": 9.237622275005121, + "kl": 4.1015625, + "learning_rate": 1.0003358365279661e-07, + "loss": 0.5888, + "reward": 2.3739354014396667, + "reward_std": 0.7014772593975067, + "rewards/accuracy_reward": 0.6041666716337204, + "rewards/reasoning_steps_reward": 0.9305556118488312, + "rewards/repetition_penalty_reward": -0.020161897875368595, + "rewards/tag_count_reward": 0.8593750298023224, + "step": 1993 + }, + { + "clip_ratio": 0.0, + "completion_length": 502.6458435058594, "epoch": 0.997, - "grad_norm": 22.404250638364193, - "kl": 4.38671875, + "grad_norm": 7.126787307275001, + "kl": 1.69140625, "learning_rate": 1.0002467378551954e-07, - "loss": 0.5886, - "reward": 2.3560396432876587, - "reward_std": 0.5860812664031982, - "rewards/accuracy_reward": 0.4791666865348816, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.013752035796642303, - "rewards/tag_count_reward": 0.9114583730697632, - "step": 997 + "loss": 0.7538, + "reward": 2.45101535320282, + "reward_std": 0.8071577250957489, + "rewards/accuracy_reward": 0.6458333730697632, + "rewards/reasoning_steps_reward": 0.9305556118488312, + "rewards/repetition_penalty_reward": -0.021206957288086414, + "rewards/tag_count_reward": 0.8958333432674408, + "step": 1994 }, { "clip_ratio": 0.0, - "completion_length": 406.2083435058594, + "completion_length": 306.50001525878906, + "epoch": 0.9975, + "grad_norm": 4.0754860395849875, + "kl": 1.087890625, + "learning_rate": 1.000171346211229e-07, + "loss": 0.2516, + "reward": 2.844114303588867, + "reward_std": 0.32640238106250763, + "rewards/accuracy_reward": 0.9375000298023224, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.027413712814450264, + "rewards/tag_count_reward": 0.96875, + "step": 1995 + }, + { + "clip_ratio": 0.0, + "completion_length": 275.5208435058594, "epoch": 0.998, - "grad_norm": 14.675909635758037, - "kl": 4.03125, + "grad_norm": 10.126201588856906, + "kl": 0.859375, "learning_rate": 1.0001096618257236e-07, - "loss": 0.5068, - "reward": 2.554211378097534, - "reward_std": 0.657451719045639, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9444444477558136, - "rewards/repetition_penalty_reward": -0.025649813003838062, - "rewards/tag_count_reward": 0.9062500298023224, - "step": 998 + "loss": 0.3494, + "reward": 2.716609239578247, + "reward_std": 0.06195330573245883, + "rewards/accuracy_reward": 0.75, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.017765806522220373, + "rewards/tag_count_reward": 0.984375, + "step": 1996 + }, + { + "clip_ratio": 0.0, + "completion_length": 413.4791717529297, + "epoch": 0.9985, + "grad_norm": 8.06318792454859, + "kl": 1.3515625, + "learning_rate": 1.0000616848865797e-07, + "loss": 0.4413, + "reward": 2.815260648727417, + "reward_std": 0.36746685206890106, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.02675328589975834, + "rewards/tag_count_reward": 0.953125, + "step": 1997 }, { "clip_ratio": 0.0, - "completion_length": 529.0416717529297, + "completion_length": 338.9583435058594, "epoch": 0.999, - "grad_norm": 25.41791128417754, - "kl": 4.296875, + "grad_norm": 5.2700476094611615, + "kl": 1.13671875, "learning_rate": 1.0000274155399433e-07, - "loss": 0.7834, - "reward": 2.529424786567688, - "reward_std": 0.7073009014129639, + "loss": 0.3144, + "reward": 2.762750267982483, + "reward_std": 0.3325686603784561, + "rewards/accuracy_reward": 0.8333333730697632, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.020235874690115452, + "rewards/tag_count_reward": 0.9635416865348816, + "step": 1998 + }, + { + "clip_ratio": 0.0, + "completion_length": 535.2083435058594, + "epoch": 0.9995, + "grad_norm": 3.755049968060971, + "kl": 1.91015625, + "learning_rate": 1.0000068538902053e-07, + "loss": 0.7273, + "reward": 2.472402811050415, + "reward_std": 0.8260309398174286, "rewards/accuracy_reward": 0.6875000298023224, - "rewards/reasoning_steps_reward": 0.9652778208255768, - "rewards/repetition_penalty_reward": -0.019186487421393394, - "rewards/tag_count_reward": 0.8958333432674408, - "step": 999 + "rewards/reasoning_steps_reward": 0.92361119389534, + "rewards/repetition_penalty_reward": -0.013708289712667465, + "rewards/tag_count_reward": 0.8750000298023224, + "step": 1999 }, { "clip_ratio": 0.0, - "completion_length": 462.625, + "completion_length": 969.0625, "epoch": 1.0, - "grad_norm": 21.030282475146205, - "kl": 3.8828125, + "grad_norm": 41.67279675930041, + "kl": 2.5078125, "learning_rate": 1e-07, - "loss": 1.0549, - "reward": 2.609420418739319, - "reward_std": 0.6920914947986603, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9652778208255768, - "rewards/repetition_penalty_reward": -0.012107519898563623, - "rewards/tag_count_reward": 0.9270833432674408, - "step": 1000 + "loss": 0.6262, + "reward": 2.2097694873809814, + "reward_std": 0.4844767898321152, + "rewards/accuracy_reward": 0.4166666865348816, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.010716728633269668, + "rewards/tag_count_reward": 0.8593750298023224, + "step": 2000 }, { "epoch": 1.0, - "step": 1000, + "step": 2000, "total_flos": 0.0, - "train_loss": 0.49418719741604583, - "train_runtime": 57748.1799, - "train_samples_per_second": 0.069, - "train_steps_per_second": 0.017 + "train_loss": 1.366488031092214, + "train_runtime": 85280.8428, + "train_samples_per_second": 0.094, + "train_steps_per_second": 0.023 } ], "logging_steps": 1, - "max_steps": 1000, + "max_steps": 2000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200,