diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -10,16012 +10,16012 @@ "log_history": [ { "clip_ratio": 0.0, - "completion_length": 483.91668701171875, + "completion_length": 503.75, "epoch": 0.001, - "grad_norm": 2.745151802079118, + "grad_norm": 2.9832106604609847, "kl": 0.0, "learning_rate": 1e-08, - "loss": -0.0775, - "reward": 0.8227441906929016, - "reward_std": 0.25334322452545166, - "rewards/accuracy_reward": 0.02083333395421505, - "rewards/reasoning_steps_reward": 0.3680555522441864, - "rewards/repetition_penalty_reward": -0.0869779996573925, - "rewards/tag_count_reward": 0.5208333432674408, + "loss": -0.0412, + "reward": 1.165066421031952, + "reward_std": 0.4164372831583023, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/reasoning_steps_reward": 0.6388888955116272, + "rewards/repetition_penalty_reward": -0.03632250800728798, + "rewards/tag_count_reward": 0.5208333730697632, "step": 1 }, { "clip_ratio": 0.0, - "completion_length": 461.93751525878906, + "completion_length": 601.9375305175781, "epoch": 0.002, - "grad_norm": 2.4442255719251174, + "grad_norm": 2.6050906286301534, "kl": 0.0, "learning_rate": 2e-08, - "loss": 0.092, - "reward": 0.7306002080440521, - "reward_std": 0.2853996157646179, - "rewards/accuracy_reward": 0.02083333395421505, - "rewards/reasoning_steps_reward": 0.3055555820465088, - "rewards/repetition_penalty_reward": -0.11141369119286537, - "rewards/tag_count_reward": 0.515625, + "loss": -0.0073, + "reward": 1.1193318367004395, + "reward_std": 0.2575419098138809, + "rewards/accuracy_reward": 0.0, + "rewards/reasoning_steps_reward": 0.659722238779068, + "rewards/repetition_penalty_reward": -0.0508070383220911, + "rewards/tag_count_reward": 0.5104166865348816, "step": 2 }, { "clip_ratio": 0.0, - "completion_length": 418.2083435058594, + "completion_length": 624.4583740234375, "epoch": 0.003, - "grad_norm": 2.689175305758079, - "kl": 0.000141143798828125, + "grad_norm": 2.4392990354195887, + "kl": 0.00021076202392578125, "learning_rate": 3e-08, - "loss": 0.0035, - "reward": 0.6592943966388702, - "reward_std": 0.2810935229063034, - "rewards/accuracy_reward": 0.02083333395421505, - "rewards/reasoning_steps_reward": 0.2222222313284874, - "rewards/repetition_penalty_reward": -0.09417785331606865, - "rewards/tag_count_reward": 0.5104166865348816, + "loss": 0.0166, + "reward": 1.1063403487205505, + "reward_std": 0.29120244085788727, + "rewards/accuracy_reward": 0.0, + "rewards/reasoning_steps_reward": 0.6597222089767456, + "rewards/repetition_penalty_reward": -0.0533819030970335, + "rewards/tag_count_reward": 0.5, "step": 3 }, { "clip_ratio": 0.0, - "completion_length": 426.50001525878906, + "completion_length": 505.97918701171875, "epoch": 0.004, - "grad_norm": 2.689818650779635, - "kl": 0.00014209747314453125, + "grad_norm": 2.6745634307308377, + "kl": 0.000171661376953125, "learning_rate": 4e-08, - "loss": -0.0387, - "reward": 0.6954131722450256, - "reward_std": 0.2756202667951584, - "rewards/accuracy_reward": 0.02083333395421505, - "rewards/reasoning_steps_reward": 0.270833358168602, - "rewards/repetition_penalty_reward": -0.10667016357183456, - "rewards/tag_count_reward": 0.5104166865348816, + "loss": -0.0265, + "reward": 1.1932607293128967, + "reward_std": 0.3999044597148895, + "rewards/accuracy_reward": 0.06250000186264515, + "rewards/reasoning_steps_reward": 0.6527777910232544, + "rewards/repetition_penalty_reward": -0.04805881343781948, + "rewards/tag_count_reward": 0.5260416865348816, "step": 4 }, { "clip_ratio": 0.0, - "completion_length": 499.375, + "completion_length": 604.6458435058594, "epoch": 0.005, - "grad_norm": 2.6048887694113243, - "kl": 0.00014638900756835938, + "grad_norm": 2.7835442273591835, + "kl": 0.00020742416381835938, "learning_rate": 5e-08, - "loss": -0.0091, - "reward": 0.7254918217658997, - "reward_std": 0.22710905969142914, - "rewards/accuracy_reward": 0.0, - "rewards/reasoning_steps_reward": 0.3194444477558136, - "rewards/repetition_penalty_reward": -0.09916100278496742, - "rewards/tag_count_reward": 0.5052083432674408, + "loss": -0.0375, + "reward": 1.0637533068656921, + "reward_std": 0.3286152780056, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/reasoning_steps_reward": 0.569444477558136, + "rewards/repetition_penalty_reward": -0.04735783860087395, + "rewards/tag_count_reward": 0.5208333432674408, "step": 5 }, { "clip_ratio": 0.0, - "completion_length": 465.62501525878906, + "completion_length": 487.22918701171875, "epoch": 0.006, - "grad_norm": 2.540290969031232, - "kl": 0.00015306472778320312, + "grad_norm": 3.0412926302974874, + "kl": 0.00020647048950195312, "learning_rate": 6e-08, - "loss": -0.037, - "reward": 0.7006559371948242, - "reward_std": 0.23147724568843842, - "rewards/accuracy_reward": 0.0, - "rewards/reasoning_steps_reward": 0.2916666865348816, - "rewards/repetition_penalty_reward": -0.09621911868453026, - "rewards/tag_count_reward": 0.5052083432674408, + "loss": -0.0389, + "reward": 1.0956225991249084, + "reward_std": 0.38825175166130066, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/reasoning_steps_reward": 0.604166716337204, + "rewards/repetition_penalty_reward": -0.039794087409973145, + "rewards/tag_count_reward": 0.5104166865348816, "step": 6 }, { "clip_ratio": 0.0, - "completion_length": 397.16668701171875, + "completion_length": 613.5833435058594, "epoch": 0.007, - "grad_norm": 2.8056740585468534, - "kl": 0.0001430511474609375, + "grad_norm": 2.444811156842584, + "kl": 0.00019121170043945312, "learning_rate": 7e-08, - "loss": -0.0425, - "reward": 0.8157726526260376, - "reward_std": 0.2988554313778877, + "loss": -0.013, + "reward": 1.1998452544212341, + "reward_std": 0.3531786799430847, "rewards/accuracy_reward": 0.0416666679084301, - "rewards/reasoning_steps_reward": 0.3125000149011612, - "rewards/repetition_penalty_reward": -0.06964396685361862, - "rewards/tag_count_reward": 0.5312500298023224, + "rewards/reasoning_steps_reward": 0.6736111640930176, + "rewards/repetition_penalty_reward": -0.05189092084765434, + "rewards/tag_count_reward": 0.5364583730697632, "step": 7 }, { "clip_ratio": 0.0, - "completion_length": 452.375, + "completion_length": 526.9166870117188, "epoch": 0.008, - "grad_norm": 2.7813986320550557, - "kl": 0.00020360946655273438, + "grad_norm": 2.577759663416605, + "kl": 0.00021648406982421875, "learning_rate": 8e-08, - "loss": -0.0213, - "reward": 0.7878675162792206, - "reward_std": 0.20458921045064926, - "rewards/accuracy_reward": 0.0, - "rewards/reasoning_steps_reward": 0.3263889253139496, - "rewards/repetition_penalty_reward": -0.06977138668298721, - "rewards/tag_count_reward": 0.53125, + "loss": -0.1157, + "reward": 1.1657525897026062, + "reward_std": 0.40865209698677063, + "rewards/accuracy_reward": 0.06250000186264515, + "rewards/reasoning_steps_reward": 0.597222238779068, + "rewards/repetition_penalty_reward": -0.040844724513590336, + "rewards/tag_count_reward": 0.5468750298023224, "step": 8 }, { "clip_ratio": 0.0, - "completion_length": 425.25001525878906, + "completion_length": 527.5208435058594, "epoch": 0.009, - "grad_norm": 2.796083816997523, - "kl": 0.0001354217529296875, + "grad_norm": 2.8811327535844984, + "kl": 0.00019359588623046875, "learning_rate": 9e-08, - "loss": -0.0349, - "reward": 0.8030039668083191, - "reward_std": 0.16971854120492935, - "rewards/accuracy_reward": 0.0, - "rewards/reasoning_steps_reward": 0.3819444477558136, - "rewards/repetition_penalty_reward": -0.089357178658247, - "rewards/tag_count_reward": 0.5104166865348816, + "loss": -0.0489, + "reward": 1.1031688451766968, + "reward_std": 0.41312260925769806, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/reasoning_steps_reward": 0.506944477558136, + "rewards/repetition_penalty_reward": -0.0444006510078907, + "rewards/tag_count_reward": 0.5572916865348816, "step": 9 }, { "clip_ratio": 0.0, - "completion_length": 422.6666717529297, + "completion_length": 556.5208435058594, "epoch": 0.01, - "grad_norm": 2.9261675838437533, - "kl": 0.0001506805419921875, + "grad_norm": 2.4760963797873985, + "kl": 0.00020551681518554688, "learning_rate": 1e-07, - "loss": -0.0298, - "reward": 0.7589910626411438, - "reward_std": 0.3277961164712906, - "rewards/accuracy_reward": 0.0416666679084301, - "rewards/reasoning_steps_reward": 0.2916666865348816, - "rewards/repetition_penalty_reward": -0.09517566859722137, - "rewards/tag_count_reward": 0.5208333432674408, + "loss": 0.1082, + "reward": 1.057469666004181, + "reward_std": 0.2826480269432068, + "rewards/accuracy_reward": 0.0, + "rewards/reasoning_steps_reward": 0.604166716337204, + "rewards/repetition_penalty_reward": -0.057113731279969215, + "rewards/tag_count_reward": 0.5104166865348816, "step": 10 }, { "clip_ratio": 0.0, - "completion_length": 418.8958435058594, + "completion_length": 505.10418701171875, "epoch": 0.011, - "grad_norm": 2.8292820583783236, - "kl": 0.0001983642578125, + "grad_norm": 2.7426745766245304, + "kl": 0.00023126602172851562, "learning_rate": 1.0999999999999999e-07, - "loss": 0.0446, - "reward": 0.7735037803649902, - "reward_std": 0.20309197902679443, - "rewards/accuracy_reward": 0.0, - "rewards/reasoning_steps_reward": 0.361111119389534, - "rewards/repetition_penalty_reward": -0.09802401065826416, - "rewards/tag_count_reward": 0.5104166865348816, + "loss": -0.0164, + "reward": 1.1937065124511719, + "reward_std": 0.39100518822669983, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/reasoning_steps_reward": 0.6736111342906952, + "rewards/repetition_penalty_reward": -0.042404673993587494, + "rewards/tag_count_reward": 0.5208333730697632, "step": 11 }, { "clip_ratio": 0.0, - "completion_length": 432.9583435058594, + "completion_length": 606.3333435058594, "epoch": 0.012, - "grad_norm": 2.696525620594043, - "kl": 0.00015544891357421875, + "grad_norm": 2.547656089396865, + "kl": 0.000209808349609375, "learning_rate": 1.2e-07, - "loss": 0.0303, - "reward": 0.8913688957691193, - "reward_std": 0.4012472331523895, - "rewards/accuracy_reward": 0.06250000186264515, - "rewards/reasoning_steps_reward": 0.3402777910232544, - "rewards/repetition_penalty_reward": -0.06870058178901672, - "rewards/tag_count_reward": 0.5572916865348816, + "loss": -0.1084, + "reward": 1.0744403004646301, + "reward_std": 0.3415149748325348, + "rewards/accuracy_reward": 0.0, + "rewards/reasoning_steps_reward": 0.6250000298023224, + "rewards/repetition_penalty_reward": -0.06097651459276676, + "rewards/tag_count_reward": 0.5104166865348816, "step": 12 }, { "clip_ratio": 0.0, - "completion_length": 485.4791717529297, + "completion_length": 624.8958740234375, "epoch": 0.013, - "grad_norm": 2.300230292640307, - "kl": 0.0001366138458251953, + "grad_norm": 2.5452173789968726, + "kl": 0.00021314620971679688, "learning_rate": 1.3e-07, - "loss": 0.0079, - "reward": 0.7123395800590515, - "reward_std": 0.1982438787817955, - "rewards/accuracy_reward": 0.02083333395421505, - "rewards/reasoning_steps_reward": 0.3055555820465088, - "rewards/repetition_penalty_reward": -0.12446600943803787, + "loss": -0.0677, + "reward": 1.0435009598731995, + "reward_std": 0.2849584221839905, + "rewards/accuracy_reward": 0.0, + "rewards/reasoning_steps_reward": 0.5902778208255768, + "rewards/repetition_penalty_reward": -0.057193491607904434, "rewards/tag_count_reward": 0.5104166865348816, "step": 13 }, { "clip_ratio": 0.0, - "completion_length": 530.2708435058594, + "completion_length": 534.3125152587891, "epoch": 0.014, - "grad_norm": 2.4942479036029925, - "kl": 0.00016641616821289062, + "grad_norm": 2.5568403376430475, + "kl": 0.000244140625, "learning_rate": 1.4e-07, - "loss": 0.0035, - "reward": 0.671058714389801, - "reward_std": 0.19315791130065918, - "rewards/accuracy_reward": 0.0, - "rewards/reasoning_steps_reward": 0.2638889029622078, - "rewards/repetition_penalty_reward": -0.11366349086165428, - "rewards/tag_count_reward": 0.5208333432674408, + "loss": -0.0267, + "reward": 1.033350259065628, + "reward_std": 0.36252032220363617, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/reasoning_steps_reward": 0.5069444626569748, + "rewards/repetition_penalty_reward": -0.03609427623450756, + "rewards/tag_count_reward": 0.5208333730697632, "step": 14 }, { "clip_ratio": 0.0, - "completion_length": 443.9375, + "completion_length": 633.8541870117188, "epoch": 0.015, - "grad_norm": 3.048882089835645, - "kl": 0.000194549560546875, + "grad_norm": 2.4845137409121465, + "kl": 0.00019216537475585938, "learning_rate": 1.5e-07, - "loss": -0.0194, - "reward": 0.7475398182868958, - "reward_std": 0.17061816900968552, + "loss": -0.0353, + "reward": 1.1506143808364868, + "reward_std": 0.3063247799873352, "rewards/accuracy_reward": 0.0, - "rewards/reasoning_steps_reward": 0.3055555522441864, - "rewards/repetition_penalty_reward": -0.06843245401978493, - "rewards/tag_count_reward": 0.5104166865348816, + "rewards/reasoning_steps_reward": 0.6805556118488312, + "rewards/repetition_penalty_reward": -0.045566244050860405, + "rewards/tag_count_reward": 0.5156250298023224, "step": 15 }, { "clip_ratio": 0.0, - "completion_length": 432.625, + "completion_length": 570.9375152587891, "epoch": 0.016, - "grad_norm": 2.59429611959581, - "kl": 0.0001544952392578125, + "grad_norm": 2.7140477200825273, + "kl": 0.0002346038818359375, "learning_rate": 1.6e-07, - "loss": 0.0539, - "reward": 0.8378123641014099, - "reward_std": 0.37222471833229065, - "rewards/accuracy_reward": 0.06250000186264515, - "rewards/reasoning_steps_reward": 0.3333333432674408, - "rewards/repetition_penalty_reward": -0.09447931870818138, - "rewards/tag_count_reward": 0.5364583432674408, + "loss": -0.0089, + "reward": 1.0886216163635254, + "reward_std": 0.3162507861852646, + "rewards/accuracy_reward": 0.0, + "rewards/reasoning_steps_reward": 0.6388888955116272, + "rewards/repetition_penalty_reward": -0.06068398430943489, + "rewards/tag_count_reward": 0.5104166865348816, "step": 16 }, { "clip_ratio": 0.0, - "completion_length": 428.62501525878906, + "completion_length": 585.2708740234375, "epoch": 0.017, - "grad_norm": 2.9883532673285766, - "kl": 0.00015592575073242188, + "grad_norm": 2.4713266581431625, + "kl": 0.0001678466796875, "learning_rate": 1.7000000000000001e-07, - "loss": -0.0556, - "reward": 0.7087865769863129, - "reward_std": 0.21792305260896683, - "rewards/accuracy_reward": 0.0, - "rewards/reasoning_steps_reward": 0.270833358168602, - "rewards/repetition_penalty_reward": -0.07767177000641823, - "rewards/tag_count_reward": 0.515625, + "loss": -0.0658, + "reward": 1.1053802967071533, + "reward_std": 0.3452056348323822, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/reasoning_steps_reward": 0.631944477558136, + "rewards/repetition_penalty_reward": -0.06823080591857433, + "rewards/tag_count_reward": 0.5208333432674408, "step": 17 }, { "clip_ratio": 0.0, - "completion_length": 441.9583435058594, + "completion_length": 513.6458587646484, "epoch": 0.018, - "grad_norm": 2.8362158709146565, - "kl": 0.00018978118896484375, + "grad_norm": 2.751825490594234, + "kl": 0.00019788742065429688, "learning_rate": 1.8e-07, - "loss": -0.0209, - "reward": 0.8078553080558777, - "reward_std": 0.27446234226226807, - "rewards/accuracy_reward": 0.0416666679084301, - "rewards/reasoning_steps_reward": 0.3125, - "rewards/repetition_penalty_reward": -0.072353046387434, - "rewards/tag_count_reward": 0.5260416865348816, + "loss": -0.0599, + "reward": 1.0880020260810852, + "reward_std": 0.318176731467247, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/reasoning_steps_reward": 0.61111119389534, + "rewards/repetition_penalty_reward": -0.04394245892763138, + "rewards/tag_count_reward": 0.5, "step": 18 }, { "clip_ratio": 0.0, - "completion_length": 466.1041717529297, + "completion_length": 586.6250305175781, "epoch": 0.019, - "grad_norm": 2.888970243800657, - "kl": 0.0001392364501953125, + "grad_norm": 2.39448557038407, + "kl": 0.0001811981201171875, "learning_rate": 1.8999999999999998e-07, - "loss": -0.0661, - "reward": 0.7331588566303253, - "reward_std": 0.21328338980674744, - "rewards/accuracy_reward": 0.0, - "rewards/reasoning_steps_reward": 0.3263889104127884, - "rewards/repetition_penalty_reward": -0.09843838959932327, - "rewards/tag_count_reward": 0.5052083432674408, + "loss": 0.0141, + "reward": 1.0959751605987549, + "reward_std": 0.43034467101097107, + "rewards/accuracy_reward": 0.06250000186264515, + "rewards/reasoning_steps_reward": 0.5486111640930176, + "rewards/repetition_penalty_reward": -0.05159439332783222, + "rewards/tag_count_reward": 0.5364583432674408, "step": 19 }, { "clip_ratio": 0.0, - "completion_length": 473.10418701171875, + "completion_length": 652.1250305175781, "epoch": 0.02, - "grad_norm": 2.866577200548836, - "kl": 0.00012922286987304688, + "grad_norm": 2.0210344240592066, + "kl": 0.00016498565673828125, "learning_rate": 2e-07, - "loss": -0.0392, - "reward": 0.7010990679264069, - "reward_std": 0.23764295876026154, - "rewards/accuracy_reward": 0.02083333395421505, - "rewards/reasoning_steps_reward": 0.2847222536802292, - "rewards/repetition_penalty_reward": -0.1252898871898651, - "rewards/tag_count_reward": 0.5208333432674408, + "loss": -0.0102, + "reward": 1.2613900899887085, + "reward_std": 0.5556788444519043, + "rewards/accuracy_reward": 0.1250000037252903, + "rewards/reasoning_steps_reward": 0.6527777910232544, + "rewards/repetition_penalty_reward": -0.06847099214792252, + "rewards/tag_count_reward": 0.5520833432674408, "step": 20 }, { "clip_ratio": 0.0, - "completion_length": 435.9375, + "completion_length": 534.3333587646484, "epoch": 0.021, - "grad_norm": 2.5853516467538085, - "kl": 0.00013971328735351562, + "grad_norm": 2.723236250331092, + "kl": 0.000263214111328125, "learning_rate": 2.0999999999999997e-07, - "loss": -0.0247, - "reward": 0.7949627339839935, - "reward_std": 0.2942465543746948, - "rewards/accuracy_reward": 0.0416666679084301, - "rewards/reasoning_steps_reward": 0.333333358168602, - "rewards/repetition_penalty_reward": -0.09045393392443657, - "rewards/tag_count_reward": 0.5104166865348816, + "loss": -0.0982, + "reward": 1.113393783569336, + "reward_std": 0.3082204759120941, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/reasoning_steps_reward": 0.6319444477558136, + "rewards/repetition_penalty_reward": -0.03938402608036995, + "rewards/tag_count_reward": 0.5, "step": 21 }, { "clip_ratio": 0.0, - "completion_length": 421.7916717529297, + "completion_length": 593.5208435058594, "epoch": 0.022, - "grad_norm": 2.81981352318438, - "kl": 0.000164031982421875, + "grad_norm": 2.802437133759923, + "kl": 0.00021219253540039062, "learning_rate": 2.1999999999999998e-07, - "loss": -0.0342, - "reward": 0.7032277882099152, - "reward_std": 0.22128960490226746, + "loss": 0.025, + "reward": 1.141821563243866, + "reward_std": 0.30135248601436615, "rewards/accuracy_reward": 0.02083333395421505, - "rewards/reasoning_steps_reward": 0.2638889029622078, - "rewards/repetition_penalty_reward": -0.08670276403427124, - "rewards/tag_count_reward": 0.5052083432674408, + "rewards/reasoning_steps_reward": 0.6527777910232544, + "rewards/repetition_penalty_reward": -0.042206283658742905, + "rewards/tag_count_reward": 0.5104166865348816, "step": 22 }, { "clip_ratio": 0.0, - "completion_length": 410.3958435058594, + "completion_length": 569.9166870117188, "epoch": 0.023, - "grad_norm": 2.801473910929597, - "kl": 0.00015163421630859375, + "grad_norm": 2.513753582519348, + "kl": 0.00023412704467773438, "learning_rate": 2.3e-07, - "loss": -0.0515, - "reward": 0.7014846205711365, - "reward_std": 0.3183808922767639, - "rewards/accuracy_reward": 0.0416666679084301, - "rewards/reasoning_steps_reward": 0.2361111268401146, - "rewards/repetition_penalty_reward": -0.09191818162798882, - "rewards/tag_count_reward": 0.5156250298023224, + "loss": -0.0394, + "reward": 1.033397912979126, + "reward_std": 0.3224620223045349, + "rewards/accuracy_reward": 0.0, + "rewards/reasoning_steps_reward": 0.5833333730697632, + "rewards/repetition_penalty_reward": -0.04993540979921818, + "rewards/tag_count_reward": 0.5, "step": 23 }, { "clip_ratio": 0.0, - "completion_length": 446.1458435058594, + "completion_length": 538.8125305175781, "epoch": 0.024, - "grad_norm": 2.6483194251516093, - "kl": 0.0001544952392578125, + "grad_norm": 2.5472422948610527, + "kl": 0.0001964569091796875, "learning_rate": 2.4e-07, - "loss": -0.0489, - "reward": 0.7778151631355286, - "reward_std": 0.22074151039123535, - "rewards/accuracy_reward": 0.0, - "rewards/reasoning_steps_reward": 0.3541666716337204, - "rewards/repetition_penalty_reward": -0.09718482196331024, - "rewards/tag_count_reward": 0.5208333730697632, + "loss": -0.0033, + "reward": 1.2361397743225098, + "reward_std": 0.4402560144662857, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/reasoning_steps_reward": 0.6736111640930176, + "rewards/repetition_penalty_reward": -0.052054738625884056, + "rewards/tag_count_reward": 0.5312500298023224, "step": 24 }, { "clip_ratio": 0.0, - "completion_length": 394.3333435058594, + "completion_length": 537.1041870117188, "epoch": 0.025, - "grad_norm": 2.479195195026665, - "kl": 0.0001544952392578125, + "grad_norm": 2.734296760351383, + "kl": 0.00023365020751953125, "learning_rate": 2.5e-07, - "loss": -0.0233, - "reward": 0.8724274635314941, - "reward_std": 0.4244323670864105, - "rewards/accuracy_reward": 0.0833333358168602, - "rewards/reasoning_steps_reward": 0.3333333730697632, - "rewards/repetition_penalty_reward": -0.10153089836239815, - "rewards/tag_count_reward": 0.5572916865348816, + "loss": -0.0501, + "reward": 1.1702337265014648, + "reward_std": 0.4403166174888611, + "rewards/accuracy_reward": 0.08333333395421505, + "rewards/reasoning_steps_reward": 0.6041666865348816, + "rewards/repetition_penalty_reward": -0.053724685683846474, + "rewards/tag_count_reward": 0.5364583432674408, "step": 25 }, { "clip_ratio": 0.0, - "completion_length": 459.0625, + "completion_length": 561.1041870117188, "epoch": 0.026, - "grad_norm": 2.562479280691169, - "kl": 0.00016546249389648438, + "grad_norm": 2.8294150280268386, + "kl": 0.0002675056457519531, "learning_rate": 2.6e-07, - "loss": -0.0271, - "reward": 0.7452935576438904, - "reward_std": 0.2175150215625763, + "loss": -0.007, + "reward": 1.1117247343063354, + "reward_std": 0.32203447818756104, "rewards/accuracy_reward": 0.02083333395421505, - "rewards/reasoning_steps_reward": 0.3055555522441864, - "rewards/repetition_penalty_reward": -0.0915120430290699, + "rewards/reasoning_steps_reward": 0.6319444477558136, + "rewards/repetition_penalty_reward": -0.05146980658173561, "rewards/tag_count_reward": 0.5104166865348816, "step": 26 }, { "clip_ratio": 0.0, - "completion_length": 461.47918701171875, + "completion_length": 527.0, "epoch": 0.027, - "grad_norm": 2.487533682339491, - "kl": 0.00016164779663085938, + "grad_norm": 2.762760874256859, + "kl": 0.00025463104248046875, "learning_rate": 2.7e-07, - "loss": 0.0539, - "reward": 0.6953436434268951, - "reward_std": 0.18584085255861282, - "rewards/accuracy_reward": 0.0, - "rewards/reasoning_steps_reward": 0.2847222313284874, - "rewards/repetition_penalty_reward": -0.09458697214722633, - "rewards/tag_count_reward": 0.5052083432674408, + "loss": -0.0483, + "reward": 1.1202597618103027, + "reward_std": 0.36672815680503845, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/reasoning_steps_reward": 0.5972222685813904, + "rewards/repetition_penalty_reward": -0.03946254029870033, + "rewards/tag_count_reward": 0.5208333432674408, "step": 27 }, { "clip_ratio": 0.0, - "completion_length": 439.9166717529297, + "completion_length": 532.3125, "epoch": 0.028, - "grad_norm": 3.247885396180637, - "kl": 0.00018787384033203125, + "grad_norm": 2.5927248535988077, + "kl": 0.00019502639770507812, "learning_rate": 2.8e-07, - "loss": 0.0669, - "reward": 0.7925191521644592, - "reward_std": 0.19140200316905975, - "rewards/accuracy_reward": 0.0, - "rewards/reasoning_steps_reward": 0.3750000149011612, - "rewards/repetition_penalty_reward": -0.09289758652448654, - "rewards/tag_count_reward": 0.5104166716337204, + "loss": -0.0126, + "reward": 1.1283650398254395, + "reward_std": 0.4593771994113922, + "rewards/accuracy_reward": 0.06250000186264515, + "rewards/reasoning_steps_reward": 0.5833333730697632, + "rewards/repetition_penalty_reward": -0.04871835932135582, + "rewards/tag_count_reward": 0.5312500298023224, "step": 28 }, { "clip_ratio": 0.0, - "completion_length": 484.1666717529297, + "completion_length": 587.5416870117188, "epoch": 0.029, - "grad_norm": 2.694287664741282, - "kl": 0.00016832351684570312, + "grad_norm": 2.379176256693628, + "kl": 0.00020503997802734375, "learning_rate": 2.9e-07, - "loss": -0.0049, - "reward": 0.7252695262432098, - "reward_std": 0.26906292140483856, - "rewards/accuracy_reward": 0.02083333395421505, - "rewards/reasoning_steps_reward": 0.2916666865348816, - "rewards/repetition_penalty_reward": -0.09764716401696205, - "rewards/tag_count_reward": 0.5104166865348816, + "loss": -0.0035, + "reward": 0.9848364293575287, + "reward_std": 0.31292901933193207, + "rewards/accuracy_reward": 0.0, + "rewards/reasoning_steps_reward": 0.541666716337204, + "rewards/repetition_penalty_reward": -0.06203855946660042, + "rewards/tag_count_reward": 0.5052083432674408, "step": 29 }, { "clip_ratio": 0.0, - "completion_length": 401.6458435058594, + "completion_length": 521.4791870117188, "epoch": 0.03, - "grad_norm": 2.525432336011363, - "kl": 0.00017595291137695312, + "grad_norm": 2.711197411130166, + "kl": 0.0002446174621582031, "learning_rate": 3e-07, - "loss": 0.0036, - "reward": 0.7998482286930084, - "reward_std": 0.27486903965473175, + "loss": 0.0671, + "reward": 1.1310882568359375, + "reward_std": 0.3909059464931488, "rewards/accuracy_reward": 0.0416666679084301, - "rewards/reasoning_steps_reward": 0.3055555820465088, - "rewards/repetition_penalty_reward": -0.08383234217762947, - "rewards/tag_count_reward": 0.5364583432674408, + "rewards/reasoning_steps_reward": 0.6180555820465088, + "rewards/repetition_penalty_reward": -0.049467260017991066, + "rewards/tag_count_reward": 0.5208333432674408, "step": 30 }, { "clip_ratio": 0.0, - "completion_length": 453.43751525878906, + "completion_length": 650.3125, "epoch": 0.031, - "grad_norm": 2.698278544513627, - "kl": 0.00021314620971679688, + "grad_norm": 2.094653949008266, + "kl": 0.00021648406982421875, "learning_rate": 3.1e-07, - "loss": -0.0796, - "reward": 0.7916192412376404, - "reward_std": 0.1992609053850174, - "rewards/accuracy_reward": 0.0, - "rewards/reasoning_steps_reward": 0.3680555671453476, - "rewards/repetition_penalty_reward": -0.1024780347943306, - "rewards/tag_count_reward": 0.5260416865348816, + "loss": -0.0303, + "reward": 1.3080359101295471, + "reward_std": 0.36785976588726044, + "rewards/accuracy_reward": 0.06250000186264515, + "rewards/reasoning_steps_reward": 0.784722238779068, + "rewards/repetition_penalty_reward": -0.05481144040822983, + "rewards/tag_count_reward": 0.515625, "step": 31 }, { "clip_ratio": 0.0, - "completion_length": 438.25, + "completion_length": 492.2708435058594, "epoch": 0.032, - "grad_norm": 2.6819385501820636, - "kl": 0.000213623046875, + "grad_norm": 2.6034268800973854, + "kl": 0.0003032684326171875, "learning_rate": 3.2e-07, - "loss": -0.059, - "reward": 0.8092588782310486, - "reward_std": 0.2647293135523796, - "rewards/accuracy_reward": 0.02083333395421505, - "rewards/reasoning_steps_reward": 0.3680555373430252, - "rewards/repetition_penalty_reward": -0.10046344250440598, - "rewards/tag_count_reward": 0.5208333730697632, + "loss": -0.0383, + "reward": 1.2631294131278992, + "reward_std": 0.42681366205215454, + "rewards/accuracy_reward": 0.06250000186264515, + "rewards/reasoning_steps_reward": 0.701388955116272, + "rewards/repetition_penalty_reward": -0.03200950939208269, + "rewards/tag_count_reward": 0.5312500298023224, "step": 32 }, { "clip_ratio": 0.0, - "completion_length": 458.68751525878906, + "completion_length": 517.9375305175781, "epoch": 0.033, - "grad_norm": 2.463224136286953, - "kl": 0.00017976760864257812, + "grad_norm": 2.6134583516084144, + "kl": 0.0002884864807128906, "learning_rate": 3.3e-07, - "loss": -0.0079, - "reward": 0.8415437340736389, - "reward_std": 0.30653180181980133, - "rewards/accuracy_reward": 0.0416666679084301, - "rewards/reasoning_steps_reward": 0.3750000298023224, - "rewards/repetition_penalty_reward": -0.10116463899612427, - "rewards/tag_count_reward": 0.5260416865348816, + "loss": 0.085, + "reward": 1.3313005566596985, + "reward_std": 0.5551705211400986, + "rewards/accuracy_reward": 0.10416666977107525, + "rewards/reasoning_steps_reward": 0.7152777910232544, + "rewards/repetition_penalty_reward": -0.04022728279232979, + "rewards/tag_count_reward": 0.5520833730697632, "step": 33 }, { "clip_ratio": 0.0, - "completion_length": 421.93751525878906, + "completion_length": 575.625, "epoch": 0.034, - "grad_norm": 2.885034452855209, - "kl": 0.00023555755615234375, + "grad_norm": 2.785389250577513, + "kl": 0.00032901763916015625, "learning_rate": 3.4000000000000003e-07, - "loss": -0.0758, - "reward": 0.7612072229385376, - "reward_std": 0.29303793609142303, - "rewards/accuracy_reward": 0.02083333395421505, - "rewards/reasoning_steps_reward": 0.3125000298023224, - "rewards/repetition_penalty_reward": -0.09816781431436539, - "rewards/tag_count_reward": 0.5260416865348816, + "loss": -0.0602, + "reward": 1.2880616784095764, + "reward_std": 0.4542257487773895, + "rewards/accuracy_reward": 0.0833333358168602, + "rewards/reasoning_steps_reward": 0.694444477558136, + "rewards/repetition_penalty_reward": -0.04179951548576355, + "rewards/tag_count_reward": 0.5520833730697632, "step": 34 }, { "clip_ratio": 0.0, - "completion_length": 436.5833435058594, + "completion_length": 521.4791870117188, "epoch": 0.035, - "grad_norm": 2.654753815065678, - "kl": 0.0002703666687011719, + "grad_norm": 2.5702386275125564, + "kl": 0.00037384033203125, "learning_rate": 3.5e-07, - "loss": -0.0198, - "reward": 0.8193124830722809, - "reward_std": 0.23815365880727768, + "loss": -0.0291, + "reward": 1.2401779294013977, + "reward_std": 0.3075665980577469, "rewards/accuracy_reward": 0.02083333395421505, - "rewards/reasoning_steps_reward": 0.3611111342906952, - "rewards/repetition_penalty_reward": -0.0834653377532959, - "rewards/tag_count_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 0.7500000596046448, + "rewards/repetition_penalty_reward": -0.04628048092126846, + "rewards/tag_count_reward": 0.5156250298023224, "step": 35 }, { "clip_ratio": 0.0, - "completion_length": 396.31251525878906, + "completion_length": 583.4166870117188, "epoch": 0.036, - "grad_norm": 2.911044795762355, - "kl": 0.0002732276916503906, + "grad_norm": 2.4872780913924584, + "kl": 0.000377655029296875, "learning_rate": 3.6e-07, - "loss": -0.0484, - "reward": 0.9044705033302307, - "reward_std": 0.2895239144563675, - "rewards/accuracy_reward": 0.0625, - "rewards/reasoning_steps_reward": 0.3819444626569748, - "rewards/repetition_penalty_reward": -0.07122397050261497, - "rewards/tag_count_reward": 0.5312500298023224, + "loss": -0.0549, + "reward": 1.0867574214935303, + "reward_std": 0.3429463729262352, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/reasoning_steps_reward": 0.5972222685813904, + "rewards/repetition_penalty_reward": -0.052131447941064835, + "rewards/tag_count_reward": 0.5208333730697632, "step": 36 }, { "clip_ratio": 0.0, - "completion_length": 416.0208435058594, + "completion_length": 576.4583435058594, "epoch": 0.037, - "grad_norm": 2.911824101724759, - "kl": 0.0003604888916015625, + "grad_norm": 2.5982342131854668, + "kl": 0.00039768218994140625, "learning_rate": 3.7e-07, - "loss": -0.0074, - "reward": 0.9215789139270782, - "reward_std": 0.3956316262483597, - "rewards/accuracy_reward": 0.06250000186264515, - "rewards/reasoning_steps_reward": 0.409722238779068, - "rewards/repetition_penalty_reward": -0.09231004863977432, - "rewards/tag_count_reward": 0.5416666865348816, + "loss": -0.0891, + "reward": 1.198279321193695, + "reward_std": 0.26803672313690186, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/reasoning_steps_reward": 0.7152778208255768, + "rewards/repetition_penalty_reward": -0.0638735331594944, + "rewards/tag_count_reward": 0.5052083432674408, "step": 37 }, { "clip_ratio": 0.0, - "completion_length": 482.125, + "completion_length": 573.3541870117188, "epoch": 0.038, - "grad_norm": 2.803305738114807, - "kl": 0.0003185272216796875, + "grad_norm": 2.416596419208738, + "kl": 0.0005702972412109375, "learning_rate": 3.7999999999999996e-07, - "loss": 0.0091, - "reward": 0.838833212852478, - "reward_std": 0.2817804142832756, + "loss": 0.021, + "reward": 1.1571301221847534, + "reward_std": 0.37196002900600433, "rewards/accuracy_reward": 0.02083333395421505, - "rewards/reasoning_steps_reward": 0.3958333432674408, - "rewards/repetition_penalty_reward": -0.09345847740769386, - "rewards/tag_count_reward": 0.515625, + "rewards/reasoning_steps_reward": 0.6527777910232544, + "rewards/repetition_penalty_reward": -0.04773101769387722, + "rewards/tag_count_reward": 0.5312500298023224, "step": 38 }, { "clip_ratio": 0.0, - "completion_length": 432.3958435058594, + "completion_length": 489.2500305175781, "epoch": 0.039, - "grad_norm": 2.551763636801408, - "kl": 0.00043487548828125, + "grad_norm": 2.7311163851954614, + "kl": 0.0006694793701171875, "learning_rate": 3.8999999999999997e-07, - "loss": -0.0155, - "reward": 0.8995303511619568, - "reward_std": 0.3570392578840256, - "rewards/accuracy_reward": 0.0416666679084301, - "rewards/reasoning_steps_reward": 0.3958333432674408, - "rewards/repetition_penalty_reward": -0.1004696749150753, - "rewards/tag_count_reward": 0.5625000298023224, + "loss": 0.0014, + "reward": 1.1946178078651428, + "reward_std": 0.42158831655979156, + "rewards/accuracy_reward": 0.06250000186264515, + "rewards/reasoning_steps_reward": 0.6319445073604584, + "rewards/repetition_penalty_reward": -0.03628497198224068, + "rewards/tag_count_reward": 0.5364583730697632, "step": 39 }, { "clip_ratio": 0.0, - "completion_length": 440.2708435058594, + "completion_length": 557.4791870117188, "epoch": 0.04, - "grad_norm": 2.5822257466140695, - "kl": 0.00044155120849609375, + "grad_norm": 2.5731368960029664, + "kl": 0.000751495361328125, "learning_rate": 4e-07, - "loss": 0.0305, - "reward": 0.9356189370155334, - "reward_std": 0.44639548659324646, - "rewards/accuracy_reward": 0.12500000558793545, - "rewards/reasoning_steps_reward": 0.3611111342906952, - "rewards/repetition_penalty_reward": -0.08695057034492493, - "rewards/tag_count_reward": 0.5364583432674408, + "loss": 0.0449, + "reward": 1.1931411623954773, + "reward_std": 0.4069296419620514, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/reasoning_steps_reward": 0.694444477558136, + "rewards/repetition_penalty_reward": -0.05338665284216404, + "rewards/tag_count_reward": 0.5104166865348816, "step": 40 }, { "clip_ratio": 0.0, - "completion_length": 450.85418701171875, + "completion_length": 630.5625, "epoch": 0.041, - "grad_norm": 2.5859059632619203, - "kl": 0.000621795654296875, + "grad_norm": 2.394979741576189, + "kl": 0.0008411407470703125, "learning_rate": 4.0999999999999994e-07, - "loss": -0.0974, - "reward": 0.9160508513450623, - "reward_std": 0.37560930848121643, - "rewards/accuracy_reward": 0.0833333358168602, - "rewards/reasoning_steps_reward": 0.3680555820465088, - "rewards/repetition_penalty_reward": -0.09262972325086594, - "rewards/tag_count_reward": 0.5572916865348816, + "loss": -0.0326, + "reward": 1.2281219959259033, + "reward_std": 0.4181075543165207, + "rewards/accuracy_reward": 0.08333333395421505, + "rewards/reasoning_steps_reward": 0.6458333432674408, + "rewards/repetition_penalty_reward": -0.05312805995345116, + "rewards/tag_count_reward": 0.5520833730697632, "step": 41 }, { "clip_ratio": 0.0, - "completion_length": 440.29168701171875, + "completion_length": 608.2083740234375, "epoch": 0.042, - "grad_norm": 2.923074139707737, - "kl": 0.0007343292236328125, + "grad_norm": 2.3967041495309647, + "kl": 0.0009288787841796875, "learning_rate": 4.1999999999999995e-07, - "loss": -0.0335, - "reward": 0.9344533383846283, - "reward_std": 0.35851919651031494, - "rewards/accuracy_reward": 0.06250000186264515, - "rewards/reasoning_steps_reward": 0.4027777910232544, - "rewards/repetition_penalty_reward": -0.08290780335664749, - "rewards/tag_count_reward": 0.5520833432674408, + "loss": 0.0268, + "reward": 1.1586747765541077, + "reward_std": 0.34190115332603455, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/reasoning_steps_reward": 0.6666666865348816, + "rewards/repetition_penalty_reward": -0.04445029981434345, + "rewards/tag_count_reward": 0.5156250298023224, "step": 42 }, { "clip_ratio": 0.0, - "completion_length": 477.6250305175781, + "completion_length": 545.5208435058594, "epoch": 0.043, - "grad_norm": 2.3369155763895306, - "kl": 0.0006542205810546875, + "grad_norm": 2.7138542170454003, + "kl": 0.00109100341796875, "learning_rate": 4.2999999999999996e-07, - "loss": -0.0037, - "reward": 0.897548645734787, - "reward_std": 0.3905322998762131, - "rewards/accuracy_reward": 0.0833333358168602, - "rewards/reasoning_steps_reward": 0.3888889104127884, - "rewards/repetition_penalty_reward": -0.13196522742509842, - "rewards/tag_count_reward": 0.5572916865348816, + "loss": -0.0442, + "reward": 1.2215397357940674, + "reward_std": 0.330724373459816, + "rewards/accuracy_reward": 0.02083333395421505, + "rewards/reasoning_steps_reward": 0.729166716337204, + "rewards/repetition_penalty_reward": -0.04929363913834095, + "rewards/tag_count_reward": 0.5208333730697632, "step": 43 }, { "clip_ratio": 0.0, - "completion_length": 512.1875305175781, + "completion_length": 520.8125, "epoch": 0.044, - "grad_norm": 2.5444553797831184, - "kl": 0.001239776611328125, + "grad_norm": 2.9707533290622923, + "kl": 0.001163482666015625, "learning_rate": 4.3999999999999997e-07, - "loss": 0.0506, - "reward": 0.8144256472587585, - "reward_std": 0.3337151110172272, + "loss": -0.0372, + "reward": 1.193300485610962, + "reward_std": 0.3503916561603546, "rewards/accuracy_reward": 0.0416666679084301, - "rewards/reasoning_steps_reward": 0.333333358168602, - "rewards/repetition_penalty_reward": -0.11265767365694046, - "rewards/tag_count_reward": 0.5520833730697632, + "rewards/reasoning_steps_reward": 0.6527778208255768, + "rewards/repetition_penalty_reward": -0.0428106477484107, + "rewards/tag_count_reward": 0.5416666865348816, "step": 44 }, { "clip_ratio": 0.0, - "completion_length": 467.9583435058594, + "completion_length": 592.2291870117188, "epoch": 0.045, - "grad_norm": 2.624341092430347, - "kl": 0.00146484375, + "grad_norm": 2.8301357058974603, + "kl": 0.001316070556640625, "learning_rate": 4.5e-07, - "loss": -0.073, - "reward": 0.8722166121006012, - "reward_std": 0.3200044110417366, - "rewards/accuracy_reward": 0.0416666679084301, - "rewards/reasoning_steps_reward": 0.3819444626569748, - "rewards/repetition_penalty_reward": -0.09826955571770668, - "rewards/tag_count_reward": 0.5468750298023224, + "loss": -0.0149, + "reward": 1.5306417346000671, + "reward_std": 0.6089198887348175, + "rewards/accuracy_reward": 0.2291666716337204, + "rewards/reasoning_steps_reward": 0.7430555522441864, + "rewards/repetition_penalty_reward": -0.04574713110923767, + "rewards/tag_count_reward": 0.6041666865348816, "step": 45 }, { "clip_ratio": 0.0, - "completion_length": 491.2083435058594, + "completion_length": 560.0833435058594, "epoch": 0.046, - "grad_norm": 2.894598405118976, - "kl": 0.001781463623046875, + "grad_norm": 2.9830534003583997, + "kl": 0.001560211181640625, "learning_rate": 4.6e-07, - "loss": -0.1345, - "reward": 0.9022504687309265, - "reward_std": 0.31815651059150696, + "loss": -0.0654, + "reward": 1.2372803688049316, + "reward_std": 0.3439289480447769, "rewards/accuracy_reward": 0.0416666679084301, - "rewards/reasoning_steps_reward": 0.4236111342906952, - "rewards/repetition_penalty_reward": -0.10469403862953186, + "rewards/reasoning_steps_reward": 0.7013889253139496, + "rewards/repetition_penalty_reward": -0.04744199104607105, "rewards/tag_count_reward": 0.5416666865348816, "step": 46 }, { "clip_ratio": 0.0, - "completion_length": 484.43751525878906, + "completion_length": 595.5416870117188, "epoch": 0.047, - "grad_norm": 2.6519924149428267, - "kl": 0.002471923828125, + "grad_norm": 2.62124053942929, + "kl": 0.001735687255859375, "learning_rate": 4.6999999999999995e-07, - "loss": -0.1081, - "reward": 1.004030019044876, - "reward_std": 0.3833072930574417, + "loss": 0.0004, + "reward": 1.289455771446228, + "reward_std": 0.41812919080257416, "rewards/accuracy_reward": 0.06250000186264515, - "rewards/reasoning_steps_reward": 0.4791666865348816, - "rewards/repetition_penalty_reward": -0.08451167494058609, - "rewards/tag_count_reward": 0.5468750298023224, + "rewards/reasoning_steps_reward": 0.7222222685813904, + "rewards/repetition_penalty_reward": -0.04734986647963524, + "rewards/tag_count_reward": 0.5520833730697632, "step": 47 }, { "clip_ratio": 0.0, - "completion_length": 405.37501525878906, + "completion_length": 603.2500305175781, "epoch": 0.048, - "grad_norm": 2.6320490085609527, - "kl": 0.00336456298828125, + "grad_norm": 2.313447720284229, + "kl": 0.001773834228515625, "learning_rate": 4.8e-07, - "loss": -0.0364, - "reward": 1.1537357568740845, - "reward_std": 0.549789696931839, - "rewards/accuracy_reward": 0.2291666716337204, - "rewards/reasoning_steps_reward": 0.3888888955116272, - "rewards/repetition_penalty_reward": -0.08411154896020889, - "rewards/tag_count_reward": 0.6197916865348816, + "loss": 0.0089, + "reward": 1.3387857675552368, + "reward_std": 0.3809618651866913, + "rewards/accuracy_reward": 0.12500000558793545, + "rewards/reasoning_steps_reward": 0.7083333432674408, + "rewards/repetition_penalty_reward": -0.05183934420347214, + "rewards/tag_count_reward": 0.5572916865348816, "step": 48 }, { "clip_ratio": 0.0, - "completion_length": 369.66668701171875, + "completion_length": 523.3541717529297, "epoch": 0.049, - "grad_norm": 2.86975613443169, - "kl": 0.0047149658203125, + "grad_norm": 2.8891626678471063, + "kl": 0.00232696533203125, "learning_rate": 4.9e-07, - "loss": 0.0118, - "reward": 1.0437835454940796, - "reward_std": 0.3194990009069443, - "rewards/accuracy_reward": 0.0833333358168602, - "rewards/reasoning_steps_reward": 0.4513889104127884, - "rewards/repetition_penalty_reward": -0.0794803537428379, - "rewards/tag_count_reward": 0.5885416865348816, + "loss": -0.089, + "reward": 1.2625147104263306, + "reward_std": 0.4176745116710663, + "rewards/accuracy_reward": 0.0416666679084301, + "rewards/reasoning_steps_reward": 0.7083334028720856, + "rewards/repetition_penalty_reward": -0.039568664506077766, + "rewards/tag_count_reward": 0.5520833730697632, "step": 49 }, { "clip_ratio": 0.0, - "completion_length": 517.9166870117188, + "completion_length": 527.0625, "epoch": 0.05, - "grad_norm": 2.3195164111653, - "kl": 0.005706787109375, + "grad_norm": 2.6847464499943308, + "kl": 0.0029754638671875, "learning_rate": 5e-07, - "loss": -0.0003, - "reward": 1.1355286836624146, - "reward_std": 0.41902345418930054, - "rewards/accuracy_reward": 0.1458333395421505, - "rewards/reasoning_steps_reward": 0.4652778059244156, - "rewards/repetition_penalty_reward": -0.12141583114862442, - "rewards/tag_count_reward": 0.6458333432674408, + "loss": 0.0743, + "reward": 1.615307629108429, + "reward_std": 0.6159101724624634, + "rewards/accuracy_reward": 0.3125000149011612, + "rewards/reasoning_steps_reward": 0.7083333730697632, + "rewards/repetition_penalty_reward": -0.046150704845786095, + "rewards/tag_count_reward": 0.6406250298023224, "step": 50 }, { "clip_ratio": 0.0, - "completion_length": 472.3333435058594, + "completion_length": 558.1875305175781, "epoch": 0.051, - "grad_norm": 2.5962908671264606, - "kl": 0.0071868896484375, + "grad_norm": 2.4123895595368547, + "kl": 0.0031280517578125, "learning_rate": 5.1e-07, - "loss": -0.0586, - "reward": 1.1341796517372131, - "reward_std": 0.4505733996629715, - "rewards/accuracy_reward": 0.1875000074505806, - "rewards/reasoning_steps_reward": 0.4375000298023224, - "rewards/repetition_penalty_reward": -0.12102878466248512, - "rewards/tag_count_reward": 0.6302083432674408, + "loss": 0.0147, + "reward": 1.4028043746948242, + "reward_std": 0.43062111735343933, + "rewards/accuracy_reward": 0.1041666679084301, + "rewards/reasoning_steps_reward": 0.7430555522441864, + "rewards/repetition_penalty_reward": -0.04858442768454552, + "rewards/tag_count_reward": 0.6041666865348816, "step": 51 }, { "clip_ratio": 0.0, - "completion_length": 564.8125305175781, + "completion_length": 575.7291870117188, "epoch": 0.052, - "grad_norm": 2.3606402936926325, - "kl": 0.008544921875, + "grad_norm": 2.8249120385470197, + "kl": 0.00350189208984375, "learning_rate": 5.2e-07, - "loss": -0.087, - "reward": 1.1581702828407288, - "reward_std": 0.44441401958465576, - "rewards/accuracy_reward": 0.10416666977107525, - "rewards/reasoning_steps_reward": 0.5625000298023224, - "rewards/repetition_penalty_reward": -0.1178714707493782, - "rewards/tag_count_reward": 0.6093750298023224, + "loss": -0.0312, + "reward": 1.5281450748443604, + "reward_std": 0.6317126750946045, + "rewards/accuracy_reward": 0.2708333358168602, + "rewards/reasoning_steps_reward": 0.6805556118488312, + "rewards/repetition_penalty_reward": -0.058660563081502914, + "rewards/tag_count_reward": 0.6354166865348816, "step": 52 }, { "clip_ratio": 0.0, - "completion_length": 489.25001525878906, + "completion_length": 519.4166870117188, "epoch": 0.053, - "grad_norm": 2.949133053889624, - "kl": 0.01031494140625, + "grad_norm": 2.8646116438456453, + "kl": 0.00429534912109375, "learning_rate": 5.3e-07, - "loss": 0.0568, - "reward": 1.4706464409828186, - "reward_std": 0.4914465397596359, - "rewards/accuracy_reward": 0.3541666865348816, - "rewards/reasoning_steps_reward": 0.4930555671453476, - "rewards/repetition_penalty_reward": -0.11095088720321655, - "rewards/tag_count_reward": 0.7343750298023224, + "loss": 0.0331, + "reward": 1.6907492280006409, + "reward_std": 0.6883328557014465, + "rewards/accuracy_reward": 0.3541666716337204, + "rewards/reasoning_steps_reward": 0.6875000298023224, + "rewards/repetition_penalty_reward": -0.03841756656765938, + "rewards/tag_count_reward": 0.6875, "step": 53 }, { "clip_ratio": 0.0, - "completion_length": 472.0, + "completion_length": 578.1250305175781, "epoch": 0.054, - "grad_norm": 2.6612164840490324, - "kl": 0.01123046875, + "grad_norm": 2.532736796747683, + "kl": 0.0043792724609375, "learning_rate": 5.4e-07, - "loss": -0.0687, - "reward": 1.4638976454734802, - "reward_std": 0.5431383848190308, - "rewards/accuracy_reward": 0.4166666716337204, - "rewards/reasoning_steps_reward": 0.4583333432674408, - "rewards/repetition_penalty_reward": -0.10901909694075584, - "rewards/tag_count_reward": 0.6979166865348816, + "loss": 0.0327, + "reward": 1.4663925766944885, + "reward_std": 0.4452537000179291, + "rewards/accuracy_reward": 0.12500000558793545, + "rewards/reasoning_steps_reward": 0.7986111342906952, + "rewards/repetition_penalty_reward": -0.06138528883457184, + "rewards/tag_count_reward": 0.6041666865348816, "step": 54 }, { "clip_ratio": 0.0, - "completion_length": 459.8750305175781, + "completion_length": 556.3750305175781, "epoch": 0.055, - "grad_norm": 2.6780744391661955, - "kl": 0.009613037109375, + "grad_norm": 2.626064949357054, + "kl": 0.0063018798828125, "learning_rate": 5.5e-07, - "loss": -0.0152, - "reward": 1.5579619407653809, - "reward_std": 0.46172915399074554, - "rewards/accuracy_reward": 0.3541666865348816, - "rewards/reasoning_steps_reward": 0.5277778059244156, - "rewards/repetition_penalty_reward": -0.10002415627241135, - "rewards/tag_count_reward": 0.7760416865348816, + "loss": 0.0412, + "reward": 1.5763724446296692, + "reward_std": 0.5313678234815598, + "rewards/accuracy_reward": 0.2500000149011612, + "rewards/reasoning_steps_reward": 0.7083333730697632, + "rewards/repetition_penalty_reward": -0.05904424749314785, + "rewards/tag_count_reward": 0.6770833432674408, "step": 55 }, { "clip_ratio": 0.0, - "completion_length": 475.1250305175781, + "completion_length": 558.9166870117188, "epoch": 0.056, - "grad_norm": 2.7898762021953036, - "kl": 0.010833740234375, + "grad_norm": 2.6301403493903748, + "kl": 0.0066680908203125, "learning_rate": 5.6e-07, - "loss": 0.1002, - "reward": 1.8562687039375305, - "reward_std": 0.49579566717147827, - "rewards/accuracy_reward": 0.6875, - "rewards/reasoning_steps_reward": 0.4583333283662796, - "rewards/repetition_penalty_reward": -0.11248137429356575, - "rewards/tag_count_reward": 0.8229166865348816, + "loss": 0.0203, + "reward": 1.794304072856903, + "reward_std": 0.6143919825553894, + "rewards/accuracy_reward": 0.3750000149011612, + "rewards/reasoning_steps_reward": 0.7361111640930176, + "rewards/repetition_penalty_reward": -0.04597374238073826, + "rewards/tag_count_reward": 0.7291666865348816, "step": 56 }, { "clip_ratio": 0.0, - "completion_length": 491.8958435058594, + "completion_length": 581.75, "epoch": 0.057, - "grad_norm": 2.4903313840679133, - "kl": 0.01837158203125, + "grad_norm": 2.377807933864504, + "kl": 0.0074462890625, "learning_rate": 5.699999999999999e-07, - "loss": -0.0781, - "reward": 1.3609140515327454, - "reward_std": 0.46711015701293945, - "rewards/accuracy_reward": 0.25000000558793545, - "rewards/reasoning_steps_reward": 0.4791666716337204, - "rewards/repetition_penalty_reward": -0.11304439604282379, - "rewards/tag_count_reward": 0.7447916865348816, + "loss": 0.0249, + "reward": 1.681954801082611, + "reward_std": 0.5288814753293991, + "rewards/accuracy_reward": 0.291666679084301, + "rewards/reasoning_steps_reward": 0.7430556118488312, + "rewards/repetition_penalty_reward": -0.05589243024587631, + "rewards/tag_count_reward": 0.703125, "step": 57 }, { "clip_ratio": 0.0, - "completion_length": 480.54168701171875, + "completion_length": 564.7916870117188, "epoch": 0.058, - "grad_norm": 2.670568943605974, - "kl": 0.0115966796875, + "grad_norm": 2.9351818937282297, + "kl": 0.0069580078125, "learning_rate": 5.8e-07, - "loss": 0.0861, - "reward": 1.5060352087020874, - "reward_std": 0.46976399421691895, - "rewards/accuracy_reward": 0.3333333358168602, - "rewards/reasoning_steps_reward": 0.486111119389534, - "rewards/repetition_penalty_reward": -0.1311175599694252, - "rewards/tag_count_reward": 0.8177083432674408, + "loss": 0.0467, + "reward": 2.0512834787368774, + "reward_std": 0.659260630607605, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.7708333730697632, + "rewards/repetition_penalty_reward": -0.04767502471804619, + "rewards/tag_count_reward": 0.7864583432674408, "step": 58 }, { "clip_ratio": 0.0, - "completion_length": 476.35418701171875, + "completion_length": 611.2916870117188, "epoch": 0.059, - "grad_norm": 2.7349874896537054, - "kl": 0.01104736328125, + "grad_norm": 2.5837982836551467, + "kl": 0.00921630859375, "learning_rate": 5.9e-07, - "loss": 0.1366, - "reward": 1.4265865087509155, - "reward_std": 0.364186555147171, - "rewards/accuracy_reward": 0.2083333358168602, - "rewards/reasoning_steps_reward": 0.4513888955116272, - "rewards/repetition_penalty_reward": -0.10292742773890495, - "rewards/tag_count_reward": 0.8697916865348816, + "loss": -0.0409, + "reward": 1.758675456047058, + "reward_std": 0.5324381589889526, + "rewards/accuracy_reward": 0.2500000111758709, + "rewards/reasoning_steps_reward": 0.7222222089767456, + "rewards/repetition_penalty_reward": -0.05729677900671959, + "rewards/tag_count_reward": 0.84375, "step": 59 }, { "clip_ratio": 0.0, - "completion_length": 445.2083435058594, + "completion_length": 520.6666870117188, "epoch": 0.06, - "grad_norm": 2.8995500539101284, - "kl": 0.01019287109375, + "grad_norm": 3.05141171791299, + "kl": 0.011322021484375, "learning_rate": 6e-07, - "loss": 0.0277, - "reward": 1.6995399594306946, - "reward_std": 0.40546177327632904, - "rewards/accuracy_reward": 0.5, - "rewards/reasoning_steps_reward": 0.3819444477558136, - "rewards/repetition_penalty_reward": -0.12511292472481728, - "rewards/tag_count_reward": 0.9427083730697632, + "loss": 0.0112, + "reward": 2.136493146419525, + "reward_std": 0.587219700217247, + "rewards/accuracy_reward": 0.5625000149011612, + "rewards/reasoning_steps_reward": 0.756944477558136, + "rewards/repetition_penalty_reward": -0.0475346464663744, + "rewards/tag_count_reward": 0.8645833432674408, "step": 60 }, { "clip_ratio": 0.0, - "completion_length": 530.6041870117188, + "completion_length": 547.7916870117188, "epoch": 0.061, - "grad_norm": 2.4674507233355745, - "kl": 0.0126953125, + "grad_norm": 2.7073330792983072, + "kl": 0.011810302734375, "learning_rate": 6.1e-07, - "loss": 0.0575, - "reward": 1.9732111096382141, - "reward_std": 0.34852640330791473, - "rewards/accuracy_reward": 0.7291666716337204, - "rewards/reasoning_steps_reward": 0.4444444477558136, - "rewards/repetition_penalty_reward": -0.10665006563067436, - "rewards/tag_count_reward": 0.90625, + "loss": 0.0477, + "reward": 2.2359567284584045, + "reward_std": 0.5013462156057358, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.6458333134651184, + "rewards/repetition_penalty_reward": -0.04008489940315485, + "rewards/tag_count_reward": 0.921875, "step": 61 }, { "clip_ratio": 0.0, - "completion_length": 356.93751525878906, + "completion_length": 449.5208435058594, "epoch": 0.062, - "grad_norm": 2.8847481386634417, - "kl": 0.015533447265625, + "grad_norm": 3.257750731526075, + "kl": 0.019744873046875, "learning_rate": 6.2e-07, - "loss": 0.0774, - "reward": 2.109173536300659, - "reward_std": 0.29093076288700104, - "rewards/accuracy_reward": 0.9375000298023224, - "rewards/reasoning_steps_reward": 0.298611119389534, - "rewards/repetition_penalty_reward": -0.11131284013390541, - "rewards/tag_count_reward": 0.984375, + "loss": 0.0639, + "reward": 2.321463108062744, + "reward_std": 0.4783380925655365, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.7013889253139496, + "rewards/repetition_penalty_reward": -0.04138421919196844, + "rewards/tag_count_reward": 0.9114583432674408, "step": 62 }, { "clip_ratio": 0.0, - "completion_length": 416.3958435058594, + "completion_length": 547.2500305175781, "epoch": 0.063, - "grad_norm": 2.512440930796155, - "kl": 0.013580322265625, + "grad_norm": 2.7432912908420732, + "kl": 0.011871337890625, "learning_rate": 6.3e-07, - "loss": 0.1099, - "reward": 2.083560585975647, - "reward_std": 0.27802765369415283, - "rewards/accuracy_reward": 0.8958333730697632, - "rewards/reasoning_steps_reward": 0.3263889253139496, - "rewards/repetition_penalty_reward": -0.10741163045167923, - "rewards/tag_count_reward": 0.96875, + "loss": 0.0445, + "reward": 2.019491195678711, + "reward_std": 0.5333467572927475, + "rewards/accuracy_reward": 0.458333358168602, + "rewards/reasoning_steps_reward": 0.6736111640930176, + "rewards/repetition_penalty_reward": -0.04995330423116684, + "rewards/tag_count_reward": 0.9375, "step": 63 }, { "clip_ratio": 0.0, - "completion_length": 418.5, + "completion_length": 533.8125305175781, "epoch": 0.064, - "grad_norm": 2.7713948724178836, - "kl": 0.013946533203125, + "grad_norm": 2.775011190038876, + "kl": 0.01220703125, "learning_rate": 6.4e-07, - "loss": -0.009, - "reward": 2.1677645444869995, - "reward_std": 0.28540101647377014, - "rewards/accuracy_reward": 0.9375, - "rewards/reasoning_steps_reward": 0.3472222089767456, - "rewards/repetition_penalty_reward": -0.09612448513507843, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": -0.0156, + "reward": 2.413369059562683, + "reward_std": 0.517508327960968, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.79861119389534, + "rewards/repetition_penalty_reward": -0.041492147371172905, + "rewards/tag_count_reward": 0.96875, "step": 64 }, { "clip_ratio": 0.0, - "completion_length": 426.3958435058594, + "completion_length": 511.16668701171875, "epoch": 0.065, - "grad_norm": 2.56902045253022, - "kl": 0.01727294921875, + "grad_norm": 2.745662806864412, + "kl": 0.014892578125, "learning_rate": 6.5e-07, - "loss": 0.0438, - "reward": 1.8686636686325073, - "reward_std": 0.27829886972904205, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/reasoning_steps_reward": 0.3333333432674408, - "rewards/repetition_penalty_reward": -0.10529470071196556, - "rewards/tag_count_reward": 0.9739583730697632, + "loss": -0.0064, + "reward": 2.327776312828064, + "reward_std": 0.5232683420181274, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.694444477558136, + "rewards/repetition_penalty_reward": -0.038543105125427246, + "rewards/tag_count_reward": 0.9427083730697632, "step": 65 }, { "clip_ratio": 0.0, - "completion_length": 398.1458435058594, + "completion_length": 538.875, "epoch": 0.066, - "grad_norm": 3.0619239713428135, - "kl": 0.01727294921875, + "grad_norm": 2.5297084422395555, + "kl": 0.0113525390625, "learning_rate": 6.6e-07, - "loss": -0.045, - "reward": 1.8041464686393738, - "reward_std": 0.16933216899633408, - "rewards/accuracy_reward": 0.5208333432674408, - "rewards/reasoning_steps_reward": 0.375, - "rewards/repetition_penalty_reward": -0.07606191188097, - "rewards/tag_count_reward": 0.984375, + "loss": -0.0286, + "reward": 2.1816118955612183, + "reward_std": 0.38229691982269287, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.6805555820465088, + "rewards/repetition_penalty_reward": -0.03540213964879513, + "rewards/tag_count_reward": 0.9947916865348816, "step": 66 }, { "clip_ratio": 0.0, - "completion_length": 426.2708435058594, + "completion_length": 559.8958435058594, "epoch": 0.067, - "grad_norm": 2.810713513324775, - "kl": 0.018310546875, + "grad_norm": 2.672746029201746, + "kl": 0.012908935546875, "learning_rate": 6.7e-07, - "loss": -0.0049, - "reward": 1.7448714971542358, - "reward_std": 0.33972424268722534, - "rewards/accuracy_reward": 0.5416666716337204, - "rewards/reasoning_steps_reward": 0.3125, - "rewards/repetition_penalty_reward": -0.09367027133703232, - "rewards/tag_count_reward": 0.9843750298023224, + "loss": -0.035, + "reward": 2.2214502096176147, + "reward_std": 0.5396545231342316, + "rewards/accuracy_reward": 0.5625, + "rewards/reasoning_steps_reward": 0.7152777910232544, + "rewards/repetition_penalty_reward": -0.04591094329953194, + "rewards/tag_count_reward": 0.9895833730697632, "step": 67 }, { "clip_ratio": 0.0, - "completion_length": 347.7083435058594, + "completion_length": 537.3125, "epoch": 0.068, - "grad_norm": 2.7669031060953864, - "kl": 0.01416015625, + "grad_norm": 2.920361106100061, + "kl": 0.019287109375, "learning_rate": 6.800000000000001e-07, - "loss": 0.0191, - "reward": 2.210191011428833, - "reward_std": 0.21266353875398636, - "rewards/accuracy_reward": 0.9583333730697632, - "rewards/reasoning_steps_reward": 0.3263888955116272, - "rewards/repetition_penalty_reward": -0.07453135401010513, - "rewards/tag_count_reward": 1.0, + "loss": 0.0507, + "reward": 2.291020631790161, + "reward_std": 0.49400143325328827, + "rewards/accuracy_reward": 0.645833358168602, + "rewards/reasoning_steps_reward": 0.7361111342906952, + "rewards/repetition_penalty_reward": -0.049257127568125725, + "rewards/tag_count_reward": 0.9583333730697632, "step": 68 }, { "clip_ratio": 0.0, - "completion_length": 412.62501525878906, + "completion_length": 469.2708435058594, "epoch": 0.069, - "grad_norm": 2.9053594696881015, - "kl": 0.013397216796875, + "grad_norm": 2.901120243600892, + "kl": 0.017242431640625, "learning_rate": 6.9e-07, - "loss": 0.0197, - "reward": 2.097227096557617, - "reward_std": 0.3291797488927841, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/reasoning_steps_reward": 0.3402777761220932, - "rewards/repetition_penalty_reward": -0.11284245550632477, - "rewards/tag_count_reward": 0.9739583432674408, - "step": 69 + "loss": 0.0654, + "reward": 2.4167280197143555, + "reward_std": 0.5367273986339569, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.7638889849185944, + "rewards/repetition_penalty_reward": -0.03466090187430382, + "rewards/tag_count_reward": 1.0, + "step": 69 }, { "clip_ratio": 0.0, - "completion_length": 500.0416717529297, + "completion_length": 504.6666717529297, "epoch": 0.07, - "grad_norm": 2.7234996193396603, - "kl": 0.014678955078125, + "grad_norm": 3.137395976127796, + "kl": 0.018524169921875, "learning_rate": 7e-07, - "loss": -0.0179, - "reward": 1.9091373085975647, - "reward_std": 0.39123236387968063, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/reasoning_steps_reward": 0.472222238779068, - "rewards/repetition_penalty_reward": -0.08391834422945976, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.0147, + "reward": 2.4454265832901, + "reward_std": 0.5369938015937805, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.7083333730697632, + "rewards/repetition_penalty_reward": -0.04415685310959816, + "rewards/tag_count_reward": 0.9895833432674408, "step": 70 }, { "clip_ratio": 0.0, - "completion_length": 354.12501525878906, + "completion_length": 535.5000305175781, "epoch": 0.071, - "grad_norm": 3.1346607990284188, - "kl": 0.014984130859375, + "grad_norm": 2.6854335457859206, + "kl": 0.01251220703125, "learning_rate": 7.1e-07, - "loss": -0.0507, - "reward": 2.0297670364379883, - "reward_std": 0.15240045171231031, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 0.3263889104127884, - "rewards/repetition_penalty_reward": -0.08308036252856255, + "loss": 0.0458, + "reward": 2.375571846961975, + "reward_std": 0.4342511296272278, + "rewards/accuracy_reward": 0.6875, + "rewards/reasoning_steps_reward": 0.7222222685813904, + "rewards/repetition_penalty_reward": -0.028942234814167023, "rewards/tag_count_reward": 0.9947916865348816, "step": 71 }, { "clip_ratio": 0.0, - "completion_length": 455.0, + "completion_length": 527.6250305175781, "epoch": 0.072, - "grad_norm": 2.7389757257052847, - "kl": 0.01416015625, + "grad_norm": 2.736209803149782, + "kl": 0.01373291015625, "learning_rate": 7.2e-07, - "loss": 0.1859, - "reward": 2.180327534675598, - "reward_std": 0.36164169013500214, - "rewards/accuracy_reward": 0.9375, - "rewards/reasoning_steps_reward": 0.347222238779068, - "rewards/repetition_penalty_reward": -0.08876965194940567, - "rewards/tag_count_reward": 0.984375, + "loss": 0.0066, + "reward": 2.4244872331619263, + "reward_std": 0.4428471177816391, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.7361111640930176, + "rewards/repetition_penalty_reward": -0.04079079441726208, + "rewards/tag_count_reward": 1.0, "step": 72 }, { "clip_ratio": 0.0, - "completion_length": 366.1041717529297, + "completion_length": 540.7708740234375, "epoch": 0.073, - "grad_norm": 2.601338769225808, - "kl": 0.015350341796875, + "grad_norm": 3.26658788883033, + "kl": 0.0230712890625, "learning_rate": 7.3e-07, - "loss": -0.0427, - "reward": 2.2693206071853638, - "reward_std": 0.1210118979215622, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 0.3541666716337204, - "rewards/repetition_penalty_reward": -0.0848461240530014, - "rewards/tag_count_reward": 1.0, + "loss": 0.05, + "reward": 2.218494176864624, + "reward_std": 0.44005706906318665, + "rewards/accuracy_reward": 0.5416666716337204, + "rewards/reasoning_steps_reward": 0.7291666865348816, + "rewards/repetition_penalty_reward": -0.036714269779622555, + "rewards/tag_count_reward": 0.984375, "step": 73 }, { "clip_ratio": 0.0, - "completion_length": 398.79168701171875, + "completion_length": 477.5833435058594, "epoch": 0.074, - "grad_norm": 2.8912870955102727, - "kl": 0.01483154296875, + "grad_norm": 2.85387562733274, + "kl": 0.015350341796875, "learning_rate": 7.4e-07, - "loss": 0.0874, - "reward": 2.229883313179016, - "reward_std": 0.21679867804050446, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 0.3958333283662796, - "rewards/repetition_penalty_reward": -0.07740835472941399, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": -0.0614, + "reward": 2.5582029819488525, + "reward_std": 0.46813952922821045, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.784722238779068, + "rewards/repetition_penalty_reward": -0.03901927825063467, + "rewards/tag_count_reward": 1.0, "step": 74 }, { "clip_ratio": 0.0, - "completion_length": 437.16668701171875, + "completion_length": 530.1041870117188, "epoch": 0.075, - "grad_norm": 2.708056713906426, - "kl": 0.01739501953125, + "grad_norm": 2.963663960294806, + "kl": 0.017822265625, "learning_rate": 7.5e-07, - "loss": 0.0224, - "reward": 2.0687904953956604, - "reward_std": 0.35561710596084595, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 0.3958333432674408, - "rewards/repetition_penalty_reward": -0.0926678255200386, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0242, + "reward": 2.356380820274353, + "reward_std": 0.5158264935016632, + "rewards/accuracy_reward": 0.6875, + "rewards/reasoning_steps_reward": 0.7083334028720856, + "rewards/repetition_penalty_reward": -0.03945251181721687, + "rewards/tag_count_reward": 1.0, "step": 75 }, { "clip_ratio": 0.0, - "completion_length": 324.25001525878906, + "completion_length": 542.7500305175781, "epoch": 0.076, - "grad_norm": 2.853565059481583, - "kl": 0.015869140625, + "grad_norm": 8.251127258415144, + "kl": 0.02813720703125, "learning_rate": 7.599999999999999e-07, - "loss": 0.0456, - "reward": 2.269968271255493, - "reward_std": 0.13344183191657066, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 0.3541666716337204, - "rewards/repetition_penalty_reward": -0.0633650440722704, - "rewards/tag_count_reward": 1.0, + "loss": 0.0219, + "reward": 2.4178924560546875, + "reward_std": 0.44688859581947327, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.7222222089767456, + "rewards/repetition_penalty_reward": -0.043913234025239944, + "rewards/tag_count_reward": 0.9895833432674408, "step": 76 }, { "clip_ratio": 0.0, - "completion_length": 496.8125305175781, + "completion_length": 547.0625, "epoch": 0.077, - "grad_norm": 2.5942080838090726, - "kl": 0.015411376953125, + "grad_norm": 3.4452973677390144, + "kl": 0.021881103515625, "learning_rate": 7.699999999999999e-07, - "loss": -0.0632, - "reward": 1.7249847650527954, - "reward_std": 0.3642410486936569, - "rewards/accuracy_reward": 0.416666679084301, - "rewards/reasoning_steps_reward": 0.4236110895872116, - "rewards/repetition_penalty_reward": -0.11008473113179207, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0425, + "reward": 2.3173056840896606, + "reward_std": 0.49892735481262207, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 0.75, + "rewards/repetition_penalty_reward": -0.04727764055132866, + "rewards/tag_count_reward": 0.9895833432674408, "step": 77 }, { "clip_ratio": 0.0, - "completion_length": 394.5208435058594, + "completion_length": 554.3958435058594, "epoch": 0.078, - "grad_norm": 2.8329024140183563, - "kl": 0.018890380859375, + "grad_norm": 2.5762620979216746, + "kl": 0.017181396484375, "learning_rate": 7.799999999999999e-07, - "loss": 0.0595, - "reward": 1.9577640295028687, - "reward_std": 0.4134839177131653, - "rewards/accuracy_reward": 0.6875, - "rewards/reasoning_steps_reward": 0.3819444626569748, - "rewards/repetition_penalty_reward": -0.0856388658285141, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": -0.0855, + "reward": 2.4648728370666504, + "reward_std": 0.5124194473028183, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.8472222685813904, + "rewards/repetition_penalty_reward": -0.04380778409540653, + "rewards/tag_count_reward": 0.9947916865348816, "step": 78 }, { "clip_ratio": 0.0, - "completion_length": 347.1458435058594, + "completion_length": 490.83335876464844, "epoch": 0.079, - "grad_norm": 2.892862738023272, - "kl": 0.025146484375, + "grad_norm": 2.8936310930522597, + "kl": 0.0186767578125, "learning_rate": 7.9e-07, - "loss": -0.0007, - "reward": 2.163182020187378, - "reward_std": 0.22357018291950226, - "rewards/accuracy_reward": 0.875, - "rewards/reasoning_steps_reward": 0.3680555522441864, - "rewards/repetition_penalty_reward": -0.07987364754080772, + "loss": 0.0224, + "reward": 2.5715928077697754, + "reward_std": 0.4545498937368393, + "rewards/accuracy_reward": 0.8333333730697632, + "rewards/reasoning_steps_reward": 0.7708334028720856, + "rewards/repetition_penalty_reward": -0.03257405199110508, "rewards/tag_count_reward": 1.0, "step": 79 }, { "clip_ratio": 0.0, - "completion_length": 408.37501525878906, + "completion_length": 505.16668701171875, "epoch": 0.08, - "grad_norm": 3.0665545773316762, - "kl": 0.0177001953125, + "grad_norm": 3.386283681350372, + "kl": 0.018951416015625, "learning_rate": 8e-07, - "loss": -0.0167, - "reward": 2.115744471549988, - "reward_std": 0.35287149250507355, - "rewards/accuracy_reward": 0.8125, - "rewards/reasoning_steps_reward": 0.4027777761220932, - "rewards/repetition_penalty_reward": -0.08911668509244919, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0251, + "reward": 2.380759596824646, + "reward_std": 0.40306712687015533, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.7708333432674408, + "rewards/repetition_penalty_reward": -0.03590711485594511, + "rewards/tag_count_reward": 1.0, "step": 80 }, { "clip_ratio": 0.0, - "completion_length": 425.37501525878906, + "completion_length": 518.7708435058594, "epoch": 0.081, - "grad_norm": 2.4400568852403133, - "kl": 0.01898193359375, + "grad_norm": 2.774591505446817, + "kl": 0.016845703125, "learning_rate": 8.1e-07, - "loss": 0.0346, - "reward": 2.1127939224243164, - "reward_std": 0.25317414104938507, - "rewards/accuracy_reward": 0.8958333730697632, - "rewards/reasoning_steps_reward": 0.3194444477558136, - "rewards/repetition_penalty_reward": -0.10248396545648575, + "loss": 0.1045, + "reward": 2.4150757789611816, + "reward_std": 0.34570978581905365, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.7638889253139496, + "rewards/repetition_penalty_reward": -0.03631327673792839, "rewards/tag_count_reward": 1.0, "step": 81 }, { "clip_ratio": 0.0, - "completion_length": 373.8541717529297, + "completion_length": 542.4791717529297, "epoch": 0.082, - "grad_norm": 3.083593323517966, - "kl": 0.0255126953125, + "grad_norm": 2.9728145387844758, + "kl": 0.01812744140625, "learning_rate": 8.199999999999999e-07, - "loss": -0.0628, - "reward": 2.049175262451172, - "reward_std": 0.11791650578379631, - "rewards/accuracy_reward": 0.75, - "rewards/reasoning_steps_reward": 0.4027777910232544, - "rewards/repetition_penalty_reward": -0.09839428588747978, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": -0.0635, + "reward": 2.5557461977005005, + "reward_std": 0.4178060442209244, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.8680555820465088, + "rewards/repetition_penalty_reward": -0.04147614166140556, + "rewards/tag_count_reward": 1.0, "step": 82 }, { "clip_ratio": 0.0, - "completion_length": 348.25001525878906, + "completion_length": 540.0208435058594, "epoch": 0.083, - "grad_norm": 2.6873467136691342, - "kl": 0.0216064453125, + "grad_norm": 2.624113800462916, + "kl": 0.02069091796875, "learning_rate": 8.299999999999999e-07, - "loss": 0.0013, - "reward": 2.1261686086654663, - "reward_std": 0.15822361409664154, - "rewards/accuracy_reward": 0.75, - "rewards/reasoning_steps_reward": 0.4791667014360428, - "rewards/repetition_penalty_reward": -0.09778975695371628, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": -0.0095, + "reward": 2.4058659076690674, + "reward_std": 0.49703338742256165, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.7708334028720856, + "rewards/repetition_penalty_reward": -0.04205096513032913, + "rewards/tag_count_reward": 0.9895833432674408, "step": 83 }, { "clip_ratio": 0.0, - "completion_length": 370.0833435058594, + "completion_length": 538.7291870117188, "epoch": 0.084, - "grad_norm": 2.8775283862863605, - "kl": 0.0233154296875, + "grad_norm": 2.654590798347164, + "kl": 0.0184326171875, "learning_rate": 8.399999999999999e-07, - "loss": -0.0352, - "reward": 2.17717444896698, - "reward_std": 0.27370330691337585, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/reasoning_steps_reward": 0.4375, - "rewards/repetition_penalty_reward": -0.06761737167835236, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": -0.0225, + "reward": 2.4977999925613403, + "reward_std": 0.5065296292304993, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.847222238779068, + "rewards/repetition_penalty_reward": -0.03692223224788904, + "rewards/tag_count_reward": 1.0, "step": 84 }, { "clip_ratio": 0.0, - "completion_length": 356.2916717529297, + "completion_length": 558.7916870117188, "epoch": 0.085, - "grad_norm": 2.6682973627532265, - "kl": 0.027099609375, + "grad_norm": 2.5236300531262033, + "kl": 0.02142333984375, "learning_rate": 8.499999999999999e-07, - "loss": 0.0445, - "reward": 2.264863967895508, - "reward_std": 0.14276781305670738, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 0.3680555671453476, - "rewards/repetition_penalty_reward": -0.09798330813646317, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0286, + "reward": 2.3374500274658203, + "reward_std": 0.4774511754512787, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 0.854166716337204, + "rewards/repetition_penalty_reward": -0.03755011223256588, + "rewards/tag_count_reward": 1.0, "step": 85 }, { "clip_ratio": 0.0, - "completion_length": 359.0208435058594, + "completion_length": 557.6458435058594, "epoch": 0.086, - "grad_norm": 2.864857579564143, - "kl": 0.026123046875, + "grad_norm": 2.4520679561302434, + "kl": 0.01904296875, "learning_rate": 8.599999999999999e-07, - "loss": -0.048, - "reward": 1.9023111462593079, - "reward_std": 0.2422098070383072, - "rewards/accuracy_reward": 0.5625000298023224, - "rewards/reasoning_steps_reward": 0.4097222238779068, - "rewards/repetition_penalty_reward": -0.06470286473631859, + "loss": -0.0126, + "reward": 2.469316601753235, + "reward_std": 0.3855314701795578, + "rewards/accuracy_reward": 0.6250000149011612, + "rewards/reasoning_steps_reward": 0.8958333432674408, + "rewards/repetition_penalty_reward": -0.046308472752571106, "rewards/tag_count_reward": 0.9947916865348816, "step": 86 }, { "clip_ratio": 0.0, - "completion_length": 401.37501525878906, + "completion_length": 537.1458435058594, "epoch": 0.087, - "grad_norm": 2.7888810070866397, - "kl": 0.0267333984375, + "grad_norm": 2.8696069886274858, + "kl": 0.02032470703125, "learning_rate": 8.699999999999999e-07, - "loss": 0.1502, - "reward": 2.093042731285095, - "reward_std": 0.22690971940755844, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 0.4305555373430252, - "rewards/repetition_penalty_reward": -0.09272129088640213, - "rewards/tag_count_reward": 0.984375, + "loss": 0.0288, + "reward": 2.775725483894348, + "reward_std": 0.3478439748287201, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.9097222685813904, + "rewards/repetition_penalty_reward": -0.04545513913035393, + "rewards/tag_count_reward": 0.9947916865348816, "step": 87 }, { "clip_ratio": 0.0, - "completion_length": 471.6041717529297, + "completion_length": 552.0625152587891, "epoch": 0.088, - "grad_norm": 3.0697190666757455, - "kl": 0.029541015625, + "grad_norm": 2.7328359228843984, + "kl": 0.0185546875, "learning_rate": 8.799999999999999e-07, - "loss": 0.0655, - "reward": 2.059904932975769, - "reward_std": 0.46183615922927856, - "rewards/accuracy_reward": 0.645833358168602, - "rewards/reasoning_steps_reward": 0.5347222685813904, - "rewards/repetition_penalty_reward": -0.07898403704166412, - "rewards/tag_count_reward": 0.9583333432674408, + "loss": 0.1113, + "reward": 2.5936840772628784, + "reward_std": 0.2873719036579132, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.881944477558136, + "rewards/repetition_penalty_reward": -0.03826039098203182, + "rewards/tag_count_reward": 1.0, "step": 88 }, { "clip_ratio": 0.0, - "completion_length": 342.75001525878906, + "completion_length": 540.8125305175781, "epoch": 0.089, - "grad_norm": 2.701919883619728, - "kl": 0.0299072265625, + "grad_norm": 2.6685590803479307, + "kl": 0.02239990234375, "learning_rate": 8.9e-07, - "loss": 0.0239, - "reward": 2.3514446020126343, - "reward_std": 0.24513769149780273, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/reasoning_steps_reward": 0.4722222238779068, - "rewards/repetition_penalty_reward": -0.07911098375916481, - "rewards/tag_count_reward": 1.0, + "loss": 0.0224, + "reward": 2.3886401653289795, + "reward_std": 0.48963838815689087, + "rewards/accuracy_reward": 0.5, + "rewards/reasoning_steps_reward": 0.9305555522441864, + "rewards/repetition_penalty_reward": -0.03670708276331425, + "rewards/tag_count_reward": 0.9947916865348816, "step": 89 }, { "clip_ratio": 0.0, - "completion_length": 496.1041717529297, + "completion_length": 490.06251525878906, "epoch": 0.09, - "grad_norm": 3.6295103528982775, - "kl": 0.0301513671875, + "grad_norm": 3.839427427471072, + "kl": 0.024169921875, "learning_rate": 9e-07, - "loss": 0.1867, - "reward": 1.6698536276817322, - "reward_std": 0.45128779113292694, - "rewards/accuracy_reward": 0.3125000111758709, - "rewards/reasoning_steps_reward": 0.5347222536802292, - "rewards/repetition_penalty_reward": -0.10966024175286293, - "rewards/tag_count_reward": 0.9322916865348816, + "loss": 0.0817, + "reward": 2.394170045852661, + "reward_std": 0.5097104758024216, + "rewards/accuracy_reward": 0.5625, + "rewards/reasoning_steps_reward": 0.8680555820465088, + "rewards/repetition_penalty_reward": -0.03638560324907303, + "rewards/tag_count_reward": 1.0, "step": 90 }, { "clip_ratio": 0.0, - "completion_length": 454.39585876464844, + "completion_length": 524.9791870117188, "epoch": 0.091, - "grad_norm": 2.7229312090729336, - "kl": 0.0328369140625, + "grad_norm": 2.553614340168228, + "kl": 0.02398681640625, "learning_rate": 9.1e-07, - "loss": -0.0062, - "reward": 2.324805498123169, - "reward_std": 0.34519773721694946, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 0.5763889253139496, - "rewards/repetition_penalty_reward": -0.10054175183176994, - "rewards/tag_count_reward": 0.9322916865348816, + "loss": 0.0725, + "reward": 2.6662930250167847, + "reward_std": 0.20492929220199585, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9097222685813904, + "rewards/repetition_penalty_reward": -0.03509608097374439, + "rewards/tag_count_reward": 1.0, "step": 91 }, { "clip_ratio": 0.0, - "completion_length": 395.375, + "completion_length": 515.0625305175781, "epoch": 0.092, - "grad_norm": 3.2764187745329334, - "kl": 0.03564453125, + "grad_norm": 2.79663742057304, + "kl": 0.02203369140625, "learning_rate": 9.2e-07, - "loss": 0.0182, - "reward": 2.0280832648277283, - "reward_std": 0.34830891340970993, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/reasoning_steps_reward": 0.4722222536802292, - "rewards/repetition_penalty_reward": -0.08476399630308151, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 0.0525, + "reward": 2.7029329538345337, + "reward_std": 0.3332049995660782, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/reasoning_steps_reward": 0.9166666865348816, + "rewards/repetition_penalty_reward": -0.04185891151428223, + "rewards/tag_count_reward": 0.9947916865348816, "step": 92 }, { "clip_ratio": 0.0, - "completion_length": 377.8958435058594, + "completion_length": 586.6875, "epoch": 0.093, - "grad_norm": 2.687381236546485, - "kl": 0.0408935546875, + "grad_norm": 2.51813341735105, + "kl": 0.02301025390625, "learning_rate": 9.3e-07, - "loss": -0.0932, - "reward": 2.1497602462768555, - "reward_std": 0.2074139527976513, - "rewards/accuracy_reward": 0.6875, - "rewards/reasoning_steps_reward": 0.5763889253139496, - "rewards/repetition_penalty_reward": -0.08808711543679237, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 0.0461, + "reward": 2.7553551197052, + "reward_std": 0.41791823506355286, + "rewards/accuracy_reward": 0.8750000298023224, + "rewards/reasoning_steps_reward": 0.9305555820465088, + "rewards/repetition_penalty_reward": -0.04499218240380287, + "rewards/tag_count_reward": 0.9947916865348816, "step": 93 }, { "clip_ratio": 0.0, - "completion_length": 435.7083435058594, + "completion_length": 589.2708435058594, "epoch": 0.094, - "grad_norm": 2.4969522460438625, - "kl": 0.040283203125, + "grad_norm": 2.5842043324661863, + "kl": 0.0238037109375, "learning_rate": 9.399999999999999e-07, - "loss": -0.0224, - "reward": 2.3433005809783936, - "reward_std": 0.3568440079689026, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/reasoning_steps_reward": 0.590277761220932, - "rewards/repetition_penalty_reward": -0.10114402696490288, - "rewards/tag_count_reward": 0.9583333432674408, + "loss": 0.0528, + "reward": 2.252669334411621, + "reward_std": 0.3811512589454651, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.04073341749608517, + "rewards/tag_count_reward": 0.9947916865348816, "step": 94 }, { "clip_ratio": 0.0, - "completion_length": 445.9791717529297, + "completion_length": 564.7708435058594, "epoch": 0.095, - "grad_norm": 2.9813291847297765, - "kl": 0.037109375, + "grad_norm": 2.5488718443741667, + "kl": 0.02752685546875, "learning_rate": 9.499999999999999e-07, - "loss": 0.1404, - "reward": 2.1387113332748413, - "reward_std": 0.32204214483499527, - "rewards/accuracy_reward": 0.6875000298023224, - "rewards/reasoning_steps_reward": 0.5902778208255768, - "rewards/repetition_penalty_reward": -0.09740003198385239, - "rewards/tag_count_reward": 0.9583333432674408, + "loss": 0.1107, + "reward": 2.6501858234405518, + "reward_std": 0.27840781956911087, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.902777761220932, + "rewards/repetition_penalty_reward": -0.04425876401364803, + "rewards/tag_count_reward": 1.0, "step": 95 }, { "clip_ratio": 0.0, - "completion_length": 520.3125, + "completion_length": 559.2708740234375, "epoch": 0.096, - "grad_norm": 2.6290316447165245, - "kl": 0.03778076171875, + "grad_norm": 2.631139494749351, + "kl": 0.02752685546875, "learning_rate": 9.6e-07, - "loss": 0.0233, - "reward": 2.0847679376602173, - "reward_std": 0.44419676065444946, - "rewards/accuracy_reward": 0.5833333432674408, - "rewards/reasoning_steps_reward": 0.666666716337204, - "rewards/repetition_penalty_reward": -0.09231549128890038, - "rewards/tag_count_reward": 0.9270833432674408, + "loss": 0.017, + "reward": 2.412258505821228, + "reward_std": 0.4537012577056885, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.9166666865348816, + "rewards/repetition_penalty_reward": -0.04607492312788963, + "rewards/tag_count_reward": 1.0, "step": 96 }, { "clip_ratio": 0.0, - "completion_length": 417.4791717529297, + "completion_length": 606.6666870117188, "epoch": 0.097, - "grad_norm": 3.192037625566893, - "kl": 0.038330078125, + "grad_norm": 2.8652564620163288, + "kl": 0.02764892578125, "learning_rate": 9.7e-07, - "loss": 0.0554, - "reward": 2.368804693222046, - "reward_std": 0.3243536055088043, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 0.5555555522441864, - "rewards/repetition_penalty_reward": -0.08779259771108627, - "rewards/tag_count_reward": 0.9843750298023224, + "loss": 0.0402, + "reward": 2.6432669162750244, + "reward_std": 0.47565995156764984, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9027778208255768, + "rewards/repetition_penalty_reward": -0.04596925526857376, + "rewards/tag_count_reward": 0.9947916865348816, "step": 97 }, { "clip_ratio": 0.0, - "completion_length": 354.29168701171875, + "completion_length": 506.4583435058594, "epoch": 0.098, - "grad_norm": 2.852420709803875, - "kl": 0.04541015625, + "grad_norm": 2.701632712968642, + "kl": 0.032470703125, "learning_rate": 9.8e-07, - "loss": -0.036, - "reward": 2.1490139961242676, - "reward_std": 0.32155825197696686, - "rewards/accuracy_reward": 0.6875000149011612, - "rewards/reasoning_steps_reward": 0.5555556416511536, - "rewards/repetition_penalty_reward": -0.08362487703561783, - "rewards/tag_count_reward": 0.9895833730697632, + "loss": -0.0494, + "reward": 2.5521618127822876, + "reward_std": 0.4033654034137726, + "rewards/accuracy_reward": 0.708333358168602, + "rewards/reasoning_steps_reward": 0.8888889253139496, + "rewards/repetition_penalty_reward": -0.0450606532394886, + "rewards/tag_count_reward": 1.0, "step": 98 }, { "clip_ratio": 0.0, - "completion_length": 433.5416717529297, + "completion_length": 570.2916870117188, "epoch": 0.099, - "grad_norm": 2.767278805413835, - "kl": 0.040283203125, + "grad_norm": 2.578693808208252, + "kl": 0.03271484375, "learning_rate": 9.9e-07, - "loss": 0.0133, - "reward": 2.1486786603927612, - "reward_std": 0.2824034094810486, - "rewards/accuracy_reward": 0.6041666865348816, - "rewards/reasoning_steps_reward": 0.6666666865348816, - "rewards/repetition_penalty_reward": -0.09090467914938927, - "rewards/tag_count_reward": 0.96875, + "loss": -0.0427, + "reward": 2.4133933782577515, + "reward_std": 0.30207425355911255, + "rewards/accuracy_reward": 0.4791666865348816, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.03973175957798958, + "rewards/tag_count_reward": 0.9947916865348816, "step": 99 }, { "clip_ratio": 0.0, - "completion_length": 395.91668701171875, + "completion_length": 614.3333740234375, "epoch": 0.1, - "grad_norm": 3.230254871990257, - "kl": 0.047607421875, + "grad_norm": 2.67358470309149, + "kl": 0.028564453125, "learning_rate": 1e-06, - "loss": 0.0681, - "reward": 2.372018337249756, - "reward_std": 0.29652033746242523, - "rewards/accuracy_reward": 0.8750000298023224, - "rewards/reasoning_steps_reward": 0.5972222685813904, - "rewards/repetition_penalty_reward": -0.06374558806419373, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": -0.0598, + "reward": 2.3288190364837646, + "reward_std": 0.3720841705799103, + "rewards/accuracy_reward": 0.5000000149011612, + "rewards/reasoning_steps_reward": 0.8819445073604584, + "rewards/repetition_penalty_reward": -0.05312554910778999, + "rewards/tag_count_reward": 1.0, "step": 100 }, { "clip_ratio": 0.0, - "completion_length": 390.2291717529297, + "completion_length": 558.4375, "epoch": 0.101, - "grad_norm": 2.546016708094516, - "kl": 0.0465087890625, + "grad_norm": 2.557702270091156, + "kl": 0.0347900390625, "learning_rate": 9.999972584460056e-07, - "loss": 0.0067, - "reward": 2.3440247774124146, - "reward_std": 0.32892371714115143, + "loss": 0.0339, + "reward": 2.7004672288894653, + "reward_std": 0.3648904711008072, "rewards/accuracy_reward": 0.8333333730697632, - "rewards/reasoning_steps_reward": 0.604166716337204, - "rewards/repetition_penalty_reward": -0.07785026356577873, - "rewards/tag_count_reward": 0.9843750298023224, + "rewards/reasoning_steps_reward": 0.92361119389534, + "rewards/repetition_penalty_reward": -0.05126911960542202, + "rewards/tag_count_reward": 0.9947916865348816, "step": 101 }, { "clip_ratio": 0.0, - "completion_length": 343.125, + "completion_length": 596.1458435058594, "epoch": 0.102, - "grad_norm": 2.8757844950968634, - "kl": 0.0611572265625, + "grad_norm": 2.7419447112432644, + "kl": 0.033935546875, "learning_rate": 9.999890338174275e-07, - "loss": -0.0118, - "reward": 2.403480887413025, - "reward_std": 0.28830624371767044, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 0.6250000596046448, - "rewards/repetition_penalty_reward": -0.07047748565673828, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0276, + "reward": 2.4844894409179688, + "reward_std": 0.4377788305282593, + "rewards/accuracy_reward": 0.583333358168602, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.043288541957736015, + "rewards/tag_count_reward": 1.0, "step": 102 }, { "clip_ratio": 0.0, - "completion_length": 382.79168701171875, + "completion_length": 549.3125305175781, "epoch": 0.103, - "grad_norm": 2.831030324735425, - "kl": 0.0604248046875, + "grad_norm": 2.5449102277843556, + "kl": 0.0384521484375, "learning_rate": 9.999753262144804e-07, - "loss": 0.0259, - "reward": 2.2483108043670654, - "reward_std": 0.42505571246147156, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 0.5972222685813904, - "rewards/repetition_penalty_reward": -0.09891153872013092, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.091, + "reward": 2.214192271232605, + "reward_std": 0.41757629811763763, + "rewards/accuracy_reward": 0.3333333358168602, + "rewards/reasoning_steps_reward": 0.930555522441864, + "rewards/repetition_penalty_reward": -0.04969660937786102, + "rewards/tag_count_reward": 1.0, "step": 103 }, { "clip_ratio": 0.0, - "completion_length": 393.8958435058594, + "completion_length": 503.6458435058594, "epoch": 0.104, - "grad_norm": 2.6419879754320323, - "kl": 0.055908203125, + "grad_norm": 2.5988919390766134, + "kl": 0.038330078125, "learning_rate": 9.999561358041868e-07, - "loss": -0.025, - "reward": 2.327350378036499, - "reward_std": 0.33883772790431976, - "rewards/accuracy_reward": 0.8125, - "rewards/reasoning_steps_reward": 0.604166716337204, - "rewards/repetition_penalty_reward": -0.06848318129777908, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": -0.0306, + "reward": 2.473710536956787, + "reward_std": 0.36740100383758545, + "rewards/accuracy_reward": 0.5833333730697632, + "rewards/reasoning_steps_reward": 0.9375000596046448, + "rewards/repetition_penalty_reward": -0.04712275043129921, + "rewards/tag_count_reward": 1.0, "step": 104 }, { "clip_ratio": 0.0, - "completion_length": 490.9375, + "completion_length": 584.5833435058594, "epoch": 0.105, - "grad_norm": 2.6277015983617718, - "kl": 0.0469970703125, + "grad_norm": 2.2253173220096847, + "kl": 0.03759765625, "learning_rate": 9.99931462820376e-07, - "loss": 0.1988, - "reward": 2.3519784212112427, - "reward_std": 0.40161918103694916, - "rewards/accuracy_reward": 0.8750000298023224, - "rewards/reasoning_steps_reward": 0.6458333730697632, - "rewards/repetition_penalty_reward": -0.11677172780036926, - "rewards/tag_count_reward": 0.9479166865348816, + "loss": -0.0418, + "reward": 2.6660202741622925, + "reward_std": 0.39214441180229187, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.04231319762766361, + "rewards/tag_count_reward": 1.0, "step": 105 }, { "clip_ratio": 0.0, - "completion_length": 400.00001525878906, + "completion_length": 580.5833435058594, "epoch": 0.106, - "grad_norm": 3.0757465996070934, - "kl": 0.0616455078125, + "grad_norm": 2.740740338946049, + "kl": 0.03857421875, "learning_rate": 9.999013075636804e-07, - "loss": 0.0222, - "reward": 2.2983983755111694, - "reward_std": 0.24760716408491135, + "loss": -0.0301, + "reward": 2.671258807182312, + "reward_std": 0.35463356226682663, "rewards/accuracy_reward": 0.7500000298023224, - "rewards/reasoning_steps_reward": 0.659722238779068, - "rewards/repetition_penalty_reward": -0.06965714879333973, - "rewards/tag_count_reward": 0.9583333432674408, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.044019101187586784, + "rewards/tag_count_reward": 1.0, "step": 106 }, { "clip_ratio": 0.0, - "completion_length": 368.5625, + "completion_length": 552.8958740234375, "epoch": 0.107, - "grad_norm": 2.3290020631157335, - "kl": 0.0606689453125, + "grad_norm": 2.6710939233020636, + "kl": 0.0406494140625, "learning_rate": 9.998656704015323e-07, - "loss": -0.0136, - "reward": 2.5443495512008667, - "reward_std": 0.17923131585121155, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 0.6666666865348816, - "rewards/repetition_penalty_reward": -0.1066923514008522, - "rewards/tag_count_reward": 0.9843750298023224, + "loss": 0.0481, + "reward": 2.4425666332244873, + "reward_std": 0.36045634746551514, + "rewards/accuracy_reward": 0.5625000298023224, + "rewards/reasoning_steps_reward": 0.9305555820465088, + "rewards/repetition_penalty_reward": -0.04528068192303181, + "rewards/tag_count_reward": 0.9947916865348816, "step": 107 }, { "clip_ratio": 0.0, - "completion_length": 442.8750305175781, + "completion_length": 521.7500305175781, "epoch": 0.108, - "grad_norm": 2.718520980839577, - "kl": 0.0548095703125, + "grad_norm": 2.9062446352100086, + "kl": 0.0419921875, "learning_rate": 9.998245517681593e-07, - "loss": 0.0682, - "reward": 2.093179702758789, - "reward_std": 0.25339990854263306, - "rewards/accuracy_reward": 0.5208333544433117, - "rewards/reasoning_steps_reward": 0.6805556118488312, - "rewards/repetition_penalty_reward": -0.07175076752901077, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.0707, + "reward": 2.725650668144226, + "reward_std": 0.3536549210548401, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.9652778506278992, + "rewards/repetition_penalty_reward": -0.05212709680199623, + "rewards/tag_count_reward": 1.0, "step": 108 }, { "clip_ratio": 0.0, - "completion_length": 363.4166717529297, + "completion_length": 473.10418701171875, "epoch": 0.109, - "grad_norm": 3.186327469598663, - "kl": 0.071533203125, + "grad_norm": 2.840139368335502, + "kl": 0.048583984375, "learning_rate": 9.997779521645791e-07, - "loss": -0.0026, - "reward": 2.538771867752075, - "reward_std": 0.134703166782856, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 0.6180555820465088, - "rewards/repetition_penalty_reward": -0.07928365468978882, + "loss": 0.0222, + "reward": 2.534466028213501, + "reward_std": 0.2846931293606758, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.028034010902047157, "rewards/tag_count_reward": 1.0, "step": 109 }, { "clip_ratio": 0.0, - "completion_length": 456.10418701171875, + "completion_length": 565.9375305175781, "epoch": 0.11, - "grad_norm": 5.949272957720982, - "kl": 0.055908203125, + "grad_norm": 2.2512653144943884, + "kl": 0.0401611328125, "learning_rate": 9.997258721585931e-07, - "loss": 0.3116, - "reward": 2.149649143218994, - "reward_std": 0.4325401335954666, - "rewards/accuracy_reward": 0.6041666716337204, - "rewards/reasoning_steps_reward": 0.6736111640930176, - "rewards/repetition_penalty_reward": -0.09687869995832443, - "rewards/tag_count_reward": 0.9687500298023224, + "loss": 0.0075, + "reward": 2.7475154399871826, + "reward_std": 0.3540599048137665, + "rewards/accuracy_reward": 0.8333333730697632, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.04935969039797783, + "rewards/tag_count_reward": 0.9843750298023224, "step": 110 }, { "clip_ratio": 0.0, - "completion_length": 344.2083435058594, + "completion_length": 553.3125, "epoch": 0.111, - "grad_norm": 2.8410771586992376, - "kl": 0.076416015625, + "grad_norm": 2.527203584806038, + "kl": 0.04296875, "learning_rate": 9.996683123847795e-07, - "loss": -0.0838, - "reward": 2.291150450706482, - "reward_std": 0.11116452049463987, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.6458333730697632, - "rewards/repetition_penalty_reward": -0.07864130288362503, + "loss": 0.0512, + "reward": 2.7313212156295776, + "reward_std": 0.2547169327735901, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9861111640930176, + "rewards/repetition_penalty_reward": -0.04124833457171917, "rewards/tag_count_reward": 0.9947916865348816, "step": 111 }, { "clip_ratio": 0.0, - "completion_length": 366.2291717529297, + "completion_length": 536.8958740234375, "epoch": 0.112, - "grad_norm": 3.0910502587176256, - "kl": 0.072509765625, + "grad_norm": 2.7442985147195307, + "kl": 0.044189453125, "learning_rate": 9.996052735444862e-07, - "loss": 0.0026, - "reward": 2.300261616706848, - "reward_std": 0.15234437957406044, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/reasoning_steps_reward": 0.659722238779068, - "rewards/repetition_penalty_reward": -0.0677939560264349, - "rewards/tag_count_reward": 1.0, + "loss": 0.0349, + "reward": 2.7351317405700684, + "reward_std": 0.31766992807388306, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/reasoning_steps_reward": 0.951388955116272, + "rewards/repetition_penalty_reward": -0.03917396813631058, + "rewards/tag_count_reward": 0.9895833730697632, "step": 112 }, { "clip_ratio": 0.0, - "completion_length": 437.00001525878906, + "completion_length": 633.9375, "epoch": 0.113, - "grad_norm": 2.695954355704197, - "kl": 0.0538330078125, + "grad_norm": 2.206627822892764, + "kl": 0.0399169921875, "learning_rate": 9.995367564058216e-07, - "loss": -0.0081, - "reward": 2.272398829460144, - "reward_std": 0.388952374458313, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.6458333730697632, - "rewards/repetition_penalty_reward": -0.08176786452531815, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.0002, + "reward": 2.4217275381088257, + "reward_std": 0.37440885603427887, + "rewards/accuracy_reward": 0.5000000223517418, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.050494687631726265, + "rewards/tag_count_reward": 1.0, "step": 113 }, { "clip_ratio": 0.0, - "completion_length": 384.12501525878906, + "completion_length": 559.5000305175781, "epoch": 0.114, - "grad_norm": 4.792325770511148, - "kl": 0.09130859375, + "grad_norm": 2.9323115322073567, + "kl": 0.048583984375, "learning_rate": 9.994627618036452e-07, - "loss": 0.0385, - "reward": 2.3695743083953857, - "reward_std": 0.272603839635849, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 0.6527777910232544, - "rewards/repetition_penalty_reward": -0.07487009279429913, + "loss": 0.0838, + "reward": 2.7629553079605103, + "reward_std": 0.3546500727534294, + "rewards/accuracy_reward": 0.8333333730697632, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.04260038584470749, "rewards/tag_count_reward": 1.0, "step": 114 }, { "clip_ratio": 0.0, - "completion_length": 361.4583435058594, + "completion_length": 600.7291870117188, "epoch": 0.115, - "grad_norm": 3.2951924014269696, - "kl": 0.077392578125, + "grad_norm": 2.281029530481033, + "kl": 0.0433349609375, "learning_rate": 9.993832906395582e-07, - "loss": 0.0976, - "reward": 2.5152515172958374, - "reward_std": 0.2316112518310547, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/reasoning_steps_reward": 0.6666666865348816, - "rewards/repetition_penalty_reward": -0.08891518414020538, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.0435, + "reward": 2.6696189641952515, + "reward_std": 0.3239249736070633, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.05260329693555832, + "rewards/tag_count_reward": 1.0, "step": 115 }, { "clip_ratio": 0.0, - "completion_length": 392.5, + "completion_length": 571.4791870117188, "epoch": 0.116, - "grad_norm": 2.8048407252075314, - "kl": 0.071044921875, + "grad_norm": 2.339412143248174, + "kl": 0.0447998046875, "learning_rate": 9.992983438818915e-07, - "loss": 0.0331, - "reward": 2.024270176887512, - "reward_std": 0.3066788762807846, - "rewards/accuracy_reward": 0.4583333544433117, - "rewards/reasoning_steps_reward": 0.6666666865348816, - "rewards/repetition_penalty_reward": -0.08510496094822884, - "rewards/tag_count_reward": 0.9843750298023224, + "loss": -0.0459, + "reward": 2.695494771003723, + "reward_std": 0.3602631092071533, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9722222089767456, + "rewards/repetition_penalty_reward": -0.05276927165687084, + "rewards/tag_count_reward": 0.984375, "step": 116 }, { "clip_ratio": 0.0, - "completion_length": 371.9791717529297, + "completion_length": 569.2291870117188, "epoch": 0.117, - "grad_norm": 2.5739325280504755, - "kl": 0.0908203125, + "grad_norm": 2.6819516393809444, + "kl": 0.0435791015625, "learning_rate": 9.992079225656944e-07, - "loss": 0.0496, - "reward": 2.431204080581665, - "reward_std": 0.24455638974905014, - "rewards/accuracy_reward": 0.8750000298023224, - "rewards/reasoning_steps_reward": 0.659722238779068, - "rewards/repetition_penalty_reward": -0.08789315819740295, - "rewards/tag_count_reward": 0.984375, + "loss": 0.0445, + "reward": 2.677255153656006, + "reward_std": 0.2550307661294937, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.038022806867957115, + "rewards/tag_count_reward": 1.0, "step": 117 }, { "clip_ratio": 0.0, - "completion_length": 344.04168701171875, + "completion_length": 606.5, "epoch": 0.118, - "grad_norm": 4.098782512032254, - "kl": 0.084716796875, + "grad_norm": 2.7553746000465016, + "kl": 0.0418701171875, "learning_rate": 9.991120277927223e-07, - "loss": 0.0221, - "reward": 2.1492127180099487, - "reward_std": 0.3605824261903763, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/reasoning_steps_reward": 0.5972222685813904, - "rewards/repetition_penalty_reward": -0.06780127808451653, - "rewards/tag_count_reward": 0.953125, + "loss": 0.0683, + "reward": 2.683447241783142, + "reward_std": 0.4267844557762146, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.045719537883996964, + "rewards/tag_count_reward": 1.0, "step": 118 }, { "clip_ratio": 0.0, - "completion_length": 317.7083435058594, + "completion_length": 589.4583740234375, "epoch": 0.119, - "grad_norm": 2.530117932966495, - "kl": 0.086669921875, + "grad_norm": 2.5283730612634168, + "kl": 0.0426025390625, "learning_rate": 9.990106607314225e-07, - "loss": 0.0474, - "reward": 2.533785820007324, - "reward_std": 0.20052310824394226, - "rewards/accuracy_reward": 0.9375, - "rewards/reasoning_steps_reward": 0.6666666865348816, - "rewards/repetition_penalty_reward": -0.06517258286476135, - "rewards/tag_count_reward": 0.9947916865348816, - "step": 119 + "loss": -0.0626, + "reward": 2.5398285388946533, + "reward_std": 0.37042136490345, + "rewards/accuracy_reward": 0.6250000149011612, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.05044914782047272, + "rewards/tag_count_reward": 1.0, + "step": 119 }, { "clip_ratio": 0.0, - "completion_length": 405.6666717529297, + "completion_length": 581.7500305175781, "epoch": 0.12, - "grad_norm": 3.3247747829113155, - "kl": 0.0830078125, + "grad_norm": 2.649956713864194, + "kl": 0.04931640625, "learning_rate": 9.989038226169207e-07, - "loss": -0.0824, - "reward": 1.8791496753692627, - "reward_std": 0.42374372482299805, - "rewards/accuracy_reward": 0.375, - "rewards/reasoning_steps_reward": 0.6041667461395264, - "rewards/repetition_penalty_reward": -0.08960039168596268, - "rewards/tag_count_reward": 0.9895833730697632, + "loss": 0.0635, + "reward": 2.569565773010254, + "reward_std": 0.4911084771156311, + "rewards/accuracy_reward": 0.6458333730697632, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.04154558852314949, + "rewards/tag_count_reward": 1.0, "step": 120 }, { "clip_ratio": 0.0, - "completion_length": 389.4583435058594, + "completion_length": 636.6666870117188, "epoch": 0.121, - "grad_norm": 3.1509140115178886, - "kl": 0.0830078125, + "grad_norm": 2.7429948870181233, + "kl": 0.0418701171875, "learning_rate": 9.98791514751006e-07, - "loss": 0.216, - "reward": 2.2920992374420166, - "reward_std": 0.3410159945487976, - "rewards/accuracy_reward": 0.6666666716337204, - "rewards/reasoning_steps_reward": 0.7222222685813904, - "rewards/repetition_penalty_reward": -0.08116474375128746, - "rewards/tag_count_reward": 0.984375, + "loss": 0.13, + "reward": 2.663628339767456, + "reward_std": 0.396682009100914, + "rewards/accuracy_reward": 0.75, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.060329992324113846, + "rewards/tag_count_reward": 0.9947916865348816, "step": 121 }, { "clip_ratio": 0.0, - "completion_length": 359.7916717529297, + "completion_length": 596.2083435058594, "epoch": 0.122, - "grad_norm": 2.9375585818738337, - "kl": 0.072021484375, + "grad_norm": 2.4221170538253816, + "kl": 0.045166015625, "learning_rate": 9.98673738502114e-07, - "loss": 0.0407, - "reward": 2.3687384128570557, - "reward_std": 0.3248277008533478, - "rewards/accuracy_reward": 0.7708333730697632, - "rewards/reasoning_steps_reward": 0.6597222685813904, - "rewards/repetition_penalty_reward": -0.0618172250688076, - "rewards/tag_count_reward": 1.0, + "loss": 0.0342, + "reward": 2.6282721757888794, + "reward_std": 0.46036189794540405, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.04707522317767143, + "rewards/tag_count_reward": 0.9947916865348816, "step": 122 }, { "clip_ratio": 0.0, - "completion_length": 357.7916717529297, + "completion_length": 610.0416870117188, "epoch": 0.123, - "grad_norm": 2.943759498608235, - "kl": 0.079833984375, + "grad_norm": 2.268992560964926, + "kl": 0.045166015625, "learning_rate": 9.985504953053113e-07, - "loss": -0.0106, - "reward": 2.511977434158325, - "reward_std": 0.1637960821390152, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 0.6666666865348816, - "rewards/repetition_penalty_reward": -0.07135587558150291, + "loss": -0.0075, + "reward": 2.5075987577438354, + "reward_std": 0.4839261472225189, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.047956960275769234, "rewards/tag_count_reward": 1.0, "step": 123 }, { "clip_ratio": 0.0, - "completion_length": 372.50001525878906, + "completion_length": 637.8333435058594, "epoch": 0.124, - "grad_norm": 2.8101729095936925, - "kl": 0.076904296875, + "grad_norm": 2.3192412067695276, + "kl": 0.040283203125, "learning_rate": 9.98421786662277e-07, - "loss": -0.0021, - "reward": 2.5283435583114624, - "reward_std": 0.2503781318664551, - "rewards/accuracy_reward": 0.9375000298023224, - "rewards/reasoning_steps_reward": 0.6597222685813904, - "rewards/repetition_penalty_reward": -0.068878673017025, - "rewards/tag_count_reward": 1.0, + "loss": 0.0276, + "reward": 2.580567240715027, + "reward_std": 0.40886104106903076, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.060057852417230606, + "rewards/tag_count_reward": 0.9947916865348816, "step": 124 }, { "clip_ratio": 0.0, - "completion_length": 379.6666717529297, + "completion_length": 550.6875305175781, "epoch": 0.125, - "grad_norm": 2.8368100626005748, - "kl": 0.075439453125, + "grad_norm": 2.6428669262747935, + "kl": 0.0528564453125, "learning_rate": 9.982876141412855e-07, - "loss": -0.0021, - "reward": 2.1703072786331177, - "reward_std": 0.22792606800794601, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/reasoning_steps_reward": 0.694444477558136, - "rewards/repetition_penalty_reward": -0.06580384261906147, - "rewards/tag_count_reward": 1.0, + "loss": -0.021, + "reward": 2.6972975730895996, + "reward_std": 0.30313703417778015, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.0353414136916399, + "rewards/tag_count_reward": 0.9895833432674408, "step": 125 }, { "clip_ratio": 0.0, - "completion_length": 412.1458435058594, + "completion_length": 680.6041870117188, "epoch": 0.126, - "grad_norm": 2.4562837566860463, - "kl": 0.06787109375, + "grad_norm": 2.3227570325460047, + "kl": 0.0430908203125, "learning_rate": 9.981479793771866e-07, - "loss": -0.045, - "reward": 2.348684072494507, - "reward_std": 0.3084895759820938, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.694444477558136, - "rewards/repetition_penalty_reward": -0.07492720708251, - "rewards/tag_count_reward": 1.0, + "loss": 0.0752, + "reward": 2.591596007347107, + "reward_std": 0.49062085151672363, + "rewards/accuracy_reward": 0.6875, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.04902906343340874, + "rewards/tag_count_reward": 0.9947916865348816, "step": 126 }, { "clip_ratio": 0.0, - "completion_length": 413.8958435058594, + "completion_length": 567.7291870117188, "epoch": 0.127, - "grad_norm": 2.876398237422394, - "kl": 0.06787109375, + "grad_norm": 2.5341618125591894, + "kl": 0.0494384765625, "learning_rate": 9.98002884071386e-07, - "loss": -0.0249, - "reward": 2.1730523109436035, - "reward_std": 0.31859181821346283, - "rewards/accuracy_reward": 0.520833358168602, - "rewards/reasoning_steps_reward": 0.694444477558136, - "rewards/repetition_penalty_reward": -0.04222557134926319, + "loss": 0.0338, + "reward": 2.735344886779785, + "reward_std": 0.3433926999568939, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.04243295639753342, "rewards/tag_count_reward": 1.0, "step": 127 }, { "clip_ratio": 0.0, - "completion_length": 414.5208435058594, + "completion_length": 611.7291870117188, "epoch": 0.128, - "grad_norm": 2.964976670140139, - "kl": 0.095458984375, + "grad_norm": 84.62793212916526, + "kl": 0.50634765625, "learning_rate": 9.97852329991824e-07, - "loss": 0.0593, - "reward": 2.3709412813186646, - "reward_std": 0.2837936729192734, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 0.6875000596046448, - "rewards/repetition_penalty_reward": -0.0873921848833561, - "rewards/tag_count_reward": 1.0, + "loss": 0.0161, + "reward": 2.649895191192627, + "reward_std": 0.4249277561903, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.04628550261259079, + "rewards/tag_count_reward": 0.9947916865348816, "step": 128 }, { "clip_ratio": 0.0, - "completion_length": 408.91668701171875, + "completion_length": 587.2083740234375, "epoch": 0.129, - "grad_norm": 2.4227774313297106, - "kl": 0.06640625, + "grad_norm": 8.417728255751756, + "kl": 0.0889892578125, "learning_rate": 9.976963189729547e-07, - "loss": 0.0011, - "reward": 2.023462414741516, - "reward_std": 0.28090299665927887, - "rewards/accuracy_reward": 0.4583333358168602, - "rewards/reasoning_steps_reward": 0.6458333730697632, - "rewards/repetition_penalty_reward": -0.08070430159568787, + "loss": 0.0735, + "reward": 2.7577908039093018, + "reward_std": 0.3766750693321228, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9444445371627808, + "rewards/repetition_penalty_reward": -0.04082038067281246, "rewards/tag_count_reward": 1.0, "step": 129 }, { "clip_ratio": 0.0, - "completion_length": 418.0416717529297, + "completion_length": 592.8958435058594, "epoch": 0.13, - "grad_norm": 2.97356155376394, - "kl": 0.0703125, + "grad_norm": 2.422421343225394, + "kl": 0.047607421875, "learning_rate": 9.975348529157229e-07, - "loss": -0.0087, - "reward": 1.900796353816986, - "reward_std": 0.37193985283374786, - "rewards/accuracy_reward": 0.2916666716337204, - "rewards/reasoning_steps_reward": 0.6666666865348816, - "rewards/repetition_penalty_reward": -0.05753708258271217, - "rewards/tag_count_reward": 1.0, + "loss": 0.0476, + "reward": 2.71326220035553, + "reward_std": 0.3403441533446312, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.05236278846859932, + "rewards/tag_count_reward": 0.9947916865348816, "step": 130 }, { "clip_ratio": 0.0, - "completion_length": 444.18751525878906, + "completion_length": 592.1250305175781, "epoch": 0.131, - "grad_norm": 2.7230165239061153, - "kl": 0.0732421875, + "grad_norm": 3.1437399948807276, + "kl": 0.0548095703125, "learning_rate": 9.973679337875418e-07, - "loss": 0.0744, - "reward": 2.3731343746185303, - "reward_std": 0.4187423288822174, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 0.6597222685813904, - "rewards/repetition_penalty_reward": -0.07304620742797852, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0466, + "reward": 2.6631596088409424, + "reward_std": 0.37047071754932404, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.04170159809291363, + "rewards/tag_count_reward": 0.9895833432674408, "step": 131 }, { "clip_ratio": 0.0, - "completion_length": 427.8541717529297, + "completion_length": 531.9791870117188, "epoch": 0.132, - "grad_norm": 2.6377196224388655, - "kl": 0.073486328125, + "grad_norm": 2.5320955473297437, + "kl": 0.052490234375, "learning_rate": 9.971955636222684e-07, - "loss": 0.0429, - "reward": 2.569479465484619, - "reward_std": 0.21691420674324036, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/reasoning_steps_reward": 0.694444477558136, - "rewards/repetition_penalty_reward": -0.08329831808805466, + "loss": 0.0346, + "reward": 2.73369300365448, + "reward_std": 0.3214843422174454, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.04408488981425762, "rewards/tag_count_reward": 1.0, "step": 132 }, { "clip_ratio": 0.0, - "completion_length": 349.0416717529297, + "completion_length": 654.1875305175781, "epoch": 0.133, - "grad_norm": 2.8574333678150747, - "kl": 0.07568359375, + "grad_norm": 2.3742871546515545, + "kl": 0.047607421875, "learning_rate": 9.970177445201783e-07, - "loss": 0.043, - "reward": 2.5894997119903564, - "reward_std": 0.19367820769548416, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 0.7291666865348816, - "rewards/repetition_penalty_reward": -0.05633382312953472, + "loss": 0.0042, + "reward": 2.523491144180298, + "reward_std": 0.49983178079128265, + "rewards/accuracy_reward": 0.5833333730697632, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.045953478664159775, "rewards/tag_count_reward": 1.0, "step": 133 }, { "clip_ratio": 0.0, - "completion_length": 368.9583435058594, + "completion_length": 567.6458740234375, "epoch": 0.134, - "grad_norm": 2.503403717143456, - "kl": 0.081298828125, + "grad_norm": 2.5745090813340257, + "kl": 0.052734375, "learning_rate": 9.968344786479415e-07, - "loss": 0.0002, - "reward": 2.6074156761169434, - "reward_std": 0.16955932043492794, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 0.7013888955116272, - "rewards/repetition_penalty_reward": -0.07314002513885498, - "rewards/tag_count_reward": 1.0, + "loss": 0.0497, + "reward": 2.654099702835083, + "reward_std": 0.32977308332920074, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.05770585313439369, + "rewards/tag_count_reward": 0.9895833730697632, "step": 134 }, { "clip_ratio": 0.0, - "completion_length": 350.0625, + "completion_length": 591.4375305175781, "epoch": 0.135, - "grad_norm": 2.797751039542394, - "kl": 0.083740234375, + "grad_norm": 2.613972316646787, + "kl": 0.0504150390625, "learning_rate": 9.96645768238595e-07, - "loss": 0.032, - "reward": 2.4297925233840942, - "reward_std": 0.3040465787053108, - "rewards/accuracy_reward": 0.8125000298023224, - "rewards/reasoning_steps_reward": 0.6805556118488312, - "rewards/repetition_penalty_reward": -0.05284643918275833, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0264, + "reward": 2.715443730354309, + "reward_std": 0.4562966376543045, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.04323691129684448, + "rewards/tag_count_reward": 0.9947916865348816, "step": 135 }, { "clip_ratio": 0.0, - "completion_length": 348.56251525878906, + "completion_length": 608.5208435058594, "epoch": 0.136, - "grad_norm": 3.1995443879686944, - "kl": 0.083984375, + "grad_norm": 2.405934822243009, + "kl": 0.0482177734375, "learning_rate": 9.964516155915151e-07, - "loss": -0.083, - "reward": 2.290953040122986, - "reward_std": 0.17754915356636047, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/reasoning_steps_reward": 0.659722238779068, - "rewards/repetition_penalty_reward": -0.06668580323457718, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0707, + "reward": 2.7309588193893433, + "reward_std": 0.42438623309135437, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.06244398467242718, + "rewards/tag_count_reward": 0.9739583432674408, "step": 136 }, { "clip_ratio": 0.0, - "completion_length": 428.5208435058594, + "completion_length": 698.9375305175781, "epoch": 0.137, - "grad_norm": 2.7704057366539523, - "kl": 0.068359375, + "grad_norm": 2.4898466567094255, + "kl": 0.041259765625, "learning_rate": 9.962520230723906e-07, - "loss": 0.0314, - "reward": 2.2051353454589844, - "reward_std": 0.3088246285915375, - "rewards/accuracy_reward": 0.625, - "rewards/reasoning_steps_reward": 0.6666666865348816, - "rewards/repetition_penalty_reward": -0.08132302761077881, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": -0.0047, + "reward": 2.7930511236190796, + "reward_std": 0.3140558898448944, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.06458798050880432, + "rewards/tag_count_reward": 0.9895833432674408, "step": 137 }, { "clip_ratio": 0.0, - "completion_length": 457.5208435058594, + "completion_length": 529.0208435058594, "epoch": 0.138, - "grad_norm": 2.428709377180155, - "kl": 0.073486328125, + "grad_norm": 2.8868557758723306, + "kl": 0.0540771484375, "learning_rate": 9.960469931131936e-07, - "loss": 0.0206, - "reward": 2.332241177558899, - "reward_std": 0.1566816307604313, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.6805556118488312, - "rewards/repetition_penalty_reward": -0.06706438213586807, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0636, + "reward": 2.6102973222732544, + "reward_std": 0.42000116407871246, + "rewards/accuracy_reward": 0.7083333730697632, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.0372721990570426, + "rewards/tag_count_reward": 0.9947916865348816, "step": 138 }, { "clip_ratio": 0.0, - "completion_length": 388.3333435058594, + "completion_length": 525.6875305175781, "epoch": 0.139, - "grad_norm": 2.8127717910115493, - "kl": 0.06640625, + "grad_norm": 2.8464042704145114, + "kl": 0.058349609375, "learning_rate": 9.958365282121496e-07, - "loss": 0.0156, - "reward": 2.2932348251342773, - "reward_std": 0.1739935651421547, - "rewards/accuracy_reward": 0.6458333432674408, - "rewards/reasoning_steps_reward": 0.7152778208255768, - "rewards/repetition_penalty_reward": -0.06787643954157829, + "loss": 0.0587, + "reward": 2.6924026012420654, + "reward_std": 0.3652355223894119, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.04370862618088722, "rewards/tag_count_reward": 1.0, "step": 139 }, { "clip_ratio": 0.0, - "completion_length": 407.5625, + "completion_length": 623.0416870117188, "epoch": 0.14, - "grad_norm": 2.722141296744586, - "kl": 0.06884765625, + "grad_norm": 2.5535766431944555, + "kl": 0.05078125, "learning_rate": 9.956206309337066e-07, - "loss": 0.0421, - "reward": 2.3178335428237915, - "reward_std": 0.2593873590230942, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.6805556118488312, - "rewards/repetition_penalty_reward": -0.09188878536224365, + "loss": -0.0369, + "reward": 2.5890008211135864, + "reward_std": 0.43424585461616516, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.049888189882040024, "rewards/tag_count_reward": 1.0, "step": 140 }, { "clip_ratio": 0.0, - "completion_length": 358.9791717529297, + "completion_length": 527.7291870117188, "epoch": 0.141, - "grad_norm": 3.1334970804201157, - "kl": 0.082763671875, + "grad_norm": 2.5331968568084804, + "kl": 0.055419921875, "learning_rate": 9.953993039085048e-07, - "loss": 0.0777, - "reward": 2.665117383003235, - "reward_std": 0.10373461246490479, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 0.7291666865348816, - "rewards/repetition_penalty_reward": -0.06404940038919449, - "rewards/tag_count_reward": 1.0, + "loss": 0.0256, + "reward": 2.5839684009552, + "reward_std": 0.30562296509742737, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.0427678357809782, + "rewards/tag_count_reward": 0.9947916865348816, "step": 141 }, { "clip_ratio": 0.0, - "completion_length": 428.5208435058594, + "completion_length": 631.0625152587891, "epoch": 0.142, - "grad_norm": 2.814219695456218, - "kl": 0.072265625, + "grad_norm": 3.9726646160122283, + "kl": 0.0572509765625, "learning_rate": 9.951725498333448e-07, - "loss": 0.0225, - "reward": 2.2822248935699463, - "reward_std": 0.2850118428468704, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/reasoning_steps_reward": 0.6875000298023224, - "rewards/repetition_penalty_reward": -0.0719418115913868, + "loss": 0.0009, + "reward": 2.697226047515869, + "reward_std": 0.40920713543891907, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.03888525255024433, "rewards/tag_count_reward": 1.0, "step": 142 }, { "clip_ratio": 0.0, - "completion_length": 326.0833435058594, + "completion_length": 608.5416870117188, "epoch": 0.143, - "grad_norm": 2.9383236871980687, - "kl": 0.091552734375, + "grad_norm": 2.415561560430353, + "kl": 0.052734375, "learning_rate": 9.949403714711526e-07, - "loss": 0.0479, - "reward": 2.626361608505249, - "reward_std": 0.04842622019350529, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 0.6805556118488312, - "rewards/repetition_penalty_reward": -0.05419414862990379, - "rewards/tag_count_reward": 1.0, + "loss": 0.0716, + "reward": 2.7793259620666504, + "reward_std": 0.3527311235666275, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.045327022671699524, + "rewards/tag_count_reward": 0.9843750298023224, "step": 143 }, { "clip_ratio": 0.0, - "completion_length": 357.1458435058594, + "completion_length": 589.875, "epoch": 0.144, - "grad_norm": 2.8343189390720314, - "kl": 0.09033203125, + "grad_norm": 2.9100333446880744, + "kl": 0.063232421875, "learning_rate": 9.947027716509488e-07, - "loss": -0.0253, - "reward": 2.5396625995635986, - "reward_std": 0.21431493014097214, - "rewards/accuracy_reward": 0.9375000298023224, - "rewards/reasoning_steps_reward": 0.659722238779068, - "rewards/repetition_penalty_reward": -0.057559750974178314, - "rewards/tag_count_reward": 1.0, + "loss": 0.0006, + "reward": 2.563896417617798, + "reward_std": 0.4373708665370941, + "rewards/accuracy_reward": 0.6875, + "rewards/reasoning_steps_reward": 0.9305556118488312, + "rewards/repetition_penalty_reward": -0.043742526322603226, + "rewards/tag_count_reward": 0.9895833432674408, "step": 144 }, { "clip_ratio": 0.0, - "completion_length": 401.60418701171875, + "completion_length": 566.2916870117188, "epoch": 0.145, - "grad_norm": 3.005269072462132, - "kl": 0.08349609375, + "grad_norm": 2.4477067800919987, + "kl": 0.0556640625, "learning_rate": 9.944597532678119e-07, - "loss": 0.0173, - "reward": 2.501742959022522, - "reward_std": 0.3763193339109421, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 0.7152778208255768, - "rewards/repetition_penalty_reward": -0.06770160421729088, + "loss": -0.0456, + "reward": 2.466762065887451, + "reward_std": 0.48124293982982635, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.965277761220932, + "rewards/repetition_penalty_reward": -0.040182387456297874, "rewards/tag_count_reward": 1.0, "step": 145 }, { "clip_ratio": 0.0, - "completion_length": 384.7083435058594, + "completion_length": 527.2708740234375, "epoch": 0.146, - "grad_norm": 2.684066882388958, - "kl": 0.078369140625, + "grad_norm": 2.735483077486635, + "kl": 0.06005859375, "learning_rate": 9.942113192828444e-07, - "loss": 0.0024, - "reward": 2.0255126357078552, - "reward_std": 0.2608904615044594, - "rewards/accuracy_reward": 0.3750000149011612, - "rewards/reasoning_steps_reward": 0.6944445371627808, - "rewards/repetition_penalty_reward": -0.043931856751441956, + "loss": 0.0496, + "reward": 2.605388879776001, + "reward_std": 0.47325506806373596, + "rewards/accuracy_reward": 0.6458333730697632, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.0335000604391098, "rewards/tag_count_reward": 1.0, "step": 146 }, { "clip_ratio": 0.0, - "completion_length": 403.8333435058594, + "completion_length": 508.66668701171875, "epoch": 0.147, - "grad_norm": 2.496885466114844, - "kl": 0.075927734375, + "grad_norm": 2.5279842832232133, + "kl": 0.0618896484375, "learning_rate": 9.939574727231362e-07, - "loss": 0.0108, - "reward": 2.3045772314071655, - "reward_std": 0.34026581048965454, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.659722238779068, - "rewards/repetition_penalty_reward": -0.08431186527013779, + "loss": -0.0023, + "reward": 2.484343409538269, + "reward_std": 0.5071098208427429, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.9861111640930176, + "rewards/repetition_penalty_reward": -0.04343440383672714, "rewards/tag_count_reward": 1.0, "step": 147 }, { "clip_ratio": 0.0, - "completion_length": 423.22918701171875, + "completion_length": 525.5000305175781, "epoch": 0.148, - "grad_norm": 2.6372859476209873, - "kl": 0.099365234375, + "grad_norm": 2.2840152072866364, + "kl": 0.057373046875, "learning_rate": 9.93698216681727e-07, - "loss": 0.0204, - "reward": 2.3333483934402466, - "reward_std": 0.3280728757381439, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/reasoning_steps_reward": 0.6875000298023224, - "rewards/repetition_penalty_reward": -0.062485139816999435, + "loss": 0.0172, + "reward": 2.810342311859131, + "reward_std": 0.24958577752113342, + "rewards/accuracy_reward": 0.8750000298023224, + "rewards/reasoning_steps_reward": 0.9652778506278992, + "rewards/repetition_penalty_reward": -0.029935498721897602, "rewards/tag_count_reward": 1.0, "step": 148 }, { "clip_ratio": 0.0, - "completion_length": 375.18751525878906, + "completion_length": 587.7083435058594, "epoch": 0.149, - "grad_norm": 3.0549740814611415, - "kl": 0.090576171875, + "grad_norm": 2.495321625160956, + "kl": 0.055908203125, "learning_rate": 9.934335543175705e-07, - "loss": 0.0589, - "reward": 2.607046604156494, - "reward_std": 0.24691469967365265, - "rewards/accuracy_reward": 0.9375000298023224, - "rewards/reasoning_steps_reward": 0.7222222685813904, - "rewards/repetition_penalty_reward": -0.052675798535346985, - "rewards/tag_count_reward": 1.0, + "loss": 0.0162, + "reward": 2.811280369758606, + "reward_std": 0.40344125032424927, + "rewards/accuracy_reward": 0.875, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.03941434063017368, + "rewards/tag_count_reward": 0.9895833432674408, "step": 149 }, { "clip_ratio": 0.0, - "completion_length": 321.5208435058594, + "completion_length": 634.9166870117188, "epoch": 0.15, - "grad_norm": 3.385179018479211, - "kl": 0.09765625, + "grad_norm": 2.2429641351216834, + "kl": 0.0501708984375, "learning_rate": 9.931634888554935e-07, - "loss": 0.0037, - "reward": 2.412143111228943, - "reward_std": 0.12580449134111404, - "rewards/accuracy_reward": 0.75, - "rewards/reasoning_steps_reward": 0.708333432674408, - "rewards/repetition_penalty_reward": -0.04619025066494942, + "loss": 0.0582, + "reward": 2.6869924068450928, + "reward_std": 0.39581531286239624, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9722222089767456, + "rewards/repetition_penalty_reward": -0.056063249707221985, "rewards/tag_count_reward": 1.0, "step": 150 }, { "clip_ratio": 0.0, - "completion_length": 361.06251525878906, + "completion_length": 500.54168701171875, "epoch": 0.151, - "grad_norm": 2.668496225188547, - "kl": 0.105712890625, + "grad_norm": 2.468490484200265, + "kl": 0.0621337890625, "learning_rate": 9.928880235861588e-07, - "loss": 0.0028, - "reward": 2.569979667663574, - "reward_std": 0.266011580824852, - "rewards/accuracy_reward": 0.875, - "rewards/reasoning_steps_reward": 0.7500000298023224, - "rewards/repetition_penalty_reward": -0.0550205297768116, + "loss": 0.1059, + "reward": 2.7397245168685913, + "reward_std": 0.32457810640335083, + "rewards/accuracy_reward": 0.8125, + "rewards/reasoning_steps_reward": 0.9652778506278992, + "rewards/repetition_penalty_reward": -0.03805328160524368, "rewards/tag_count_reward": 1.0, "step": 151 }, { "clip_ratio": 0.0, - "completion_length": 325.8125, + "completion_length": 566.6875, "epoch": 0.152, - "grad_norm": 3.2127804730383382, - "kl": 0.09326171875, + "grad_norm": 2.5723071707401126, + "kl": 0.05517578125, "learning_rate": 9.926071618660237e-07, - "loss": -0.0792, - "reward": 2.437727928161621, - "reward_std": 0.11045113950967789, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 0.7222222685813904, - "rewards/repetition_penalty_reward": -0.05532771721482277, + "loss": 0.0205, + "reward": 2.7304086685180664, + "reward_std": 0.34727388620376587, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.04042468871921301, "rewards/tag_count_reward": 1.0, "step": 152 }, { "clip_ratio": 0.0, - "completion_length": 419.43751525878906, + "completion_length": 591.1875, "epoch": 0.153, - "grad_norm": 2.5156223065017156, - "kl": 0.08544921875, + "grad_norm": 2.2620129683268595, + "kl": 0.058349609375, "learning_rate": 9.923209071172994e-07, - "loss": 0.0392, - "reward": 2.3198297023773193, - "reward_std": 0.21060329675674438, - "rewards/accuracy_reward": 0.6875, - "rewards/reasoning_steps_reward": 0.7083333432674408, - "rewards/repetition_penalty_reward": -0.07600365206599236, + "loss": 0.0377, + "reward": 2.5389167070388794, + "reward_std": 0.3387536555528641, + "rewards/accuracy_reward": 0.6041666716337204, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.044416630640625954, "rewards/tag_count_reward": 1.0, "step": 153 }, { "clip_ratio": 0.0, - "completion_length": 324.4166717529297, + "completion_length": 565.8958435058594, "epoch": 0.154, - "grad_norm": 2.782084588216173, - "kl": 0.1015625, + "grad_norm": 2.7014878630289547, + "kl": 0.064453125, "learning_rate": 9.9202926282791e-07, - "loss": 0.0103, - "reward": 2.5981401205062866, - "reward_std": 0.2492058426141739, - "rewards/accuracy_reward": 0.9375, - "rewards/reasoning_steps_reward": 0.7013889253139496, - "rewards/repetition_penalty_reward": -0.040748924016952515, - "rewards/tag_count_reward": 1.0, + "loss": 0.0875, + "reward": 2.7602481842041016, + "reward_std": 0.4075208753347397, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.04357131011784077, + "rewards/tag_count_reward": 0.984375, "step": 154 }, { "clip_ratio": 0.0, - "completion_length": 370.50001525878906, + "completion_length": 558.3541870117188, "epoch": 0.155, - "grad_norm": 3.1229310866317, - "kl": 0.104248046875, + "grad_norm": 2.6542156680058486, + "kl": 0.060791015625, "learning_rate": 9.917322325514487e-07, - "loss": 0.191, - "reward": 2.4774017333984375, - "reward_std": 0.21329060196876526, + "loss": 0.0558, + "reward": 2.7091113328933716, + "reward_std": 0.3156071752309799, "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 0.7708333432674408, - "rewards/repetition_penalty_reward": -0.0538483802229166, - "rewards/tag_count_reward": 0.9895833432674408, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.03568043000996113, + "rewards/tag_count_reward": 0.9947916865348816, "step": 155 }, { "clip_ratio": 0.0, - "completion_length": 305.0833435058594, + "completion_length": 547.8333435058594, "epoch": 0.156, - "grad_norm": 2.961746395349351, - "kl": 0.117919921875, + "grad_norm": 2.6130648886630397, + "kl": 0.0601806640625, "learning_rate": 9.91429819907136e-07, - "loss": -0.0456, - "reward": 2.5482953786849976, - "reward_std": 0.1972944438457489, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 0.819444477558136, - "rewards/repetition_penalty_reward": -0.04198233038187027, + "loss": -0.002, + "reward": 2.6609745025634766, + "reward_std": 0.2819754481315613, + "rewards/accuracy_reward": 0.708333358168602, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.04041440971195698, "rewards/tag_count_reward": 1.0, "step": 156 }, { "clip_ratio": 0.0, - "completion_length": 365.1666717529297, + "completion_length": 535.5208435058594, "epoch": 0.157, - "grad_norm": 2.9042194629218407, - "kl": 0.126220703125, + "grad_norm": 2.4079001126290325, + "kl": 0.064208984375, "learning_rate": 9.911220285797748e-07, - "loss": -0.0288, - "reward": 2.6360727548599243, - "reward_std": 0.2432560846209526, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 0.8333334028720856, - "rewards/repetition_penalty_reward": -0.05142738111317158, + "loss": 0.0085, + "reward": 2.513691782951355, + "reward_std": 0.3074871450662613, + "rewards/accuracy_reward": 0.5625000149011612, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.04880848899483681, "rewards/tag_count_reward": 1.0, "step": 157 }, { "clip_ratio": 0.0, - "completion_length": 379.7708435058594, + "completion_length": 539.7500305175781, "epoch": 0.158, - "grad_norm": 3.1512747019800305, - "kl": 0.10595703125, + "grad_norm": 2.772614283171138, + "kl": 0.0655517578125, "learning_rate": 9.908088623197048e-07, - "loss": 0.1085, - "reward": 2.4349732398986816, - "reward_std": 0.30785757303237915, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.7638888955116272, - "rewards/repetition_penalty_reward": -0.05808231420814991, + "loss": -0.0064, + "reward": 2.664715528488159, + "reward_std": 0.265642948448658, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.029728862456977367, "rewards/tag_count_reward": 1.0, "step": 158 }, { "clip_ratio": 0.0, - "completion_length": 397.2916717529297, + "completion_length": 541.2291870117188, "epoch": 0.159, - "grad_norm": 2.651710357131677, - "kl": 0.114990234375, + "grad_norm": 2.4163026682291426, + "kl": 0.0654296875, "learning_rate": 9.904903249427582e-07, - "loss": 0.0932, - "reward": 2.543499231338501, - "reward_std": 0.33671319484710693, - "rewards/accuracy_reward": 0.8125, - "rewards/reasoning_steps_reward": 0.7986111342906952, - "rewards/repetition_penalty_reward": -0.057195279747247696, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": -0.0569, + "reward": 2.6886651515960693, + "reward_std": 0.3855375796556473, + "rewards/accuracy_reward": 0.75, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.03355725575238466, + "rewards/tag_count_reward": 1.0, "step": 159 }, { "clip_ratio": 0.0, - "completion_length": 326.31251525878906, + "completion_length": 576.7916870117188, "epoch": 0.16, - "grad_norm": 2.7278044148272107, - "kl": 0.14111328125, + "grad_norm": 2.4138081235417306, + "kl": 0.06298828125, "learning_rate": 9.901664203302124e-07, - "loss": -0.0182, - "reward": 2.6059353351593018, - "reward_std": 0.15745962411165237, - "rewards/accuracy_reward": 0.75, - "rewards/reasoning_steps_reward": 0.9027778506278992, - "rewards/repetition_penalty_reward": -0.04684258997440338, - "rewards/tag_count_reward": 1.0, + "loss": 0.0256, + "reward": 2.8020124435424805, + "reward_std": 0.28147071599960327, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.03305710572749376, + "rewards/tag_count_reward": 0.9947916865348816, "step": 160 }, { "clip_ratio": 0.0, - "completion_length": 343.4375, + "completion_length": 538.8125, "epoch": 0.161, - "grad_norm": 3.5086869532321567, - "kl": 0.125, + "grad_norm": 2.3732077198637116, + "kl": 0.0616455078125, "learning_rate": 9.89837152428743e-07, - "loss": 0.0497, - "reward": 2.6332504749298096, - "reward_std": 0.28190523386001587, - "rewards/accuracy_reward": 0.8125000298023224, - "rewards/reasoning_steps_reward": 0.86111119389534, - "rewards/repetition_penalty_reward": -0.04036063142120838, - "rewards/tag_count_reward": 1.0, + "loss": -0.0005, + "reward": 2.4948108196258545, + "reward_std": 0.3342844396829605, + "rewards/accuracy_reward": 0.5625000298023224, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.04338358715176582, + "rewards/tag_count_reward": 0.9895833432674408, "step": 161 }, { "clip_ratio": 0.0, - "completion_length": 380.50001525878906, + "completion_length": 593.4166870117188, "epoch": 0.162, - "grad_norm": 2.5122864468777766, - "kl": 0.134765625, + "grad_norm": 2.1912046412336528, + "kl": 0.06591796875, "learning_rate": 9.895025252503755e-07, - "loss": 0.06, - "reward": 2.426963210105896, - "reward_std": 0.18724749609827995, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/reasoning_steps_reward": 0.9444444477558136, - "rewards/repetition_penalty_reward": -0.05914814583957195, - "rewards/tag_count_reward": 1.0, + "loss": -0.0436, + "reward": 2.7147765159606934, + "reward_std": 0.40104810893535614, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.05084844306111336, + "rewards/tag_count_reward": 0.9947916865348816, "step": 162 }, { "clip_ratio": 0.0, - "completion_length": 390.75001525878906, + "completion_length": 604.3750305175781, "epoch": 0.163, - "grad_norm": 2.5444915792264626, - "kl": 0.1181640625, + "grad_norm": 2.276675908100595, + "kl": 0.06494140625, "learning_rate": 9.891625428724364e-07, - "loss": 0.0081, - "reward": 2.4827873706817627, - "reward_std": 0.38760950416326523, - "rewards/accuracy_reward": 0.6875000298023224, - "rewards/reasoning_steps_reward": 0.8611111044883728, - "rewards/repetition_penalty_reward": -0.05540710873901844, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": -0.0477, + "reward": 2.567950963973999, + "reward_std": 0.3306031674146652, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.036215804517269135, + "rewards/tag_count_reward": 1.0, "step": 163 }, { "clip_ratio": 0.0, - "completion_length": 374.0416717529297, + "completion_length": 522.5416870117188, "epoch": 0.164, - "grad_norm": 2.6501523297554415, - "kl": 0.136962890625, + "grad_norm": 3.181028204076106, + "kl": 0.071044921875, "learning_rate": 9.888172094375033e-07, - "loss": 0.0605, - "reward": 2.673776149749756, - "reward_std": 0.22138488292694092, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/reasoning_steps_reward": 0.8541666567325592, - "rewards/repetition_penalty_reward": -0.07101555913686752, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.1641, + "reward": 2.7617541551589966, + "reward_std": 0.19565748795866966, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.02991252951323986, + "rewards/tag_count_reward": 1.0, "step": 164 }, { "clip_ratio": 0.0, - "completion_length": 351.62501525878906, + "completion_length": 604.3333435058594, "epoch": 0.165, - "grad_norm": 3.070808836747664, - "kl": 0.1533203125, + "grad_norm": 2.3335259823634105, + "kl": 0.06494140625, "learning_rate": 9.88466529153356e-07, - "loss": 0.0308, - "reward": 2.7332472801208496, - "reward_std": 0.16363364458084106, - "rewards/accuracy_reward": 0.8125, - "rewards/reasoning_steps_reward": 0.9652778506278992, - "rewards/repetition_penalty_reward": -0.04453066922724247, - "rewards/tag_count_reward": 1.0, + "loss": -0.1023, + "reward": 2.599223256111145, + "reward_std": 0.4700702428817749, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.055290715768933296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 165 }, { "clip_ratio": 0.0, - "completion_length": 326.8958435058594, + "completion_length": 583.4791870117188, "epoch": 0.166, - "grad_norm": 2.978929616260445, - "kl": 0.1455078125, + "grad_norm": 2.4208829670581853, + "kl": 0.070556640625, "learning_rate": 9.881105062929221e-07, - "loss": -0.0148, - "reward": 2.6116230487823486, - "reward_std": 0.2165299728512764, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/reasoning_steps_reward": 0.8958334028720856, - "rewards/repetition_penalty_reward": -0.03421040251851082, - "rewards/tag_count_reward": 1.0, + "loss": 0.0434, + "reward": 2.332135558128357, + "reward_std": 0.36515676975250244, + "rewards/accuracy_reward": 0.3958333544433117, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.039392316713929176, + "rewards/tag_count_reward": 0.9895833730697632, "step": 166 }, { "clip_ratio": 0.0, - "completion_length": 376.8333435058594, + "completion_length": 630.8125, "epoch": 0.167, - "grad_norm": 2.6780375221325086, - "kl": 0.13818359375, + "grad_norm": 2.4360050121887475, + "kl": 0.06396484375, "learning_rate": 9.877491451942284e-07, - "loss": 0.095, - "reward": 2.518862009048462, - "reward_std": 0.42348942160606384, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/reasoning_steps_reward": 0.9097222685813904, - "rewards/repetition_penalty_reward": -0.05231846868991852, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": -0.0026, + "reward": 2.669437289237976, + "reward_std": 0.23492664098739624, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9791667461395264, + "rewards/repetition_penalty_reward": -0.03889598324894905, + "rewards/tag_count_reward": 1.0, "step": 167 }, { "clip_ratio": 0.0, - "completion_length": 361.31251525878906, + "completion_length": 653.8750305175781, "epoch": 0.168, - "grad_norm": 3.1072073372250872, - "kl": 0.15380859375, + "grad_norm": 2.299477713264774, + "kl": 0.066162109375, "learning_rate": 9.873824502603459e-07, - "loss": 0.1231, - "reward": 2.8115198612213135, - "reward_std": 0.24008309096097946, - "rewards/accuracy_reward": 0.9375, - "rewards/reasoning_steps_reward": 0.9583334028720856, - "rewards/repetition_penalty_reward": -0.06348029710352421, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.0313, + "reward": 2.6158487796783447, + "reward_std": 0.346173420548439, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.04387335851788521, + "rewards/tag_count_reward": 1.0, "step": 168 }, { "clip_ratio": 0.0, - "completion_length": 456.1041717529297, + "completion_length": 625.8541870117188, "epoch": 0.169, - "grad_norm": 3.3606151437586145, - "kl": 0.1572265625, + "grad_norm": 2.4011318956925503, + "kl": 0.0640869140625, "learning_rate": 9.870104259593362e-07, - "loss": 0.2551, - "reward": 2.5517321825027466, - "reward_std": 0.3905438333749771, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/reasoning_steps_reward": 0.972222238779068, - "rewards/repetition_penalty_reward": -0.07153183594346046, - "rewards/tag_count_reward": 0.9427083432674408, + "loss": 0.0797, + "reward": 2.6212562322616577, + "reward_std": 0.1528831347823143, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.034993914887309074, + "rewards/tag_count_reward": 0.9895833432674408, "step": 169 }, { "clip_ratio": 0.0, - "completion_length": 493.50001525878906, + "completion_length": 579.7083435058594, "epoch": 0.17, - "grad_norm": 6.376932604167781, - "kl": 0.1748046875, + "grad_norm": 2.5134518850907948, + "kl": 0.069580078125, "learning_rate": 9.866330768241983e-07, - "loss": 0.4857, - "reward": 2.6348665952682495, - "reward_std": 0.35951171815395355, - "rewards/accuracy_reward": 0.8750000298023224, - "rewards/reasoning_steps_reward": 0.92361119389534, - "rewards/repetition_penalty_reward": -0.08561956509947777, - "rewards/tag_count_reward": 0.921875, + "loss": 0.0129, + "reward": 2.715871572494507, + "reward_std": 0.325920432806015, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.03760070167481899, + "rewards/tag_count_reward": 0.9895833730697632, "step": 170 }, { "clip_ratio": 0.0, - "completion_length": 536.0833587646484, + "completion_length": 633.0625305175781, "epoch": 0.171, - "grad_norm": 4.335741164998474, - "kl": 0.1796875, + "grad_norm": 2.4224007332086424, + "kl": 0.0693359375, "learning_rate": 9.862504074528126e-07, - "loss": 0.4908, - "reward": 2.537351965904236, - "reward_std": 0.31542840600013733, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.0841759443283081, - "rewards/tag_count_reward": 0.9062500298023224, + "loss": 0.0557, + "reward": 2.5395129919052124, + "reward_std": 0.4677208960056305, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.04382044076919556, + "rewards/tag_count_reward": 1.0, "step": 171 }, { "clip_ratio": 0.0, - "completion_length": 485.8333435058594, + "completion_length": 621.0416870117188, "epoch": 0.172, - "grad_norm": 2.6021267445345644, - "kl": 0.141845703125, + "grad_norm": 2.1277905450282595, + "kl": 0.0667724609375, "learning_rate": 9.85862422507884e-07, - "loss": 0.2001, - "reward": 2.707563281059265, - "reward_std": 0.3819567710161209, - "rewards/accuracy_reward": 0.8958333730697632, - "rewards/reasoning_steps_reward": 0.9375000298023224, - "rewards/repetition_penalty_reward": -0.06847843155264854, - "rewards/tag_count_reward": 0.9427083730697632, + "loss": 0.0193, + "reward": 2.8295018672943115, + "reward_std": 0.13751935493201017, + "rewards/accuracy_reward": 0.875, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.04549824073910713, + "rewards/tag_count_reward": 1.0, "step": 172 }, { "clip_ratio": 0.0, - "completion_length": 387.0208435058594, + "completion_length": 648.7916870117188, "epoch": 0.173, - "grad_norm": 2.6629217396132554, - "kl": 0.1865234375, + "grad_norm": 2.324532047104547, + "kl": 0.069091796875, "learning_rate": 9.854691267168871e-07, - "loss": -0.0179, - "reward": 2.554840326309204, - "reward_std": 0.35031288862228394, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/reasoning_steps_reward": 0.9375, - "rewards/repetition_penalty_reward": -0.04932638257741928, - "rewards/tag_count_reward": 1.0, + "loss": 0.1049, + "reward": 2.722783327102661, + "reward_std": 0.41735316812992096, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.0376333836466074, + "rewards/tag_count_reward": 0.9895833432674408, "step": 173 }, { "clip_ratio": 0.0, - "completion_length": 460.25001525878906, + "completion_length": 671.7083435058594, "epoch": 0.174, - "grad_norm": 2.538846941500733, - "kl": 0.16943359375, + "grad_norm": 1.9577431816288606, + "kl": 0.066650390625, "learning_rate": 9.850705248720068e-07, - "loss": 0.0751, - "reward": 2.6158581972122192, - "reward_std": 0.37689241766929626, - "rewards/accuracy_reward": 0.7083333730697632, - "rewards/reasoning_steps_reward": 0.972222238779068, - "rewards/repetition_penalty_reward": -0.06469768099486828, + "loss": 0.0282, + "reward": 2.7458845376968384, + "reward_std": 0.3250259757041931, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.04578226990997791, "rewards/tag_count_reward": 1.0, "step": 174 }, { "clip_ratio": 0.0, - "completion_length": 437.8958435058594, + "completion_length": 700.7916870117188, "epoch": 0.175, - "grad_norm": 3.003203808605003, - "kl": 0.1552734375, + "grad_norm": 1.8987257817841927, + "kl": 0.0645751953125, "learning_rate": 9.846666218300807e-07, - "loss": -0.0156, - "reward": 2.4194644689559937, - "reward_std": 0.3525353968143463, - "rewards/accuracy_reward": 0.541666679084301, - "rewards/reasoning_steps_reward": 0.9583333432674408, - "rewards/repetition_penalty_reward": -0.07011887058615685, - "rewards/tag_count_reward": 0.9895833730697632, + "loss": 0.0104, + "reward": 2.6111772060394287, + "reward_std": 0.39906148612499237, + "rewards/accuracy_reward": 0.708333358168602, + "rewards/reasoning_steps_reward": 0.9652778506278992, + "rewards/repetition_penalty_reward": -0.06243372894823551, + "rewards/tag_count_reward": 1.0, "step": 175 }, { "clip_ratio": 0.0, - "completion_length": 416.3333435058594, + "completion_length": 628.1041870117188, "epoch": 0.176, - "grad_norm": 2.95409892177437, - "kl": 0.134521484375, + "grad_norm": 2.404386166811554, + "kl": 0.071533203125, "learning_rate": 9.8425742251254e-07, - "loss": -0.03, - "reward": 2.22793447971344, - "reward_std": 0.32642531394958496, - "rewards/accuracy_reward": 0.4166666865348816, - "rewards/reasoning_steps_reward": 0.8680556118488312, - "rewards/repetition_penalty_reward": -0.05157955177128315, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0773, + "reward": 2.6240488290786743, + "reward_std": 0.37613917887210846, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.06692333333194256, + "rewards/tag_count_reward": 0.9895833432674408, "step": 176 }, { "clip_ratio": 0.0, - "completion_length": 360.79168701171875, + "completion_length": 587.6458435058594, "epoch": 0.177, - "grad_norm": 2.662376544209673, - "kl": 0.18212890625, + "grad_norm": 2.4043287387592125, + "kl": 0.075439453125, "learning_rate": 9.838429319053495e-07, - "loss": 0.0148, - "reward": 2.8403353691101074, - "reward_std": 0.21480560302734375, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.03813690226525068, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0096, + "reward": 2.5300588607788086, + "reward_std": 0.3319057375192642, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.04112182557582855, + "rewards/tag_count_reward": 0.9947916865348816, "step": 177 }, { "clip_ratio": 0.0, - "completion_length": 316.5833435058594, + "completion_length": 698.8125305175781, "epoch": 0.178, - "grad_norm": 3.24184793690645, - "kl": 0.18896484375, + "grad_norm": 2.028048720044857, + "kl": 0.072265625, "learning_rate": 9.83423155058946e-07, - "loss": 0.0788, - "reward": 2.916623115539551, - "reward_std": 0.16335924714803696, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 0.9652778208255768, - "rewards/repetition_penalty_reward": -0.02782137133181095, + "loss": -0.0443, + "reward": 2.4296261072158813, + "reward_std": 0.34993553161621094, + "rewards/accuracy_reward": 0.4791666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.04954059235751629, "rewards/tag_count_reward": 1.0, "step": 178 }, { "clip_ratio": 0.0, - "completion_length": 337.4791717529297, + "completion_length": 643.4166870117188, "epoch": 0.179, - "grad_norm": 2.6431662790726045, - "kl": 0.19287109375, + "grad_norm": 2.5526900934607544, + "kl": 0.07763671875, "learning_rate": 9.829980970881784e-07, - "loss": -0.0306, - "reward": 2.447614073753357, - "reward_std": 0.26418814063072205, - "rewards/accuracy_reward": 0.5, - "rewards/reasoning_steps_reward": 0.9861111044883728, - "rewards/repetition_penalty_reward": -0.038497064262628555, - "rewards/tag_count_reward": 1.0, + "loss": 0.1619, + "reward": 2.847709059715271, + "reward_std": 0.24474234879016876, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.04291607812047005, + "rewards/tag_count_reward": 0.9739583432674408, "step": 179 }, { "clip_ratio": 0.0, - "completion_length": 349.0, + "completion_length": 617.2291870117188, "epoch": 0.18, - "grad_norm": 2.4268967639792405, - "kl": 0.201171875, + "grad_norm": 1.9749460293522159, + "kl": 0.082275390625, "learning_rate": 9.825677631722435e-07, - "loss": 0.0424, - "reward": 2.659801959991455, - "reward_std": 0.12255865335464478, + "loss": 0.0669, + "reward": 2.691727042198181, + "reward_std": 0.3086921200156212, "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.041587039828300476, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.03743956796824932, "rewards/tag_count_reward": 1.0, "step": 180 }, { "clip_ratio": 0.0, - "completion_length": 385.625, + "completion_length": 619.3541870117188, "epoch": 0.181, - "grad_norm": 2.7474043855560493, - "kl": 0.1865234375, + "grad_norm": 2.115751260793685, + "kl": 0.0771484375, "learning_rate": 9.821321585546243e-07, - "loss": 0.0351, - "reward": 2.642500877380371, - "reward_std": 0.27992890775203705, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 0.9375000298023224, - "rewards/repetition_penalty_reward": -0.06062416732311249, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0106, + "reward": 2.788450598716736, + "reward_std": 0.2660830020904541, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.05529944226145744, + "rewards/tag_count_reward": 0.9895833432674408, "step": 181 }, { "clip_ratio": 0.0, - "completion_length": 528.4166870117188, + "completion_length": 631.7500305175781, "epoch": 0.182, - "grad_norm": 2.5451366053177837, - "kl": 0.177734375, + "grad_norm": 2.195356667493195, + "kl": 0.084716796875, "learning_rate": 9.816912885430258e-07, - "loss": 0.2003, - "reward": 2.7395867109298706, - "reward_std": 0.4115413725376129, - "rewards/accuracy_reward": 0.875, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.0902744010090828, - "rewards/tag_count_reward": 0.96875, + "loss": 0.0269, + "reward": 2.6126253604888916, + "reward_std": 0.32306814193725586, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.054041286930441856, + "rewards/tag_count_reward": 1.0, "step": 182 }, { "clip_ratio": 0.0, - "completion_length": 394.8541717529297, + "completion_length": 634.8125305175781, "epoch": 0.183, - "grad_norm": 2.3148058104091964, - "kl": 0.19580078125, + "grad_norm": 2.1090909584715587, + "kl": 0.0791015625, "learning_rate": 9.812451585093098e-07, - "loss": -0.0157, - "reward": 2.751864433288574, - "reward_std": 0.36297309398651123, - "rewards/accuracy_reward": 0.8333333730697632, - "rewards/reasoning_steps_reward": 0.9652778506278992, - "rewards/repetition_penalty_reward": -0.04674675129354, + "loss": 0.0252, + "reward": 2.7393864393234253, + "reward_std": 0.3596974164247513, + "rewards/accuracy_reward": 0.8125, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.052280182018876076, "rewards/tag_count_reward": 1.0, "step": 183 }, { "clip_ratio": 0.0, - "completion_length": 357.54168701171875, + "completion_length": 661.7083435058594, "epoch": 0.184, - "grad_norm": 2.6255777725667757, - "kl": 0.21533203125, + "grad_norm": 2.077351011146557, + "kl": 0.0771484375, "learning_rate": 9.807937738894303e-07, - "loss": 0.0929, - "reward": 2.7035133838653564, - "reward_std": 0.15313967317342758, - "rewards/accuracy_reward": 0.7500000298023224, + "loss": 0.0566, + "reward": 2.869002103805542, + "reward_std": 0.24853621423244476, + "rewards/accuracy_reward": 0.9166666865348816, "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.04648665338754654, + "rewards/repetition_penalty_reward": -0.04766450449824333, "rewards/tag_count_reward": 1.0, "step": 184 }, { "clip_ratio": 0.0, - "completion_length": 584.0416717529297, + "completion_length": 603.0208435058594, "epoch": 0.185, - "grad_norm": 2.37177858580137, - "kl": 0.17236328125, + "grad_norm": 2.1637455121074223, + "kl": 0.087158203125, "learning_rate": 9.80337140183366e-07, - "loss": 0.1587, - "reward": 2.703560948371887, - "reward_std": 0.33490853011608124, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/reasoning_steps_reward": 0.979166716337204, - "rewards/repetition_penalty_reward": -0.07248069904744625, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.0752, + "reward": 2.8117637634277344, + "reward_std": 0.24315628595650196, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.035458519123494625, + "rewards/tag_count_reward": 1.0, "step": 185 }, { "clip_ratio": 0.0, - "completion_length": 408.93751525878906, + "completion_length": 583.1458435058594, "epoch": 0.186, - "grad_norm": 2.6431314031134963, - "kl": 0.1875, + "grad_norm": 2.275135225584201, + "kl": 0.09326171875, "learning_rate": 9.798752629550546e-07, - "loss": 0.0391, - "reward": 2.509796619415283, - "reward_std": 0.30193691700696945, - "rewards/accuracy_reward": 0.625, - "rewards/reasoning_steps_reward": 0.951388955116272, - "rewards/repetition_penalty_reward": -0.050967441871762276, - "rewards/tag_count_reward": 0.984375, + "loss": 0.0655, + "reward": 2.8143444061279297, + "reward_std": 0.21083202213048935, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.039822228252887726, + "rewards/tag_count_reward": 1.0, "step": 186 }, { "clip_ratio": 0.0, - "completion_length": 399.3333435058594, + "completion_length": 583.2916870117188, "epoch": 0.187, - "grad_norm": 2.5674308961884162, - "kl": 0.1953125, + "grad_norm": 2.233756675170646, + "kl": 0.089599609375, "learning_rate": 9.794081478323245e-07, - "loss": -0.0258, - "reward": 2.6311999559402466, - "reward_std": 0.41848231852054596, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9513888955116272, - "rewards/repetition_penalty_reward": -0.049355631694197655, + "loss": 0.0115, + "reward": 2.55490505695343, + "reward_std": 0.20559479296207428, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.04231713339686394, "rewards/tag_count_reward": 1.0, "step": 187 }, { "clip_ratio": 0.0, - "completion_length": 353.66668701171875, + "completion_length": 577.3958435058594, "epoch": 0.188, - "grad_norm": 2.605549752312026, - "kl": 0.1943359375, + "grad_norm": 2.312702195384097, + "kl": 0.09033203125, "learning_rate": 9.78935800506826e-07, - "loss": 0.0815, - "reward": 2.3135520219802856, - "reward_std": 0.22824274003505707, - "rewards/accuracy_reward": 0.3750000149011612, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.04755915887653828, + "loss": 0.0262, + "reward": 2.8108856678009033, + "reward_std": 0.29880291223526, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.04328102432191372, "rewards/tag_count_reward": 1.0, "step": 188 }, { "clip_ratio": 0.0, - "completion_length": 351.7083435058594, + "completion_length": 621.6458435058594, "epoch": 0.189, - "grad_norm": 2.9652335128549416, - "kl": 0.22412109375, + "grad_norm": 2.450467067468715, + "kl": 0.095458984375, "learning_rate": 9.784582267339622e-07, - "loss": 0.1421, - "reward": 2.709948182106018, - "reward_std": 0.15605415403842926, - "rewards/accuracy_reward": 0.7500000298023224, + "loss": 0.1076, + "reward": 2.691537618637085, + "reward_std": 0.46115170419216156, + "rewards/accuracy_reward": 0.7291666865348816, "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.04005194641649723, + "rewards/repetition_penalty_reward": -0.03762921877205372, "rewards/tag_count_reward": 1.0, "step": 189 }, { "clip_ratio": 0.0, - "completion_length": 418.5833435058594, + "completion_length": 552.2291870117188, "epoch": 0.19, - "grad_norm": 2.652354660767287, - "kl": 0.2041015625, + "grad_norm": 2.33663470812737, + "kl": 0.096923828125, "learning_rate": 9.779754323328192e-07, - "loss": 0.0066, - "reward": 2.211930513381958, - "reward_std": 0.2278308868408203, - "rewards/accuracy_reward": 0.3125, - "rewards/reasoning_steps_reward": 0.9583333730697632, - "rewards/repetition_penalty_reward": -0.0589029211550951, + "loss": 0.0829, + "reward": 2.556964159011841, + "reward_std": 0.2390810027718544, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.02636928204447031, "rewards/tag_count_reward": 1.0, "step": 190 }, { "clip_ratio": 0.0, - "completion_length": 342.66668701171875, + "completion_length": 589.0416870117188, "epoch": 0.191, - "grad_norm": 2.771903462610168, - "kl": 0.2392578125, + "grad_norm": 2.145945974879737, + "kl": 0.094482421875, "learning_rate": 9.774874231860935e-07, - "loss": 0.0982, - "reward": 2.9450162649154663, - "reward_std": 0.08517741318792105, - "rewards/accuracy_reward": 0.9791666865348816, + "loss": -0.0278, + "reward": 2.621474266052246, + "reward_std": 0.42711225152015686, + "rewards/accuracy_reward": 0.6666666865348816, "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.03415041510015726, + "rewards/repetition_penalty_reward": -0.04519243072718382, "rewards/tag_count_reward": 1.0, "step": 191 }, { "clip_ratio": 0.0, - "completion_length": 453.8125, + "completion_length": 539.6875305175781, "epoch": 0.192, - "grad_norm": 2.394349630579676, - "kl": 0.1796875, + "grad_norm": 2.147974112089912, + "kl": 0.090087890625, "learning_rate": 9.769942052400235e-07, - "loss": 0.0938, - "reward": 2.169591546058655, - "reward_std": 0.3903626799583435, - "rewards/accuracy_reward": 0.3333333432674408, - "rewards/reasoning_steps_reward": 0.92361119389534, - "rewards/repetition_penalty_reward": -0.066519720479846, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": -0.0237, + "reward": 2.76838481426239, + "reward_std": 0.310782328248024, + "rewards/accuracy_reward": 0.8125, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.04411514103412628, + "rewards/tag_count_reward": 1.0, "step": 192 }, { "clip_ratio": 0.0, - "completion_length": 426.00001525878906, + "completion_length": 525.2916870117188, "epoch": 0.193, - "grad_norm": 5.6685150390503205, - "kl": 0.24658203125, + "grad_norm": 2.1483746204852827, + "kl": 0.096923828125, "learning_rate": 9.764957845043135e-07, - "loss": 0.1891, - "reward": 2.8668434619903564, - "reward_std": 0.21791008114814758, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 0.965277761220932, - "rewards/repetition_penalty_reward": -0.061975859105587006, - "rewards/tag_count_reward": 0.984375, + "loss": 0.0031, + "reward": 2.842332124710083, + "reward_std": 0.13549592159688473, + "rewards/accuracy_reward": 0.875, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.032667966559529305, + "rewards/tag_count_reward": 1.0, "step": 193 }, { "clip_ratio": 0.0, - "completion_length": 338.6875, + "completion_length": 595.1875305175781, "epoch": 0.194, - "grad_norm": 2.708847901330575, - "kl": 0.22607421875, + "grad_norm": 1.981195616724217, + "kl": 0.09228515625, "learning_rate": 9.759921670520634e-07, - "loss": 0.0441, - "reward": 2.7790828943252563, - "reward_std": 0.2629942800849676, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.03688951954245567, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": -0.0784, + "reward": 2.706938862800598, + "reward_std": 0.286833293735981, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.03611667454242706, + "rewards/tag_count_reward": 1.0, "step": 194 }, { "clip_ratio": 0.0, - "completion_length": 374.1041717529297, + "completion_length": 576.2708587646484, "epoch": 0.195, - "grad_norm": 2.5160147798100856, - "kl": 0.23193359375, + "grad_norm": 2.648586034553415, + "kl": 0.09375, "learning_rate": 9.754833590196926e-07, - "loss": 0.0775, - "reward": 2.6829413175582886, - "reward_std": 0.08678778074681759, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.04622538760304451, + "loss": 0.198, + "reward": 2.8884334564208984, + "reward_std": 0.24114538729190826, + "rewards/accuracy_reward": 0.9375000298023224, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.035177784971892834, "rewards/tag_count_reward": 1.0, "step": 195 }, { "clip_ratio": 0.0, - "completion_length": 423.5, + "completion_length": 593.3750305175781, "epoch": 0.196, - "grad_norm": 2.928972776987841, - "kl": 0.21435546875, + "grad_norm": 2.403156236141252, + "kl": 0.10107421875, "learning_rate": 9.749693666068663e-07, - "loss": 0.1779, - "reward": 2.688473343849182, - "reward_std": 0.22662825137376785, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.040693264454603195, + "loss": -0.0295, + "reward": 2.310402512550354, + "reward_std": 0.2717669606208801, + "rewards/accuracy_reward": 0.3541666716337204, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.03681975603103638, "rewards/tag_count_reward": 1.0, "step": 196 }, { "clip_ratio": 0.0, - "completion_length": 432.25001525878906, + "completion_length": 466.60418701171875, "epoch": 0.197, - "grad_norm": 3.8972392977198593, - "kl": 0.23046875, + "grad_norm": 5.530819309602341, + "kl": 0.12060546875, "learning_rate": 9.744501960764203e-07, - "loss": 0.3185, - "reward": 2.8815797567367554, - "reward_std": 0.19053060561418533, - "rewards/accuracy_reward": 0.9583333730697632, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.06112846918404102, - "rewards/tag_count_reward": 0.984375, + "loss": 0.0515, + "reward": 2.7814574241638184, + "reward_std": 0.3644670993089676, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.02062598057091236, + "rewards/tag_count_reward": 0.9895833432674408, "step": 197 }, { "clip_ratio": 0.0, - "completion_length": 356.56251525878906, + "completion_length": 528.1666870117188, "epoch": 0.198, - "grad_norm": 2.689800196992468, - "kl": 0.23388671875, + "grad_norm": 2.484071909086178, + "kl": 0.1015625, "learning_rate": 9.739258537542835e-07, - "loss": 0.0009, - "reward": 2.6005579233169556, - "reward_std": 0.23926106840372086, - "rewards/accuracy_reward": 0.6666666716337204, - "rewards/reasoning_steps_reward": 0.9861111044883728, - "rewards/repetition_penalty_reward": -0.04701168276369572, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0689, + "reward": 2.845056176185608, + "reward_std": 0.29101284593343735, + "rewards/accuracy_reward": 0.8750000298023224, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.0229995958507061, + "rewards/tag_count_reward": 1.0, "step": 198 }, { "clip_ratio": 0.0, - "completion_length": 391.4791717529297, + "completion_length": 472.9583435058594, "epoch": 0.199, - "grad_norm": 2.602749454178963, - "kl": 0.22216796875, + "grad_norm": 2.111890817265467, + "kl": 0.10791015625, "learning_rate": 9.733963460294015e-07, - "loss": 0.105, - "reward": 2.6414283514022827, - "reward_std": 0.17605525813996792, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.04780779778957367, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": -0.0139, + "reward": 2.8303216695785522, + "reward_std": 0.34869830310344696, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.034261688590049744, + "rewards/tag_count_reward": 0.9895833432674408, "step": 199 }, { "clip_ratio": 0.0, - "completion_length": 500.1250305175781, + "completion_length": 515.3333587646484, "epoch": 0.2, - "grad_norm": 2.402492232810374, - "kl": 0.2265625, + "grad_norm": 2.557342178020337, + "kl": 0.1005859375, "learning_rate": 9.728616793536587e-07, - "loss": 0.1604, - "reward": 2.5097321271896362, - "reward_std": 0.29343516379594803, - "rewards/accuracy_reward": 0.6041666865348816, + "loss": -0.0301, + "reward": 2.5907087326049805, + "reward_std": 0.3476664125919342, + "rewards/accuracy_reward": 0.625, "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.045823450200259686, - "rewards/tag_count_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.027346878312528133, + "rewards/tag_count_reward": 1.0, "step": 200 }, { "clip_ratio": 0.0, - "completion_length": 461.12501525878906, + "completion_length": 531.8333435058594, "epoch": 0.201, - "grad_norm": 2.005525658148089, - "kl": 0.21484375, + "grad_norm": 2.699850984290719, + "kl": 0.107177734375, "learning_rate": 9.723218602418e-07, - "loss": -0.0028, - "reward": 2.548290967941284, - "reward_std": 0.30597639828920364, - "rewards/accuracy_reward": 0.6041666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.05587571673095226, + "loss": 0.026, + "reward": 2.548583984375, + "reward_std": 0.4529786705970764, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.027805176563560963, "rewards/tag_count_reward": 1.0, "step": 201 }, { "clip_ratio": 0.0, - "completion_length": 409.5416717529297, + "completion_length": 506.7708435058594, "epoch": 0.202, - "grad_norm": 2.3136005995586535, - "kl": 0.248046875, + "grad_norm": 2.3216160000213297, + "kl": 0.088623046875, "learning_rate": 9.717768952713511e-07, - "loss": -0.004, - "reward": 2.7236886024475098, - "reward_std": 0.241857148706913, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 0.979166716337204, - "rewards/repetition_penalty_reward": -0.04714471846818924, - "rewards/tag_count_reward": 1.0, + "loss": -0.0301, + "reward": 2.6107916831970215, + "reward_std": 0.3744069039821625, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9722222089767456, + "rewards/repetition_penalty_reward": -0.03851393796503544, + "rewards/tag_count_reward": 0.9895833432674408, "step": 202 }, { "clip_ratio": 0.0, - "completion_length": 410.0416717529297, + "completion_length": 530.9375152587891, "epoch": 0.203, - "grad_norm": 2.603770633632376, - "kl": 0.24853515625, + "grad_norm": 3.0552289765907665, + "kl": 0.105712890625, "learning_rate": 9.71226791082538e-07, - "loss": 0.0012, - "reward": 2.5697896480560303, - "reward_std": 0.29889051616191864, - "rewards/accuracy_reward": 0.6250000298023224, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.04132155515253544, - "rewards/tag_count_reward": 1.0, + "loss": 0.0276, + "reward": 2.899757981300354, + "reward_std": 0.19229509681463242, + "rewards/accuracy_reward": 0.9583333432674408, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.032533735036849976, + "rewards/tag_count_reward": 0.9947916865348816, "step": 203 }, { "clip_ratio": 0.0, - "completion_length": 343.9791717529297, + "completion_length": 613.8333435058594, "epoch": 0.204, - "grad_norm": 2.2680333100336147, - "kl": 0.232421875, + "grad_norm": 2.25614862930677, + "kl": 0.094482421875, "learning_rate": 9.706715543782064e-07, - "loss": 0.032, - "reward": 2.832016110420227, - "reward_std": 0.2716123014688492, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 0.9513889253139496, - "rewards/repetition_penalty_reward": -0.03603947255760431, - "rewards/tag_count_reward": 1.0, + "loss": -0.0024, + "reward": 2.8068939447402954, + "reward_std": 0.3218880593776703, + "rewards/accuracy_reward": 0.8958333730697632, + "rewards/reasoning_steps_reward": 0.9652778506278992, + "rewards/repetition_penalty_reward": -0.0438005393370986, + "rewards/tag_count_reward": 0.9895833432674408, "step": 204 }, { "clip_ratio": 0.0, - "completion_length": 425.5833435058594, + "completion_length": 548.3333740234375, "epoch": 0.205, - "grad_norm": 2.7075607767447805, - "kl": 0.23095703125, + "grad_norm": 2.1739212203171405, + "kl": 0.100830078125, "learning_rate": 9.701111919237408e-07, - "loss": 0.0586, - "reward": 2.690392851829529, - "reward_std": 0.2373129427433014, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 0.944444477558136, - "rewards/repetition_penalty_reward": -0.04571827873587608, + "loss": -0.0353, + "reward": 2.628522038459778, + "reward_std": 0.37829773128032684, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.03120046854019165, "rewards/tag_count_reward": 1.0, "step": 205 }, { "clip_ratio": 0.0, - "completion_length": 496.8958435058594, + "completion_length": 575.6666870117188, "epoch": 0.206, - "grad_norm": 2.6712887661409974, - "kl": 0.2431640625, + "grad_norm": 2.142090873135409, + "kl": 0.081298828125, "learning_rate": 9.695457105469804e-07, - "loss": 0.2061, - "reward": 2.5488892793655396, - "reward_std": 0.3880208730697632, - "rewards/accuracy_reward": 0.6250000298023224, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.04659699276089668, - "rewards/tag_count_reward": 0.984375, + "loss": 0.0283, + "reward": 2.6884310245513916, + "reward_std": 0.1302720569074154, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.042471904307603836, + "rewards/tag_count_reward": 0.9947916865348816, "step": 206 }, { "clip_ratio": 0.0, - "completion_length": 610.2083435058594, + "completion_length": 537.5833587646484, "epoch": 0.207, - "grad_norm": 2.574506336632449, - "kl": 0.232421875, + "grad_norm": 2.5678922668456563, + "kl": 0.092529296875, "learning_rate": 9.689751171381377e-07, - "loss": 0.2071, - "reward": 2.3815261125564575, - "reward_std": 0.47317539155483246, - "rewards/accuracy_reward": 0.6041666716337204, - "rewards/reasoning_steps_reward": 0.9374999701976776, - "rewards/repetition_penalty_reward": -0.06639067735522985, - "rewards/tag_count_reward": 0.90625, + "loss": 0.0451, + "reward": 2.6688740253448486, + "reward_std": 0.40736155211925507, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.025570510886609554, + "rewards/tag_count_reward": 1.0, "step": 207 }, { "clip_ratio": 0.0, - "completion_length": 633.8125152587891, + "completion_length": 585.875, "epoch": 0.208, - "grad_norm": 2.4837133322051357, - "kl": 0.2294921875, + "grad_norm": 2.5924187225102355, + "kl": 0.098388671875, "learning_rate": 9.683994186497132e-07, - "loss": 0.2463, - "reward": 2.661625385284424, - "reward_std": 0.4536067247390747, - "rewards/accuracy_reward": 0.8333333730697632, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.06580524891614914, - "rewards/tag_count_reward": 0.921875, + "loss": 0.169, + "reward": 2.3460363149642944, + "reward_std": 0.4289597123861313, + "rewards/accuracy_reward": 0.4375000111758709, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.034172071143984795, + "rewards/tag_count_reward": 0.984375, "step": 208 }, { "clip_ratio": 0.0, - "completion_length": 553.5833740234375, + "completion_length": 488.35418701171875, "epoch": 0.209, - "grad_norm": 2.2002075463407844, - "kl": 0.244140625, + "grad_norm": 2.6979608618101913, + "kl": 0.099609375, "learning_rate": 9.67818622096411e-07, - "loss": 0.0584, - "reward": 2.6819825172424316, - "reward_std": 0.4266183227300644, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 0.979166716337204, - "rewards/repetition_penalty_reward": -0.057600950822234154, - "rewards/tag_count_reward": 0.96875, + "loss": 0.0553, + "reward": 2.830100178718567, + "reward_std": 0.3162241727113724, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.017122075892984867, + "rewards/tag_count_reward": 0.9791666865348816, "step": 209 }, { "clip_ratio": 0.0, - "completion_length": 546.2291870117188, + "completion_length": 535.6041870117188, "epoch": 0.21, - "grad_norm": 2.352500862076168, - "kl": 0.232421875, + "grad_norm": 2.6398456904077494, + "kl": 0.09814453125, "learning_rate": 9.672327345550543e-07, - "loss": 0.1839, - "reward": 2.6210367679595947, - "reward_std": 0.46432720124721527, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 0.9375000298023224, - "rewards/repetition_penalty_reward": -0.07687987759709358, - "rewards/tag_count_reward": 0.96875, + "loss": 0.0618, + "reward": 2.509260654449463, + "reward_std": 0.28831499069929123, + "rewards/accuracy_reward": 0.5625, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.023725682869553566, + "rewards/tag_count_reward": 0.984375, "step": 210 }, { "clip_ratio": 0.0, - "completion_length": 403.9583435058594, + "completion_length": 599.6041870117188, "epoch": 0.211, - "grad_norm": 2.8855563522295027, - "kl": 0.2646484375, + "grad_norm": 4.517806643959562, + "kl": 0.109130859375, "learning_rate": 9.666417631644976e-07, - "loss": 0.1873, - "reward": 2.6186695098876953, - "reward_std": 0.16608004923909903, - "rewards/accuracy_reward": 0.6875, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.04799719527363777, - "rewards/tag_count_reward": 1.0, + "loss": 0.0023, + "reward": 2.5767656564712524, + "reward_std": 0.3708747327327728, + "rewards/accuracy_reward": 0.645833358168602, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.03608160652220249, + "rewards/tag_count_reward": 0.9947916865348816, "step": 211 }, { "clip_ratio": 0.0, - "completion_length": 438.62501525878906, + "completion_length": 589.4166870117188, "epoch": 0.212, - "grad_norm": 3.0393991802862863, - "kl": 0.2861328125, + "grad_norm": 2.261259945602269, + "kl": 0.10205078125, "learning_rate": 9.66045715125541e-07, - "loss": 0.1309, - "reward": 2.5795925855636597, - "reward_std": 0.17754635773599148, - "rewards/accuracy_reward": 0.6458333432674408, - "rewards/reasoning_steps_reward": 0.9652778506278992, - "rewards/repetition_penalty_reward": -0.03151867352426052, - "rewards/tag_count_reward": 1.0, + "loss": 0.097, + "reward": 2.5739282369613647, + "reward_std": 0.3682664930820465, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.040655218064785004, + "rewards/tag_count_reward": 0.9895833432674408, "step": 212 }, { "clip_ratio": 0.0, - "completion_length": 420.87501525878906, + "completion_length": 515.7291870117188, "epoch": 0.213, - "grad_norm": 2.4803650070833454, - "kl": 0.279296875, + "grad_norm": 2.397926932727844, + "kl": 0.1044921875, "learning_rate": 9.654445977008414e-07, - "loss": 0.1117, - "reward": 2.8217967748641968, - "reward_std": 0.27948014438152313, - "rewards/accuracy_reward": 0.8958333432674408, + "loss": 0.0394, + "reward": 2.682453751564026, + "reward_std": 0.4673650562763214, + "rewards/accuracy_reward": 0.7291666865348816, "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.035842034965753555, - "rewards/tag_count_reward": 0.96875, + "rewards/repetition_penalty_reward": -0.024143685586750507, + "rewards/tag_count_reward": 0.984375, "step": 213 }, { "clip_ratio": 0.0, - "completion_length": 559.3333435058594, + "completion_length": 691.2916870117188, "epoch": 0.214, - "grad_norm": 2.4221808596964913, - "kl": 0.2978515625, + "grad_norm": 8.712060550440611, + "kl": 0.124755859375, "learning_rate": 9.648384182148252e-07, - "loss": 0.1297, - "reward": 2.693021297454834, - "reward_std": 0.5070927739143372, - "rewards/accuracy_reward": 0.8125000298023224, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.05524260923266411, - "rewards/tag_count_reward": 0.9427083432674408, + "loss": 0.0044, + "reward": 2.719285011291504, + "reward_std": 0.3266633450984955, + "rewards/accuracy_reward": 0.8125, + "rewards/reasoning_steps_reward": 0.9652778506278992, + "rewards/repetition_penalty_reward": -0.04807615838944912, + "rewards/tag_count_reward": 0.9895833432674408, "step": 214 }, { "clip_ratio": 0.0, - "completion_length": 451.10418701171875, + "completion_length": 609.5416870117188, "epoch": 0.215, - "grad_norm": 3.1920621510823484, - "kl": 0.3349609375, + "grad_norm": 2.215014758114719, + "kl": 0.09326171875, "learning_rate": 9.64227184053598e-07, - "loss": 0.2194, - "reward": 2.557488441467285, - "reward_std": 0.33413204550743103, - "rewards/accuracy_reward": 0.625, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.03626162186264992, - "rewards/tag_count_reward": 0.96875, + "loss": 0.0437, + "reward": 2.835343837738037, + "reward_std": 0.26147839426994324, + "rewards/accuracy_reward": 0.9375000298023224, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.04312858823686838, + "rewards/tag_count_reward": 0.9895833432674408, "step": 215 }, { "clip_ratio": 0.0, - "completion_length": 294.18751525878906, + "completion_length": 600.7291870117188, "epoch": 0.216, - "grad_norm": 2.764380110656695, - "kl": 0.3154296875, + "grad_norm": 2.0090732793867403, + "kl": 0.124267578125, "learning_rate": 9.636109026648554e-07, - "loss": 0.0889, - "reward": 2.9289251565933228, - "reward_std": 0.15140239149332047, - "rewards/accuracy_reward": 0.9583333730697632, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.015519283711910248, + "loss": -0.091, + "reward": 2.805617928504944, + "reward_std": 0.33824414014816284, + "rewards/accuracy_reward": 0.8958333730697632, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.04160444252192974, "rewards/tag_count_reward": 1.0, "step": 216 }, { "clip_ratio": 0.0, - "completion_length": 488.6041717529297, + "completion_length": 645.2083740234375, "epoch": 0.217, - "grad_norm": 3.3944147591962706, - "kl": 0.3583984375, + "grad_norm": 13.509335743253168, + "kl": 0.146484375, "learning_rate": 9.629895815577915e-07, - "loss": 0.099, - "reward": 2.1621663570404053, - "reward_std": 0.3560364469885826, - "rewards/accuracy_reward": 0.2500000111758709, + "loss": -0.0366, + "reward": 2.7113317251205444, + "reward_std": 0.34692464768886566, + "rewards/accuracy_reward": 0.7708333432674408, "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.042694730684161186, - "rewards/tag_count_reward": 0.96875, + "rewards/repetition_penalty_reward": -0.045612769201397896, + "rewards/tag_count_reward": 1.0, "step": 217 }, { "clip_ratio": 0.0, - "completion_length": 324.79168701171875, + "completion_length": 644.5208435058594, "epoch": 0.218, - "grad_norm": 2.6844696325030903, - "kl": 0.3505859375, + "grad_norm": 2.1194540845072596, + "kl": 0.09033203125, "learning_rate": 9.623632283030077e-07, - "loss": 0.0671, - "reward": 2.567893862724304, - "reward_std": 0.43815483152866364, - "rewards/accuracy_reward": 0.625, - "rewards/reasoning_steps_reward": 0.9722222089767456, - "rewards/repetition_penalty_reward": -0.02932846639305353, + "loss": 0.0715, + "reward": 2.900136113166809, + "reward_std": 0.12626729905605316, + "rewards/accuracy_reward": 0.9791666865348816, + "rewards/reasoning_steps_reward": 0.9652778506278992, + "rewards/repetition_penalty_reward": -0.04430842399597168, "rewards/tag_count_reward": 1.0, "step": 218 }, { "clip_ratio": 0.0, - "completion_length": 318.1666717529297, + "completion_length": 621.1041870117188, "epoch": 0.219, - "grad_norm": 3.486069913729495, - "kl": 0.42578125, + "grad_norm": 2.2781852426998395, + "kl": 0.1044921875, "learning_rate": 9.617318505324212e-07, - "loss": 0.0933, - "reward": 2.9045804738998413, - "reward_std": 0.16637740656733513, - "rewards/accuracy_reward": 0.9375, - "rewards/reasoning_steps_reward": 0.9861111044883728, - "rewards/repetition_penalty_reward": -0.019030763767659664, + "loss": -0.0177, + "reward": 2.647742986679077, + "reward_std": 0.42986422777175903, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.0328126884996891, "rewards/tag_count_reward": 1.0, "step": 219 }, { "clip_ratio": 0.0, - "completion_length": 373.2083435058594, + "completion_length": 605.4791870117188, "epoch": 0.22, - "grad_norm": 4.3045603064493765, - "kl": 0.4111328125, + "grad_norm": 2.061772640741915, + "kl": 0.096435546875, "learning_rate": 9.610954559391704e-07, - "loss": 0.2256, - "reward": 2.6810883283615112, - "reward_std": 0.3095013499259949, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/reasoning_steps_reward": 0.979166716337204, - "rewards/repetition_penalty_reward": -0.032453480176627636, - "rewards/tag_count_reward": 0.984375, + "loss": 0.041, + "reward": 2.9348647594451904, + "reward_std": 0.0986992521211505, + "rewards/accuracy_reward": 0.9791666865348816, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.03735771402716637, + "rewards/tag_count_reward": 1.0, "step": 220 }, { "clip_ratio": 0.0, - "completion_length": 358.16668701171875, + "completion_length": 548.5416870117188, "epoch": 0.221, - "grad_norm": 2.7616567924045095, - "kl": 0.4755859375, + "grad_norm": 2.174295218510334, + "kl": 0.10888671875, "learning_rate": 9.604540522775227e-07, - "loss": 0.0416, - "reward": 2.660899043083191, - "reward_std": 0.37445715069770813, - "rewards/accuracy_reward": 0.7083333730697632, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.03354557789862156, - "rewards/tag_count_reward": 1.0, + "loss": 0.0495, + "reward": 2.896275758743286, + "reward_std": 0.1799733191728592, + "rewards/accuracy_reward": 0.9375, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.02386310324072838, + "rewards/tag_count_reward": 0.9895833432674408, "step": 221 }, { "clip_ratio": 0.0, - "completion_length": 389.0208435058594, + "completion_length": 544.0, "epoch": 0.222, - "grad_norm": 3.8553475873080862, - "kl": 0.4912109375, + "grad_norm": 2.1466512297504834, + "kl": 0.10986328125, "learning_rate": 9.598076473627796e-07, - "loss": -0.0273, - "reward": 2.6129144430160522, - "reward_std": 0.33815373480319977, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/reasoning_steps_reward": 0.979166716337204, - "rewards/repetition_penalty_reward": -0.032918866723775864, - "rewards/tag_count_reward": 1.0, + "loss": 0.0578, + "reward": 2.740749478340149, + "reward_std": 0.31613367795944214, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9861111640930176, + "rewards/repetition_penalty_reward": -0.03182023763656616, + "rewards/tag_count_reward": 0.9947916865348816, "step": 222 }, { "clip_ratio": 0.0, - "completion_length": 271.6666717529297, + "completion_length": 535.7708435058594, "epoch": 0.223, - "grad_norm": 2.7935737780309817, - "kl": 0.4853515625, + "grad_norm": 2.280033428835585, + "kl": 0.10107421875, "learning_rate": 9.59156249071181e-07, - "loss": 0.0592, - "reward": 2.632933259010315, - "reward_std": 0.1580705177038908, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.019844571594148874, + "loss": 0.0484, + "reward": 2.7902063131332397, + "reward_std": 0.30508676171302795, + "rewards/accuracy_reward": 0.8333333730697632, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.03618272300809622, "rewards/tag_count_reward": 1.0, "step": 223 }, { "clip_ratio": 0.0, - "completion_length": 426.2708435058594, + "completion_length": 611.3958435058594, "epoch": 0.224, - "grad_norm": 2.9631029028858182, - "kl": 0.50390625, + "grad_norm": 2.78636229997098, + "kl": 0.103515625, "learning_rate": 9.58499865339809e-07, - "loss": 0.0885, - "reward": 2.5158601999282837, - "reward_std": 0.2534521669149399, - "rewards/accuracy_reward": 0.5625000298023224, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.03969540446996689, + "loss": -0.0115, + "reward": 2.8599945306777954, + "reward_std": 0.2055322751402855, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.03583900257945061, "rewards/tag_count_reward": 1.0, "step": 224 }, { "clip_ratio": 0.0, - "completion_length": 281.8541717529297, + "completion_length": 539.375, "epoch": 0.225, - "grad_norm": 2.769972036225527, - "kl": 0.4814453125, + "grad_norm": 2.3136617005859286, + "kl": 0.100341796875, "learning_rate": 9.578385041664925e-07, - "loss": 0.0679, - "reward": 2.679645538330078, - "reward_std": 0.11014634743332863, - "rewards/accuracy_reward": 0.75, - "rewards/reasoning_steps_reward": 0.9513889253139496, - "rewards/repetition_penalty_reward": -0.021743278950452805, + "loss": 0.0279, + "reward": 2.5980384349823, + "reward_std": 0.35135801136493683, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.026961631141602993, "rewards/tag_count_reward": 1.0, "step": 225 }, { "clip_ratio": 0.0, - "completion_length": 431.9583435058594, + "completion_length": 599.7291870117188, "epoch": 0.226, - "grad_norm": 4.727320292383479, - "kl": 0.556640625, + "grad_norm": 2.7312479125410287, + "kl": 0.11376953125, "learning_rate": 9.571721736097088e-07, - "loss": 0.1839, - "reward": 2.4493629932403564, - "reward_std": 0.3959372490644455, - "rewards/accuracy_reward": 0.5208333432674408, - "rewards/reasoning_steps_reward": 0.979166716337204, - "rewards/repetition_penalty_reward": -0.02459544874727726, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": -0.0362, + "reward": 2.461398482322693, + "reward_std": 0.46244145929813385, + "rewards/accuracy_reward": 0.5000000149011612, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.024712713435292244, + "rewards/tag_count_reward": 1.0, "step": 226 }, { "clip_ratio": 0.0, - "completion_length": 316.4375, + "completion_length": 503.6458435058594, "epoch": 0.227, - "grad_norm": 2.6159714393344804, - "kl": 0.560546875, + "grad_norm": 2.9850488724293416, + "kl": 0.11083984375, "learning_rate": 9.565008817884854e-07, - "loss": 0.0326, - "reward": 2.673804759979248, - "reward_std": 0.15345264226198196, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.03279241733253002, - "rewards/tag_count_reward": 0.984375, + "loss": 0.1147, + "reward": 2.619432210922241, + "reward_std": 0.47723488509655, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.028137334622442722, + "rewards/tag_count_reward": 0.9947916865348816, "step": 227 }, { "clip_ratio": 0.0, - "completion_length": 320.43751525878906, + "completion_length": 523.9583435058594, "epoch": 0.228, - "grad_norm": 3.5210640917128675, - "kl": 0.560546875, + "grad_norm": 2.171102325634445, + "kl": 0.107421875, "learning_rate": 9.55824636882301e-07, - "loss": 0.0642, - "reward": 2.6521875858306885, - "reward_std": 0.22128037735819817, - "rewards/accuracy_reward": 0.75, - "rewards/reasoning_steps_reward": 0.951388955116272, - "rewards/repetition_penalty_reward": -0.03878478333353996, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": -0.0038, + "reward": 2.581455111503601, + "reward_std": 0.357488214969635, + "rewards/accuracy_reward": 0.625, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.036600593477487564, + "rewards/tag_count_reward": 1.0, "step": 228 }, { "clip_ratio": 0.0, - "completion_length": 256.375, + "completion_length": 552.9375152587891, "epoch": 0.229, - "grad_norm": 2.9609259902077696, - "kl": 0.5546875, + "grad_norm": 2.3604731804980457, + "kl": 0.115234375, "learning_rate": 9.55143447130987e-07, - "loss": 0.0886, - "reward": 2.7338963747024536, - "reward_std": 0.014361805282533169, - "rewards/accuracy_reward": 0.75, + "loss": 0.044, + "reward": 2.7417712211608887, + "reward_std": 0.29566267877817154, + "rewards/accuracy_reward": 0.7708333432674408, "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.016103758476674557, + "rewards/repetition_penalty_reward": -0.02906225249171257, "rewards/tag_count_reward": 1.0, "step": 229 }, { "clip_ratio": 0.0, - "completion_length": 306.9375, + "completion_length": 599.6666870117188, "epoch": 0.23, - "grad_norm": 5.604185835817082, - "kl": 0.611328125, + "grad_norm": 2.247209523128909, + "kl": 0.1123046875, "learning_rate": 9.54457320834625e-07, - "loss": -0.0495, - "reward": 2.731146812438965, - "reward_std": 0.20065782219171524, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 0.958333432674408, - "rewards/repetition_penalty_reward": -0.018853235989809036, + "loss": 0.0325, + "reward": 2.77982234954834, + "reward_std": 0.38850660622119904, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.03267781715840101, "rewards/tag_count_reward": 1.0, "step": 230 }, { "clip_ratio": 0.0, - "completion_length": 298.8333435058594, + "completion_length": 498.91668701171875, "epoch": 0.231, - "grad_norm": 4.202790444366677, - "kl": 0.529296875, + "grad_norm": 2.1665114909176872, + "kl": 0.103515625, "learning_rate": 9.537662663534477e-07, - "loss": 0.117, - "reward": 2.8252217769622803, - "reward_std": 0.25127677619457245, - "rewards/accuracy_reward": 0.8750000298023224, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.02373660635203123, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.01, + "reward": 2.9319541454315186, + "reward_std": 0.12954077869653702, + "rewards/accuracy_reward": 0.9791666865348816, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.03332359157502651, + "rewards/tag_count_reward": 1.0, "step": 231 }, { "clip_ratio": 0.0, - "completion_length": 263.6666717529297, + "completion_length": 565.1458435058594, "epoch": 0.232, - "grad_norm": 6.068065005931034, - "kl": 0.568359375, + "grad_norm": 2.397445136474842, + "kl": 0.1044921875, "learning_rate": 9.530702921077358e-07, - "loss": -0.063, - "reward": 2.838685154914856, - "reward_std": 0.3905899375677109, - "rewards/accuracy_reward": 0.8958333730697632, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.018953759223222733, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0176, + "reward": 2.7926957607269287, + "reward_std": 0.3122672885656357, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.04063776135444641, + "rewards/tag_count_reward": 1.0, "step": 232 }, { "clip_ratio": 0.0, - "completion_length": 311.00001525878906, + "completion_length": 515.5625152587891, "epoch": 0.233, - "grad_norm": 3.647765888753802, - "kl": 0.60546875, + "grad_norm": 2.3670675203551528, + "kl": 0.113525390625, "learning_rate": 9.523694065777156e-07, - "loss": 0.0296, - "reward": 2.2541710138320923, - "reward_std": 0.18214759789407253, - "rewards/accuracy_reward": 0.3125, - "rewards/reasoning_steps_reward": 0.9652778208255768, - "rewards/repetition_penalty_reward": -0.023606806993484497, + "loss": 0.0494, + "reward": 2.6802080869674683, + "reward_std": 0.28352178633213043, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.02812536619603634, "rewards/tag_count_reward": 1.0, "step": 233 }, { "clip_ratio": 0.0, - "completion_length": 339.16668701171875, + "completion_length": 524.875, "epoch": 0.234, - "grad_norm": 5.12106854551638, - "kl": 0.5107421875, + "grad_norm": 2.1646989735165527, + "kl": 0.121826171875, "learning_rate": 9.516636183034564e-07, - "loss": -0.0535, - "reward": 2.629412055015564, - "reward_std": 0.29604433476924896, - "rewards/accuracy_reward": 0.6666666865348816, + "loss": 0.0408, + "reward": 2.791554570198059, + "reward_std": 0.38019663095474243, + "rewards/accuracy_reward": 0.8541666865348816, "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.023365740664303303, - "rewards/tag_count_reward": 1.0, + "rewards/repetition_penalty_reward": -0.03309844061732292, + "rewards/tag_count_reward": 0.984375, "step": 234 }, { "clip_ratio": 0.0, - "completion_length": 278.5416717529297, + "completion_length": 560.0208435058594, "epoch": 0.235, - "grad_norm": 4.294377229443609, - "kl": 0.4228515625, + "grad_norm": 2.2756987535611404, + "kl": 0.10693359375, "learning_rate": 9.509529358847654e-07, - "loss": -0.0316, - "reward": 2.7256810665130615, - "reward_std": 0.2235753796994686, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.024319014512002468, + "loss": 0.0085, + "reward": 2.8194351196289062, + "reward_std": 0.3406111150979996, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.03473140113055706, "rewards/tag_count_reward": 1.0, "step": 235 }, { "clip_ratio": 0.0, - "completion_length": 289.6041717529297, + "completion_length": 575.375, "epoch": 0.236, - "grad_norm": 3.964097993133612, - "kl": 0.396484375, + "grad_norm": 2.174744939943205, + "kl": 0.100830078125, "learning_rate": 9.502373679810839e-07, - "loss": -0.0157, - "reward": 2.489555239677429, - "reward_std": 0.10523704253137112, - "rewards/accuracy_reward": 0.520833333954215, + "loss": -0.0023, + "reward": 2.7065417766571045, + "reward_std": 0.15310228383168578, + "rewards/accuracy_reward": 0.75, "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.01912526786327362, - "rewards/tag_count_reward": 0.9947916865348816, + "rewards/repetition_penalty_reward": -0.03651383891701698, + "rewards/tag_count_reward": 1.0, "step": 236 }, { "clip_ratio": 0.0, - "completion_length": 254.02083587646484, + "completion_length": 484.9583435058594, "epoch": 0.237, - "grad_norm": 2.774435745085441, - "kl": 0.3544921875, + "grad_norm": 2.6379983576818242, + "kl": 0.119140625, "learning_rate": 9.495169233113806e-07, - "loss": 0.066, - "reward": 2.7271549701690674, - "reward_std": 0.029675951227545738, - "rewards/accuracy_reward": 0.75, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.015900670550763607, + "loss": 0.0566, + "reward": 2.538248300552368, + "reward_std": 0.3112593740224838, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.965277761220932, + "rewards/repetition_penalty_reward": -0.031196235679090023, "rewards/tag_count_reward": 1.0, "step": 237 }, { "clip_ratio": 0.0, - "completion_length": 323.50000762939453, + "completion_length": 565.7916870117188, "epoch": 0.238, - "grad_norm": 2.766337642883494, - "kl": 0.3388671875, + "grad_norm": 2.052140048933374, + "kl": 0.114990234375, "learning_rate": 9.487916106540465e-07, - "loss": 0.049, - "reward": 2.8437864780426025, - "reward_std": 0.24735151696950197, - "rewards/accuracy_reward": 0.875, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.024269170127809048, + "loss": -0.0027, + "reward": 2.778996706008911, + "reward_std": 0.32464583218097687, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.03350352682173252, "rewards/tag_count_reward": 1.0, "step": 238 }, { "clip_ratio": 0.0, - "completion_length": 263.81251525878906, + "completion_length": 507.2291717529297, "epoch": 0.239, - "grad_norm": 3.146721984690368, - "kl": 0.361328125, + "grad_norm": 2.3870284690729915, + "kl": 0.103515625, "learning_rate": 9.480614388467877e-07, - "loss": -0.0076, - "reward": 2.551085948944092, - "reward_std": 0.17887626215815544, - "rewards/accuracy_reward": 0.5833333432674408, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.018358652479946613, + "loss": 0.0266, + "reward": 2.865510582923889, + "reward_std": 0.2045399323105812, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.037267398089170456, "rewards/tag_count_reward": 1.0, "step": 239 }, { "clip_ratio": 0.0, - "completion_length": 294.7291717529297, + "completion_length": 473.0833435058594, "epoch": 0.24, - "grad_norm": 2.823111206302459, - "kl": 0.3212890625, + "grad_norm": 2.340901673290699, + "kl": 0.120849609375, "learning_rate": 9.473264167865171e-07, - "loss": 0.0229, - "reward": 2.902875542640686, - "reward_std": 0.16621639393270016, - "rewards/accuracy_reward": 0.9375, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.020735724829137325, + "loss": -0.0045, + "reward": 2.849257707595825, + "reward_std": 0.26365862786769867, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.951388955116272, + "rewards/repetition_penalty_reward": -0.018797898665070534, "rewards/tag_count_reward": 1.0, "step": 240 }, { "clip_ratio": 0.0, - "completion_length": 242.02084350585938, + "completion_length": 502.3125, "epoch": 0.241, - "grad_norm": 2.768047531401508, - "kl": 0.35546875, + "grad_norm": 1.9771145576165876, + "kl": 0.10791015625, "learning_rate": 9.465865534292464e-07, - "loss": 0.0321, - "reward": 2.9354735612869263, - "reward_std": 0.17485684901475906, - "rewards/accuracy_reward": 0.9583333730697632, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.015915566124022007, + "loss": 0.0245, + "reward": 2.665071487426758, + "reward_std": 0.25069355964660645, + "rewards/accuracy_reward": 0.7083333730697632, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.036317549645900726, "rewards/tag_count_reward": 1.0, "step": 241 }, { "clip_ratio": 0.0, - "completion_length": 255.9791717529297, + "completion_length": 469.0625, "epoch": 0.242, - "grad_norm": 2.6848042463097466, - "kl": 0.380859375, + "grad_norm": 2.429463773148552, + "kl": 0.12890625, "learning_rate": 9.458418577899774e-07, - "loss": 0.035, - "reward": 2.9760119915008545, - "reward_std": 0.03133458364754915, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.0170435830950737, + "loss": 0.0219, + "reward": 2.832092523574829, + "reward_std": 0.3335033804178238, + "rewards/accuracy_reward": 0.8750000298023224, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.015129742678254843, "rewards/tag_count_reward": 1.0, "step": 242 }, { "clip_ratio": 0.0, - "completion_length": 389.7708435058594, + "completion_length": 509.1041717529297, "epoch": 0.243, - "grad_norm": 2.522587626804693, - "kl": 0.333984375, + "grad_norm": 2.2888490947191453, + "kl": 0.11669921875, "learning_rate": 9.450923389425911e-07, - "loss": 0.0405, - "reward": 2.7551279067993164, - "reward_std": 0.2791627198457718, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.029594331979751587, + "loss": 0.0679, + "reward": 2.8057637214660645, + "reward_std": 0.3620642125606537, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.027569908648729324, "rewards/tag_count_reward": 1.0, "step": 243 }, { "clip_ratio": 0.0, - "completion_length": 304.3541717529297, + "completion_length": 462.3958435058594, "epoch": 0.244, - "grad_norm": 2.77297928840735, - "kl": 0.3720703125, + "grad_norm": 2.7695341753126383, + "kl": 0.12841796875, "learning_rate": 9.443380060197385e-07, - "loss": -0.0179, - "reward": 2.8342981338500977, - "reward_std": 0.20725210011005402, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.019868516363203526, - "rewards/tag_count_reward": 1.0, + "loss": 0.028, + "reward": 2.687015175819397, + "reward_std": 0.19807551801204681, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9861111640930176, + "rewards/repetition_penalty_reward": -0.02305438881739974, + "rewards/tag_count_reward": 0.9947916865348816, "step": 244 }, { "clip_ratio": 0.0, - "completion_length": 320.25000762939453, + "completion_length": 562.0625152587891, "epoch": 0.245, - "grad_norm": 3.1679197914818404, - "kl": 0.322265625, + "grad_norm": 2.0280976947569536, + "kl": 0.12060546875, "learning_rate": 9.43578868212728e-07, - "loss": -0.0621, - "reward": 2.6348594427108765, - "reward_std": 0.31440991163253784, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.017918367870151997, + "loss": -0.044, + "reward": 2.79758882522583, + "reward_std": 0.2576555460691452, + "rewards/accuracy_reward": 0.875, + "rewards/reasoning_steps_reward": 0.9513889849185944, + "rewards/repetition_penalty_reward": -0.028800049796700478, "rewards/tag_count_reward": 1.0, "step": 245 }, { "clip_ratio": 0.0, - "completion_length": 365.43751525878906, + "completion_length": 529.3333435058594, "epoch": 0.246, - "grad_norm": 2.62272326098995, - "kl": 0.349609375, + "grad_norm": 2.3299542232310477, + "kl": 0.119873046875, "learning_rate": 9.428149347714143e-07, - "loss": -0.0162, - "reward": 2.487512946128845, - "reward_std": 0.3192945420742035, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/reasoning_steps_reward": 0.972222238779068, - "rewards/repetition_penalty_reward": -0.02637607231736183, + "loss": -0.0061, + "reward": 2.660530924797058, + "reward_std": 0.16930758208036423, + "rewards/accuracy_reward": 0.6875000149011612, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.026969049125909805, "rewards/tag_count_reward": 1.0, "step": 246 }, { "clip_ratio": 0.0, - "completion_length": 467.3333435058594, + "completion_length": 488.72918701171875, "epoch": 0.247, - "grad_norm": 2.9019860675823295, - "kl": 0.337890625, + "grad_norm": 2.300003946060058, + "kl": 0.1318359375, "learning_rate": 9.420462150040852e-07, - "loss": 0.1817, - "reward": 2.817419409751892, - "reward_std": 0.21217607846483588, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.03153882268816233, - "rewards/tag_count_reward": 0.953125, + "loss": 0.0401, + "reward": 2.5491377115249634, + "reward_std": 0.3199358731508255, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.027251286432147026, + "rewards/tag_count_reward": 1.0, "step": 247 }, { "clip_ratio": 0.0, - "completion_length": 305.87501525878906, + "completion_length": 528.2291870117188, "epoch": 0.248, - "grad_norm": 2.70760754513365, - "kl": 0.3310546875, + "grad_norm": 2.242455600453007, + "kl": 0.119384765625, "learning_rate": 9.412727182773486e-07, - "loss": -0.0027, - "reward": 2.8553617000579834, - "reward_std": 0.2359490469098091, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/reasoning_steps_reward": 0.979166716337204, - "rewards/repetition_penalty_reward": -0.019638150930404663, + "loss": 0.0059, + "reward": 2.7635743618011475, + "reward_std": 0.3389218971133232, + "rewards/accuracy_reward": 0.8125, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.02809229213744402, "rewards/tag_count_reward": 1.0, "step": 248 }, { "clip_ratio": 0.0, - "completion_length": 331.2083435058594, + "completion_length": 639.5625305175781, "epoch": 0.249, - "grad_norm": 2.7177251033441157, - "kl": 0.3486328125, + "grad_norm": 1.9733887624956274, + "kl": 0.110595703125, "learning_rate": 9.404944540160177e-07, - "loss": -0.0116, - "reward": 2.83267879486084, - "reward_std": 0.2832423448562622, - "rewards/accuracy_reward": 0.8750000298023224, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.02148800529539585, + "loss": -0.0684, + "reward": 2.6618722677230835, + "reward_std": 0.2736282553523779, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.03951651602983475, "rewards/tag_count_reward": 1.0, "step": 249 }, { "clip_ratio": 0.0, - "completion_length": 312.4791717529297, + "completion_length": 501.12501525878906, "epoch": 0.25, - "grad_norm": 2.9325420553490305, - "kl": 0.3271484375, + "grad_norm": 2.167863484844858, + "kl": 0.13525390625, "learning_rate": 9.397114317029974e-07, - "loss": 0.1489, - "reward": 2.847796678543091, - "reward_std": 0.17942998930811882, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.037619972601532936, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.004, + "reward": 2.526219606399536, + "reward_std": 0.380461186170578, + "rewards/accuracy_reward": 0.5833333730697632, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.0293360473588109, + "rewards/tag_count_reward": 1.0, "step": 250 }, { "clip_ratio": 0.0, - "completion_length": 348.50001525878906, + "completion_length": 592.2083435058594, "epoch": 0.251, - "grad_norm": 2.8433413824399154, - "kl": 0.3271484375, + "grad_norm": 2.441314951742952, + "kl": 0.11767578125, "learning_rate": 9.38923660879167e-07, - "loss": 0.0626, - "reward": 2.7041455507278442, - "reward_std": 0.26819342374801636, - "rewards/accuracy_reward": 0.75, + "loss": 0.1345, + "reward": 2.7307329177856445, + "reward_std": 0.2994953393936157, + "rewards/accuracy_reward": 0.7708333730697632, "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.023285221308469772, - "rewards/tag_count_reward": 0.984375, + "rewards/repetition_penalty_reward": -0.033156177029013634, + "rewards/tag_count_reward": 1.0, "step": 251 }, { "clip_ratio": 0.0, - "completion_length": 449.00001525878906, + "completion_length": 564.2708435058594, "epoch": 0.252, - "grad_norm": 2.3894856749387747, - "kl": 0.3154296875, + "grad_norm": 2.49677783518446, + "kl": 0.122802734375, "learning_rate": 9.381311511432658e-07, - "loss": 0.0764, - "reward": 2.486729145050049, - "reward_std": 0.38556669652462006, - "rewards/accuracy_reward": 0.520833358168602, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.02889585867524147, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.1067, + "reward": 2.4708348512649536, + "reward_std": 0.36502179503440857, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.036109643056988716, + "rewards/tag_count_reward": 1.0, "step": 252 }, { "clip_ratio": 0.0, - "completion_length": 275.9791717529297, + "completion_length": 601.3125305175781, "epoch": 0.253, - "grad_norm": 3.308957349942137, - "kl": 0.3369140625, + "grad_norm": 2.031276195067717, + "kl": 0.119384765625, "learning_rate": 9.373339121517746e-07, - "loss": 0.081, - "reward": 2.875459671020508, - "reward_std": 0.20478565990924835, - "rewards/accuracy_reward": 0.8958333730697632, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.020373898558318615, - "rewards/tag_count_reward": 1.0, + "loss": 0.0435, + "reward": 2.9231903553009033, + "reward_std": 0.10614164918661118, + "rewards/accuracy_reward": 0.9791666865348816, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.04382369481027126, + "rewards/tag_count_reward": 0.9947916865348816, "step": 253 }, { "clip_ratio": 0.0, - "completion_length": 391.7083435058594, + "completion_length": 572.4791870117188, "epoch": 0.254, - "grad_norm": 2.7663683490086393, - "kl": 0.3544921875, + "grad_norm": 2.1020100515227313, + "kl": 0.12060546875, "learning_rate": 9.36531953618799e-07, - "loss": 0.0862, - "reward": 2.8472323417663574, - "reward_std": 0.260253444314003, - "rewards/accuracy_reward": 0.8958333730697632, + "loss": 0.0657, + "reward": 2.7917503118515015, + "reward_std": 0.26876559667289257, + "rewards/accuracy_reward": 0.8333333432674408, "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.015615224838256836, - "rewards/tag_count_reward": 0.9739583432674408, + "rewards/repetition_penalty_reward": -0.029430264607071877, + "rewards/tag_count_reward": 0.9947916865348816, "step": 254 }, { "clip_ratio": 0.0, - "completion_length": 396.3125, + "completion_length": 553.8333740234375, "epoch": 0.255, - "grad_norm": 2.8117307785301535, - "kl": 0.3369140625, + "grad_norm": 3.5327461787319776, + "kl": 0.13232421875, "learning_rate": 9.357252853159505e-07, - "loss": 0.0311, - "reward": 2.5658544301986694, - "reward_std": 0.4280482977628708, - "rewards/accuracy_reward": 0.6458333432674408, - "rewards/reasoning_steps_reward": 0.9652777910232544, - "rewards/repetition_penalty_reward": -0.02963178977370262, - "rewards/tag_count_reward": 0.984375, + "loss": 0.0351, + "reward": 2.589944839477539, + "reward_std": 0.22619716823101044, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.02811075560748577, + "rewards/tag_count_reward": 1.0, "step": 255 }, { "clip_ratio": 0.0, - "completion_length": 301.2708435058594, + "completion_length": 548.0416870117188, "epoch": 0.256, - "grad_norm": 2.87891254388211, - "kl": 0.34765625, + "grad_norm": 2.183159179082376, + "kl": 0.13525390625, "learning_rate": 9.34913917072228e-07, - "loss": 0.073, - "reward": 2.86317241191864, - "reward_std": 0.2617315109819174, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.030924628488719463, - "rewards/tag_count_reward": 0.984375, + "loss": -0.0696, + "reward": 2.6049808263778687, + "reward_std": 0.3286217898130417, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.028699966147542, + "rewards/tag_count_reward": 0.9947916865348816, "step": 256 }, { "clip_ratio": 0.0, - "completion_length": 414.4166717529297, + "completion_length": 580.6041870117188, "epoch": 0.257, - "grad_norm": 3.2595404882778056, - "kl": 0.3564453125, + "grad_norm": 2.401471424731539, + "kl": 0.12744140625, "learning_rate": 9.340978587738972e-07, - "loss": 0.0132, - "reward": 2.71917188167572, - "reward_std": 0.32954995334148407, - "rewards/accuracy_reward": 0.7500000298023224, + "loss": 0.0462, + "reward": 2.8021459579467773, + "reward_std": 0.3006982207298279, + "rewards/accuracy_reward": 0.8333333730697632, "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.02041140664368868, - "rewards/tag_count_reward": 0.9895833432674408, + "rewards/repetition_penalty_reward": -0.031187265180051327, + "rewards/tag_count_reward": 1.0, "step": 257 }, { "clip_ratio": 0.0, - "completion_length": 294.3958435058594, + "completion_length": 570.6041870117188, "epoch": 0.258, - "grad_norm": 2.989660481107389, - "kl": 0.3681640625, + "grad_norm": 7.315971327716766, + "kl": 0.12353515625, "learning_rate": 9.332771203643714e-07, - "loss": -0.0372, - "reward": 2.7095645666122437, - "reward_std": 0.21496595442295074, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.01265768427401781, - "rewards/tag_count_reward": 1.0, + "loss": 0.1692, + "reward": 2.6945351362228394, + "reward_std": 0.3729802221059799, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.0398398581892252, + "rewards/tag_count_reward": 0.984375, "step": 258 }, { "clip_ratio": 0.0, - "completion_length": 379.5208435058594, + "completion_length": 462.18751525878906, "epoch": 0.259, - "grad_norm": 2.457450895989442, - "kl": 0.3544921875, + "grad_norm": 2.249715244876303, + "kl": 0.1328125, "learning_rate": 9.324517118440888e-07, - "loss": 0.0042, - "reward": 2.899809718132019, - "reward_std": 0.11693940963596106, - "rewards/accuracy_reward": 0.9375, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.023801439441740513, + "loss": 0.0048, + "reward": 2.757077693939209, + "reward_std": 0.31304194778203964, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.020700284279882908, "rewards/tag_count_reward": 1.0, "step": 259 }, { "clip_ratio": 0.0, - "completion_length": 362.68751525878906, + "completion_length": 508.2083435058594, "epoch": 0.26, - "grad_norm": 2.7414137576471402, - "kl": 0.3798828125, + "grad_norm": 5.193269569172354, + "kl": 0.1416015625, "learning_rate": 9.316216432703916e-07, - "loss": 0.0314, - "reward": 2.765600085258484, - "reward_std": 0.1935584181919694, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.0191219886764884, - "rewards/tag_count_reward": 1.0, + "loss": 0.1727, + "reward": 2.8865363597869873, + "reward_std": 0.19043887220323086, + "rewards/accuracy_reward": 0.9583333432674408, + "rewards/reasoning_steps_reward": 0.9861111640930176, + "rewards/repetition_penalty_reward": -0.04228321462869644, + "rewards/tag_count_reward": 0.984375, "step": 260 }, { "clip_ratio": 0.0, - "completion_length": 306.2708435058594, + "completion_length": 496.58335876464844, "epoch": 0.261, - "grad_norm": 2.660574547389871, - "kl": 0.359375, + "grad_norm": 2.2509057041996443, + "kl": 0.13623046875, "learning_rate": 9.307869247574038e-07, - "loss": 0.0345, - "reward": 2.8953992128372192, - "reward_std": 0.1925100740045309, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.02126746135763824, - "rewards/tag_count_reward": 1.0, + "loss": 0.0359, + "reward": 2.8467074632644653, + "reward_std": 0.2996975928544998, + "rewards/accuracy_reward": 0.8958333730697632, + "rewards/reasoning_steps_reward": 0.9791667461395264, + "rewards/repetition_penalty_reward": -0.02829269878566265, + "rewards/tag_count_reward": 1.0, "step": 261 }, { "clip_ratio": 0.0, - "completion_length": 353.2083435058594, + "completion_length": 628.9375, "epoch": 0.262, - "grad_norm": 3.495196800396644, - "kl": 0.3447265625, + "grad_norm": 4.668495262967337, + "kl": 0.13623046875, "learning_rate": 9.299475664759068e-07, - "loss": 0.1955, - "reward": 2.715879440307617, - "reward_std": 0.1999235600233078, - "rewards/accuracy_reward": 0.7708333730697632, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.023703853599727154, - "rewards/tag_count_reward": 0.9687500298023224, + "loss": 0.1609, + "reward": 2.720739483833313, + "reward_std": 0.4649975746870041, + "rewards/accuracy_reward": 0.8125, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.04141336679458618, + "rewards/tag_count_reward": 0.9843750298023224, "step": 262 }, { "clip_ratio": 0.0, - "completion_length": 321.10418701171875, + "completion_length": 519.0, "epoch": 0.263, - "grad_norm": 3.534417152793247, - "kl": 0.361328125, + "grad_norm": 2.3541332969660975, + "kl": 0.13671875, "learning_rate": 9.291035786532163e-07, - "loss": 0.1788, - "reward": 2.926344156265259, - "reward_std": 0.1562669388949871, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 0.9652778506278992, - "rewards/repetition_penalty_reward": -0.018100355518981814, - "rewards/tag_count_reward": 1.0, + "loss": 0.0638, + "reward": 2.4355462789535522, + "reward_std": 0.21204119874164462, + "rewards/accuracy_reward": 0.4791666716337204, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.033203769475221634, + "rewards/tag_count_reward": 0.9895833432674408, "step": 263 }, { "clip_ratio": 0.0, - "completion_length": 418.5208435058594, + "completion_length": 546.3125, "epoch": 0.264, - "grad_norm": 3.7535388419862548, - "kl": 0.34765625, + "grad_norm": 2.1456751308727213, + "kl": 0.1416015625, "learning_rate": 9.282549715730579e-07, - "loss": 0.3327, - "reward": 2.719529151916504, - "reward_std": 0.2675071656703949, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.03567917086184025, - "rewards/tag_count_reward": 0.984375, + "loss": -0.0358, + "reward": 2.4012598991394043, + "reward_std": 0.44198279082775116, + "rewards/accuracy_reward": 0.4791666865348816, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.03971245139837265, + "rewards/tag_count_reward": 0.9895833432674408, "step": 264 }, { "clip_ratio": 0.0, - "completion_length": 501.2083435058594, + "completion_length": 460.12501525878906, "epoch": 0.265, - "grad_norm": 2.6668962419506688, - "kl": 0.3583984375, + "grad_norm": 2.3449066935567697, + "kl": 0.15087890625, "learning_rate": 9.274017555754407e-07, - "loss": 0.0528, - "reward": 2.599488615989685, - "reward_std": 0.3290487378835678, - "rewards/accuracy_reward": 0.6458333730697632, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.03419201076030731, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0136, + "reward": 2.7137099504470825, + "reward_std": 0.326103575527668, + "rewards/accuracy_reward": 0.7708333730697632, + "rewards/reasoning_steps_reward": 0.9652778506278992, + "rewards/repetition_penalty_reward": -0.02240123925730586, + "rewards/tag_count_reward": 1.0, "step": 265 }, { "clip_ratio": 0.0, - "completion_length": 338.68751525878906, + "completion_length": 526.7291870117188, "epoch": 0.266, - "grad_norm": 2.9171604581691075, - "kl": 0.3740234375, + "grad_norm": 2.521194502817804, + "kl": 0.1572265625, "learning_rate": 9.265439410565328e-07, - "loss": 0.106, - "reward": 2.8173811435699463, - "reward_std": 0.3043472170829773, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.015952279791235924, - "rewards/tag_count_reward": 1.0, + "loss": 0.1172, + "reward": 2.733197331428528, + "reward_std": 0.43402716517448425, + "rewards/accuracy_reward": 0.8125, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.028955545276403427, + "rewards/tag_count_reward": 0.984375, "step": 266 }, { "clip_ratio": 0.0, - "completion_length": 492.18751525878906, + "completion_length": 580.3541870117188, "epoch": 0.267, - "grad_norm": 3.5797475292008394, - "kl": 0.3798828125, + "grad_norm": 2.2571936139784374, + "kl": 0.15185546875, "learning_rate": 9.256815384685328e-07, - "loss": 0.2554, - "reward": 2.771204352378845, - "reward_std": 0.3193429633975029, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.03608737327158451, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 0.0205, + "reward": 2.665441632270813, + "reward_std": 0.2782168686389923, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.029003008268773556, + "rewards/tag_count_reward": 1.0, "step": 267 }, { "clip_ratio": 0.0, - "completion_length": 401.9166717529297, + "completion_length": 528.8541870117188, "epoch": 0.268, - "grad_norm": 2.631093943670801, - "kl": 0.4140625, + "grad_norm": 2.527114445757883, + "kl": 0.15478515625, "learning_rate": 9.248145583195447e-07, - "loss": 0.0077, - "reward": 2.728670120239258, - "reward_std": 0.33099038898944855, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.028274414129555225, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.0642, + "reward": 2.634427309036255, + "reward_std": 0.35257890820503235, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.025295117869973183, + "rewards/tag_count_reward": 1.0, "step": 268 }, { "clip_ratio": 0.0, - "completion_length": 422.8958435058594, + "completion_length": 480.4791717529297, "epoch": 0.269, - "grad_norm": 2.7769233292530155, - "kl": 0.404296875, + "grad_norm": 2.565433677851082, + "kl": 0.15185546875, "learning_rate": 9.239430111734476e-07, - "loss": 0.0158, - "reward": 2.6283435821533203, - "reward_std": 0.24908730387687683, + "loss": -0.0129, + "reward": 2.6094166040420532, + "reward_std": 0.32510536164045334, "rewards/accuracy_reward": 0.6666666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.017489885911345482, - "rewards/tag_count_reward": 0.9791666865348816, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.02252791728824377, + "rewards/tag_count_reward": 1.0, "step": 269 }, { "clip_ratio": 0.0, - "completion_length": 392.1666717529297, + "completion_length": 535.3333435058594, "epoch": 0.27, - "grad_norm": 3.995553313633592, - "kl": 0.3740234375, + "grad_norm": 7.328177927840423, + "kl": 0.15185546875, "learning_rate": 9.230669076497687e-07, - "loss": 0.247, - "reward": 2.7079943418502808, - "reward_std": 0.38716357946395874, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.019436247646808624, + "loss": 0.1255, + "reward": 2.5093064308166504, + "reward_std": 0.3799414336681366, + "rewards/accuracy_reward": 0.5625000149011612, + "rewards/reasoning_steps_reward": 0.9861111640930176, + "rewards/repetition_penalty_reward": -0.02367980219423771, "rewards/tag_count_reward": 0.984375, "step": 270 }, { "clip_ratio": 0.0, - "completion_length": 448.66668701171875, + "completion_length": 509.85418701171875, "epoch": 0.271, - "grad_norm": 5.429612814412277, - "kl": 0.4091796875, + "grad_norm": 2.718362036973218, + "kl": 0.15576171875, "learning_rate": 9.221862584235526e-07, - "loss": 0.3253, - "reward": 2.8226706981658936, - "reward_std": 0.3070548251271248, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.024551517330110073, + "loss": 0.0344, + "reward": 2.775928258895874, + "reward_std": 0.3645378649234772, + "rewards/accuracy_reward": 0.8333333730697632, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.02962728962302208, "rewards/tag_count_reward": 1.0, "step": 271 }, { "clip_ratio": 0.0, - "completion_length": 397.2708435058594, + "completion_length": 511.50001525878906, "epoch": 0.272, - "grad_norm": 3.0329360747244674, - "kl": 0.474609375, + "grad_norm": 2.272195946333135, + "kl": 0.1591796875, "learning_rate": 9.213010742252327e-07, - "loss": 0.1074, - "reward": 2.7572423219680786, - "reward_std": 0.2572716176509857, - "rewards/accuracy_reward": 0.7708333730697632, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.013591074384748936, - "rewards/tag_count_reward": 1.0, + "loss": -0.0293, + "reward": 2.742081642150879, + "reward_std": 0.2952452003955841, + "rewards/accuracy_reward": 0.8333333730697632, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.03048785123974085, + "rewards/tag_count_reward": 0.9947916865348816, "step": 272 }, { "clip_ratio": 0.0, - "completion_length": 467.0000305175781, + "completion_length": 585.25, "epoch": 0.273, - "grad_norm": 4.023031066533007, - "kl": 0.4453125, + "grad_norm": 2.32139747171617, + "kl": 0.14794921875, "learning_rate": 9.204113658404989e-07, - "loss": 0.2193, - "reward": 2.751101016998291, - "reward_std": 0.45732370018959045, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.0284128924831748, - "rewards/tag_count_reward": 0.953125, + "loss": -0.0799, + "reward": 2.521125078201294, + "reward_std": 0.2736772522330284, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.04137510620057583, + "rewards/tag_count_reward": 0.9791666865348816, "step": 273 }, { "clip_ratio": 0.0, - "completion_length": 273.43751525878906, + "completion_length": 559.2708435058594, "epoch": 0.274, - "grad_norm": 3.716780735934786, - "kl": 0.4189453125, + "grad_norm": 3.852408203301354, + "kl": 0.1982421875, "learning_rate": 9.195171441101668e-07, - "loss": 0.1002, - "reward": 2.955238103866577, - "reward_std": 0.11242054030299187, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.013511963188648224, + "loss": -0.0098, + "reward": 2.5533807277679443, + "reward_std": 0.35553011298179626, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.019536098465323448, "rewards/tag_count_reward": 0.9895833432674408, "step": 274 }, { "clip_ratio": 0.0, - "completion_length": 360.5416793823242, + "completion_length": 633.4583435058594, "epoch": 0.275, - "grad_norm": 3.141537512405704, - "kl": 0.474609375, + "grad_norm": 2.0627187344345166, + "kl": 0.1513671875, "learning_rate": 9.186184199300463e-07, - "loss": 0.092, - "reward": 2.7825783491134644, - "reward_std": 0.38917484879493713, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 0.9652777910232544, - "rewards/repetition_penalty_reward": -0.021241238340735435, - "rewards/tag_count_reward": 0.984375, + "loss": -0.0293, + "reward": 2.7091498374938965, + "reward_std": 0.1623903214931488, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.04432236962020397, + "rewards/tag_count_reward": 0.9895833432674408, "step": 275 }, { "clip_ratio": 0.0, - "completion_length": 401.0208435058594, + "completion_length": 532.0416717529297, "epoch": 0.276, - "grad_norm": 5.879862522656149, - "kl": 0.5078125, + "grad_norm": 2.401556778445366, + "kl": 0.1552734375, "learning_rate": 9.177152042508077e-07, - "loss": 0.2415, - "reward": 2.831485390663147, - "reward_std": 0.3835846334695816, - "rewards/accuracy_reward": 0.8958333432674408, + "loss": 0.0535, + "reward": 2.728344440460205, + "reward_std": 0.3177375793457031, + "rewards/accuracy_reward": 0.7916666865348816, "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.02094521652907133, - "rewards/tag_count_reward": 0.984375, + "rewards/repetition_penalty_reward": -0.025128038600087166, + "rewards/tag_count_reward": 0.9895833432674408, "step": 276 }, { "clip_ratio": 0.0, - "completion_length": 342.3958435058594, + "completion_length": 497.7500305175781, "epoch": 0.277, - "grad_norm": 4.5700348001725555, - "kl": 0.607421875, + "grad_norm": 2.544090680740168, + "kl": 0.17333984375, "learning_rate": 9.168075080778494e-07, - "loss": 0.1393, - "reward": 2.8642842769622803, - "reward_std": 0.2969040870666504, + "loss": 0.0797, + "reward": 2.892575979232788, + "reward_std": 0.2266939841210842, "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 0.9861111044883728, - "rewards/repetition_penalty_reward": -0.02286839485168457, - "rewards/tag_count_reward": 0.984375, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.01714644394814968, + "rewards/tag_count_reward": 1.0, "step": 277 }, { "clip_ratio": 0.0, - "completion_length": 307.5833435058594, + "completion_length": 496.6666717529297, "epoch": 0.278, - "grad_norm": 3.2294045797839637, - "kl": 0.701171875, + "grad_norm": 2.35427197118846, + "kl": 0.15576171875, "learning_rate": 9.158953424711624e-07, - "loss": 0.0686, - "reward": 2.657663941383362, - "reward_std": 0.2474864050745964, - "rewards/accuracy_reward": 0.6875000149011612, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.01073892181739211, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0443, + "reward": 2.8280975818634033, + "reward_std": 0.15652123093605042, + "rewards/accuracy_reward": 0.875, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.029541232623159885, + "rewards/tag_count_reward": 0.9895833432674408, "step": 278 }, { "clip_ratio": 0.0, - "completion_length": 331.66668701171875, + "completion_length": 505.7708435058594, "epoch": 0.279, - "grad_norm": 7.441080087991311, - "kl": 0.6796875, + "grad_norm": 2.8854860164238882, + "kl": 0.1611328125, "learning_rate": 9.149787185451969e-07, - "loss": 0.266, - "reward": 2.9158542156219482, - "reward_std": 0.20566503703594208, - "rewards/accuracy_reward": 0.9375000298023224, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.01122913183644414, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.1633, + "reward": 2.6974780559539795, + "reward_std": 0.26681460440158844, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.019535831175744534, + "rewards/tag_count_reward": 0.9947916865348816, "step": 279 }, { "clip_ratio": 0.0, - "completion_length": 295.1041717529297, + "completion_length": 544.1458587646484, "epoch": 0.28, - "grad_norm": 4.696335292121163, - "kl": 0.85546875, + "grad_norm": 2.1507469824718592, + "kl": 0.15576171875, "learning_rate": 9.140576474687263e-07, - "loss": 0.036, - "reward": 2.945027709007263, - "reward_std": 0.17141056433320045, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 0.9722222089767456, - "rewards/repetition_penalty_reward": -0.006361227482557297, - "rewards/tag_count_reward": 1.0, + "loss": 0.0214, + "reward": 2.660393714904785, + "reward_std": 0.35248108208179474, + "rewards/accuracy_reward": 0.708333358168602, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.02884255349636078, + "rewards/tag_count_reward": 0.9947916865348816, "step": 280 }, { "clip_ratio": 0.0, - "completion_length": 282.7291793823242, + "completion_length": 582.9791870117188, "epoch": 0.281, - "grad_norm": 13.259634202794716, - "kl": 0.95703125, + "grad_norm": 2.2081213981330974, + "kl": 0.1484375, "learning_rate": 9.131321404647109e-07, - "loss": -0.1536, - "reward": 2.7471028566360474, - "reward_std": 0.04579891404137015, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.013313933741301298, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0259, + "reward": 2.749055504798889, + "reward_std": 0.3296816051006317, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.9722222089767456, + "rewards/repetition_penalty_reward": -0.035666871815919876, + "rewards/tag_count_reward": 1.0, "step": 281 }, { "clip_ratio": 0.0, - "completion_length": 401.56251525878906, + "completion_length": 495.8958435058594, "epoch": 0.282, - "grad_norm": 12.730208805255582, - "kl": 1.09375, + "grad_norm": 2.6311621368175055, + "kl": 0.185546875, "learning_rate": 9.122022088101613e-07, - "loss": 0.2685, - "reward": 2.495549201965332, - "reward_std": 0.5612721145153046, - "rewards/accuracy_reward": 0.6041666716337204, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.014867515303194523, - "rewards/tag_count_reward": 0.9270833432674408, + "loss": 0.0722, + "reward": 2.718705654144287, + "reward_std": 0.37944258749485016, + "rewards/accuracy_reward": 0.75, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.01740554254502058, + "rewards/tag_count_reward": 1.0, "step": 282 }, { "clip_ratio": 0.0, - "completion_length": 257.9166717529297, + "completion_length": 585.125, "epoch": 0.283, - "grad_norm": 5.236483615327856, - "kl": 1.06640625, + "grad_norm": 2.435559416337875, + "kl": 0.16943359375, "learning_rate": 9.112678638360015e-07, - "loss": 0.0392, - "reward": 2.9387857913970947, - "reward_std": 0.17011093348264694, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.007394707296043634, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0748, + "reward": 2.535888671875, + "reward_std": 0.3460448384284973, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 0.9861111640930176, + "rewards/repetition_penalty_reward": -0.04918085224926472, + "rewards/tag_count_reward": 0.9739583730697632, "step": 283 }, { "clip_ratio": 0.0, - "completion_length": 275.625, + "completion_length": 568.4583740234375, "epoch": 0.284, - "grad_norm": 35.63565072852818, - "kl": 1.11328125, + "grad_norm": 2.3346177606876206, + "kl": 0.1689453125, "learning_rate": 9.103291169269299e-07, - "loss": 0.4187, - "reward": 2.9108535051345825, - "reward_std": 0.21760105341672897, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 0.9791667461395264, - "rewards/repetition_penalty_reward": -0.011021499056369066, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.0921, + "reward": 2.8603535890579224, + "reward_std": 0.25171563029289246, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.0389520637691021, + "rewards/tag_count_reward": 0.9895833432674408, "step": 284 }, { "clip_ratio": 0.0, - "completion_length": 490.7916717529297, + "completion_length": 663.0416870117188, "epoch": 0.285, - "grad_norm": 10.711892307067096, - "kl": 1.58984375, + "grad_norm": 2.193370842368294, + "kl": 0.15673828125, "learning_rate": 9.093859795212817e-07, - "loss": 0.3561, - "reward": 2.664412260055542, - "reward_std": 0.36060677468776703, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.009199028369039297, - "rewards/tag_count_reward": 0.9375000298023224, + "loss": 0.1456, + "reward": 2.744846820831299, + "reward_std": 0.27857429534196854, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.04508392512798309, + "rewards/tag_count_reward": 0.984375, "step": 285 }, { "clip_ratio": 0.0, - "completion_length": 345.56251525878906, + "completion_length": 563.3125305175781, "epoch": 0.286, - "grad_norm": 18.39170901959641, - "kl": 2.8125, + "grad_norm": 2.4347017189444737, + "kl": 0.20166015625, "learning_rate": 9.084384631108882e-07, - "loss": 0.392, - "reward": 2.446242332458496, - "reward_std": 0.3337973803281784, - "rewards/accuracy_reward": 0.5000000149011612, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.010354849509894848, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.0819, + "reward": 2.745976209640503, + "reward_std": 0.29341720789670944, + "rewards/accuracy_reward": 0.7708333730697632, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.01964883040636778, + "rewards/tag_count_reward": 0.9947916865348816, "step": 286 }, { "clip_ratio": 0.0, - "completion_length": 362.1041717529297, + "completion_length": 530.8333435058594, "epoch": 0.287, - "grad_norm": 33.35178475022627, - "kl": 3.671875, + "grad_norm": 2.5821740187900097, + "kl": 0.1962890625, "learning_rate": 9.074865792409381e-07, - "loss": 0.8645, - "reward": 2.8375598192214966, - "reward_std": 0.34689949452877045, - "rewards/accuracy_reward": 0.8958333432674408, + "loss": 0.0427, + "reward": 2.4354602098464966, + "reward_std": 0.31608445942401886, + "rewards/accuracy_reward": 0.458333358168602, "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.009662610478699207, - "rewards/tag_count_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.015928767155855894, + "rewards/tag_count_reward": 1.0, "step": 287 }, { "clip_ratio": 0.0, - "completion_length": 315.25001525878906, + "completion_length": 516.6875, "epoch": 0.288, - "grad_norm": 17.86618868600841, - "kl": 3.484375, + "grad_norm": 2.4879515053528927, + "kl": 0.1796875, "learning_rate": 9.065303395098358e-07, - "loss": 0.4558, - "reward": 2.867647409439087, - "reward_std": 0.3230934739112854, - "rewards/accuracy_reward": 0.9375000298023224, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.014297273010015488, - "rewards/tag_count_reward": 0.9583333432674408, + "loss": 0.0885, + "reward": 2.7357919216156006, + "reward_std": 0.21385888010263443, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.022888831794261932, + "rewards/tag_count_reward": 0.9947916865348816, "step": 288 }, { "clip_ratio": 0.0, - "completion_length": 421.50001525878906, + "completion_length": 610.2291870117188, "epoch": 0.289, - "grad_norm": 76.98364168686291, - "kl": 6.296875, + "grad_norm": 10.258023745667739, + "kl": 0.24951171875, "learning_rate": 9.055697555690607e-07, - "loss": 1.3624, - "reward": 2.4802991151809692, - "reward_std": 0.45680928230285645, - "rewards/accuracy_reward": 0.6041666716337204, - "rewards/reasoning_steps_reward": 0.9583333432674408, - "rewards/repetition_penalty_reward": -0.009284371510148048, - "rewards/tag_count_reward": 0.9270833432674408, + "loss": 0.2087, + "reward": 2.447946548461914, + "reward_std": 0.3903558999300003, + "rewards/accuracy_reward": 0.520833358168602, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.0399008896201849, + "rewards/tag_count_reward": 0.9739583432674408, "step": 289 }, { "clip_ratio": 0.0, - "completion_length": 319.5416717529297, + "completion_length": 488.2500305175781, "epoch": 0.29, - "grad_norm": 72.76634544330949, - "kl": 3.7734375, + "grad_norm": 2.72857938643871, + "kl": 0.2431640625, "learning_rate": 9.046048391230247e-07, - "loss": 0.7484, - "reward": 2.4455480575561523, - "reward_std": 0.1240087493788451, - "rewards/accuracy_reward": 0.5, - "rewards/reasoning_steps_reward": 0.9861111044883728, - "rewards/repetition_penalty_reward": -0.009313223417848349, - "rewards/tag_count_reward": 0.96875, + "loss": -0.0029, + "reward": 2.764933466911316, + "reward_std": 0.31763769686222076, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.019788892939686775, + "rewards/tag_count_reward": 1.0, "step": 290 }, { "clip_ratio": 0.0, - "completion_length": 425.06251525878906, + "completion_length": 548.4166717529297, "epoch": 0.291, - "grad_norm": 17.675439622341106, - "kl": 3.44921875, + "grad_norm": 13.446114403763556, + "kl": 0.3310546875, "learning_rate": 9.036356019289309e-07, - "loss": 0.73, - "reward": 2.6398208141326904, - "reward_std": 0.3441198170185089, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.0164292948320508, - "rewards/tag_count_reward": 0.9479166865348816, + "loss": 0.1304, + "reward": 2.442963719367981, + "reward_std": 0.4253944456577301, + "rewards/accuracy_reward": 0.4791666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.020578143652528524, + "rewards/tag_count_reward": 0.984375, "step": 291 }, { "clip_ratio": 0.0, - "completion_length": 270.3958435058594, + "completion_length": 527.5000305175781, "epoch": 0.292, - "grad_norm": 40.60216279597499, - "kl": 1.34765625, + "grad_norm": 2.399625776751827, + "kl": 0.275390625, "learning_rate": 9.026620557966279e-07, - "loss": 0.3472, - "reward": 2.8886016607284546, - "reward_std": 0.29884637892246246, - "rewards/accuracy_reward": 0.9375000298023224, + "loss": 0.03, + "reward": 2.5269211530685425, + "reward_std": 0.2096565067768097, + "rewards/accuracy_reward": 0.5625000298023224, "rewards/reasoning_steps_reward": 0.979166716337204, - "rewards/repetition_penalty_reward": -0.012439928948879242, - "rewards/tag_count_reward": 0.9843750298023224, + "rewards/repetition_penalty_reward": -0.01474552322179079, + "rewards/tag_count_reward": 1.0, "step": 292 }, { "clip_ratio": 0.0, - "completion_length": 357.8958435058594, + "completion_length": 557.6458435058594, "epoch": 0.293, - "grad_norm": 48.256551683115056, - "kl": 2.15625, + "grad_norm": 2.6309659598581225, + "kl": 0.28955078125, "learning_rate": 9.016842125884684e-07, - "loss": 0.9207, - "reward": 2.8681055307388306, - "reward_std": 0.3581779897212982, - "rewards/accuracy_reward": 0.9375, - "rewards/reasoning_steps_reward": 0.972222238779068, - "rewards/repetition_penalty_reward": -0.010366793721914291, - "rewards/tag_count_reward": 0.9687500298023224, + "loss": 0.0651, + "reward": 2.6259918212890625, + "reward_std": 0.3636874854564667, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.018105541355907917, + "rewards/tag_count_reward": 0.984375, "step": 293 }, { "clip_ratio": 0.0, - "completion_length": 532.2083435058594, + "completion_length": 591.3958435058594, "epoch": 0.294, - "grad_norm": 61.50543866717565, - "kl": 6.609375, + "grad_norm": 2.594070997434929, + "kl": 0.2685546875, "learning_rate": 9.007020842191634e-07, - "loss": 1.2975, - "reward": 2.3665781021118164, - "reward_std": 0.723650187253952, - "rewards/accuracy_reward": 0.5416666716337204, - "rewards/reasoning_steps_reward": 0.9236111044883728, - "rewards/repetition_penalty_reward": -0.01536642899736762, - "rewards/tag_count_reward": 0.9166666865348816, + "loss": 0.0742, + "reward": 2.6963618993759155, + "reward_std": 0.27481937408447266, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.0258603785187006, + "rewards/tag_count_reward": 1.0, "step": 294 }, { "clip_ratio": 0.0, - "completion_length": 285.7291717529297, + "completion_length": 503.7916717529297, "epoch": 0.295, - "grad_norm": 11.699489720391307, - "kl": 1.78125, + "grad_norm": 2.797055325097868, + "kl": 0.3544921875, "learning_rate": 8.997156826556369e-07, - "loss": 0.4253, - "reward": 2.4907814264297485, - "reward_std": 0.22814809903502464, - "rewards/accuracy_reward": 0.5208333432674408, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.012690847273916006, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0827, + "reward": 2.522792100906372, + "reward_std": 0.4541844576597214, + "rewards/accuracy_reward": 0.5625, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.01887462381273508, + "rewards/tag_count_reward": 1.0, "step": 295 }, { "clip_ratio": 0.0, - "completion_length": 424.25001525878906, + "completion_length": 572.1875305175781, "epoch": 0.296, - "grad_norm": 40.28188014679377, - "kl": 4.1875, + "grad_norm": 3.656654339941009, + "kl": 0.388671875, "learning_rate": 8.987250199168808e-07, - "loss": 0.8123, - "reward": 2.730523943901062, - "reward_std": 0.36739376187324524, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.01253165677189827, + "loss": 0.0611, + "reward": 2.7958298921585083, + "reward_std": 0.3831692487001419, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.03750366624444723, "rewards/tag_count_reward": 0.9791666865348816, "step": 296 }, { "clip_ratio": 0.0, - "completion_length": 401.5833435058594, + "completion_length": 679.375, "epoch": 0.297, - "grad_norm": 60.07282330403106, - "kl": 2.796875, + "grad_norm": 5.294668622093825, + "kl": 0.4609375, "learning_rate": 8.977301080738079e-07, - "loss": 0.9001, - "reward": 2.8031165599823, - "reward_std": 0.4816116392612457, - "rewards/accuracy_reward": 0.8541666865348816, + "loss": 0.1776, + "reward": 2.7792227268218994, + "reward_std": 0.2825077772140503, + "rewards/accuracy_reward": 0.8333333432674408, "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.009383588563650846, - "rewards/tag_count_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.04369393363595009, + "rewards/tag_count_reward": 0.9895833432674408, "step": 297 }, { "clip_ratio": 0.0, - "completion_length": 317.6666793823242, + "completion_length": 777.9375, "epoch": 0.298, - "grad_norm": 37.69005118517701, - "kl": 2.466796875, + "grad_norm": 9.442864310512464, + "kl": 0.658203125, "learning_rate": 8.967309592491052e-07, - "loss": 0.2842, - "reward": 2.44062602519989, - "reward_std": 0.16654927376657724, - "rewards/accuracy_reward": 0.5208333432674408, + "loss": 0.3089, + "reward": 2.554570198059082, + "reward_std": 0.5709795355796814, + "rewards/accuracy_reward": 0.6875, "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.01076290593482554, - "rewards/tag_count_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.04265192709863186, + "rewards/tag_count_reward": 0.9375, "step": 298 }, { "clip_ratio": 0.0, - "completion_length": 347.125, + "completion_length": 774.7708435058594, "epoch": 0.299, - "grad_norm": 61.921737312804765, - "kl": 1.951171875, + "grad_norm": 4.044252023077153, + "kl": 1.21484375, "learning_rate": 8.957275856170855e-07, - "loss": 0.6645, - "reward": 2.417535662651062, - "reward_std": 0.18784819543361664, - "rewards/accuracy_reward": 0.4791666865348816, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.009547823574393988, - "rewards/tag_count_reward": 0.9687500298023224, + "loss": 0.1666, + "reward": 2.4033294916152954, + "reward_std": 0.4724307656288147, + "rewards/accuracy_reward": 0.520833358168602, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.028962312266230583, + "rewards/tag_count_reward": 0.9114583432674408, "step": 299 }, { "clip_ratio": 0.0, - "completion_length": 349.41668701171875, + "completion_length": 775.9166870117188, "epoch": 0.3, - "grad_norm": 29.758501719120176, - "kl": 1.828125, + "grad_norm": 13.271553563881938, + "kl": 1.8515625, "learning_rate": 8.9471999940354e-07, - "loss": 0.6036, - "reward": 2.854809045791626, - "reward_std": 0.3670079857110977, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 0.965277761220932, - "rewards/repetition_penalty_reward": -0.011510440614074469, - "rewards/tag_count_reward": 0.9843750298023224, + "loss": 0.2894, + "reward": 2.3227447271347046, + "reward_std": 0.5490213930606842, + "rewards/accuracy_reward": 0.4583333432674408, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.03663041442632675, + "rewards/tag_count_reward": 0.9010416865348816, "step": 300 }, { "clip_ratio": 0.0, - "completion_length": 296.7708435058594, + "completion_length": 862.0625305175781, "epoch": 0.301, - "grad_norm": 33.88745189663411, - "kl": 2.4765625, + "grad_norm": 11.329717412687812, + "kl": 2.6328125, "learning_rate": 8.937082128855891e-07, - "loss": 0.7192, - "reward": 2.89434552192688, - "reward_std": 0.25644664466381073, - "rewards/accuracy_reward": 0.9375000298023224, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.010168483480811119, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 0.303, + "reward": 2.497220277786255, + "reward_std": 0.5546972751617432, + "rewards/accuracy_reward": 0.625, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.025349291041493416, + "rewards/tag_count_reward": 0.9114583432674408, "step": 301 }, { "clip_ratio": 0.0, - "completion_length": 277.06251525878906, + "completion_length": 718.0000305175781, "epoch": 0.302, - "grad_norm": 18.819156414990914, - "kl": 2.146484375, + "grad_norm": 11.620481772542528, + "kl": 1.78125, "learning_rate": 8.926922383915315e-07, - "loss": 0.7377, - "reward": 2.641542673110962, - "reward_std": 0.24526511318981647, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/reasoning_steps_reward": 0.9652777910232544, - "rewards/repetition_penalty_reward": -0.011235354468226433, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.1349, + "reward": 2.820877194404602, + "reward_std": 0.2959998771548271, + "rewards/accuracy_reward": 0.8958333730697632, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.026345071382820606, + "rewards/tag_count_reward": 0.9583333432674408, "step": 302 }, { "clip_ratio": 0.0, - "completion_length": 357.35418701171875, + "completion_length": 699.8958740234375, "epoch": 0.303, - "grad_norm": 39.50549927060914, - "kl": 3.2109375, + "grad_norm": 18.84955689848848, + "kl": 0.779296875, "learning_rate": 8.916720883006963e-07, - "loss": 0.8041, - "reward": 2.57657790184021, - "reward_std": 0.3627946972846985, - "rewards/accuracy_reward": 0.6666666716337204, - "rewards/reasoning_steps_reward": 0.9583334028720856, - "rewards/repetition_penalty_reward": -0.017172069288790226, - "rewards/tag_count_reward": 0.9687500298023224, + "loss": 0.2289, + "reward": 2.662257194519043, + "reward_std": 0.4918932765722275, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.02697897609323263, + "rewards/tag_count_reward": 0.9531250298023224, "step": 303 }, { "clip_ratio": 0.0, - "completion_length": 371.97918701171875, + "completion_length": 854.3333435058594, "epoch": 0.304, - "grad_norm": 34.11018576277812, - "kl": 2.171875, + "grad_norm": 16.06283643508135, + "kl": 0.9453125, "learning_rate": 8.906477750432903e-07, - "loss": 0.6381, - "reward": 2.747861623764038, - "reward_std": 0.29682640731334686, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.012555128429085016, - "rewards/tag_count_reward": 0.96875, + "loss": 0.2394, + "reward": 2.463517665863037, + "reward_std": 0.5290980041027069, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.034746479243040085, + "rewards/tag_count_reward": 0.9010416865348816, "step": 304 }, { "clip_ratio": 0.0, - "completion_length": 369.6666717529297, + "completion_length": 795.7291870117188, "epoch": 0.305, - "grad_norm": 16.618138166114957, - "kl": 1.30078125, + "grad_norm": 7.368811609366542, + "kl": 1.072265625, "learning_rate": 8.896193111002475e-07, - "loss": 0.015, - "reward": 2.2844594717025757, - "reward_std": 0.36854127049446106, - "rewards/accuracy_reward": 0.3125000149011612, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.017623926512897015, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.1781, + "reward": 2.7576704025268555, + "reward_std": 0.3518691807985306, + "rewards/accuracy_reward": 0.8958333730697632, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.03399639390408993, + "rewards/tag_count_reward": 0.9166666865348816, "step": 305 }, { "clip_ratio": 0.0, - "completion_length": 302.0208435058594, + "completion_length": 646.9583435058594, "epoch": 0.306, - "grad_norm": 78.36971773402463, - "kl": 1.158203125, + "grad_norm": 5.727482347866686, + "kl": 1.28125, "learning_rate": 8.88586709003076e-07, - "loss": 0.5731, - "reward": 2.9113956689834595, - "reward_std": 0.22667651623487473, - "rewards/accuracy_reward": 0.9375000298023224, + "loss": 0.1126, + "reward": 2.540480852127075, + "reward_std": 0.4421972632408142, + "rewards/accuracy_reward": 0.6250000298023224, "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.010479431133717299, - "rewards/tag_count_reward": 0.9843750298023224, + "rewards/repetition_penalty_reward": -0.03243602532893419, + "rewards/tag_count_reward": 0.9479166865348816, "step": 306 }, { "clip_ratio": 0.0, - "completion_length": 348.25001525878906, + "completion_length": 876.6041870117188, "epoch": 0.307, - "grad_norm": 28.97587648635268, - "kl": 1.068359375, + "grad_norm": 6.1244435821869905, + "kl": 1.9296875, "learning_rate": 8.875499813337067e-07, - "loss": 0.2691, - "reward": 2.4367611408233643, - "reward_std": 0.28679367154836655, - "rewards/accuracy_reward": 0.5000000111758709, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.011155676562339067, - "rewards/tag_count_reward": 0.96875, + "loss": 0.2147, + "reward": 2.5565717220306396, + "reward_std": 0.5118266344070435, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.026761652901768684, + "rewards/tag_count_reward": 0.9166666865348816, "step": 307 }, { "clip_ratio": 0.0, - "completion_length": 245.6041717529297, + "completion_length": 735.1041870117188, "epoch": 0.308, - "grad_norm": 4.263228510899138, - "kl": 0.5166015625, + "grad_norm": 6.2567586393756685, + "kl": 1.25390625, "learning_rate": 8.865091407243394e-07, - "loss": 0.0724, - "reward": 2.4779577255249023, - "reward_std": 0.033920885529369116, - "rewards/accuracy_reward": 0.5, + "loss": 0.1521, + "reward": 2.8010218143463135, + "reward_std": 0.3402135968208313, + "rewards/accuracy_reward": 0.9166666865348816, "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.015097890980541706, - "rewards/tag_count_reward": 1.0, + "rewards/repetition_penalty_reward": -0.03057546727359295, + "rewards/tag_count_reward": 0.9218750298023224, "step": 308 }, { "clip_ratio": 0.0, - "completion_length": 285.9583435058594, + "completion_length": 884.6666870117188, "epoch": 0.309, - "grad_norm": 22.05984317009652, - "kl": 2.03515625, + "grad_norm": 11.171414148004054, + "kl": 2.3984375, "learning_rate": 8.85464199857288e-07, - "loss": 0.6187, - "reward": 2.908278703689575, - "reward_std": 0.20823773369193077, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.00491589680314064, - "rewards/tag_count_reward": 0.96875, + "loss": 0.2805, + "reward": 2.442511558532715, + "reward_std": 0.5785179138183594, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.026238556019961834, + "rewards/tag_count_reward": 0.8854166865348816, "step": 309 }, { "clip_ratio": 0.0, - "completion_length": 458.5833435058594, + "completion_length": 982.625, "epoch": 0.31, - "grad_norm": 20.933487397076487, - "kl": 3.609375, + "grad_norm": 5.999562164727096, + "kl": 2.0234375, "learning_rate": 8.844151714648274e-07, - "loss": 1.1318, - "reward": 2.5730291604995728, - "reward_std": 0.601581871509552, - "rewards/accuracy_reward": 0.7083333730697632, - "rewards/reasoning_steps_reward": 0.979166716337204, - "rewards/repetition_penalty_reward": -0.02072106394916773, - "rewards/tag_count_reward": 0.9062500298023224, + "loss": 0.2776, + "reward": 2.3847498893737793, + "reward_std": 0.6050747036933899, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.026708428747951984, + "rewards/tag_count_reward": 0.8906250298023224, "step": 310 }, { "clip_ratio": 0.0, - "completion_length": 250.25001525878906, + "completion_length": 750.9166870117188, "epoch": 0.311, - "grad_norm": 4.3999685618121935, - "kl": 0.560546875, + "grad_norm": 10.25459242080789, + "kl": 1.228515625, "learning_rate": 8.833620683290375e-07, - "loss": 0.0613, - "reward": 2.9593318700790405, - "reward_std": 0.10326157324016094, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.01289046322926879, - "rewards/tag_count_reward": 1.0, + "loss": 0.2226, + "reward": 2.489295244216919, + "reward_std": 0.3828739821910858, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.031538188457489014, + "rewards/tag_count_reward": 0.9375, "step": 311 }, { "clip_ratio": 0.0, - "completion_length": 306.12500762939453, + "completion_length": 920.7083435058594, "epoch": 0.312, - "grad_norm": 7.6586898826038, - "kl": 0.7509765625, + "grad_norm": 11.341166194035626, + "kl": 3.0390625, "learning_rate": 8.823049032816478e-07, - "loss": 0.021, - "reward": 2.673188805580139, - "reward_std": 0.28602842800319195, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.021255791187286377, - "rewards/tag_count_reward": 1.0, + "loss": 0.3465, + "reward": 2.3931429386138916, + "reward_std": 0.5464552640914917, + "rewards/accuracy_reward": 0.5416666716337204, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.033940425142645836, + "rewards/tag_count_reward": 0.8854166865348816, "step": 312 }, { "clip_ratio": 0.0, - "completion_length": 482.81251525878906, + "completion_length": 824.3541870117188, "epoch": 0.313, - "grad_norm": 34.53058556422272, - "kl": 3.1015625, + "grad_norm": 14.499216833551465, + "kl": 2.5703125, "learning_rate": 8.812436892038805e-07, - "loss": 0.4534, - "reward": 2.5731600522994995, - "reward_std": 0.3059232383966446, - "rewards/accuracy_reward": 0.7083333730697632, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.01885397359728813, - "rewards/tag_count_reward": 0.8906250298023224, + "loss": 0.4531, + "reward": 2.4514354467391968, + "reward_std": 0.4615107327699661, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.03120344504714012, + "rewards/tag_count_reward": 0.8645833432674408, "step": 313 }, { "clip_ratio": 0.0, - "completion_length": 309.0208435058594, + "completion_length": 752.2500305175781, "epoch": 0.314, - "grad_norm": 6.2749926210643965, - "kl": 0.734375, + "grad_norm": 9.160833779249737, + "kl": 2.52734375, "learning_rate": 8.801784390262943e-07, - "loss": 0.039, - "reward": 2.797974109649658, - "reward_std": 0.13754124753177166, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.01278979703783989, - "rewards/tag_count_reward": 0.984375, + "loss": 0.4578, + "reward": 2.459816098213196, + "reward_std": 0.5396545231342316, + "rewards/accuracy_reward": 0.625, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.02629512920975685, + "rewards/tag_count_reward": 0.8750000298023224, "step": 314 }, { "clip_ratio": 0.0, - "completion_length": 459.2083435058594, + "completion_length": 666.5625305175781, "epoch": 0.315, - "grad_norm": 70.4484863364483, - "kl": 1.26171875, + "grad_norm": 16.821527260355506, + "kl": 1.90625, "learning_rate": 8.791091657286267e-07, - "loss": 0.6687, - "reward": 2.7114145755767822, - "reward_std": 0.5056033432483673, - "rewards/accuracy_reward": 0.8125, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.01948827039450407, - "rewards/tag_count_reward": 0.9322916865348816, + "loss": 0.1424, + "reward": 2.7689753770828247, + "reward_std": 0.27851930260658264, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.03137190733104944, + "rewards/tag_count_reward": 0.953125, "step": 315 }, { "clip_ratio": 0.0, - "completion_length": 256.08333587646484, + "completion_length": 749.3958435058594, "epoch": 0.316, - "grad_norm": 4.46489687930188, - "kl": 0.4853515625, + "grad_norm": 15.034225144072645, + "kl": 2.33984375, "learning_rate": 8.780358823396352e-07, - "loss": 0.0082, - "reward": 2.550494432449341, - "reward_std": 0.12301526172086596, - "rewards/accuracy_reward": 0.5625, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.006797325797379017, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.2368, + "reward": 2.8358311653137207, + "reward_std": 0.2934058606624603, + "rewards/accuracy_reward": 0.9375, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.03396068140864372, + "rewards/tag_count_reward": 0.953125, "step": 316 }, { "clip_ratio": 0.0, - "completion_length": 390.0833435058594, + "completion_length": 740.875, "epoch": 0.317, - "grad_norm": 7.822024755252251, - "kl": 0.576171875, + "grad_norm": 8.197253327861047, + "kl": 2.1796875, "learning_rate": 8.769586019369391e-07, - "loss": 0.1726, - "reward": 2.701343536376953, - "reward_std": 0.30785155296325684, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.01567037357017398, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 0.2262, + "reward": 2.721475839614868, + "reward_std": 0.3922436535358429, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.023315943777561188, + "rewards/tag_count_reward": 0.9114583432674408, "step": 317 }, { "clip_ratio": 0.0, - "completion_length": 372.2083435058594, + "completion_length": 731.8125, "epoch": 0.318, - "grad_norm": 43.28879860910786, - "kl": 0.986328125, + "grad_norm": 17.02162446986935, + "kl": 0.9296875, "learning_rate": 8.758773376468604e-07, - "loss": 0.5577, - "reward": 2.804646611213684, - "reward_std": 0.3991493731737137, - "rewards/accuracy_reward": 0.8958333730697632, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.016533964313566685, - "rewards/tag_count_reward": 0.953125, + "loss": 0.2494, + "reward": 2.5680298805236816, + "reward_std": 0.4028843492269516, + "rewards/accuracy_reward": 0.6875, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.030928438529372215, + "rewards/tag_count_reward": 0.9114583432674408, "step": 318 }, { "clip_ratio": 0.0, - "completion_length": 293.62501525878906, + "completion_length": 646.0625305175781, "epoch": 0.319, - "grad_norm": 30.388960723639432, - "kl": 0.904296875, + "grad_norm": 8.403943914356434, + "kl": 0.6796875, "learning_rate": 8.747921026442629e-07, - "loss": 0.3412, - "reward": 2.693268656730652, - "reward_std": 0.1956123784184456, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 0.9583334028720856, - "rewards/repetition_penalty_reward": -0.009856293676421046, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 0.1368, + "reward": 2.6740156412124634, + "reward_std": 0.3067634850740433, + "rewards/accuracy_reward": 0.75, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.018692771438509226, + "rewards/tag_count_reward": 0.9427083432674408, "step": 319 }, { "clip_ratio": 0.0, - "completion_length": 389.375, + "completion_length": 612.8125, "epoch": 0.32, - "grad_norm": 18.882466744370163, - "kl": 0.9375, + "grad_norm": 3.055603498237129, + "kl": 0.384765625, "learning_rate": 8.737029101523929e-07, - "loss": 0.2045, - "reward": 2.8057843446731567, - "reward_std": 0.3706630915403366, - "rewards/accuracy_reward": 0.8750000298023224, - "rewards/reasoning_steps_reward": 0.972222238779068, - "rewards/repetition_penalty_reward": -0.02060467004776001, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.0287, + "reward": 2.7557495832443237, + "reward_std": 0.21633769571781158, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.028972744941711426, + "rewards/tag_count_reward": 1.0, "step": 320 }, { "clip_ratio": 0.0, - "completion_length": 369.5416717529297, + "completion_length": 619.375, "epoch": 0.321, - "grad_norm": 8.904166362421526, - "kl": 1.482421875, + "grad_norm": 3.8882839855022113, + "kl": 0.3203125, "learning_rate": 8.726097734427172e-07, - "loss": 0.3554, - "reward": 2.5382190942764282, - "reward_std": 0.42692097276449203, - "rewards/accuracy_reward": 0.6041666865348816, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.013864347711205482, - "rewards/tag_count_reward": 0.9687500298023224, + "loss": 0.0372, + "reward": 2.877520799636841, + "reward_std": 0.2280464619398117, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.028729302808642387, + "rewards/tag_count_reward": 0.9895833432674408, "step": 321 }, { "clip_ratio": 0.0, - "completion_length": 433.56251525878906, + "completion_length": 642.2291870117188, "epoch": 0.322, - "grad_norm": 74.92718593357132, - "kl": 6.5546875, + "grad_norm": 5.934518493796481, + "kl": 0.5390625, "learning_rate": 8.715127058347614e-07, - "loss": 1.4038, - "reward": 2.511315107345581, - "reward_std": 0.47513845562934875, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/reasoning_steps_reward": 0.9375000298023224, - "rewards/repetition_penalty_reward": -0.009518398437649012, - "rewards/tag_count_reward": 0.9166666865348816, + "loss": 0.1573, + "reward": 2.8018211126327515, + "reward_std": 0.2535254070535302, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.029776128008961678, + "rewards/tag_count_reward": 0.984375, "step": 322 }, { "clip_ratio": 0.0, - "completion_length": 268.6041717529297, + "completion_length": 584.3958435058594, "epoch": 0.323, - "grad_norm": 5.6518495493286025, - "kl": 0.78515625, + "grad_norm": 12.30823020424776, + "kl": 0.6533203125, "learning_rate": 8.704117206959484e-07, - "loss": 0.027, - "reward": 2.715887427330017, - "reward_std": 0.2659030854701996, - "rewards/accuracy_reward": 0.7291666865348816, + "loss": -0.0165, + "reward": 2.274023652076721, + "reward_std": 0.3736593574285507, + "rewards/accuracy_reward": 0.3125000111758709, "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.008070975309237838, - "rewards/tag_count_reward": 0.9947916865348816, + "rewards/repetition_penalty_reward": -0.022851460613310337, + "rewards/tag_count_reward": 0.984375, "step": 323 }, { "clip_ratio": 0.0, - "completion_length": 291.7708435058594, + "completion_length": 616.7083435058594, "epoch": 0.324, - "grad_norm": 95.69379610952801, - "kl": 2.904296875, + "grad_norm": 11.03962394142098, + "kl": 0.708984375, "learning_rate": 8.693068314414344e-07, - "loss": 0.4528, - "reward": 2.6881362199783325, - "reward_std": 0.13930337294004858, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.008044424466788769, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.2153, + "reward": 2.9491621255874634, + "reward_std": 0.0671940129250288, + "rewards/accuracy_reward": 1.0, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.030004790984094143, + "rewards/tag_count_reward": 0.9791666865348816, "step": 324 }, { "clip_ratio": 0.0, - "completion_length": 380.7916717529297, + "completion_length": 586.2916870117188, "epoch": 0.325, - "grad_norm": 14.165946492722295, - "kl": 1.685546875, + "grad_norm": 21.7519973984484, + "kl": 0.9091796875, "learning_rate": 8.681980515339463e-07, - "loss": 0.3598, - "reward": 2.792818069458008, - "reward_std": 0.33109084889292717, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.023154238238930702, - "rewards/tag_count_reward": 0.9687500298023224, + "loss": 0.1511, + "reward": 2.7508959770202637, + "reward_std": 0.3181898444890976, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.025145714171230793, + "rewards/tag_count_reward": 0.9843750298023224, "step": 325 }, { "clip_ratio": 0.0, - "completion_length": 372.0833435058594, + "completion_length": 609.3125305175781, "epoch": 0.326, - "grad_norm": 58.37365484616897, - "kl": 3.380859375, + "grad_norm": 7.475624041841047, + "kl": 1.08984375, "learning_rate": 8.670853944836176e-07, - "loss": 0.7338, - "reward": 2.4957873821258545, - "reward_std": 0.5664017200469971, - "rewards/accuracy_reward": 0.5625, - "rewards/reasoning_steps_reward": 0.979166716337204, - "rewards/repetition_penalty_reward": -0.01983764488250017, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 0.1854, + "reward": 2.718400478363037, + "reward_std": 0.3192872703075409, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.024655278772115707, + "rewards/tag_count_reward": 0.9791666865348816, "step": 326 }, { "clip_ratio": 0.0, - "completion_length": 397.1041717529297, + "completion_length": 537.0833435058594, "epoch": 0.327, - "grad_norm": 41.19786996726065, - "kl": 1.58203125, + "grad_norm": 3.934696523513004, + "kl": 0.4482421875, "learning_rate": 8.659688738478231e-07, - "loss": 0.5564, - "reward": 2.8201701641082764, - "reward_std": 0.39665576815605164, - "rewards/accuracy_reward": 0.875, + "loss": 0.0237, + "reward": 2.5368727445602417, + "reward_std": 0.2816409021615982, + "rewards/accuracy_reward": 0.5833333432674408, "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.016635391861200333, - "rewards/tag_count_reward": 0.9687500298023224, + "rewards/repetition_penalty_reward": -0.029099617153406143, + "rewards/tag_count_reward": 0.9895833432674408, "step": 327 }, { "clip_ratio": 0.0, - "completion_length": 288.7916717529297, + "completion_length": 561.1875, "epoch": 0.328, - "grad_norm": 18.462005969003272, - "kl": 1.126953125, + "grad_norm": 4.769237705857611, + "kl": 0.576171875, "learning_rate": 8.648485032310144e-07, - "loss": 0.2269, - "reward": 2.6695051193237305, - "reward_std": 0.29302695021033287, - "rewards/accuracy_reward": 0.7291666716337204, - "rewards/reasoning_steps_reward": 0.9722222983837128, - "rewards/repetition_penalty_reward": -0.01105071953497827, + "loss": 0.0438, + "reward": 2.7067657709121704, + "reward_std": 0.287298321723938, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9861111640930176, + "rewards/repetition_penalty_reward": -0.029345519840717316, "rewards/tag_count_reward": 0.9791666865348816, "step": 328 }, { "clip_ratio": 0.0, - "completion_length": 332.1041717529297, + "completion_length": 630.4375, "epoch": 0.329, - "grad_norm": 18.67557243566907, - "kl": 2.216796875, + "grad_norm": 10.245397889363776, + "kl": 2.02734375, "learning_rate": 8.63724296284554e-07, - "loss": 0.6714, - "reward": 2.814840316772461, - "reward_std": 0.3265114799141884, - "rewards/accuracy_reward": 0.875, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.018493151292204857, - "rewards/tag_count_reward": 0.9583333432674408, + "loss": 0.2262, + "reward": 2.599774479866028, + "reward_std": 0.4478468745946884, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.023489387705922127, + "rewards/tag_count_reward": 0.9427083432674408, "step": 329 }, { "clip_ratio": 0.0, - "completion_length": 250.1041717529297, + "completion_length": 571.2916870117188, "epoch": 0.33, - "grad_norm": 14.134762836281471, - "kl": 2.34375, + "grad_norm": 22.88975372356627, + "kl": 1.796875, "learning_rate": 8.625962667065487e-07, - "loss": 0.5478, - "reward": 2.917124032974243, - "reward_std": 0.25805456936359406, - "rewards/accuracy_reward": 0.9583333730697632, + "loss": 0.206, + "reward": 2.5127453804016113, + "reward_std": 0.4001367390155792, + "rewards/accuracy_reward": 0.5625000149011612, "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.009959449991583824, - "rewards/tag_count_reward": 0.9687500298023224, + "rewards/repetition_penalty_reward": -0.028921468183398247, + "rewards/tag_count_reward": 0.9791666865348816, "step": 330 }, { "clip_ratio": 0.0, - "completion_length": 888.7708435058594, + "completion_length": 524.7083740234375, "epoch": 0.331, - "grad_norm": 248.8165438961169, - "kl": 28.8125, + "grad_norm": 5.9516087463505025, + "kl": 0.42626953125, "learning_rate": 8.614644282416831e-07, - "loss": 2.508, - "reward": 2.2433557510375977, - "reward_std": 0.5838175415992737, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.017060973681509495, - "rewards/tag_count_reward": 0.7395833432674408, + "loss": 0.0881, + "reward": 2.731919050216675, + "reward_std": 0.31574640423059464, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.040650567039847374, + "rewards/tag_count_reward": 0.9739583432674408, "step": 331 }, { "clip_ratio": 0.0, - "completion_length": 306.5, + "completion_length": 558.9583740234375, "epoch": 0.332, - "grad_norm": 34.660629557664016, - "kl": 4.484375, + "grad_norm": 5.083242340459772, + "kl": 0.27294921875, "learning_rate": 8.603287946810513e-07, - "loss": 0.4247, - "reward": 2.776843309402466, - "reward_std": 0.3961004763841629, - "rewards/accuracy_reward": 0.8333333730697632, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.013087108265608549, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.05, + "reward": 2.7102267742156982, + "reward_std": 0.2585765942931175, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.03456501290202141, + "rewards/tag_count_reward": 0.9947916865348816, "step": 332 }, { "clip_ratio": 0.0, - "completion_length": 333.8541717529297, + "completion_length": 618.1041870117188, "epoch": 0.333, - "grad_norm": 19.508661577504974, - "kl": 3.921875, + "grad_norm": 6.1273870030040065, + "kl": 1.0595703125, "learning_rate": 8.591893798619903e-07, - "loss": 0.5355, - "reward": 2.717034101486206, - "reward_std": 0.5191467255353928, - "rewards/accuracy_reward": 0.8125, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.015604821499437094, - "rewards/tag_count_reward": 0.9270833730697632, + "loss": 0.2075, + "reward": 2.7363253831863403, + "reward_std": 0.37927868962287903, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.018883120268583298, + "rewards/tag_count_reward": 0.9635416865348816, "step": 333 }, { "clip_ratio": 0.0, - "completion_length": 414.5208435058594, + "completion_length": 603.9375, "epoch": 0.334, - "grad_norm": 12.14388635151008, - "kl": 2.59375, + "grad_norm": 5.897033604873569, + "kl": 1.12890625, "learning_rate": 8.580461976679099e-07, - "loss": 0.5599, - "reward": 2.6168153285980225, - "reward_std": 0.3208754360675812, - "rewards/accuracy_reward": 0.6875000149011612, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.027281965129077435, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.2075, + "reward": 2.6124191284179688, + "reward_std": 0.4683973491191864, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.017789172008633614, + "rewards/tag_count_reward": 0.921875, "step": 334 }, { "clip_ratio": 0.0, - "completion_length": 471.1458435058594, + "completion_length": 515.75, "epoch": 0.335, - "grad_norm": 29.801902151222833, - "kl": 3.07421875, + "grad_norm": 7.88708659023299, + "kl": 0.53515625, "learning_rate": 8.568992620281243e-07, - "loss": 0.9618, - "reward": 2.668967604637146, - "reward_std": 0.4781971871852875, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.016796366777271032, - "rewards/tag_count_reward": 0.9218750298023224, + "loss": 0.0447, + "reward": 2.5799560546875, + "reward_std": 0.4440341293811798, + "rewards/accuracy_reward": 0.625, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.019002487882971764, + "rewards/tag_count_reward": 0.9947916865348816, "step": 335 }, { "clip_ratio": 0.0, - "completion_length": 380.06251525878906, + "completion_length": 618.2291870117188, "epoch": 0.336, - "grad_norm": 6.932275776456119, - "kl": 1.611328125, + "grad_norm": 13.855588768970593, + "kl": 1.8125, "learning_rate": 8.557485869176825e-07, - "loss": 0.2886, - "reward": 2.6852755546569824, - "reward_std": 0.27035200595855713, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.030002295039594173, - "rewards/tag_count_reward": 0.9583333730697632, + "loss": 0.1842, + "reward": 2.804866313934326, + "reward_std": 0.27131783962249756, + "rewards/accuracy_reward": 0.8958333730697632, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.023258699104189873, + "rewards/tag_count_reward": 0.9322916865348816, "step": 336 }, { "clip_ratio": 0.0, - "completion_length": 371.3333435058594, + "completion_length": 544.0416870117188, "epoch": 0.337, - "grad_norm": 10.08685619246109, - "kl": 2.46875, + "grad_norm": 10.22565696968507, + "kl": 0.556640625, "learning_rate": 8.545941863571973e-07, - "loss": 0.2943, - "reward": 2.8558069467544556, - "reward_std": 0.2918730303645134, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.012248746817931533, - "rewards/tag_count_reward": 0.9583333432674408, + "loss": 0.0697, + "reward": 2.7841432094573975, + "reward_std": 0.22680224478244781, + "rewards/accuracy_reward": 0.8333333730697632, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.028356771916151047, + "rewards/tag_count_reward": 0.9791666865348816, "step": 337 }, { "clip_ratio": 0.0, - "completion_length": 323.37501525878906, + "completion_length": 660.1875305175781, "epoch": 0.338, - "grad_norm": 6.834406988492541, - "kl": 1.14453125, + "grad_norm": 13.275393254857, + "kl": 1.57421875, "learning_rate": 8.534360744126753e-07, - "loss": 0.0956, - "reward": 2.9468610286712646, - "reward_std": 0.14941991260275245, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.011472368380054832, - "rewards/tag_count_reward": 1.0, + "loss": 0.3522, + "reward": 2.626600503921509, + "reward_std": 0.5416260808706284, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.026177333667874336, + "rewards/tag_count_reward": 0.9375000298023224, "step": 338 }, { "clip_ratio": 0.0, - "completion_length": 366.81251525878906, + "completion_length": 538.2916870117188, "epoch": 0.339, - "grad_norm": 12.820513619490347, - "kl": 1.990234375, + "grad_norm": 9.284294976639861, + "kl": 0.890625, "learning_rate": 8.522742651953456e-07, - "loss": 0.6153, - "reward": 2.8665162324905396, - "reward_std": 0.2634586850181222, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.013692285865545273, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.2327, + "reward": 2.6350467205047607, + "reward_std": 0.30000850558280945, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.024675646796822548, + "rewards/tag_count_reward": 0.9791666865348816, "step": 339 }, { "clip_ratio": 0.0, - "completion_length": 458.56251525878906, + "completion_length": 572.5208740234375, "epoch": 0.34, - "grad_norm": 14.34972615832187, - "kl": 2.255859375, + "grad_norm": 11.869158855207635, + "kl": 1.427734375, "learning_rate": 8.511087728614862e-07, - "loss": 0.6387, - "reward": 2.777496576309204, - "reward_std": 0.36730538308620453, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.02285083942115307, - "rewards/tag_count_reward": 0.9114583730697632, + "loss": 0.2899, + "reward": 2.693272352218628, + "reward_std": 0.4559956192970276, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.023741761222481728, + "rewards/tag_count_reward": 0.9322916865348816, "step": 340 }, { "clip_ratio": 0.0, - "completion_length": 357.60418701171875, + "completion_length": 559.7083435058594, "epoch": 0.341, - "grad_norm": 6.208773682663241, - "kl": 1.41796875, + "grad_norm": 16.084642786041737, + "kl": 2.78125, "learning_rate": 8.499396116122535e-07, - "loss": 0.2272, - "reward": 2.631097197532654, - "reward_std": 0.3990493267774582, - "rewards/accuracy_reward": 0.6875000298023224, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.019944587722420692, - "rewards/tag_count_reward": 0.984375, + "loss": 0.404, + "reward": 2.5177054405212402, + "reward_std": 0.358899861574173, + "rewards/accuracy_reward": 0.583333358168602, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.018752962350845337, + "rewards/tag_count_reward": 0.953125, "step": 341 }, { "clip_ratio": 0.0, - "completion_length": 327.43751525878906, + "completion_length": 552.3125305175781, "epoch": 0.342, - "grad_norm": 3.811852351785347, - "kl": 0.5634765625, + "grad_norm": 22.00620591226436, + "kl": 2.71875, "learning_rate": 8.487667956935087e-07, - "loss": 0.0261, - "reward": 2.915947198867798, - "reward_std": 0.13628114759922028, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.028497262857854366, - "rewards/tag_count_reward": 1.0, + "loss": 0.1789, + "reward": 2.8170549869537354, + "reward_std": 0.35434219241142273, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.026695125736296177, + "rewards/tag_count_reward": 0.9479166865348816, "step": 342 }, { "clip_ratio": 0.0, - "completion_length": 317.2291717529297, + "completion_length": 522.9791717529297, "epoch": 0.343, - "grad_norm": 11.349107657754852, - "kl": 1.578125, + "grad_norm": 22.634127347183696, + "kl": 3.59375, "learning_rate": 8.475903393956433e-07, - "loss": 0.3765, - "reward": 2.7598495483398438, - "reward_std": 0.2619144544005394, - "rewards/accuracy_reward": 0.8333333730697632, - "rewards/reasoning_steps_reward": 0.9861111044883728, - "rewards/repetition_penalty_reward": -0.012720022583380342, - "rewards/tag_count_reward": 0.9531250298023224, + "loss": 0.3774, + "reward": 2.7711684703826904, + "reward_std": 0.3556895852088928, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.018762326799333096, + "rewards/tag_count_reward": 0.9427083730697632, "step": 343 }, { "clip_ratio": 0.0, - "completion_length": 297.7083435058594, + "completion_length": 594.7291870117188, "epoch": 0.344, - "grad_norm": 7.7203726727342294, - "kl": 0.740234375, + "grad_norm": 25.37619859311342, + "kl": 2.046875, "learning_rate": 8.464102570534061e-07, - "loss": 0.1124, - "reward": 2.888232111930847, - "reward_std": 0.2508256062865257, - "rewards/accuracy_reward": 0.9375000298023224, - "rewards/reasoning_steps_reward": 0.9791667461395264, - "rewards/repetition_penalty_reward": -0.018018093891441822, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0985, + "reward": 2.51139497756958, + "reward_std": 0.4084962010383606, + "rewards/accuracy_reward": 0.583333358168602, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.03374398034065962, + "rewards/tag_count_reward": 0.9687500298023224, "step": 344 }, { "clip_ratio": 0.0, - "completion_length": 353.1458435058594, + "completion_length": 584.375, "epoch": 0.345, - "grad_norm": 15.49721785780986, - "kl": 2.42578125, + "grad_norm": 8.691345932821196, + "kl": 2.03515625, "learning_rate": 8.452265630457282e-07, - "loss": 0.2393, - "reward": 2.7500698566436768, - "reward_std": 0.28690239787101746, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.01381902676075697, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.2564, + "reward": 2.603990077972412, + "reward_std": 0.5245843231678009, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.02448232937604189, + "rewards/tag_count_reward": 0.9479166865348816, "step": 345 }, { "clip_ratio": 0.0, - "completion_length": 448.875, + "completion_length": 533.9166870117188, "epoch": 0.346, - "grad_norm": 19.70926907915403, - "kl": 2.7734375, + "grad_norm": 27.37252896321387, + "kl": 1.34375, "learning_rate": 8.440392717955475e-07, - "loss": 0.8212, - "reward": 2.787039637565613, - "reward_std": 0.4537549167871475, - "rewards/accuracy_reward": 0.8750000298023224, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.020251897163689137, - "rewards/tag_count_reward": 0.953125, + "loss": 0.4947, + "reward": 2.6430338621139526, + "reward_std": 0.4521474093198776, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9861111640930176, + "rewards/repetition_penalty_reward": -0.014952600467950106, + "rewards/tag_count_reward": 0.9427083432674408, "step": 346 }, { "clip_ratio": 0.0, - "completion_length": 327.54168701171875, + "completion_length": 528.8541870117188, "epoch": 0.347, - "grad_norm": 7.093058788121615, - "kl": 1.5859375, + "grad_norm": 3.9349545954124743, + "kl": 0.39453125, "learning_rate": 8.428483977696328e-07, - "loss": 0.2942, - "reward": 2.7902220487594604, - "reward_std": 0.29879920184612274, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 0.979166716337204, - "rewards/repetition_penalty_reward": -0.00665304739959538, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.0493, + "reward": 2.7178937196731567, + "reward_std": 0.36671870201826096, + "rewards/accuracy_reward": 0.75, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.026898046024143696, + "rewards/tag_count_reward": 0.9947916865348816, "step": 347 }, { "clip_ratio": 0.0, - "completion_length": 295.4583435058594, + "completion_length": 536.8333740234375, "epoch": 0.348, - "grad_norm": 8.099616607017232, - "kl": 1.2578125, + "grad_norm": 5.777524675922901, + "kl": 0.3681640625, "learning_rate": 8.416539554784089e-07, - "loss": 0.1772, - "reward": 2.750554084777832, - "reward_std": 0.28315818309783936, - "rewards/accuracy_reward": 0.7708333730697632, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.013334698043763638, - "rewards/tag_count_reward": 1.0, + "loss": 0.049, + "reward": 2.5611852407455444, + "reward_std": 0.3319101259112358, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.025620201602578163, + "rewards/tag_count_reward": 0.9895833432674408, "step": 348 }, { "clip_ratio": 0.0, - "completion_length": 247.81251525878906, + "completion_length": 553.625, "epoch": 0.349, - "grad_norm": 5.385204283310242, - "kl": 0.72265625, + "grad_norm": 5.2707839803098055, + "kl": 0.34375, "learning_rate": 8.404559594757777e-07, - "loss": 0.0633, - "reward": 2.955636143684387, - "reward_std": 0.10240116296336055, - "rewards/accuracy_reward": 0.9791666865348816, + "loss": 0.0007, + "reward": 2.545145034790039, + "reward_std": 0.22191336005926132, + "rewards/accuracy_reward": 0.5833333730697632, "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.016586333978921175, - "rewards/tag_count_reward": 1.0, + "rewards/repetition_penalty_reward": -0.026035414077341557, + "rewards/tag_count_reward": 0.9947916865348816, "step": 349 }, { "clip_ratio": 0.0, - "completion_length": 256.14583587646484, + "completion_length": 462.12501525878906, "epoch": 0.35, - "grad_norm": 9.872088380114736, - "kl": 0.724609375, + "grad_norm": 3.643489393286096, + "kl": 0.2626953125, "learning_rate": 8.392544243589427e-07, - "loss": -0.0081, - "reward": 2.839182496070862, - "reward_std": 0.18973393738269806, - "rewards/accuracy_reward": 0.875, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.016720325220376253, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0118, + "reward": 2.5923283100128174, + "reward_std": 0.3613494336605072, + "rewards/accuracy_reward": 0.625, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.022255297750234604, + "rewards/tag_count_reward": 0.9895833432674408, "step": 350 }, { "clip_ratio": 0.0, - "completion_length": 346.3541717529297, + "completion_length": 563.3333740234375, "epoch": 0.351, - "grad_norm": 12.548929086667865, - "kl": 1.830078125, + "grad_norm": 7.73557681949032, + "kl": 0.56640625, "learning_rate": 8.3804936476823e-07, - "loss": 0.2942, - "reward": 2.8285974264144897, - "reward_std": 0.17067108117043972, - "rewards/accuracy_reward": 0.875, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.023833095096051693, - "rewards/tag_count_reward": 0.984375, + "loss": 0.1536, + "reward": 2.4618531465530396, + "reward_std": 0.3520192950963974, + "rewards/accuracy_reward": 0.520833358168602, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.024257982149720192, + "rewards/tag_count_reward": 0.9791666865348816, "step": 351 }, { "clip_ratio": 0.0, - "completion_length": 318.75001525878906, + "completion_length": 523.5625305175781, "epoch": 0.352, - "grad_norm": 26.853325769832182, - "kl": 1.759765625, + "grad_norm": 3.134584879525316, + "kl": 0.16259765625, "learning_rate": 8.368407953869103e-07, - "loss": 0.6289, - "reward": 2.8765352964401245, - "reward_std": 0.3505753315985203, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.012353679165244102, - "rewards/tag_count_reward": 0.9583333432674408, + "loss": 0.0164, + "reward": 2.8710557222366333, + "reward_std": 0.16795307025313377, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.03345821052789688, + "rewards/tag_count_reward": 0.9947916865348816, "step": 352 }, { "clip_ratio": 0.0, - "completion_length": 252.1666717529297, + "completion_length": 526.0208435058594, "epoch": 0.353, - "grad_norm": 11.904950232584664, - "kl": 0.7421875, + "grad_norm": 4.312118426726596, + "kl": 0.224609375, "learning_rate": 8.356287309410204e-07, - "loss": 0.1552, - "reward": 2.927272319793701, - "reward_std": 0.17897653579711914, - "rewards/accuracy_reward": 0.9375000298023224, + "loss": 0.0462, + "reward": 2.8158966302871704, + "reward_std": 0.22574415802955627, + "rewards/accuracy_reward": 0.8541666865348816, "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.010227864608168602, - "rewards/tag_count_reward": 1.0, + "rewards/repetition_penalty_reward": -0.0226451912894845, + "rewards/tag_count_reward": 0.984375, "step": 353 }, { "clip_ratio": 0.0, - "completion_length": 316.00001525878906, + "completion_length": 490.1666717529297, "epoch": 0.354, - "grad_norm": 20.841686985519985, - "kl": 1.23046875, + "grad_norm": 2.993028075064908, + "kl": 0.1904296875, "learning_rate": 8.344131861991828e-07, - "loss": 0.4109, - "reward": 2.6631776094436646, - "reward_std": 0.24949096888303757, - "rewards/accuracy_reward": 0.708333358168602, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.019114283844828606, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": -0.0266, + "reward": 2.8288527727127075, + "reward_std": 0.15067671798169613, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.01836968120187521, + "rewards/tag_count_reward": 1.0, "step": 354 }, { "clip_ratio": 0.0, - "completion_length": 312.68751525878906, + "completion_length": 577.9583435058594, "epoch": 0.355, - "grad_norm": 12.468057495516762, - "kl": 2.013671875, + "grad_norm": 4.692169147728827, + "kl": 0.69189453125, "learning_rate": 8.331941759724268e-07, - "loss": 0.1786, - "reward": 2.820387601852417, - "reward_std": 0.23722103238105774, - "rewards/accuracy_reward": 0.8750000298023224, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.01989032607525587, + "loss": 0.1281, + "reward": 2.5701699256896973, + "reward_std": 0.3895218074321747, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.02705218270421028, "rewards/tag_count_reward": 0.9791666865348816, "step": 355 }, { "clip_ratio": 0.0, - "completion_length": 359.25000762939453, + "completion_length": 601.1458435058594, "epoch": 0.356, - "grad_norm": 25.51437747882916, - "kl": 3.64453125, + "grad_norm": 10.169896342116992, + "kl": 0.734375, "learning_rate": 8.319717151140072e-07, - "loss": 0.2652, - "reward": 2.6489123106002808, - "reward_std": 0.1955174282193184, - "rewards/accuracy_reward": 0.6875000298023224, + "loss": 0.2155, + "reward": 2.395414352416992, + "reward_std": 0.416961133480072, + "rewards/accuracy_reward": 0.4791666865348816, "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.01254612673074007, - "rewards/tag_count_reward": 0.9739583432674408, + "rewards/repetition_penalty_reward": -0.026460560970008373, + "rewards/tag_count_reward": 0.9427083432674408, "step": 356 }, { "clip_ratio": 0.0, - "completion_length": 411.60418701171875, + "completion_length": 546.5416870117188, "epoch": 0.357, - "grad_norm": 10.585679306255473, - "kl": 3.51953125, + "grad_norm": 11.244917008930972, + "kl": 0.6279296875, "learning_rate": 8.307458185192238e-07, - "loss": 0.4823, - "reward": 2.7355072498321533, - "reward_std": 0.36401769518852234, - "rewards/accuracy_reward": 0.8125000298023224, + "loss": 0.1956, + "reward": 2.8225170373916626, + "reward_std": 0.34238358587026596, + "rewards/accuracy_reward": 0.8958333730697632, "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.01970117725431919, - "rewards/tag_count_reward": 0.9427083432674408, + "rewards/repetition_penalty_reward": -0.021233050152659416, + "rewards/tag_count_reward": 0.9479166865348816, "step": 357 }, { "clip_ratio": 0.0, - "completion_length": 308.18751525878906, + "completion_length": 601.2916870117188, "epoch": 0.358, - "grad_norm": 7.207803354205841, - "kl": 1.41015625, + "grad_norm": 6.159922849123736, + "kl": 0.908203125, "learning_rate": 8.295165011252396e-07, - "loss": 0.1827, - "reward": 2.8859267234802246, - "reward_std": 0.16146743949502707, - "rewards/accuracy_reward": 0.9166666865348816, + "loss": 0.1542, + "reward": 2.7077341079711914, + "reward_std": 0.3084152042865753, + "rewards/accuracy_reward": 0.75, "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.015115112066268921, - "rewards/tag_count_reward": 0.984375, + "rewards/repetition_penalty_reward": -0.03184933587908745, + "rewards/tag_count_reward": 0.9895833432674408, "step": 358 }, { "clip_ratio": 0.0, - "completion_length": 262.1041717529297, + "completion_length": 506.85418701171875, "epoch": 0.359, - "grad_norm": 13.054462977290376, - "kl": 1.5537109375, + "grad_norm": 4.104290958705583, + "kl": 0.3466796875, "learning_rate": 8.282837779108993e-07, - "loss": 0.4139, - "reward": 2.9265706539154053, - "reward_std": 0.2239571064710617, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 0.979166716337204, - "rewards/repetition_penalty_reward": -0.010929480195045471, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.0297, + "reward": 2.7414658069610596, + "reward_std": 0.21230606734752655, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.02415929874405265, + "rewards/tag_count_reward": 0.9947916865348816, "step": 359 }, { "clip_ratio": 0.0, - "completion_length": 283.7708435058594, + "completion_length": 547.5000305175781, "epoch": 0.36, - "grad_norm": 8.140167696518287, - "kl": 0.734375, + "grad_norm": 20.57792012790243, + "kl": 2.328125, "learning_rate": 8.270476638965461e-07, - "loss": 0.1334, - "reward": 2.9586756229400635, - "reward_std": 0.0983630595728755, - "rewards/accuracy_reward": 0.9791666865348816, + "loss": 0.502, + "reward": 2.5797927379608154, + "reward_std": 0.31256987154483795, + "rewards/accuracy_reward": 0.645833358168602, "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.015282800886780024, - "rewards/tag_count_reward": 0.9947916865348816, + "rewards/repetition_penalty_reward": -0.01916569098830223, + "rewards/tag_count_reward": 0.9531250298023224, "step": 360 }, { "clip_ratio": 0.0, - "completion_length": 326.0833435058594, + "completion_length": 729.3541870117188, "epoch": 0.361, - "grad_norm": 19.257786649718646, - "kl": 1.11328125, + "grad_norm": 42.22333450542258, + "kl": 6.875, "learning_rate": 8.258081741438394e-07, - "loss": 0.3238, - "reward": 2.686263680458069, - "reward_std": 0.17666732892394066, - "rewards/accuracy_reward": 0.75, - "rewards/reasoning_steps_reward": 0.9652778208255768, - "rewards/repetition_penalty_reward": -0.013388962484896183, - "rewards/tag_count_reward": 0.984375, + "loss": 0.8517, + "reward": 2.3151514530181885, + "reward_std": 0.5804566144943237, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.021654206328094006, + "rewards/tag_count_reward": 0.8229166865348816, "step": 361 }, { "clip_ratio": 0.0, - "completion_length": 249.27084350585938, + "completion_length": 597.5416870117188, "epoch": 0.362, - "grad_norm": 7.04844779313925, - "kl": 0.60546875, + "grad_norm": 20.91301673590996, + "kl": 3.27734375, "learning_rate": 8.245653237555705e-07, - "loss": 0.0856, - "reward": 2.971284031867981, - "reward_std": 0.07957913680002093, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.007882710546255112, - "rewards/tag_count_reward": 1.0, + "loss": 0.4718, + "reward": 2.5699336528778076, + "reward_std": 0.4157796800136566, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.03076085541397333, + "rewards/tag_count_reward": 0.9479166865348816, "step": 362 }, { "clip_ratio": 0.0, - "completion_length": 278.1666717529297, + "completion_length": 526.9166717529297, "epoch": 0.363, - "grad_norm": 9.821600972103582, - "kl": 0.4599609375, + "grad_norm": 15.16442092292686, + "kl": 1.78515625, "learning_rate": 8.23319127875479e-07, - "loss": 0.1021, - "reward": 2.930161237716675, - "reward_std": 0.15336985141038895, - "rewards/accuracy_reward": 0.9583333730697632, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.021227712742984295, - "rewards/tag_count_reward": 1.0, + "loss": 0.2643, + "reward": 2.8612154722213745, + "reward_std": 0.2645837068557739, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.020729007199406624, + "rewards/tag_count_reward": 0.9791666865348816, "step": 363 }, { "clip_ratio": 0.0, - "completion_length": 240.39583587646484, + "completion_length": 574.6250152587891, "epoch": 0.364, - "grad_norm": 9.013709838641676, - "kl": 0.705078125, + "grad_norm": 27.79134380026425, + "kl": 2.84375, "learning_rate": 8.220696016880687e-07, - "loss": 0.1419, - "reward": 2.990858793258667, - "reward_std": 0.009336329298093915, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.00914114573970437, - "rewards/tag_count_reward": 1.0, + "loss": 0.4106, + "reward": 2.6281174421310425, + "reward_std": 0.4037089943885803, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.015979719813913107, + "rewards/tag_count_reward": 0.9218750298023224, "step": 364 }, { "clip_ratio": 0.0, - "completion_length": 212.75000762939453, + "completion_length": 736.8958435058594, "epoch": 0.365, - "grad_norm": 5.6397315200664, - "kl": 0.5703125, + "grad_norm": 20.741864587098874, + "kl": 3.328125, "learning_rate": 8.208167604184217e-07, - "loss": 0.0127, - "reward": 2.9352588653564453, - "reward_std": 0.15303438156843185, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.009185637114569545, - "rewards/tag_count_reward": 1.0, + "loss": 0.5224, + "reward": 2.400440216064453, + "reward_std": 0.5106681287288666, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.024907216429710388, + "rewards/tag_count_reward": 0.8489583432674408, "step": 365 }, { "clip_ratio": 0.0, - "completion_length": 402.4583435058594, + "completion_length": 651.7708435058594, "epoch": 0.366, - "grad_norm": 13.9750719900113, - "kl": 3.6328125, + "grad_norm": 14.227390419980614, + "kl": 2.10546875, "learning_rate": 8.195606193320136e-07, - "loss": 0.5713, - "reward": 2.5722696781158447, - "reward_std": 0.2334899678826332, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.01453589741140604, - "rewards/tag_count_reward": 0.9270833730697632, + "loss": 0.3847, + "reward": 2.4665273427963257, + "reward_std": 0.6216453611850739, + "rewards/accuracy_reward": 0.5833333730697632, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.03347271308302879, + "rewards/tag_count_reward": 0.9166666865348816, "step": 366 }, { "clip_ratio": 0.0, - "completion_length": 277.37501525878906, + "completion_length": 714.1458740234375, "epoch": 0.367, - "grad_norm": 12.381300964710869, - "kl": 1.68359375, + "grad_norm": 15.698648427320858, + "kl": 2.15625, "learning_rate": 8.183011937345271e-07, - "loss": 0.0884, - "reward": 2.6966644525527954, - "reward_std": 0.1265738122165203, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.013405262492597103, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.3797, + "reward": 2.530668020248413, + "reward_std": 0.5888173580169678, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.02141545619815588, + "rewards/tag_count_reward": 0.8437500298023224, "step": 367 }, { "clip_ratio": 0.0, - "completion_length": 352.93751525878906, + "completion_length": 646.0000305175781, "epoch": 0.368, - "grad_norm": 21.0149586822577, - "kl": 3.34375, + "grad_norm": 24.843348556432016, + "kl": 1.98828125, "learning_rate": 8.170384989716657e-07, - "loss": 0.3657, - "reward": 2.709786891937256, - "reward_std": 0.13821916095912457, - "rewards/accuracy_reward": 0.7708333432674408, + "loss": 0.5184, + "reward": 2.5058876276016235, + "reward_std": 0.6075495481491089, + "rewards/accuracy_reward": 0.6458333432674408, "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.02806040644645691, - "rewards/tag_count_reward": 0.9739583432674408, + "rewards/repetition_penalty_reward": -0.023626457899808884, + "rewards/tag_count_reward": 0.8906250298023224, "step": 368 }, { "clip_ratio": 0.0, - "completion_length": 360.3958435058594, + "completion_length": 597.1875, "epoch": 0.369, - "grad_norm": 10.082919357705237, - "kl": 2.9921875, + "grad_norm": 16.338985391404897, + "kl": 1.15625, "learning_rate": 8.157725504289664e-07, - "loss": 0.507, - "reward": 2.8243778944015503, - "reward_std": 0.22792188078165054, - "rewards/accuracy_reward": 0.875, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.008955407422035933, + "loss": 0.3953, + "reward": 2.7404117584228516, + "reward_std": 0.3872096836566925, + "rewards/accuracy_reward": 0.8333333730697632, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.02347709983587265, "rewards/tag_count_reward": 0.9583333432674408, "step": 369 }, { "clip_ratio": 0.0, - "completion_length": 358.64583587646484, + "completion_length": 596.0833435058594, "epoch": 0.37, - "grad_norm": 14.839482248100737, - "kl": 3.037109375, + "grad_norm": 13.06241235347548, + "kl": 1.5, "learning_rate": 8.145033635316128e-07, - "loss": 0.622, - "reward": 2.661393404006958, - "reward_std": 0.162936769425869, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.012217745650559664, - "rewards/tag_count_reward": 0.9583333432674408, + "loss": 0.3765, + "reward": 2.6401225328445435, + "reward_std": 0.5106720924377441, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.01959980558604002, + "rewards/tag_count_reward": 0.9166666865348816, "step": 370 }, { "clip_ratio": 0.0, - "completion_length": 370.27083587646484, + "completion_length": 726.5208435058594, "epoch": 0.371, - "grad_norm": 18.49437890117159, - "kl": 2.82421875, + "grad_norm": 14.668565421161508, + "kl": 2.65625, "learning_rate": 8.13230953744247e-07, - "loss": 0.2179, - "reward": 2.4758044481277466, - "reward_std": 0.34404291212558746, - "rewards/accuracy_reward": 0.5625000298023224, - "rewards/reasoning_steps_reward": 0.9583333432674408, - "rewards/repetition_penalty_reward": -0.018987320829182863, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 0.5462, + "reward": 2.342048168182373, + "reward_std": 0.4522154927253723, + "rewards/accuracy_reward": 0.4791666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.02774341218173504, + "rewards/tag_count_reward": 0.8906250298023224, "step": 371 }, { "clip_ratio": 0.0, - "completion_length": 236.2291717529297, + "completion_length": 634.5416870117188, "epoch": 0.372, - "grad_norm": 9.41273712843433, - "kl": 0.85546875, + "grad_norm": 10.126972374008618, + "kl": 2.734375, "learning_rate": 8.119553365707802e-07, - "loss": 0.0027, - "reward": 2.9694904088974, - "reward_std": 0.06044703675433993, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.009676362853497267, - "rewards/tag_count_reward": 1.0, + "loss": 0.5343, + "reward": 2.6384775638580322, + "reward_std": 0.559327244758606, + "rewards/accuracy_reward": 0.75, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.016036429908126593, + "rewards/tag_count_reward": 0.9114583432674408, "step": 372 }, { "clip_ratio": 0.0, - "completion_length": 227.0, + "completion_length": 620.6041870117188, "epoch": 0.373, - "grad_norm": 6.867023920714141, - "kl": 0.796875, + "grad_norm": 12.491483118539476, + "kl": 3.515625, "learning_rate": 8.106765275542053e-07, - "loss": 0.1236, - "reward": 2.992671012878418, - "reward_std": 0.008701992686837912, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.007329111685976386, - "rewards/tag_count_reward": 1.0, + "loss": 0.608, + "reward": 2.324834704399109, + "reward_std": 0.6106734275817871, + "rewards/accuracy_reward": 0.4791666716337204, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.022387592121958733, + "rewards/tag_count_reward": 0.9166666865348816, "step": 373 }, { "clip_ratio": 0.0, - "completion_length": 322.93751525878906, + "completion_length": 595.9166870117188, "epoch": 0.374, - "grad_norm": 12.095618376072478, - "kl": 1.328125, + "grad_norm": 34.92698420205304, + "kl": 4.8046875, "learning_rate": 8.093945422764069e-07, - "loss": 0.1833, - "reward": 2.659416437149048, - "reward_std": 0.3187124878168106, - "rewards/accuracy_reward": 0.6875000149011612, + "loss": 0.7281, + "reward": 2.6930631399154663, + "reward_std": 0.4513649195432663, + "rewards/accuracy_reward": 0.8125000298023224, "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.012458545621484518, - "rewards/tag_count_reward": 0.984375, + "rewards/repetition_penalty_reward": -0.015270282980054617, + "rewards/tag_count_reward": 0.8958333432674408, "step": 374 }, { "clip_ratio": 0.0, - "completion_length": 334.43751525878906, + "completion_length": 555.0000152587891, "epoch": 0.375, - "grad_norm": 29.105348822902826, - "kl": 1.04296875, + "grad_norm": 20.20783740449395, + "kl": 3.421875, "learning_rate": 8.081093963579707e-07, - "loss": -0.0011, - "reward": 2.7296379804611206, - "reward_std": 0.1736055612564087, - "rewards/accuracy_reward": 0.7708333730697632, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.013417747803032398, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.6528, + "reward": 2.706920862197876, + "reward_std": 0.5360372513532639, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.01703750714659691, + "rewards/tag_count_reward": 0.9322916865348816, "step": 375 }, { "clip_ratio": 0.0, - "completion_length": 303.89583587646484, + "completion_length": 809.8125, "epoch": 0.376, - "grad_norm": 5.262771831358241, - "kl": 0.888671875, + "grad_norm": 42.45590838876723, + "kl": 8.359375, "learning_rate": 8.068211054579943e-07, - "loss": 0.0742, - "reward": 2.8428704738616943, - "reward_std": 0.2906629294157028, - "rewards/accuracy_reward": 0.8750000298023224, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.016504602041095495, - "rewards/tag_count_reward": 0.984375, + "loss": 0.939, + "reward": 2.4243576526641846, + "reward_std": 0.6719434857368469, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.033975621685385704, + "rewards/tag_count_reward": 0.8750000298023224, "step": 376 }, { "clip_ratio": 0.0, - "completion_length": 234.77084350585938, + "completion_length": 618.2083435058594, "epoch": 0.377, - "grad_norm": 6.893334873917138, - "kl": 0.5498046875, + "grad_norm": 8.055288053209214, + "kl": 2.9609375, "learning_rate": 8.055296852738956e-07, - "loss": 0.0456, - "reward": 2.85650098323822, - "reward_std": 0.2420366182923317, - "rewards/accuracy_reward": 0.875, + "loss": 0.5277, + "reward": 2.515875458717346, + "reward_std": 0.6309832334518433, + "rewards/accuracy_reward": 0.6458333432674408, "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.00634615495800972, - "rewards/tag_count_reward": 0.9947916865348816, + "rewards/repetition_penalty_reward": -0.018846786580979824, + "rewards/tag_count_reward": 0.8958333730697632, "step": 377 }, { "clip_ratio": 0.0, - "completion_length": 291.7916717529297, + "completion_length": 531.5625, "epoch": 0.378, - "grad_norm": 4.261736016512869, - "kl": 0.4951171875, + "grad_norm": 10.73266293127411, + "kl": 2.0234375, "learning_rate": 8.04235151541222e-07, - "loss": 0.0601, - "reward": 2.9579402208328247, - "reward_std": 0.09960247576236725, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.014282059390097857, - "rewards/tag_count_reward": 1.0, + "loss": 0.5295, + "reward": 2.736036777496338, + "reward_std": 0.5330559611320496, + "rewards/accuracy_reward": 0.8333333730697632, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.012227283790707588, + "rewards/tag_count_reward": 0.9218750298023224, "step": 378 }, { "clip_ratio": 0.0, - "completion_length": 279.9166717529297, + "completion_length": 577.9166717529297, "epoch": 0.379, - "grad_norm": 22.521378985902814, - "kl": 0.806640625, + "grad_norm": 30.827479254698062, + "kl": 1.96484375, "learning_rate": 8.029375200334587e-07, - "loss": 0.3781, - "reward": 2.861446499824524, - "reward_std": 0.32641811668872833, - "rewards/accuracy_reward": 0.8958333730697632, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.015289702918380499, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.4051, + "reward": 2.63044536113739, + "reward_std": 0.4179770350456238, + "rewards/accuracy_reward": 0.75, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.02059647301211953, + "rewards/tag_count_reward": 0.9427083432674408, "step": 379 }, { "clip_ratio": 0.0, - "completion_length": 254.27083587646484, + "completion_length": 571.625, "epoch": 0.38, - "grad_norm": 11.574906523808227, - "kl": 0.6015625, + "grad_norm": 11.564342505897628, + "kl": 0.927734375, "learning_rate": 8.01636806561836e-07, - "loss": 0.1322, - "reward": 2.908362627029419, - "reward_std": 0.28985975682735443, - "rewards/accuracy_reward": 0.9166666865348816, + "loss": 0.286, + "reward": 2.6674641370773315, + "reward_std": 0.36413097381591797, + "rewards/accuracy_reward": 0.7291666865348816, "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.008304051123559475, - "rewards/tag_count_reward": 1.0, + "rewards/repetition_penalty_reward": -0.030452590435743332, + "rewards/tag_count_reward": 0.96875, "step": 380 }, { "clip_ratio": 0.0, - "completion_length": 257.1041793823242, + "completion_length": 627.7083587646484, "epoch": 0.381, - "grad_norm": 4.454428707075612, - "kl": 0.55859375, + "grad_norm": 20.79576339395751, + "kl": 2.859375, "learning_rate": 8.003330269751372e-07, - "loss": 0.0378, - "reward": 2.9804463386535645, - "reward_std": 0.050026701763272285, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.007400926435366273, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.473, + "reward": 2.5600701570510864, + "reward_std": 0.5540246367454529, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.02152709010988474, + "rewards/tag_count_reward": 0.8802083432674408, "step": 381 }, { "clip_ratio": 0.0, - "completion_length": 353.25, + "completion_length": 549.8125305175781, "epoch": 0.382, - "grad_norm": 26.036197516019396, - "kl": 1.115234375, + "grad_norm": 5.584732514662178, + "kl": 0.400390625, "learning_rate": 7.990261971595048e-07, - "loss": 0.4664, - "reward": 2.5783530473709106, - "reward_std": 0.45483914017677307, - "rewards/accuracy_reward": 0.645833358168602, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.013660957105457783, - "rewards/tag_count_reward": 0.953125, + "loss": 0.037, + "reward": 2.806509017944336, + "reward_std": 0.29304996132850647, + "rewards/accuracy_reward": 0.8750000298023224, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.030296694487333298, + "rewards/tag_count_reward": 0.9895833432674408, "step": 382 }, { "clip_ratio": 0.0, - "completion_length": 342.83333587646484, + "completion_length": 804.3541870117188, "epoch": 0.383, - "grad_norm": 17.04219935805869, - "kl": 1.140625, + "grad_norm": 11.399359693772158, + "kl": 2.4296875, "learning_rate": 7.977163330382479e-07, - "loss": 0.3428, - "reward": 2.8301628828048706, - "reward_std": 0.23999100923538208, - "rewards/accuracy_reward": 0.8750000298023224, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.011851201299577951, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 0.4973, + "reward": 2.5117881298065186, + "reward_std": 0.5285173058509827, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.028142395429313183, + "rewards/tag_count_reward": 0.8177083730697632, "step": 383 }, { "clip_ratio": 0.0, - "completion_length": 343.6666717529297, + "completion_length": 529.2291870117188, "epoch": 0.384, - "grad_norm": 11.187708212106639, - "kl": 1.671875, + "grad_norm": 15.184188235037553, + "kl": 1.04296875, "learning_rate": 7.964034505716476e-07, - "loss": 0.3868, - "reward": 2.833388090133667, - "reward_std": 0.3260490596294403, - "rewards/accuracy_reward": 0.8750000298023224, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.01730638463050127, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.3124, + "reward": 2.5725693702697754, + "reward_std": 0.3802667409181595, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.9791666567325592, + "rewards/repetition_penalty_reward": -0.015972374938428402, + "rewards/tag_count_reward": 0.9635416865348816, "step": 384 }, { "clip_ratio": 0.0, - "completion_length": 609.4166870117188, + "completion_length": 677.3333435058594, "epoch": 0.385, - "grad_norm": 20.468524308014135, - "kl": 6.421875, + "grad_norm": 6.513600915558159, + "kl": 2.9375, "learning_rate": 7.950875657567621e-07, - "loss": 0.5787, - "reward": 2.567959189414978, - "reward_std": 0.35809822380542755, - "rewards/accuracy_reward": 0.7083333730697632, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.01016602711752057, - "rewards/tag_count_reward": 0.8697916865348816, + "loss": 0.5198, + "reward": 2.4022648334503174, + "reward_std": 0.6920067071914673, + "rewards/accuracy_reward": 0.5833333730697632, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.0178740993142128, + "rewards/tag_count_reward": 0.84375, "step": 385 }, { "clip_ratio": 0.0, - "completion_length": 255.04167938232422, + "completion_length": 572.7708435058594, "epoch": 0.386, - "grad_norm": 13.647544624335573, - "kl": 1.419921875, + "grad_norm": 10.893931836031141, + "kl": 1.95703125, "learning_rate": 7.93768694627233e-07, - "loss": 0.0371, - "reward": 2.7234532833099365, - "reward_std": 0.16390015184879303, - "rewards/accuracy_reward": 0.75, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.010921777226030827, - "rewards/tag_count_reward": 0.984375, + "loss": 0.4043, + "reward": 2.5226422548294067, + "reward_std": 0.502282902598381, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.022496800869703293, + "rewards/tag_count_reward": 0.90625, "step": 386 }, { "clip_ratio": 0.0, - "completion_length": 254.75001525878906, + "completion_length": 650.7083435058594, "epoch": 0.387, - "grad_norm": 9.622481923732613, - "kl": 1.3046875, + "grad_norm": 11.694285585261715, + "kl": 2.919921875, "learning_rate": 7.924468532530883e-07, - "loss": 0.3274, - "reward": 2.8835262060165405, - "reward_std": 0.16932791541330516, - "rewards/accuracy_reward": 0.9166666865348816, + "loss": 0.3373, + "reward": 2.491190791130066, + "reward_std": 0.4670180529356003, + "rewards/accuracy_reward": 0.645833358168602, "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.005362672731280327, - "rewards/tag_count_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.03311488404870033, + "rewards/tag_count_reward": 0.8854166865348816, "step": 387 }, { "clip_ratio": 0.0, - "completion_length": 303.875, + "completion_length": 834.1041870117188, "epoch": 0.388, - "grad_norm": 8.548662805697377, - "kl": 2.2734375, + "grad_norm": 11.93709525868574, + "kl": 5.203125, "learning_rate": 7.911220577405484e-07, - "loss": 0.2493, - "reward": 2.9338642358779907, - "reward_std": 0.15618407726287842, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.015788794495165348, - "rewards/tag_count_reward": 0.984375, + "loss": 0.6908, + "reward": 2.1863056421279907, + "reward_std": 0.6824186444282532, + "rewards/accuracy_reward": 0.4166666865348816, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.020291661843657494, + "rewards/tag_count_reward": 0.7968750298023224, "step": 388 }, { "clip_ratio": 0.0, - "completion_length": 448.16668701171875, + "completion_length": 692.3333435058594, "epoch": 0.389, - "grad_norm": 14.69295282027617, - "kl": 6.5234375, + "grad_norm": 9.021930665026163, + "kl": 3.2265625, "learning_rate": 7.897943242318285e-07, - "loss": 0.908, - "reward": 2.3935057520866394, - "reward_std": 0.4311106204986572, - "rewards/accuracy_reward": 0.5208333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.012744141276925802, - "rewards/tag_count_reward": 0.8854166865348816, + "loss": 0.5433, + "reward": 2.5233638286590576, + "reward_std": 0.5364340543746948, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.026983547024428844, + "rewards/tag_count_reward": 0.8697916865348816, "step": 389 }, { "clip_ratio": 0.0, - "completion_length": 305.2083435058594, + "completion_length": 705.8750305175781, "epoch": 0.39, - "grad_norm": 17.2734009221056, - "kl": 2.859375, + "grad_norm": 13.205610036016818, + "kl": 3.453125, "learning_rate": 7.884636689049422e-07, - "loss": 0.2772, - "reward": 2.5650092363357544, - "reward_std": 0.25631172955036163, - "rewards/accuracy_reward": 0.6041666716337204, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.011379900854080915, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.76, + "reward": 2.53139591217041, + "reward_std": 0.6152072250843048, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.018951344303786755, + "rewards/tag_count_reward": 0.8281250298023224, "step": 390 }, { "clip_ratio": 0.0, - "completion_length": 230.97917938232422, + "completion_length": 733.6250305175781, "epoch": 0.391, - "grad_norm": 20.597174178428695, - "kl": 1.3359375, + "grad_norm": 7.373593323078646, + "kl": 3.265625, "learning_rate": 7.871301079735049e-07, - "loss": 0.508, - "reward": 2.6912208795547485, - "reward_std": 0.17258714139461517, - "rewards/accuracy_reward": 0.75, - "rewards/reasoning_steps_reward": 0.9652778208255768, - "rewards/repetition_penalty_reward": -0.0032235970720648766, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.5816, + "reward": 2.51271653175354, + "reward_std": 0.5442458987236023, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.028950226493179798, + "rewards/tag_count_reward": 0.8541666865348816, "step": 391 }, { "clip_ratio": 0.0, - "completion_length": 361.8541717529297, + "completion_length": 897.1666870117188, "epoch": 0.392, - "grad_norm": 8.584409404449747, - "kl": 2.53515625, + "grad_norm": 24.00799478501999, + "kl": 6.640625, "learning_rate": 7.857936576865356e-07, - "loss": 0.3897, - "reward": 2.677772879600525, - "reward_std": 0.3170725256204605, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.01667182147502899, - "rewards/tag_count_reward": 0.9583333432674408, + "loss": 0.9816, + "reward": 2.3230772018432617, + "reward_std": 0.7363496124744415, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.013728468678891659, + "rewards/tag_count_reward": 0.6979166865348816, "step": 392 }, { "clip_ratio": 0.0, - "completion_length": 256.5208435058594, + "completion_length": 753.1666870117188, "epoch": 0.393, - "grad_norm": 17.445829937923943, - "kl": 1.240234375, + "grad_norm": 12.878430832163207, + "kl": 3.63671875, "learning_rate": 7.844543343282595e-07, - "loss": 0.4272, - "reward": 2.9350606203079224, - "reward_std": 0.15632967790588737, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.005911729473154992, - "rewards/tag_count_reward": 0.96875, + "loss": 0.7256, + "reward": 2.5792795419692993, + "reward_std": 0.5005324482917786, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.026623360812664032, + "rewards/tag_count_reward": 0.8281250298023224, "step": 393 }, { "clip_ratio": 0.0, - "completion_length": 301.35418701171875, + "completion_length": 723.9791870117188, "epoch": 0.394, - "grad_norm": 10.602385167372207, - "kl": 1.3828125, + "grad_norm": 22.790620068840962, + "kl": 3.4140625, "learning_rate": 7.831121542179086e-07, - "loss": 0.3526, - "reward": 2.804153561592102, - "reward_std": 0.3132985904812813, - "rewards/accuracy_reward": 0.8333333730697632, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.008346541319042444, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.794, + "reward": 2.396026372909546, + "reward_std": 0.5379204005002975, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.02237648330628872, + "rewards/tag_count_reward": 0.8489583730697632, "step": 394 }, { "clip_ratio": 0.0, - "completion_length": 246.87500762939453, + "completion_length": 863.2291870117188, "epoch": 0.395, - "grad_norm": 17.112565449463606, - "kl": 0.79296875, + "grad_norm": 19.51867731167796, + "kl": 5.0625, "learning_rate": 7.817671337095244e-07, - "loss": 0.2047, - "reward": 2.9179338216781616, - "reward_std": 0.13691373681649566, - "rewards/accuracy_reward": 0.9375, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.012621679343283176, - "rewards/tag_count_reward": 1.0, + "loss": 0.5991, + "reward": 2.26111102104187, + "reward_std": 0.6565420031547546, + "rewards/accuracy_reward": 0.5000000298023224, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.02013904694467783, + "rewards/tag_count_reward": 0.7812500298023224, "step": 395 }, { "clip_ratio": 0.0, - "completion_length": 225.14583587646484, + "completion_length": 900.2708740234375, "epoch": 0.396, - "grad_norm": 12.071383569028448, - "kl": 1.6171875, + "grad_norm": 17.88490632655815, + "kl": 6.6171875, "learning_rate": 7.804192891917571e-07, - "loss": 0.3204, - "reward": 2.893914580345154, - "reward_std": 0.22353263571858406, - "rewards/accuracy_reward": 0.9375, + "loss": 0.7442, + "reward": 2.096524953842163, + "reward_std": 0.6609176695346832, + "rewards/accuracy_reward": 0.4375000149011612, "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.003654828295111656, - "rewards/tag_count_reward": 0.9739583730697632, + "rewards/repetition_penalty_reward": -0.01458633691072464, + "rewards/tag_count_reward": 0.6875, "step": 396 }, { "clip_ratio": 0.0, - "completion_length": 310.8958435058594, + "completion_length": 727.7083435058594, "epoch": 0.397, - "grad_norm": 272.3827360029816, - "kl": 5.501953125, + "grad_norm": 7.902766105189536, + "kl": 3.4375, "learning_rate": 7.79068637087667e-07, - "loss": 1.2608, - "reward": 2.5784049034118652, - "reward_std": 0.38914042711257935, - "rewards/accuracy_reward": 0.6250000149011612, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.008400833932682872, - "rewards/tag_count_reward": 0.96875, + "loss": 0.619, + "reward": 2.3182718753814697, + "reward_std": 0.6975450813770294, + "rewards/accuracy_reward": 0.5625000298023224, + "rewards/reasoning_steps_reward": 0.9861111640930176, + "rewards/repetition_penalty_reward": -0.027214372530579567, + "rewards/tag_count_reward": 0.796875, "step": 397 }, { "clip_ratio": 0.0, - "completion_length": 284.0208435058594, + "completion_length": 923.0833435058594, "epoch": 0.398, - "grad_norm": 10.678052160225592, - "kl": 2.439453125, + "grad_norm": 8.813050784830676, + "kl": 4.828125, "learning_rate": 7.777151938545235e-07, - "loss": 0.2944, - "reward": 2.7569762468338013, - "reward_std": 0.34162452071905136, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.012120912782847881, - "rewards/tag_count_reward": 0.984375, + "loss": 0.744, + "reward": 2.218664765357971, + "reward_std": 0.687986433506012, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.026126965880393982, + "rewards/tag_count_reward": 0.7239583730697632, "step": 398 }, { "clip_ratio": 0.0, - "completion_length": 304.1666717529297, + "completion_length": 731.9583435058594, "epoch": 0.399, - "grad_norm": 18.574868914365272, - "kl": 3.4609375, + "grad_norm": 17.369257892310884, + "kl": 3.0703125, "learning_rate": 7.763589759836058e-07, - "loss": 0.9017, - "reward": 2.83078134059906, - "reward_std": 0.37965691089630127, - "rewards/accuracy_reward": 0.8958333730697632, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.00428810523590073, - "rewards/tag_count_reward": 0.953125, + "loss": 0.5943, + "reward": 2.4915345907211304, + "reward_std": 0.6682489514350891, + "rewards/accuracy_reward": 0.7083333730697632, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.020618132315576077, + "rewards/tag_count_reward": 0.8177083432674408, "step": 399 }, { "clip_ratio": 0.0, - "completion_length": 345.31251525878906, + "completion_length": 669.4583435058594, "epoch": 0.4, - "grad_norm": 18.773027879774, - "kl": 3.7890625, + "grad_norm": 28.5217854549646, + "kl": 2.2109375, "learning_rate": 7.75e-07, - "loss": 0.2659, - "reward": 2.7037516832351685, - "reward_std": 0.38868679106235504, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 0.9652777910232544, - "rewards/repetition_penalty_reward": -0.016734644304960966, - "rewards/tag_count_reward": 0.984375, + "loss": 0.495, + "reward": 2.6985886096954346, + "reward_std": 0.4976983517408371, + "rewards/accuracy_reward": 0.8750000298023224, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.027106057852506638, + "rewards/tag_count_reward": 0.8645833730697632, "step": 400 }, { "clip_ratio": 0.0, - "completion_length": 247.43751525878906, + "completion_length": 747.625, "epoch": 0.401, - "grad_norm": 11.066744261093644, - "kl": 2.24609375, + "grad_norm": 8.8439464948951, + "kl": 2.84375, "learning_rate": 7.736382824623999e-07, - "loss": 0.188, - "reward": 2.784433126449585, - "reward_std": 0.30970167368650436, - "rewards/accuracy_reward": 0.8125000298023224, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.010705627035349607, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.4971, + "reward": 2.371925711631775, + "reward_std": 0.5836665034294128, + "rewards/accuracy_reward": 0.5625000298023224, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.02390767727047205, + "rewards/tag_count_reward": 0.8333333432674408, "step": 401 }, { "clip_ratio": 0.0, - "completion_length": 288.9583435058594, + "completion_length": 632.9791870117188, "epoch": 0.402, - "grad_norm": 21.83992473829985, - "kl": 2.37890625, + "grad_norm": 7.326080260643342, + "kl": 2.3515625, "learning_rate": 7.72273839962904e-07, - "loss": 0.151, - "reward": 2.62851345539093, - "reward_std": 0.2669844478368759, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.019056010991334915, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.4332, + "reward": 2.304866313934326, + "reward_std": 0.5863041877746582, + "rewards/accuracy_reward": 0.5, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.026731058955192566, + "rewards/tag_count_reward": 0.859375, "step": 402 }, { "clip_ratio": 0.0, - "completion_length": 254.50000762939453, + "completion_length": 852.3958740234375, "epoch": 0.403, - "grad_norm": 6.243676667747818, - "kl": 1.068359375, + "grad_norm": 12.84430423544328, + "kl": 4.61328125, "learning_rate": 7.709066891268133e-07, - "loss": 0.0598, - "reward": 2.6921645402908325, - "reward_std": 0.11518762912601233, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.01096044760197401, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.5593, + "reward": 2.23075133562088, + "reward_std": 0.5370394438505173, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.029665381647646427, + "rewards/tag_count_reward": 0.7395833432674408, "step": 403 }, { "clip_ratio": 0.0, - "completion_length": 244.43750762939453, + "completion_length": 815.9375, "epoch": 0.404, - "grad_norm": 7.792073767279535, - "kl": 1.078125, + "grad_norm": 19.101834754723175, + "kl": 5.0, "learning_rate": 7.695368466124296e-07, - "loss": 0.1444, - "reward": 2.84457790851593, - "reward_std": 0.1354450937360525, - "rewards/accuracy_reward": 0.8541666865348816, + "loss": 0.5902, + "reward": 2.4460870027542114, + "reward_std": 0.5804774314165115, + "rewards/accuracy_reward": 0.6875, "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.009588914457708597, - "rewards/tag_count_reward": 1.0, + "rewards/repetition_penalty_reward": -0.0174547852948308, + "rewards/tag_count_reward": 0.7760416865348816, "step": 404 }, { "clip_ratio": 0.0, - "completion_length": 236.14584350585938, + "completion_length": 951.1458740234375, "epoch": 0.405, - "grad_norm": 5.3185713493671045, - "kl": 0.978515625, + "grad_norm": 11.760380958455592, + "kl": 6.1875, "learning_rate": 7.681643291108517e-07, - "loss": 0.0981, - "reward": 2.669191598892212, - "reward_std": 0.14930030331015587, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.014836111571639776, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.8655, + "reward": 2.131704032421112, + "reward_std": 0.7297748029232025, + "rewards/accuracy_reward": 0.4375000298023224, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.017601476050913334, + "rewards/tag_count_reward": 0.7187500298023224, "step": 405 }, { "clip_ratio": 0.0, - "completion_length": 304.77083587646484, + "completion_length": 657.4375305175781, "epoch": 0.406, - "grad_norm": 9.31833590147285, - "kl": 1.470703125, + "grad_norm": 8.483281318881998, + "kl": 2.7734375, "learning_rate": 7.667891533457718e-07, - "loss": 0.197, - "reward": 2.6477270126342773, - "reward_std": 0.2534653600305319, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.011995228473097086, - "rewards/tag_count_reward": 0.9583333432674408, + "loss": 0.3648, + "reward": 2.377819776535034, + "reward_std": 0.6631312072277069, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.019749819301068783, + "rewards/tag_count_reward": 0.8072916865348816, "step": 406 }, { "clip_ratio": 0.0, - "completion_length": 330.75001525878906, + "completion_length": 532.6458587646484, "epoch": 0.407, - "grad_norm": 24.00739457089737, - "kl": 1.5302734375, + "grad_norm": 14.061905199941569, + "kl": 0.779296875, "learning_rate": 7.654113360732732e-07, - "loss": 0.6077, - "reward": 2.8755991458892822, - "reward_std": 0.3171325586736202, - "rewards/accuracy_reward": 0.9375, + "loss": 0.1726, + "reward": 2.7471961975097656, + "reward_std": 0.508865624666214, + "rewards/accuracy_reward": 0.8333333730697632, "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.018498200457543135, - "rewards/tag_count_reward": 0.9635416865348816, + "rewards/repetition_penalty_reward": -0.027109289541840553, + "rewards/tag_count_reward": 0.9479166865348816, "step": 407 }, { "clip_ratio": 0.0, - "completion_length": 199.70834350585938, + "completion_length": 706.9583435058594, "epoch": 0.408, - "grad_norm": 9.599826256945484, - "kl": 0.5390625, + "grad_norm": 11.779388411702078, + "kl": 2.3046875, "learning_rate": 7.640308940816239e-07, - "loss": 0.0626, - "reward": 2.952404022216797, - "reward_std": 0.1330920085310936, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.005929233739152551, - "rewards/tag_count_reward": 1.0, + "loss": 0.4889, + "reward": 2.349950671195984, + "reward_std": 0.6774967312812805, + "rewards/accuracy_reward": 0.5625000298023224, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.014632808044552803, + "rewards/tag_count_reward": 0.8020833730697632, "step": 408 }, { "clip_ratio": 0.0, - "completion_length": 238.4375, + "completion_length": 765.8125305175781, "epoch": 0.409, - "grad_norm": 20.18559430087595, - "kl": 0.697265625, + "grad_norm": 15.586280796607687, + "kl": 3.109375, "learning_rate": 7.626478441910744e-07, - "loss": 0.1904, - "reward": 2.9667248725891113, - "reward_std": 0.07727183401584625, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 0.979166716337204, - "rewards/repetition_penalty_reward": -0.012441886123269796, - "rewards/tag_count_reward": 1.0, + "loss": 0.578, + "reward": 2.306677460670471, + "reward_std": 0.7071038484573364, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.016239337623119354, + "rewards/tag_count_reward": 0.7812500298023224, "step": 409 }, { "clip_ratio": 0.0, - "completion_length": 260.31251525878906, + "completion_length": 665.75, "epoch": 0.41, - "grad_norm": 16.106097999513374, - "kl": 1.30078125, + "grad_norm": 7.201254956040354, + "kl": 2.546875, "learning_rate": 7.612622032536507e-07, - "loss": 0.4696, - "reward": 2.4825315475463867, - "reward_std": 0.1622295081615448, - "rewards/accuracy_reward": 0.5208333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.007051846478134394, - "rewards/tag_count_reward": 0.9687500298023224, + "loss": 0.3043, + "reward": 2.2321949005126953, + "reward_std": 0.5692197978496552, + "rewards/accuracy_reward": 0.4375, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.019541208632290363, + "rewards/tag_count_reward": 0.8281250298023224, "step": 410 }, { "clip_ratio": 0.0, - "completion_length": 226.6666717529297, + "completion_length": 557.1250305175781, "epoch": 0.411, - "grad_norm": 6.517847576483415, - "kl": 1.146484375, + "grad_norm": 12.200566811103256, + "kl": 1.060546875, "learning_rate": 7.59873988152951e-07, - "loss": 0.0413, - "reward": 2.6654216051101685, - "reward_std": 0.1699606329202652, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/reasoning_steps_reward": 0.9652778208255768, - "rewards/repetition_penalty_reward": -0.008189610671252012, - "rewards/tag_count_reward": 1.0, + "loss": 0.2802, + "reward": 2.6842163801193237, + "reward_std": 0.292233943939209, + "rewards/accuracy_reward": 0.7708333730697632, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.031061380170285702, + "rewards/tag_count_reward": 0.9583333730697632, "step": 411 }, { "clip_ratio": 0.0, - "completion_length": 294.5, + "completion_length": 589.4375305175781, "epoch": 0.412, - "grad_norm": 13.646229012067966, - "kl": 2.75, + "grad_norm": 20.90214024265737, + "kl": 1.5390625, "learning_rate": 7.584832158039378e-07, - "loss": 0.3343, - "reward": 2.712114095687866, - "reward_std": 0.18451374024152756, - "rewards/accuracy_reward": 0.7500000298023224, + "loss": 0.2278, + "reward": 2.5838751792907715, + "reward_std": 0.5441871881484985, + "rewards/accuracy_reward": 0.6875, "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.006636099424213171, - "rewards/tag_count_reward": 0.96875, + "rewards/repetition_penalty_reward": -0.03070823848247528, + "rewards/tag_count_reward": 0.9270833432674408, "step": 412 }, { "clip_ratio": 0.0, - "completion_length": 262.18751525878906, + "completion_length": 641.4791870117188, "epoch": 0.413, - "grad_norm": 6.69201053922183, - "kl": 0.740234375, + "grad_norm": 15.755392833494586, + "kl": 3.9609375, "learning_rate": 7.570899031527332e-07, - "loss": 0.0431, - "reward": 2.9610793590545654, - "reward_std": 0.07591623580083251, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.018087436445057392, - "rewards/tag_count_reward": 1.0, + "loss": 0.4663, + "reward": 2.525357723236084, + "reward_std": 0.607357531785965, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.0145728699862957, + "rewards/tag_count_reward": 0.8593750298023224, "step": 413 }, { "clip_ratio": 0.0, - "completion_length": 243.89584350585938, + "completion_length": 899.6250305175781, "epoch": 0.414, - "grad_norm": 7.993128205603922, - "kl": 0.64453125, + "grad_norm": 21.34036856462466, + "kl": 6.265625, "learning_rate": 7.556940671764124e-07, - "loss": 0.0699, - "reward": 2.989240884780884, - "reward_std": 0.012998570688068867, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.010759284719824791, - "rewards/tag_count_reward": 1.0, + "loss": 0.7498, + "reward": 2.206377148628235, + "reward_std": 0.7690772414207458, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.021053357981145382, + "rewards/tag_count_reward": 0.7135416865348816, "step": 414 }, { "clip_ratio": 0.0, - "completion_length": 264.43751525878906, + "completion_length": 902.6250305175781, "epoch": 0.415, - "grad_norm": 16.430977334889395, - "kl": 2.078125, + "grad_norm": 14.645548610311325, + "kl": 5.703125, "learning_rate": 7.54295724882796e-07, - "loss": 0.4182, - "reward": 2.7359074354171753, - "reward_std": 0.11975578963756561, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.017565005458891392, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.7326, + "reward": 2.3041797876358032, + "reward_std": 0.5700157135725021, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.020473050884902477, + "rewards/tag_count_reward": 0.7343750298023224, "step": 415 }, { "clip_ratio": 0.0, - "completion_length": 225.47917938232422, + "completion_length": 601.4375, "epoch": 0.416, - "grad_norm": 17.419905952579377, - "kl": 1.05859375, + "grad_norm": 9.196475410607844, + "kl": 1.935546875, "learning_rate": 7.528948933102438e-07, - "loss": 0.0044, - "reward": 2.7541191577911377, - "reward_std": 0.20759083330631256, + "loss": 0.3524, + "reward": 2.62975537776947, + "reward_std": 0.5219498127698898, "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.006297626765444875, - "rewards/tag_count_reward": 0.9895833432674408, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.024758631363511086, + "rewards/tag_count_reward": 0.890625, "step": 416 }, { "clip_ratio": 0.0, - "completion_length": 323.47918701171875, + "completion_length": 541.1666870117188, "epoch": 0.417, - "grad_norm": 6.481118351227645, - "kl": 1.140625, + "grad_norm": 6.279974669254126, + "kl": 1.14453125, "learning_rate": 7.514915895274463e-07, - "loss": 0.1027, - "reward": 2.947334408760071, - "reward_std": 0.10196847002953291, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.024887832812964916, - "rewards/tag_count_reward": 1.0, + "loss": 0.2295, + "reward": 2.4813188314437866, + "reward_std": 0.307245125528425, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.022153427824378014, + "rewards/tag_count_reward": 0.96875, "step": 417 }, { "clip_ratio": 0.0, - "completion_length": 234.64584350585938, + "completion_length": 608.6666870117188, "epoch": 0.418, - "grad_norm": 10.82200038781914, - "kl": 1.072265625, + "grad_norm": 13.670357428378258, + "kl": 2.1640625, "learning_rate": 7.500858306332172e-07, - "loss": 0.103, - "reward": 2.95858895778656, - "reward_std": 0.0965597927570343, - "rewards/accuracy_reward": 0.9791666865348816, + "loss": 0.4449, + "reward": 2.4430500268936157, + "reward_std": 0.5130402147769928, + "rewards/accuracy_reward": 0.5625, "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.010160942561924458, - "rewards/tag_count_reward": 0.9895833432674408, + "rewards/repetition_penalty_reward": -0.015283319167792797, + "rewards/tag_count_reward": 0.8958333432674408, "step": 418 }, { "clip_ratio": 0.0, - "completion_length": 228.5, + "completion_length": 513.375, "epoch": 0.419, - "grad_norm": 7.669364077967064, - "kl": 1.23828125, + "grad_norm": 8.83072473507965, + "kl": 1.2265625, "learning_rate": 7.486776337562853e-07, - "loss": 0.1605, - "reward": 2.8517900705337524, - "reward_std": 0.1703078057616949, - "rewards/accuracy_reward": 0.875, + "loss": 0.2575, + "reward": 2.5676904916763306, + "reward_std": 0.4607061445713043, + "rewards/accuracy_reward": 0.6666666865348816, "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.007585049606859684, - "rewards/tag_count_reward": 0.9843750298023224, + "rewards/repetition_penalty_reward": -0.026059521362185478, + "rewards/tag_count_reward": 0.9270833432674408, "step": 419 }, { "clip_ratio": 0.0, - "completion_length": 364.0, + "completion_length": 527.9166717529297, "epoch": 0.42, - "grad_norm": 14.759754537499324, - "kl": 3.71875, + "grad_norm": 9.224829985446288, + "kl": 1.548828125, "learning_rate": 7.472670160550848e-07, - "loss": 0.8881, - "reward": 2.4980452060699463, - "reward_std": 0.4056100994348526, - "rewards/accuracy_reward": 0.5833333432674408, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.017579803243279457, + "loss": 0.1764, + "reward": 2.311278820037842, + "reward_std": 0.4501464366912842, + "rewards/accuracy_reward": 0.3958333432674408, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.023790807463228703, "rewards/tag_count_reward": 0.953125, "step": 420 }, { "clip_ratio": 0.0, - "completion_length": 300.43750762939453, + "completion_length": 707.0208435058594, "epoch": 0.421, - "grad_norm": 19.215958439968322, - "kl": 2.671875, + "grad_norm": 9.815342189617178, + "kl": 3.53125, "learning_rate": 7.458539947175473e-07, - "loss": 0.5943, - "reward": 2.8830156326293945, - "reward_std": 0.2273974046111107, - "rewards/accuracy_reward": 0.9375, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.007609529187902808, - "rewards/tag_count_reward": 0.953125, + "loss": 0.7108, + "reward": 2.3707098960876465, + "reward_std": 0.5124354809522629, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.019915098324418068, + "rewards/tag_count_reward": 0.8697916865348816, "step": 421 }, { "clip_ratio": 0.0, - "completion_length": 403.7291717529297, + "completion_length": 500.8750305175781, "epoch": 0.422, - "grad_norm": 12.644744252137519, - "kl": 3.4921875, + "grad_norm": 5.188831712175436, + "kl": 1.2841796875, "learning_rate": 7.444385869608921e-07, - "loss": 0.3828, - "reward": 2.652065634727478, - "reward_std": 0.4860363006591797, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.01286498922854662, - "rewards/tag_count_reward": 0.9427083432674408, + "loss": 0.2224, + "reward": 2.582339286804199, + "reward_std": 0.25641510635614395, + "rewards/accuracy_reward": 0.6875000149011612, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.02529969811439514, + "rewards/tag_count_reward": 0.9479166865348816, "step": 422 }, { "clip_ratio": 0.0, - "completion_length": 333.25001525878906, + "completion_length": 470.8333435058594, "epoch": 0.423, - "grad_norm": 11.20878290976847, - "kl": 2.3671875, + "grad_norm": 8.589041831141525, + "kl": 0.720703125, "learning_rate": 7.430208100314156e-07, - "loss": 0.3149, - "reward": 2.755776882171631, - "reward_std": 0.43349410593509674, - "rewards/accuracy_reward": 0.8125, + "loss": 0.1102, + "reward": 2.710915446281433, + "reward_std": 0.390904039144516, + "rewards/accuracy_reward": 0.7708333730697632, "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.013320384547114372, - "rewards/tag_count_reward": 0.9635416865348816, + "rewards/repetition_penalty_reward": -0.026931931264698505, + "rewards/tag_count_reward": 0.9739583432674408, "step": 423 }, { "clip_ratio": 0.0, - "completion_length": 283.3958435058594, + "completion_length": 489.8333435058594, "epoch": 0.424, - "grad_norm": 10.636459245130991, - "kl": 2.21875, + "grad_norm": 10.226503973140625, + "kl": 1.58984375, "learning_rate": 7.416006812042827e-07, - "loss": 0.4015, - "reward": 2.8386212587356567, - "reward_std": 0.2844984009861946, - "rewards/accuracy_reward": 0.8750000298023224, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.008601217065006495, + "loss": 0.3074, + "reward": 2.462380290031433, + "reward_std": 0.39044664800167084, + "rewards/accuracy_reward": 0.5, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.016786448657512665, "rewards/tag_count_reward": 0.9791666865348816, "step": 424 }, { "clip_ratio": 0.0, - "completion_length": 404.93751525878906, + "completion_length": 642.6666870117188, "epoch": 0.425, - "grad_norm": 17.97141631212035, - "kl": 3.6796875, + "grad_norm": 14.863563634108994, + "kl": 4.3671875, "learning_rate": 7.401782177833147e-07, - "loss": 0.8092, - "reward": 2.8467488288879395, - "reward_std": 0.3366597145795822, - "rewards/accuracy_reward": 0.9375000298023224, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.007417984306812286, - "rewards/tag_count_reward": 0.9166666865348816, + "loss": 0.7175, + "reward": 2.3808945417404175, + "reward_std": 0.6382828205823898, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.9861111640930176, + "rewards/repetition_penalty_reward": -0.03750831447541714, + "rewards/tag_count_reward": 0.890625, "step": 425 }, { "clip_ratio": 0.0, - "completion_length": 435.5625305175781, + "completion_length": 619.1666870117188, "epoch": 0.426, - "grad_norm": 17.37729426506878, - "kl": 3.875, + "grad_norm": 10.866155494400388, + "kl": 2.935546875, "learning_rate": 7.387534371007797e-07, - "loss": 1.1299, - "reward": 2.7906486988067627, - "reward_std": 0.4539821445941925, - "rewards/accuracy_reward": 0.9166666865348816, + "loss": 0.3152, + "reward": 2.6118627786636353, + "reward_std": 0.29477719962596893, + "rewards/accuracy_reward": 0.7291666865348816, "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.021851486759260297, - "rewards/tag_count_reward": 0.8958333730697632, + "rewards/repetition_penalty_reward": -0.028762279078364372, + "rewards/tag_count_reward": 0.9114583730697632, "step": 426 }, { "clip_ratio": 0.0, - "completion_length": 238.1041717529297, + "completion_length": 638.8333435058594, "epoch": 0.427, - "grad_norm": 11.698938755968873, - "kl": 1.275390625, + "grad_norm": 11.888028322568847, + "kl": 3.5625, "learning_rate": 7.373263565171805e-07, - "loss": 0.2203, - "reward": 2.948538661003113, - "reward_std": 0.1473785564303398, - "rewards/accuracy_reward": 0.9583333730697632, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.00979476748034358, - "rewards/tag_count_reward": 1.0, + "loss": 0.3712, + "reward": 2.460385322570801, + "reward_std": 0.48997509479522705, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9861111640930176, + "rewards/repetition_penalty_reward": -0.025725997984409332, + "rewards/tag_count_reward": 0.8958333730697632, "step": 427 }, { "clip_ratio": 0.0, - "completion_length": 273.6875, + "completion_length": 579.4375, "epoch": 0.428, - "grad_norm": 7.90979453410796, - "kl": 2.236328125, + "grad_norm": 12.847512777948014, + "kl": 2.53515625, "learning_rate": 7.358969934210438e-07, - "loss": 0.2656, - "reward": 2.817924976348877, - "reward_std": 0.29837460815906525, - "rewards/accuracy_reward": 0.8750000298023224, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.010199957992881536, - "rewards/tag_count_reward": 0.9531250298023224, + "loss": 0.6338, + "reward": 2.6187379360198975, + "reward_std": 0.53090900182724, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.02362314984202385, + "rewards/tag_count_reward": 0.8854166865348816, "step": 428 }, { "clip_ratio": 0.0, - "completion_length": 420.22918701171875, + "completion_length": 483.70835876464844, "epoch": 0.429, - "grad_norm": 14.018561166957593, - "kl": 3.921875, + "grad_norm": 11.418979135114895, + "kl": 1.544921875, "learning_rate": 7.344653652287077e-07, - "loss": 0.7596, - "reward": 2.6629719734191895, - "reward_std": 0.43267548084259033, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.012375576887279749, - "rewards/tag_count_reward": 0.9322916865348816, + "loss": 0.2723, + "reward": 2.708911895751953, + "reward_std": 0.45899534225463867, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.02199092786759138, + "rewards/tag_count_reward": 0.953125, "step": 429 }, { "clip_ratio": 0.0, - "completion_length": 304.20833587646484, + "completion_length": 705.2083435058594, "epoch": 0.43, - "grad_norm": 16.084290973845988, - "kl": 3.51171875, + "grad_norm": 25.61496750803982, + "kl": 5.7578125, "learning_rate": 7.330314893841101e-07, - "loss": 0.3177, - "reward": 2.8543320894241333, - "reward_std": 0.27815040200948715, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.008515107212588191, - "rewards/tag_count_reward": 0.953125, + "loss": 0.6231, + "reward": 2.4273844957351685, + "reward_std": 0.5663270652294159, + "rewards/accuracy_reward": 0.625, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.02400454506278038, + "rewards/tag_count_reward": 0.8333333432674408, "step": 430 }, { "clip_ratio": 0.0, - "completion_length": 330.43751525878906, + "completion_length": 462.1666717529297, "epoch": 0.431, - "grad_norm": 32.49188981156383, - "kl": 4.9453125, + "grad_norm": 16.91868590026825, + "kl": 1.2421875, "learning_rate": 7.315953833585755e-07, - "loss": 0.9476, - "reward": 2.794061064720154, - "reward_std": 0.46746546030044556, - "rewards/accuracy_reward": 0.8958333730697632, - "rewards/reasoning_steps_reward": 0.9652777910232544, - "rewards/repetition_penalty_reward": -0.009758562548086047, - "rewards/tag_count_reward": 0.9427083432674408, + "loss": 0.1736, + "reward": 2.821496605873108, + "reward_std": 0.40228843688964844, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.020517424680292606, + "rewards/tag_count_reward": 0.9531250298023224, "step": 431 }, { "clip_ratio": 0.0, - "completion_length": 372.8958435058594, + "completion_length": 564.7708435058594, "epoch": 0.432, - "grad_norm": 8.305770651864805, - "kl": 2.7734375, + "grad_norm": 10.05316608237381, + "kl": 3.078125, "learning_rate": 7.301570646506027e-07, - "loss": 0.3665, - "reward": 2.5656776428222656, - "reward_std": 0.3194030672311783, - "rewards/accuracy_reward": 0.6458333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.017655752133578062, - "rewards/tag_count_reward": 0.9375, + "loss": 0.7108, + "reward": 2.748389720916748, + "reward_std": 0.52855084836483, + "rewards/accuracy_reward": 0.8750000298023224, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.02070755325257778, + "rewards/tag_count_reward": 0.9010416865348816, "step": 432 }, { "clip_ratio": 0.0, - "completion_length": 338.7916717529297, + "completion_length": 779.3333435058594, "epoch": 0.433, - "grad_norm": 11.195575939484351, - "kl": 2.060546875, + "grad_norm": 27.40142017382295, + "kl": 7.359375, "learning_rate": 7.287165507856512e-07, - "loss": 0.6233, - "reward": 2.7856305837631226, - "reward_std": 0.3946501612663269, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.014716661535203457, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 0.8687, + "reward": 2.269049644470215, + "reward_std": 0.7071286737918854, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.015672642271965742, + "rewards/tag_count_reward": 0.75, "step": 433 }, { "clip_ratio": 0.0, - "completion_length": 233.62500762939453, + "completion_length": 671.7916870117188, "epoch": 0.434, - "grad_norm": 16.626958838382755, - "kl": 1.138671875, + "grad_norm": 21.75332490966816, + "kl": 4.8125, "learning_rate": 7.27273859315928e-07, - "loss": 0.2701, - "reward": 2.9222382307052612, - "reward_std": 0.1406011739745736, - "rewards/accuracy_reward": 0.9375, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.004845138406381011, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.6014, + "reward": 2.483176350593567, + "reward_std": 0.5736861526966095, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9722222983837128, + "rewards/repetition_penalty_reward": -0.02550439164042473, + "rewards/tag_count_reward": 0.8697916865348816, "step": 434 }, { "clip_ratio": 0.0, - "completion_length": 455.50001525878906, + "completion_length": 847.4375305175781, "epoch": 0.435, - "grad_norm": 29.675328876623947, - "kl": 3.0234375, + "grad_norm": 27.079037368491598, + "kl": 6.625, "learning_rate": 7.258290078201731e-07, - "loss": 1.1172, - "reward": 2.445867896080017, - "reward_std": 0.5425393283367157, - "rewards/accuracy_reward": 0.5416666716337204, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.012465503066778183, - "rewards/tag_count_reward": 0.9375000298023224, + "loss": 0.8844, + "reward": 1.987673282623291, + "reward_std": 0.6426667273044586, + "rewards/accuracy_reward": 0.2708333358168602, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.027951689437031746, + "rewards/tag_count_reward": 0.7447916865348816, "step": 435 }, { "clip_ratio": 0.0, - "completion_length": 321.9791717529297, + "completion_length": 611.125, "epoch": 0.436, - "grad_norm": 25.371836632661847, - "kl": 1.87890625, + "grad_norm": 10.091665515824934, + "kl": 4.046875, "learning_rate": 7.243820139034464e-07, - "loss": 0.7732, - "reward": 2.633955955505371, - "reward_std": 0.36667102575302124, - "rewards/accuracy_reward": 0.7083333730697632, - "rewards/reasoning_steps_reward": 0.972222238779068, - "rewards/repetition_penalty_reward": -0.010141498409211636, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.6799, + "reward": 2.2479928731918335, + "reward_std": 0.6026458740234375, + "rewards/accuracy_reward": 0.458333358168602, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.015896069817245007, + "rewards/tag_count_reward": 0.8125000298023224, "step": 436 }, { "clip_ratio": 0.0, - "completion_length": 258.5, + "completion_length": 750.5833740234375, "epoch": 0.437, - "grad_norm": 8.855478252795859, - "kl": 1.078125, + "grad_norm": 9.40390565894805, + "kl": 4.390625, "learning_rate": 7.229328951969115e-07, - "loss": 0.0244, - "reward": 2.6492111682891846, - "reward_std": 0.13810409186407924, + "loss": 0.7124, + "reward": 2.4180572032928467, + "reward_std": 0.6964816451072693, "rewards/accuracy_reward": 0.6666666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.010510906344279647, - "rewards/tag_count_reward": 1.0, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.01944286935031414, + "rewards/tag_count_reward": 0.7916666865348816, "step": 437 }, { "clip_ratio": 0.0, - "completion_length": 242.06250762939453, + "completion_length": 597.7500305175781, "epoch": 0.438, - "grad_norm": 11.963028453699632, - "kl": 1.2734375, + "grad_norm": 14.732573596337566, + "kl": 1.73828125, "learning_rate": 7.214816693576234e-07, - "loss": 0.1783, - "reward": 2.8399312496185303, - "reward_std": 0.17648932174779475, - "rewards/accuracy_reward": 0.875, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.0055551567347720265, - "rewards/tag_count_reward": 0.984375, + "loss": 0.4652, + "reward": 2.5944327116012573, + "reward_std": 0.4662973880767822, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9861111640930176, + "rewards/repetition_penalty_reward": -0.021886682137846947, + "rewards/tag_count_reward": 0.8802083432674408, "step": 438 }, { "clip_ratio": 0.0, - "completion_length": 231.64584350585938, + "completion_length": 666.2083587646484, "epoch": 0.439, - "grad_norm": 21.671409471748014, - "kl": 1.236328125, + "grad_norm": 15.776670792271592, + "kl": 3.07421875, "learning_rate": 7.200283540683102e-07, - "loss": -0.032, - "reward": 2.598492741584778, - "reward_std": 0.27336232364177704, - "rewards/accuracy_reward": 0.6041666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.005674049956724048, - "rewards/tag_count_reward": 1.0, + "loss": 0.5074, + "reward": 2.3149194717407227, + "reward_std": 0.568027138710022, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.02014992106705904, + "rewards/tag_count_reward": 0.8072916865348816, "step": 439 }, { "clip_ratio": 0.0, - "completion_length": 225.7291717529297, + "completion_length": 579.7500305175781, "epoch": 0.44, - "grad_norm": 21.828812504472463, - "kl": 1.46875, + "grad_norm": 13.110846882696643, + "kl": 1.84765625, "learning_rate": 7.185729670371604e-07, - "loss": 0.3567, - "reward": 2.6878061294555664, - "reward_std": 0.15182771906256676, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.004902319284155965, - "rewards/tag_count_reward": 0.9843750298023224, + "loss": 0.3424, + "reward": 2.5865591764450073, + "reward_std": 0.5619917958974838, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.024551907554268837, + "rewards/tag_count_reward": 0.8958333432674408, "step": 440 }, { "clip_ratio": 0.0, - "completion_length": 404.8125, + "completion_length": 691.2916870117188, "epoch": 0.441, - "grad_norm": 27.648451934044843, - "kl": 5.078125, + "grad_norm": 13.083680161708491, + "kl": 2.3515625, "learning_rate": 7.171155259976057e-07, - "loss": 0.7272, - "reward": 2.6540826559066772, - "reward_std": 0.44992175698280334, - "rewards/accuracy_reward": 0.75, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.014320255257189274, - "rewards/tag_count_reward": 0.9322916865348816, + "loss": 0.5228, + "reward": 2.60994291305542, + "reward_std": 0.5069815963506699, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.034154389053583145, + "rewards/tag_count_reward": 0.9010416865348816, "step": 441 }, { "clip_ratio": 0.0, - "completion_length": 249.12500762939453, + "completion_length": 646.6041870117188, "epoch": 0.442, - "grad_norm": 10.419892933320272, - "kl": 1.8359375, + "grad_norm": 50.887150157363536, + "kl": 4.890625, "learning_rate": 7.156560487081051e-07, - "loss": 0.42, - "reward": 2.8949460983276367, - "reward_std": 0.21132793929427862, - "rewards/accuracy_reward": 0.9375, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.006095779070165008, - "rewards/tag_count_reward": 0.984375, + "loss": 0.9085, + "reward": 2.2685351371765137, + "reward_std": 0.7841026484966278, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.03354842588305473, + "rewards/tag_count_reward": 0.7812500298023224, "step": 442 }, { "clip_ratio": 0.0, - "completion_length": 184.39583587646484, + "completion_length": 755.8750305175781, "epoch": 0.443, - "grad_norm": 14.540377312001233, - "kl": 0.873046875, + "grad_norm": 15.956258650623388, + "kl": 5.609375, "learning_rate": 7.141945529519288e-07, - "loss": 0.1231, - "reward": 2.993991494178772, - "reward_std": 0.011944914236664772, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.006008585449308157, - "rewards/tag_count_reward": 1.0, + "loss": 0.7977, + "reward": 2.418252944946289, + "reward_std": 0.683345377445221, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.027927756309509277, + "rewards/tag_count_reward": 0.828125, "step": 443 }, { "clip_ratio": 0.0, - "completion_length": 271.25000762939453, + "completion_length": 743.5416870117188, "epoch": 0.444, - "grad_norm": 17.09004745008883, - "kl": 1.48046875, + "grad_norm": 24.6724782512694, + "kl": 7.890625, "learning_rate": 7.127310565369415e-07, - "loss": 0.33, - "reward": 2.7574501037597656, - "reward_std": 0.19106110045686364, - "rewards/accuracy_reward": 0.8125, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.013383325655013323, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 1.0148, + "reward": 2.227971076965332, + "reward_std": 0.8724583983421326, + "rewards/accuracy_reward": 0.5625, + "rewards/reasoning_steps_reward": 0.965277761220932, + "rewards/repetition_penalty_reward": -0.04980688448995352, + "rewards/tag_count_reward": 0.7500000298023224, "step": 444 }, { "clip_ratio": 0.0, - "completion_length": 247.1666717529297, + "completion_length": 617.1458435058594, "epoch": 0.445, - "grad_norm": 11.595192617068363, - "kl": 2.318359375, + "grad_norm": 18.521282427965453, + "kl": 4.4140625, "learning_rate": 7.11265577295385e-07, - "loss": 0.2551, - "reward": 2.9505834579467773, - "reward_std": 0.11514858528971672, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.011222275323234499, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.5287, + "reward": 2.4945348501205444, + "reward_std": 0.5981617867946625, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9861111640930176, + "rewards/repetition_penalty_reward": -0.01761806895956397, + "rewards/tag_count_reward": 0.8385416865348816, "step": 445 }, { "clip_ratio": 0.0, - "completion_length": 286.9166793823242, + "completion_length": 732.5625305175781, "epoch": 0.446, - "grad_norm": 18.680897407222055, - "kl": 3.734375, + "grad_norm": 34.24653385750582, + "kl": 6.484375, "learning_rate": 7.097981330836616e-07, - "loss": 0.2912, - "reward": 2.6779263019561768, - "reward_std": 0.36124061048030853, - "rewards/accuracy_reward": 0.7083333730697632, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.009573756018653512, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.7675, + "reward": 1.965806484222412, + "reward_std": 0.6163901686668396, + "rewards/accuracy_reward": 0.2708333358168602, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.04113807622343302, + "rewards/tag_count_reward": 0.75, "step": 446 }, { "clip_ratio": 0.0, - "completion_length": 257.12500762939453, + "completion_length": 672.2708435058594, "epoch": 0.447, - "grad_norm": 11.804832877485316, - "kl": 2.453125, + "grad_norm": 15.152108234478831, + "kl": 5.34375, "learning_rate": 7.083287417821157e-07, - "loss": 0.2602, - "reward": 2.763852119445801, - "reward_std": 0.1756526604294777, - "rewards/accuracy_reward": 0.7708333730697632, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.006981401238590479, - "rewards/tag_count_reward": 1.0, + "loss": 0.7173, + "reward": 2.316531181335449, + "reward_std": 0.5732046067714691, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.018538246862590313, + "rewards/tag_count_reward": 0.8281250298023224, "step": 447 }, { "clip_ratio": 0.0, - "completion_length": 266.0416793823242, + "completion_length": 730.1875305175781, "epoch": 0.448, - "grad_norm": 12.30412227520385, - "kl": 1.57421875, + "grad_norm": 9.314353876415918, + "kl": 4.265625, "learning_rate": 7.068574212948169e-07, - "loss": 0.2714, - "reward": 2.898730993270874, - "reward_std": 0.20964787900447845, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.0109912040643394, - "rewards/tag_count_reward": 1.0, + "loss": 0.7717, + "reward": 2.5467538833618164, + "reward_std": 0.5365928113460541, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.03137107007205486, + "rewards/tag_count_reward": 0.8281250298023224, "step": 448 }, { "clip_ratio": 0.0, - "completion_length": 187.8541717529297, + "completion_length": 650.6875305175781, "epoch": 0.449, - "grad_norm": 11.066113037706055, - "kl": 1.23046875, + "grad_norm": 46.24553233658842, + "kl": 2.59375, "learning_rate": 7.053841895493406e-07, - "loss": 0.1982, - "reward": 2.977561354637146, - "reward_std": 0.04039582028053701, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.0016053106519393623, - "rewards/tag_count_reward": 1.0, + "loss": 0.778, + "reward": 2.618269443511963, + "reward_std": 0.6291141211986542, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9861111640930176, + "rewards/repetition_penalty_reward": -0.029300065711140633, + "rewards/tag_count_reward": 0.8906250298023224, "step": 449 }, { "clip_ratio": 0.0, - "completion_length": 205.50000762939453, + "completion_length": 628.1458435058594, "epoch": 0.45, - "grad_norm": 8.214840475178983, - "kl": 0.990234375, + "grad_norm": 22.01768222586756, + "kl": 1.95703125, "learning_rate": 7.039090644965509e-07, - "loss": 0.0391, - "reward": 2.969719409942627, - "reward_std": 0.07768429722636938, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.009447223274037242, - "rewards/tag_count_reward": 1.0, + "loss": 0.4547, + "reward": 2.60259747505188, + "reward_std": 0.5832863450050354, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9861111640930176, + "rewards/repetition_penalty_reward": -0.02934711705893278, + "rewards/tag_count_reward": 0.8958333432674408, "step": 450 }, { "clip_ratio": 0.0, - "completion_length": 355.4166717529297, + "completion_length": 517.1458435058594, "epoch": 0.451, - "grad_norm": 19.276343643182592, - "kl": 3.53125, + "grad_norm": 33.402974244613176, + "kl": 1.6484375, "learning_rate": 7.024320641103811e-07, - "loss": 0.382, - "reward": 2.527764320373535, - "reward_std": 0.3032260835170746, - "rewards/accuracy_reward": 0.5625000298023224, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.01390251237899065, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.4482, + "reward": 2.561974287033081, + "reward_std": 0.5507599115371704, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.017886925488710403, + "rewards/tag_count_reward": 0.90625, "step": 451 }, { "clip_ratio": 0.0, - "completion_length": 251.81251525878906, + "completion_length": 501.41668701171875, "epoch": 0.452, - "grad_norm": 12.253339596009525, - "kl": 2.25, + "grad_norm": 12.388889897731671, + "kl": 1.1015625, "learning_rate": 7.009532063876148e-07, - "loss": 0.5179, - "reward": 2.908603549003601, - "reward_std": 0.24857314629480243, - "rewards/accuracy_reward": 0.9375, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.006326982693281025, - "rewards/tag_count_reward": 0.984375, + "loss": -0.028, + "reward": 2.4829607009887695, + "reward_std": 0.4351721853017807, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.022247745655477047, + "rewards/tag_count_reward": 0.921875, "step": 452 }, { "clip_ratio": 0.0, - "completion_length": 323.4583435058594, + "completion_length": 663.8125305175781, "epoch": 0.453, - "grad_norm": 17.217793364184836, - "kl": 2.56640625, + "grad_norm": 8.122371694455087, + "kl": 3.46875, "learning_rate": 6.994725093476664e-07, - "loss": 0.4318, - "reward": 2.7413212060928345, - "reward_std": 0.3314635679125786, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.012150994269177318, - "rewards/tag_count_reward": 0.96875, + "loss": 0.586, + "reward": 2.5677295923233032, + "reward_std": 0.5909285247325897, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.0173398619517684, + "rewards/tag_count_reward": 0.8697916865348816, "step": 453 }, { "clip_ratio": 0.0, - "completion_length": 189.52083587646484, + "completion_length": 724.4375457763672, "epoch": 0.454, - "grad_norm": 12.078609668023535, - "kl": 1.673828125, + "grad_norm": 24.750328325907926, + "kl": 3.8046875, "learning_rate": 6.979899910323624e-07, - "loss": 0.2312, - "reward": 2.9633864164352417, - "reward_std": 0.07754162582568824, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 0.9861111044883728, - "rewards/repetition_penalty_reward": -0.0018913548556156456, - "rewards/tag_count_reward": 1.0, + "loss": 0.4282, + "reward": 2.418397545814514, + "reward_std": 0.5793856680393219, + "rewards/accuracy_reward": 0.6458333730697632, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.022574756294488907, + "rewards/tag_count_reward": 0.8229166865348816, "step": 454 }, { "clip_ratio": 0.0, - "completion_length": 169.8125, + "completion_length": 670.1875, "epoch": 0.455, - "grad_norm": 15.920796040915208, - "kl": 0.76171875, + "grad_norm": 32.10532320367166, + "kl": 2.8515625, "learning_rate": 6.965056695057204e-07, - "loss": 0.0464, - "reward": 2.927971601486206, - "reward_std": 0.173157200217247, - "rewards/accuracy_reward": 0.9375000298023224, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.002584027766715735, - "rewards/tag_count_reward": 1.0, + "loss": 0.5623, + "reward": 2.5078253746032715, + "reward_std": 0.7500589489936829, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.04252194054424763, + "rewards/tag_count_reward": 0.8281250298023224, "step": 455 }, { "clip_ratio": 0.0, - "completion_length": 253.375, + "completion_length": 769.7083435058594, "epoch": 0.456, - "grad_norm": 7.995068686055259, - "kl": 1.4140625, + "grad_norm": 35.5370005747395, + "kl": 5.8203125, "learning_rate": 6.950195628537299e-07, - "loss": 0.0637, - "reward": 2.8680626153945923, - "reward_std": 0.2435438260436058, - "rewards/accuracy_reward": 0.9166666865348816, + "loss": 0.7239, + "reward": 2.236100912094116, + "reward_std": 0.6354574412107468, + "rewards/accuracy_reward": 0.4791666865348816, "rewards/reasoning_steps_reward": 0.979166716337204, - "rewards/repetition_penalty_reward": -0.0173541740514338, - "rewards/tag_count_reward": 0.9895833432674408, + "rewards/repetition_penalty_reward": -0.024315819144248962, + "rewards/tag_count_reward": 0.8020833432674408, "step": 456 }, { "clip_ratio": 0.0, - "completion_length": 312.2916717529297, + "completion_length": 598.4166870117188, "epoch": 0.457, - "grad_norm": 10.99706180411646, - "kl": 2.8515625, + "grad_norm": 15.54748803668102, + "kl": 2.38671875, "learning_rate": 6.935316891841315e-07, - "loss": 0.4685, - "reward": 2.8859946727752686, - "reward_std": 0.2174585685133934, - "rewards/accuracy_reward": 0.9375, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.01678317505866289, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.5143, + "reward": 2.461942434310913, + "reward_std": 0.5678964406251907, + "rewards/accuracy_reward": 0.6250000149011612, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.02764105051755905, + "rewards/tag_count_reward": 0.8645833432674408, "step": 457 }, { "clip_ratio": 0.0, - "completion_length": 292.95833587646484, + "completion_length": 687.0208740234375, "epoch": 0.458, - "grad_norm": 10.30490284856631, - "kl": 1.52734375, + "grad_norm": 9.674950437917838, + "kl": 3.5078125, "learning_rate": 6.920420666261961e-07, - "loss": 0.398, - "reward": 2.85658597946167, - "reward_std": 0.2730413416866213, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/reasoning_steps_reward": 0.9861111044883728, - "rewards/repetition_penalty_reward": -0.009733541868627071, - "rewards/tag_count_reward": 0.984375, + "loss": 0.525, + "reward": 2.373473048210144, + "reward_std": 0.6431159377098083, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9861111640930176, + "rewards/repetition_penalty_reward": -0.02409642282873392, + "rewards/tag_count_reward": 0.8281250298023224, "step": 458 }, { "clip_ratio": 0.0, - "completion_length": 274.7291793823242, + "completion_length": 597.1666870117188, "epoch": 0.459, - "grad_norm": 15.714034242580542, - "kl": 2.890625, + "grad_norm": 11.267949423552796, + "kl": 2.6328125, "learning_rate": 6.905507133305047e-07, - "loss": 0.3013, - "reward": 2.988945960998535, - "reward_std": 0.006963400868698955, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.01105411583557725, - "rewards/tag_count_reward": 1.0, + "loss": 0.5289, + "reward": 2.4548171758651733, + "reward_std": 0.5164096057415009, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.017405156511813402, + "rewards/tag_count_reward": 0.8750000298023224, "step": 459 }, { "clip_ratio": 0.0, - "completion_length": 319.1666793823242, + "completion_length": 624.125, "epoch": 0.46, - "grad_norm": 12.20715746544976, - "kl": 3.46875, + "grad_norm": 11.88684382845764, + "kl": 3.703125, "learning_rate": 6.890576474687263e-07, - "loss": 0.4873, - "reward": 2.765836477279663, - "reward_std": 0.2891187369823456, - "rewards/accuracy_reward": 0.8125000298023224, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.01541365310549736, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.6898, + "reward": 2.5472575426101685, + "reward_std": 0.6651158332824707, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9861111640930176, + "rewards/repetition_penalty_reward": -0.016978577245026827, + "rewards/tag_count_reward": 0.8489583730697632, "step": 460 }, { "clip_ratio": 0.0, - "completion_length": 303.20833587646484, + "completion_length": 752.25, "epoch": 0.461, - "grad_norm": 38.62287019446842, - "kl": 4.25, + "grad_norm": 14.448078905297072, + "kl": 5.765625, "learning_rate": 6.875628872333975e-07, - "loss": 0.3742, - "reward": 2.554042100906372, - "reward_std": 0.396420918405056, - "rewards/accuracy_reward": 0.6041666865348816, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.004985676612704992, - "rewards/tag_count_reward": 0.9687500298023224, + "loss": 0.7562, + "reward": 2.2633214592933655, + "reward_std": 0.6216214001178741, + "rewards/accuracy_reward": 0.5000000149011612, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.02660918142646551, + "rewards/tag_count_reward": 0.7968750298023224, "step": 461 }, { "clip_ratio": 0.0, - "completion_length": 267.1666717529297, + "completion_length": 708.3958740234375, "epoch": 0.462, - "grad_norm": 30.51307885830115, - "kl": 3.515625, + "grad_norm": 20.449164875872643, + "kl": 7.078125, "learning_rate": 6.860664508377001e-07, - "loss": 0.4237, - "reward": 2.764556646347046, - "reward_std": 0.3790958672761917, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.011485280469059944, - "rewards/tag_count_reward": 0.984375, + "loss": 0.7331, + "reward": 2.1556172370910645, + "reward_std": 0.7438893914222717, + "rewards/accuracy_reward": 0.4583333432674408, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.01799399685114622, + "rewards/tag_count_reward": 0.7291666865348816, "step": 462 }, { "clip_ratio": 0.0, - "completion_length": 269.1458435058594, + "completion_length": 650.5000305175781, "epoch": 0.463, - "grad_norm": 28.719414086512987, - "kl": 2.4921875, + "grad_norm": 14.487854173063429, + "kl": 4.7265625, "learning_rate": 6.84568356515239e-07, - "loss": 0.492, - "reward": 2.834352970123291, - "reward_std": 0.41617171466350555, - "rewards/accuracy_reward": 0.8958333730697632, - "rewards/reasoning_steps_reward": 0.9861111044883728, - "rewards/repetition_penalty_reward": -0.011133111780509353, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.818, + "reward": 2.314304828643799, + "reward_std": 0.6960805356502533, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.025973046198487282, + "rewards/tag_count_reward": 0.8125000298023224, "step": 463 }, { "clip_ratio": 0.0, - "completion_length": 326.2916717529297, + "completion_length": 727.3125, "epoch": 0.464, - "grad_norm": 41.380165153751626, - "kl": 2.55078125, + "grad_norm": 26.696071790504657, + "kl": 7.0625, "learning_rate": 6.83068622519821e-07, - "loss": 0.544, - "reward": 2.8248093128204346, - "reward_std": 0.28381817042827606, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.008524150121957064, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.8028, + "reward": 2.3517472743988037, + "reward_std": 0.6738306879997253, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.02498907968401909, + "rewards/tag_count_reward": 0.7864583432674408, "step": 464 }, { "clip_ratio": 0.0, - "completion_length": 230.18750762939453, + "completion_length": 644.5000305175781, "epoch": 0.465, - "grad_norm": 29.64089972611228, - "kl": 1.26953125, + "grad_norm": 13.62258323479659, + "kl": 3.65625, "learning_rate": 6.815672671252315e-07, - "loss": 0.209, - "reward": 2.735522150993347, - "reward_std": 0.31543679535388947, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.0092695722123608, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.8709, + "reward": 2.5122939348220825, + "reward_std": 0.6799641847610474, + "rewards/accuracy_reward": 0.6875, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.0224284203723073, + "rewards/tag_count_reward": 0.8541666865348816, "step": 465 }, { "clip_ratio": 0.0, - "completion_length": 244.2916717529297, + "completion_length": 696.3125305175781, "epoch": 0.466, - "grad_norm": 15.73699488805263, - "kl": 1.05078125, + "grad_norm": 16.003897480590176, + "kl": 4.6328125, "learning_rate": 6.800643086250121e-07, - "loss": 0.0851, - "reward": 2.8762727975845337, - "reward_std": 0.1973465383052826, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.012616324238479137, - "rewards/tag_count_reward": 1.0, + "loss": 0.8121, + "reward": 2.034106969833374, + "reward_std": 0.537964329123497, + "rewards/accuracy_reward": 0.2500000074505806, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.021448652260005474, + "rewards/tag_count_reward": 0.8125000298023224, "step": 466 }, { "clip_ratio": 0.0, - "completion_length": 395.12501525878906, + "completion_length": 834.3333740234375, "epoch": 0.467, - "grad_norm": 109.3348899491789, - "kl": 6.359375, + "grad_norm": 78.01009343549657, + "kl": 9.53125, "learning_rate": 6.78559765332238e-07, - "loss": 1.2095, - "reward": 2.626768112182617, - "reward_std": 0.4595881998538971, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9652777910232544, - "rewards/repetition_penalty_reward": -0.01038473192602396, - "rewards/tag_count_reward": 0.9427083432674408, + "loss": 0.9044, + "reward": 1.9304699301719666, + "reward_std": 0.6387233734130859, + "rewards/accuracy_reward": 0.3125, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.02786353789269924, + "rewards/tag_count_reward": 0.6666666865348816, "step": 467 }, { "clip_ratio": 0.0, - "completion_length": 304.8125, + "completion_length": 653.5000305175781, "epoch": 0.468, - "grad_norm": 17.611421298445574, - "kl": 2.3359375, + "grad_norm": 26.98679371952424, + "kl": 4.2109375, "learning_rate": 6.770536555792944e-07, - "loss": 0.4842, - "reward": 2.794470429420471, - "reward_std": 0.399617999792099, - "rewards/accuracy_reward": 0.875, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.0249741836450994, - "rewards/tag_count_reward": 0.9583333730697632, + "loss": 0.8219, + "reward": 2.4220420122146606, + "reward_std": 0.6534742116928101, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.018930173479020596, + "rewards/tag_count_reward": 0.84375, "step": 468 }, { "clip_ratio": 0.0, - "completion_length": 304.4375, + "completion_length": 627.3750305175781, "epoch": 0.469, - "grad_norm": 16.326515615230953, - "kl": 3.375, + "grad_norm": 149.59812362533228, + "kl": 6.53125, "learning_rate": 6.755459977176532e-07, - "loss": 0.6251, - "reward": 2.8299753665924072, - "reward_std": 0.4067747890949249, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.005094195483252406, - "rewards/tag_count_reward": 0.9531250298023224, + "loss": 0.8048, + "reward": 2.1985827684402466, + "reward_std": 0.603963702917099, + "rewards/accuracy_reward": 0.4375, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.03405630309134722, + "rewards/tag_count_reward": 0.8229166865348816, "step": 469 }, { "clip_ratio": 0.0, - "completion_length": 355.56251525878906, + "completion_length": 470.20835876464844, "epoch": 0.47, - "grad_norm": 15.89185600106967, - "kl": 4.3125, + "grad_norm": 17.189852933380237, + "kl": 1.8984375, "learning_rate": 6.740368101176495e-07, - "loss": 0.5398, - "reward": 2.787715196609497, - "reward_std": 0.18601080728694797, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.012632016558200121, - "rewards/tag_count_reward": 0.953125, + "loss": 0.2061, + "reward": 2.4457980394363403, + "reward_std": 0.44828473031520844, + "rewards/accuracy_reward": 0.5625, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.01774365920573473, + "rewards/tag_count_reward": 0.9218750298023224, "step": 470 }, { "clip_ratio": 0.0, - "completion_length": 344.8958435058594, + "completion_length": 547.5625, "epoch": 0.471, - "grad_norm": 14.774842647953179, - "kl": 4.875, + "grad_norm": 15.50803120857332, + "kl": 2.32421875, "learning_rate": 6.725261111682584e-07, - "loss": 1.0535, - "reward": 2.855059504508972, - "reward_std": 0.3954618573188782, - "rewards/accuracy_reward": 0.9375000298023224, - "rewards/reasoning_steps_reward": 0.9791667461395264, - "rewards/repetition_penalty_reward": -0.014732431387528777, - "rewards/tag_count_reward": 0.9531250298023224, + "loss": 0.4077, + "reward": 2.327059745788574, + "reward_std": 0.5171791017055511, + "rewards/accuracy_reward": 0.4791666716337204, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.023634711280465126, + "rewards/tag_count_reward": 0.8854166865348816, "step": 471 }, { "clip_ratio": 0.0, - "completion_length": 433.37501525878906, + "completion_length": 642.6666870117188, "epoch": 0.472, - "grad_norm": 26.75576523716219, - "kl": 4.875, + "grad_norm": 21.870962603035192, + "kl": 3.46875, "learning_rate": 6.710139192768694e-07, - "loss": 0.7723, - "reward": 2.653918743133545, - "reward_std": 0.3631540536880493, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.016220235731452703, - "rewards/tag_count_reward": 0.9479166865348816, - "step": 472 + "loss": 0.7413, + "reward": 2.383070707321167, + "reward_std": 0.5808857977390289, + "rewards/accuracy_reward": 0.5625000298023224, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.02838774584233761, + "rewards/tag_count_reward": 0.8489583432674408, + "step": 472 }, { "clip_ratio": 0.0, - "completion_length": 380.27083587646484, + "completion_length": 642.8958435058594, "epoch": 0.473, - "grad_norm": 27.581830903427907, - "kl": 6.1875, + "grad_norm": 10.842744450293143, + "kl": 3.8203125, "learning_rate": 6.695002528690639e-07, - "loss": 0.4787, - "reward": 2.801064968109131, - "reward_std": 0.29212646931409836, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.014907423872500658, - "rewards/tag_count_reward": 0.9479166865348816, + "loss": 0.6264, + "reward": 2.4516608715057373, + "reward_std": 0.5648539513349533, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.024033674970269203, + "rewards/tag_count_reward": 0.8020833432674408, "step": 473 }, { "clip_ratio": 0.0, - "completion_length": 362.125, + "completion_length": 724.2708435058594, "epoch": 0.474, - "grad_norm": 14.79478361269408, - "kl": 2.7734375, + "grad_norm": 29.595685417395913, + "kl": 5.6640625, "learning_rate": 6.679851303883891e-07, - "loss": 0.5538, - "reward": 2.8165420293807983, - "reward_std": 0.33146366477012634, - "rewards/accuracy_reward": 0.8958333730697632, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.021999629214406013, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.4416, + "reward": 2.2453513741493225, + "reward_std": 0.5234881341457367, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.020273788832128048, + "rewards/tag_count_reward": 0.7656250298023224, "step": 474 }, { "clip_ratio": 0.0, - "completion_length": 377.9375, + "completion_length": 648.6458435058594, "epoch": 0.475, - "grad_norm": 11.893689335159557, - "kl": 4.109375, + "grad_norm": 12.791295320250482, + "kl": 4.8125, "learning_rate": 6.664685702961344e-07, - "loss": 0.6569, - "reward": 2.879771113395691, - "reward_std": 0.3006248101592064, - "rewards/accuracy_reward": 0.9375, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.009117891313508153, - "rewards/tag_count_reward": 0.9583333432674408, + "loss": 0.6559, + "reward": 2.5282609462738037, + "reward_std": 0.6692875623703003, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.030766917392611504, + "rewards/tag_count_reward": 0.8437500298023224, "step": 475 }, { "clip_ratio": 0.0, - "completion_length": 408.2083435058594, + "completion_length": 548.0833435058594, "epoch": 0.476, - "grad_norm": 18.18403338652072, - "kl": 5.140625, + "grad_norm": 14.816045187511754, + "kl": 2.8046875, "learning_rate": 6.649505910711058e-07, - "loss": 1.2212, - "reward": 2.753008008003235, - "reward_std": 0.47393013536930084, - "rewards/accuracy_reward": 0.875, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.01956150634214282, - "rewards/tag_count_reward": 0.9114583432674408, + "loss": 0.4964, + "reward": 2.352592945098877, + "reward_std": 0.5124609172344208, + "rewards/accuracy_reward": 0.5416666716337204, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.03282391466200352, + "rewards/tag_count_reward": 0.8854166865348816, "step": 476 }, { "clip_ratio": 0.0, - "completion_length": 405.0208435058594, + "completion_length": 653.0208435058594, "epoch": 0.477, - "grad_norm": 27.373071605343785, - "kl": 6.078125, + "grad_norm": 17.711933807676367, + "kl": 4.5625, "learning_rate": 6.634312112094013e-07, - "loss": 1.0244, - "reward": 2.728609800338745, - "reward_std": 0.48706677556037903, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.010973602533340454, - "rewards/tag_count_reward": 0.9062500298023224, + "loss": 0.5825, + "reward": 2.517646074295044, + "reward_std": 0.46930187940597534, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.020548363216221333, + "rewards/tag_count_reward": 0.8645833432674408, "step": 477 }, { "clip_ratio": 0.0, - "completion_length": 249.33334350585938, + "completion_length": 569.8541870117188, "epoch": 0.478, - "grad_norm": 15.083408724555492, - "kl": 0.87109375, + "grad_norm": 15.060185969581552, + "kl": 2.9296875, "learning_rate": 6.619104492241847e-07, - "loss": 0.1685, - "reward": 2.567692518234253, - "reward_std": 0.14475586265325546, + "loss": 0.6103, + "reward": 2.419161558151245, + "reward_std": 0.5570693910121918, "rewards/accuracy_reward": 0.5833333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.010432600043714046, - "rewards/tag_count_reward": 0.9947916865348816, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.01833842322230339, + "rewards/tag_count_reward": 0.8750000298023224, "step": 478 }, { "clip_ratio": 0.0, - "completion_length": 278.7916793823242, + "completion_length": 820.8541870117188, "epoch": 0.479, - "grad_norm": 15.230261719124337, - "kl": 2.30078125, + "grad_norm": 67.25826820013505, + "kl": 5.484375, "learning_rate": 6.603883236454612e-07, - "loss": 0.5105, - "reward": 2.8713775873184204, - "reward_std": 0.33212073147296906, - "rewards/accuracy_reward": 0.9166666865348816, + "loss": 0.759, + "reward": 2.107418715953827, + "reward_std": 0.7182769775390625, + "rewards/accuracy_reward": 0.4375000298023224, "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.010567050776444376, - "rewards/tag_count_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.050567377358675, + "rewards/tag_count_reward": 0.7343750298023224, "step": 479 }, { "clip_ratio": 0.0, - "completion_length": 391.5208435058594, + "completion_length": 534.2291870117188, "epoch": 0.48, - "grad_norm": 10.357252958419537, - "kl": 3.4140625, + "grad_norm": 19.548359681394896, + "kl": 2.03515625, "learning_rate": 6.588648530198504e-07, - "loss": 0.4834, - "reward": 2.718873381614685, - "reward_std": 0.28838878870010376, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.01897380780428648, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 0.5902, + "reward": 2.236539840698242, + "reward_std": 0.47467684745788574, + "rewards/accuracy_reward": 0.3750000149011612, + "rewards/reasoning_steps_reward": 0.9861111640930176, + "rewards/repetition_penalty_reward": -0.03082139603793621, + "rewards/tag_count_reward": 0.9062500298023224, "step": 480 }, { "clip_ratio": 0.0, - "completion_length": 313.1458435058594, + "completion_length": 557.5833587646484, "epoch": 0.481, - "grad_norm": 25.521863668265123, - "kl": 2.34375, + "grad_norm": 27.83836565318464, + "kl": 2.5, "learning_rate": 6.573400559103613e-07, - "loss": 0.5594, - "reward": 2.433735966682434, - "reward_std": 0.2864895761013031, - "rewards/accuracy_reward": 0.47916667722165585, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.012444715015590191, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 0.6866, + "reward": 2.6256160736083984, + "reward_std": 0.6559298038482666, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9722222089767456, + "rewards/repetition_penalty_reward": -0.018481258302927017, + "rewards/tag_count_reward": 0.8802083432674408, "step": 481 }, { "clip_ratio": 0.0, - "completion_length": 321.5833435058594, + "completion_length": 544.4791870117188, "epoch": 0.482, - "grad_norm": 16.284045938467507, - "kl": 2.5078125, + "grad_norm": 16.75197360847501, + "kl": 2.359375, "learning_rate": 6.558139508961654e-07, - "loss": 0.5418, - "reward": 2.869401454925537, - "reward_std": 0.24261815659701824, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.016015302389860153, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.4691, + "reward": 2.4990181922912598, + "reward_std": 0.5695154368877411, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.02528748568147421, + "rewards/tag_count_reward": 0.9062500298023224, "step": 482 }, { "clip_ratio": 0.0, - "completion_length": 291.375, + "completion_length": 513.3125152587891, "epoch": 0.483, - "grad_norm": 36.736071740062556, - "kl": 2.7421875, + "grad_norm": 11.150322642215151, + "kl": 3.21875, "learning_rate": 6.542865565723707e-07, - "loss": 0.6645, - "reward": 2.9086687564849854, - "reward_std": 0.22905682167038321, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.01320632640272379, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.6304, + "reward": 2.340551257133484, + "reward_std": 0.5843985080718994, + "rewards/accuracy_reward": 0.5416666716337204, + "rewards/reasoning_steps_reward": 0.9513889849185944, + "rewards/repetition_penalty_reward": -0.02229604311287403, + "rewards/tag_count_reward": 0.8697916865348816, "step": 483 }, { "clip_ratio": 0.0, - "completion_length": 525.5208435058594, + "completion_length": 555.0, "epoch": 0.484, - "grad_norm": 32.813299806674735, - "kl": 6.921875, + "grad_norm": 25.161999743535905, + "kl": 4.7734375, "learning_rate": 6.527578915497951e-07, - "loss": 0.9861, - "reward": 2.333793520927429, - "reward_std": 0.6775176525115967, - "rewards/accuracy_reward": 0.4791666865348816, - "rewards/reasoning_steps_reward": 0.9513889253139496, - "rewards/repetition_penalty_reward": -0.02384539693593979, - "rewards/tag_count_reward": 0.9270833432674408, + "loss": 0.7346, + "reward": 2.534891724586487, + "reward_std": 0.6275463998317719, + "rewards/accuracy_reward": 0.7708333730697632, + "rewards/reasoning_steps_reward": 0.9652778506278992, + "rewards/repetition_penalty_reward": -0.029344591312110424, + "rewards/tag_count_reward": 0.8281250298023224, "step": 484 }, { "clip_ratio": 0.0, - "completion_length": 335.5416717529297, + "completion_length": 780.125, "epoch": 0.485, - "grad_norm": 11.224518053503527, - "kl": 3.673828125, + "grad_norm": 51.74758202769363, + "kl": 10.09375, "learning_rate": 6.512279744547392e-07, - "loss": 0.548, - "reward": 2.8074876070022583, - "reward_std": 0.2918330039829016, - "rewards/accuracy_reward": 0.875, - "rewards/reasoning_steps_reward": 0.9791666567325592, - "rewards/repetition_penalty_reward": -0.010220712749287486, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.9894, + "reward": 1.9237075448036194, + "reward_std": 0.6222628057003021, + "rewards/accuracy_reward": 0.2708333432674408, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.01726479548960924, + "rewards/tag_count_reward": 0.6979166865348816, "step": 485 }, { "clip_ratio": 0.0, - "completion_length": 361.54168701171875, + "completion_length": 694.1875305175781, "epoch": 0.486, - "grad_norm": 12.584296670452938, - "kl": 2.6484375, + "grad_norm": 26.567315708037814, + "kl": 5.890625, "learning_rate": 6.496968239287603e-07, - "loss": 0.4334, - "reward": 2.722707986831665, - "reward_std": 0.34765733778476715, - "rewards/accuracy_reward": 0.7916666865348816, + "loss": 0.7402, + "reward": 2.210293173789978, + "reward_std": 0.7222527861595154, + "rewards/accuracy_reward": 0.4791666865348816, "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.02381980326026678, - "rewards/tag_count_reward": 0.96875, + "rewards/repetition_penalty_reward": -0.02060973085463047, + "rewards/tag_count_reward": 0.7656250298023224, "step": 486 }, { "clip_ratio": 0.0, - "completion_length": 451.22918701171875, + "completion_length": 701.7291870117188, "epoch": 0.487, - "grad_norm": 16.438033111437058, - "kl": 5.2734375, + "grad_norm": 23.60086720039633, + "kl": 5.703125, "learning_rate": 6.481644586284442e-07, - "loss": 0.9588, - "reward": 2.72256863117218, - "reward_std": 0.3559862896800041, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.027431411668658257, - "rewards/tag_count_reward": 0.9375, + "loss": 0.8466, + "reward": 2.353806734085083, + "reward_std": 0.6038880944252014, + "rewards/accuracy_reward": 0.583333358168602, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.03334605507552624, + "rewards/tag_count_reward": 0.8177083432674408, "step": 487 }, { "clip_ratio": 0.0, - "completion_length": 184.33334350585938, + "completion_length": 748.4583435058594, "epoch": 0.488, - "grad_norm": 9.244410387486095, - "kl": 1.40234375, + "grad_norm": 19.03851418441834, + "kl": 6.2734375, "learning_rate": 6.466308972251785e-07, - "loss": 0.0326, - "reward": 2.916001558303833, - "reward_std": 0.21936041116714478, - "rewards/accuracy_reward": 0.9583333730697632, + "loss": 1.0066, + "reward": 2.5121508836746216, + "reward_std": 0.7225759625434875, + "rewards/accuracy_reward": 0.75, "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.009345855098217726, - "rewards/tag_count_reward": 0.9947916865348816, + "rewards/repetition_penalty_reward": -0.038196416571736336, + "rewards/tag_count_reward": 0.8281250298023224, "step": 488 }, { "clip_ratio": 0.0, - "completion_length": 459.16668701171875, + "completion_length": 558.0416870117188, "epoch": 0.489, - "grad_norm": 14.539758029710416, - "kl": 2.642578125, + "grad_norm": 12.839961288051784, + "kl": 3.6796875, "learning_rate": 6.45096158404925e-07, - "loss": 0.3171, - "reward": 2.3459954261779785, - "reward_std": 0.3811195343732834, - "rewards/accuracy_reward": 0.4375000149011612, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.04115736857056618, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.7412, + "reward": 2.5207144021987915, + "reward_std": 0.6570230424404144, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.015743907541036606, + "rewards/tag_count_reward": 0.890625, "step": 489 }, { "clip_ratio": 0.0, - "completion_length": 196.95833587646484, + "completion_length": 594.1666870117188, "epoch": 0.49, - "grad_norm": 30.034706832606794, - "kl": 0.751953125, + "grad_norm": 11.918218758688853, + "kl": 4.0390625, "learning_rate": 6.435602608679916e-07, - "loss": 0.1755, - "reward": 2.9789345264434814, - "reward_std": 0.05226236814633012, - "rewards/accuracy_reward": 1.0, + "loss": 0.5896, + "reward": 2.576896905899048, + "reward_std": 0.6758164465427399, + "rewards/accuracy_reward": 0.7500000298023224, "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.007176772924140096, - "rewards/tag_count_reward": 1.0, + "rewards/repetition_penalty_reward": -0.018589303828775883, + "rewards/tag_count_reward": 0.8593750298023224, "step": 490 }, { "clip_ratio": 0.0, - "completion_length": 378.3125, + "completion_length": 531.25, "epoch": 0.491, - "grad_norm": 9.673308309943636, - "kl": 2.494140625, + "grad_norm": 12.572669494859374, + "kl": 4.7421875, "learning_rate": 6.420232233288055e-07, - "loss": 0.5684, - "reward": 2.676477789878845, - "reward_std": 0.31330945109948516, - "rewards/accuracy_reward": 0.7291666716337204, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.016230588778853416, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.4827, + "reward": 2.256177067756653, + "reward_std": 0.543413519859314, + "rewards/accuracy_reward": 0.4375000223517418, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.01639253133907914, + "rewards/tag_count_reward": 0.8697916865348816, "step": 491 }, { "clip_ratio": 0.0, - "completion_length": 228.1875, + "completion_length": 400.9375, "epoch": 0.492, - "grad_norm": 11.738371356075815, - "kl": 0.6962890625, + "grad_norm": 25.196826952891467, + "kl": 1.765625, "learning_rate": 6.404850645156841e-07, - "loss": 0.1092, - "reward": 2.964345932006836, - "reward_std": 0.0826022494584322, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.014820618322119117, - "rewards/tag_count_reward": 1.0, + "loss": 0.3313, + "reward": 2.747916102409363, + "reward_std": 0.490363210439682, + "rewards/accuracy_reward": 0.8750000298023224, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.022917456924915314, + "rewards/tag_count_reward": 0.9166666865348816, "step": 492 }, { "clip_ratio": 0.0, - "completion_length": 238.39584350585938, + "completion_length": 536.6458435058594, "epoch": 0.493, - "grad_norm": 15.139611008350808, - "kl": 0.9453125, + "grad_norm": 12.14349711767654, + "kl": 2.984375, "learning_rate": 6.389458031706068e-07, - "loss": 0.1978, - "reward": 2.7019978761672974, - "reward_std": 0.10218662954866886, - "rewards/accuracy_reward": 0.7291666716337204, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.021960550919175148, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.5739, + "reward": 2.6953213214874268, + "reward_std": 0.5101250112056732, + "rewards/accuracy_reward": 0.8333333730697632, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.026901046745479107, + "rewards/tag_count_reward": 0.9166666865348816, "step": 493 }, { "clip_ratio": 0.0, - "completion_length": 238.08334350585938, + "completion_length": 399.7083435058594, "epoch": 0.494, - "grad_norm": 9.710857369040516, - "kl": 0.6513671875, + "grad_norm": 19.752623374452337, + "kl": 2.17578125, "learning_rate": 6.374054580489873e-07, - "loss": 0.0909, - "reward": 2.973971128463745, - "reward_std": 0.03539864718914032, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.019084418890997767, - "rewards/tag_count_reward": 1.0, + "loss": 0.1944, + "reward": 2.5966198444366455, + "reward_std": 0.4636584371328354, + "rewards/accuracy_reward": 0.7083333730697632, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.02317184768617153, + "rewards/tag_count_reward": 0.9322916865348816, "step": 494 }, { "clip_ratio": 0.0, - "completion_length": 253.95833587646484, + "completion_length": 465.91668701171875, "epoch": 0.495, - "grad_norm": 20.3697808953947, - "kl": 2.3671875, + "grad_norm": 14.16827974955118, + "kl": 1.6328125, "learning_rate": 6.358640479194451e-07, - "loss": 0.5815, - "reward": 2.7025917768478394, - "reward_std": 0.2823432832956314, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.007477694656699896, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 0.3459, + "reward": 2.6907471418380737, + "reward_std": 0.4236067831516266, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.0332112442702055, + "rewards/tag_count_reward": 0.9322916865348816, "step": 495 }, { "clip_ratio": 0.0, - "completion_length": 295.0416717529297, + "completion_length": 587.9791870117188, "epoch": 0.496, - "grad_norm": 9.430645547787067, - "kl": 1.99609375, + "grad_norm": 28.995423975163437, + "kl": 3.40625, "learning_rate": 6.343215915635761e-07, - "loss": 0.2919, - "reward": 2.7567089796066284, - "reward_std": 0.38885729759931564, - "rewards/accuracy_reward": 0.8125000298023224, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.014124338689725846, - "rewards/tag_count_reward": 0.9583333432674408, + "loss": 0.5617, + "reward": 2.512778878211975, + "reward_std": 0.5804081857204437, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.9791666567325592, + "rewards/repetition_penalty_reward": -0.028887868858873844, + "rewards/tag_count_reward": 0.9166666865348816, "step": 496 }, { "clip_ratio": 0.0, - "completion_length": 243.0416717529297, + "completion_length": 411.7083435058594, "epoch": 0.497, - "grad_norm": 12.06986867897201, - "kl": 1.90234375, + "grad_norm": 14.922976729616561, + "kl": 2.7109375, "learning_rate": 6.327781077757241e-07, - "loss": 0.2432, - "reward": 2.8464730978012085, - "reward_std": 0.181410051882267, - "rewards/accuracy_reward": 0.875, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.005957435816526413, - "rewards/tag_count_reward": 0.984375, + "loss": 0.3736, + "reward": 2.570963978767395, + "reward_std": 0.578457772731781, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.019313913770020008, + "rewards/tag_count_reward": 0.9166666865348816, "step": 497 }, { "clip_ratio": 0.0, - "completion_length": 243.25, + "completion_length": 570.1875, "epoch": 0.498, - "grad_norm": 15.21081882375541, - "kl": 1.72265625, + "grad_norm": 12.22816448398453, + "kl": 3.7890625, "learning_rate": 6.31233615362752e-07, - "loss": 0.4659, - "reward": 2.9344961643218994, - "reward_std": 0.20611581206321716, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.006476166658103466, - "rewards/tag_count_reward": 0.9687500298023224, + "loss": 0.7994, + "reward": 2.4899239540100098, + "reward_std": 0.45974400639533997, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.9861111640930176, + "rewards/repetition_penalty_reward": -0.0222289408557117, + "rewards/tag_count_reward": 0.8802083432674408, "step": 498 }, { "clip_ratio": 0.0, - "completion_length": 236.875, + "completion_length": 541.2083435058594, "epoch": 0.499, - "grad_norm": 8.580027665495455, - "kl": 1.15625, + "grad_norm": 24.07992840689996, + "kl": 3.2890625, "learning_rate": 6.296881331438126e-07, - "loss": 0.0374, - "reward": 2.717370390892029, - "reward_std": 0.08062501531094313, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.01179626164957881, - "rewards/tag_count_reward": 1.0, + "loss": 0.6263, + "reward": 2.4488918781280518, + "reward_std": 0.6255036890506744, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9861111640930176, + "rewards/repetition_penalty_reward": -0.02680276893079281, + "rewards/tag_count_reward": 0.8854166865348816, "step": 499 }, { "clip_ratio": 0.0, - "completion_length": 199.1875, + "completion_length": 539.4375, "epoch": 0.5, - "grad_norm": 8.003095596136852, - "kl": 1.033203125, + "grad_norm": 13.625300381502933, + "kl": 3.3359375, "learning_rate": 6.281416799501187e-07, - "loss": 0.1252, - "reward": 2.9793566465377808, - "reward_std": 0.03495909832417965, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.00849070237018168, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.3615, + "reward": 2.6264405250549316, + "reward_std": 0.5198075622320175, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.01939287781715393, + "rewards/tag_count_reward": 0.8750000298023224, "step": 500 }, { "clip_ratio": 0.0, - "completion_length": 243.5416717529297, + "completion_length": 527.1041870117188, "epoch": 0.501, - "grad_norm": 15.495719367664487, - "kl": 3.625, + "grad_norm": 14.175570746863878, + "kl": 4.4375, "learning_rate": 6.265942746247146e-07, - "loss": 0.7939, - "reward": 2.899778366088867, - "reward_std": 0.22089383751153946, - "rewards/accuracy_reward": 0.9583333730697632, - "rewards/reasoning_steps_reward": 0.9652778208255768, - "rewards/repetition_penalty_reward": -0.008207551087252796, - "rewards/tag_count_reward": 0.9843750298023224, + "loss": 0.5509, + "reward": 2.34303081035614, + "reward_std": 0.5519561469554901, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.9375000596046448, + "rewards/repetition_penalty_reward": -0.021552613005042076, + "rewards/tag_count_reward": 0.8854166865348816, "step": 501 }, { "clip_ratio": 0.0, - "completion_length": 198.7291717529297, + "completion_length": 698.0625305175781, "epoch": 0.502, - "grad_norm": 9.58883536803665, - "kl": 2.13671875, + "grad_norm": 11.766159336622879, + "kl": 5.6875, "learning_rate": 6.25045936022246e-07, - "loss": 0.2912, - "reward": 2.9464685916900635, - "reward_std": 0.10652142949402332, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.006656510988250375, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.9051, + "reward": 2.2460073232650757, + "reward_std": 0.7623308002948761, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.019617782905697823, + "rewards/tag_count_reward": 0.765625, "step": 502 }, { "clip_ratio": 0.0, - "completion_length": 156.31250762939453, + "completion_length": 680.4375305175781, "epoch": 0.503, - "grad_norm": 11.330982642401041, - "kl": 1.15234375, + "grad_norm": 29.510968471399966, + "kl": 7.453125, "learning_rate": 6.2349668300873e-07, - "loss": 0.0429, - "reward": 2.9774378538131714, - "reward_std": 0.06712529435753822, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.0034648407017812133, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 1.0744, + "reward": 2.1472885608673096, + "reward_std": 0.7113551497459412, + "rewards/accuracy_reward": 0.4583333432674408, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.03847557958215475, + "rewards/tag_count_reward": 0.7552083432674408, "step": 503 }, { "clip_ratio": 0.0, - "completion_length": 241.31251525878906, + "completion_length": 477.1041717529297, "epoch": 0.504, - "grad_norm": 19.100889677451153, - "kl": 1.17578125, + "grad_norm": 14.23791373020675, + "kl": 3.9609375, "learning_rate": 6.219465344613258e-07, - "loss": 0.0373, - "reward": 2.930669903755188, - "reward_std": 0.17007755488157272, - "rewards/accuracy_reward": 0.9583333730697632, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.015510922763496637, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.6783, + "reward": 2.5481714010238647, + "reward_std": 0.5729511976242065, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.012592657934874296, + "rewards/tag_count_reward": 0.8593750298023224, "step": 504 }, { "clip_ratio": 0.0, - "completion_length": 283.875, + "completion_length": 497.35418701171875, "epoch": 0.505, - "grad_norm": 35.665695552105845, - "kl": 3.421875, + "grad_norm": 11.708756841965316, + "kl": 3.3359375, "learning_rate": 6.203955092681039e-07, - "loss": 0.5468, - "reward": 2.5979902744293213, - "reward_std": 0.3358563035726547, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/reasoning_steps_reward": 0.9861111044883728, - "rewards/repetition_penalty_reward": -0.013120918069034815, - "rewards/tag_count_reward": 0.9583333432674408, + "loss": 0.4639, + "reward": 2.479733467102051, + "reward_std": 0.38708843290805817, + "rewards/accuracy_reward": 0.625, + "rewards/reasoning_steps_reward": 0.9861111640930176, + "rewards/repetition_penalty_reward": -0.022002640645951033, + "rewards/tag_count_reward": 0.890625, "step": 505 }, { "clip_ratio": 0.0, - "completion_length": 239.7291717529297, + "completion_length": 681.0625305175781, "epoch": 0.506, - "grad_norm": 15.324747307886929, - "kl": 2.390625, + "grad_norm": 18.2185049016805, + "kl": 5.5390625, "learning_rate": 6.188436263278172e-07, - "loss": 0.1762, - "reward": 2.9514931440353394, - "reward_std": 0.08562670648097992, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.025937482248991728, - "rewards/tag_count_reward": 0.9843750298023224, + "loss": 0.7, + "reward": 2.064489424228668, + "reward_std": 0.6310321092605591, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.022316260263323784, + "rewards/tag_count_reward": 0.7812500298023224, "step": 506 }, { "clip_ratio": 0.0, - "completion_length": 229.43750762939453, + "completion_length": 550.75, "epoch": 0.507, - "grad_norm": 14.586062141440776, - "kl": 1.859375, + "grad_norm": 17.7803570241784, + "kl": 3.546875, "learning_rate": 6.172909045496694e-07, - "loss": 0.159, - "reward": 2.9485890865325928, - "reward_std": 0.10438758321106434, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.009744220646098256, - "rewards/tag_count_reward": 1.0, + "loss": 0.8123, + "reward": 2.2492624521255493, + "reward_std": 0.5181691646575928, + "rewards/accuracy_reward": 0.4166666716337204, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.02330716885626316, + "rewards/tag_count_reward": 0.8697916865348816, "step": 507 }, { "clip_ratio": 0.0, - "completion_length": 299.79168701171875, + "completion_length": 807.4375305175781, "epoch": 0.508, - "grad_norm": 28.42312546961311, - "kl": 5.1484375, + "grad_norm": 42.53879608641054, + "kl": 6.71875, "learning_rate": 6.157373628530852e-07, - "loss": 1.7724, - "reward": 2.5468300580978394, - "reward_std": 0.6340313255786896, - "rewards/accuracy_reward": 0.6875000149011612, - "rewards/reasoning_steps_reward": 0.9444445371627808, - "rewards/repetition_penalty_reward": -0.017406176775693893, - "rewards/tag_count_reward": 0.9322916865348816, + "loss": 0.9312, + "reward": 2.1224422454833984, + "reward_std": 0.8008854687213898, + "rewards/accuracy_reward": 0.4583333432674408, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.06505792587995529, + "rewards/tag_count_reward": 0.75, "step": 508 }, { "clip_ratio": 0.0, - "completion_length": 369.37501525878906, + "completion_length": 703.625, "epoch": 0.509, - "grad_norm": 11.108283336927975, - "kl": 2.603515625, + "grad_norm": 38.46406571699496, + "kl": 5.875, "learning_rate": 6.141830201674802e-07, - "loss": 0.323, - "reward": 2.7940465211868286, - "reward_std": 0.31403081864118576, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.01671730587258935, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 1.0649, + "reward": 2.162044405937195, + "reward_std": 0.7470089793205261, + "rewards/accuracy_reward": 0.520833358168602, + "rewards/reasoning_steps_reward": 0.9097223281860352, + "rewards/repetition_penalty_reward": -0.04976117052137852, + "rewards/tag_count_reward": 0.78125, "step": 509 }, { "clip_ratio": 0.0, - "completion_length": 248.5416717529297, + "completion_length": 537.3541870117188, "epoch": 0.51, - "grad_norm": 8.988814072021315, - "kl": 0.73828125, + "grad_norm": 23.200007837390707, + "kl": 2.9609375, "learning_rate": 6.126278954320294e-07, - "loss": -0.0089, - "reward": 2.9368520975112915, - "reward_std": 0.10330374073237181, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.02148128766566515, - "rewards/tag_count_reward": 1.0, + "loss": 0.8415, + "reward": 2.5392757654190063, + "reward_std": 0.7050909399986267, + "rewards/accuracy_reward": 0.7083333730697632, + "rewards/reasoning_steps_reward": 0.9513889849185944, + "rewards/repetition_penalty_reward": -0.02148817665874958, + "rewards/tag_count_reward": 0.9010416865348816, "step": 510 }, { "clip_ratio": 0.0, - "completion_length": 287.6458435058594, + "completion_length": 605.9375152587891, "epoch": 0.511, - "grad_norm": 24.516374990056164, - "kl": 3.3515625, + "grad_norm": 12.217834862993339, + "kl": 4.390625, "learning_rate": 6.11072007595437e-07, - "loss": 0.7743, - "reward": 2.771681785583496, - "reward_std": 0.39937661588191986, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.01477636443451047, - "rewards/tag_count_reward": 0.9531250298023224, + "loss": 0.5492, + "reward": 2.281681537628174, + "reward_std": 0.5939317345619202, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.018665821757167578, + "rewards/tag_count_reward": 0.8072916865348816, "step": 511 }, { "clip_ratio": 0.0, - "completion_length": 338.9583435058594, + "completion_length": 659.0000305175781, "epoch": 0.512, - "grad_norm": 12.845715409034057, - "kl": 2.421875, + "grad_norm": 11.142187913544113, + "kl": 5.203125, "learning_rate": 6.095153756157051e-07, - "loss": 0.2996, - "reward": 2.6392208337783813, - "reward_std": 0.3684597611427307, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.03265418764203787, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.8536, + "reward": 2.404225468635559, + "reward_std": 0.7626966536045074, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.9722222089767456, + "rewards/repetition_penalty_reward": -0.01591357495635748, + "rewards/tag_count_reward": 0.8020833730697632, "step": 512 }, { "clip_ratio": 0.0, - "completion_length": 179.43750762939453, + "completion_length": 482.1875, "epoch": 0.513, - "grad_norm": 10.355067695132892, - "kl": 0.60546875, + "grad_norm": 27.609945341526714, + "kl": 2.7734375, "learning_rate": 6.079580184599032e-07, - "loss": 0.0553, - "reward": 2.7439173460006714, - "reward_std": 0.009601960889995098, - "rewards/accuracy_reward": 0.75, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.006082800100557506, - "rewards/tag_count_reward": 1.0, + "loss": 0.5479, + "reward": 2.167789936065674, + "reward_std": 0.3920954018831253, + "rewards/accuracy_reward": 0.27083333395421505, + "rewards/reasoning_steps_reward": 0.9791666567325592, + "rewards/repetition_penalty_reward": -0.019710074178874493, + "rewards/tag_count_reward": 0.9375000298023224, "step": 513 }, { "clip_ratio": 0.0, - "completion_length": 201.06250762939453, + "completion_length": 602.0208435058594, "epoch": 0.514, - "grad_norm": 21.140279810086266, - "kl": 0.875, + "grad_norm": 21.276603242655675, + "kl": 4.65625, "learning_rate": 6.06399955103937e-07, - "loss": 0.1497, - "reward": 2.872864007949829, - "reward_std": 0.17378759384155273, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.012552707456052303, - "rewards/tag_count_reward": 0.9895833432674408, - "step": 514 - }, + "loss": 0.9838, + "reward": 2.233444333076477, + "reward_std": 0.626254141330719, + "rewards/accuracy_reward": 0.4166666716337204, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.03218065481632948, + "rewards/tag_count_reward": 0.8697916865348816, + "step": 514 + }, { "clip_ratio": 0.0, - "completion_length": 222.89583587646484, + "completion_length": 548.7708587646484, "epoch": 0.515, - "grad_norm": 8.518894089016033, - "kl": 0.859375, + "grad_norm": 8.99915341715262, + "kl": 2.984375, "learning_rate": 6.048412045323164e-07, - "loss": 0.1076, - "reward": 2.821071743965149, - "reward_std": 0.133408279158175, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.012261879397556186, - "rewards/tag_count_reward": 1.0, + "loss": 0.5247, + "reward": 2.4330430030822754, + "reward_std": 0.4966660887002945, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.02702661231160164, + "rewards/tag_count_reward": 0.9114583432674408, "step": 515 }, { "clip_ratio": 0.0, - "completion_length": 256.4583435058594, + "completion_length": 607.3125305175781, "epoch": 0.516, - "grad_norm": 28.685109752500917, - "kl": 1.56640625, + "grad_norm": 12.744716558501036, + "kl": 4.21875, "learning_rate": 6.032817857379256e-07, - "loss": 0.7068, - "reward": 2.4452799558639526, - "reward_std": 0.2884073406457901, - "rewards/accuracy_reward": 0.5000000149011612, + "loss": 0.6478, + "reward": 2.4581196308135986, + "reward_std": 0.674341470003128, + "rewards/accuracy_reward": 0.6458333730697632, "rewards/reasoning_steps_reward": 0.9861111044883728, - "rewards/repetition_penalty_reward": -0.004372848430648446, - "rewards/tag_count_reward": 0.9635416865348816, + "rewards/repetition_penalty_reward": -0.017574850469827652, + "rewards/tag_count_reward": 0.8437500298023224, "step": 516 }, { "clip_ratio": 0.0, - "completion_length": 234.7916717529297, + "completion_length": 550.0833587646484, "epoch": 0.517, - "grad_norm": 15.289700434803649, - "kl": 0.98046875, + "grad_norm": 25.040001378142637, + "kl": 3.984375, "learning_rate": 6.017217177217899e-07, - "loss": 0.4358, - "reward": 2.956493854522705, - "reward_std": 0.11499326303601265, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.012256061629159376, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.8487, + "reward": 2.251497983932495, + "reward_std": 0.6527323424816132, + "rewards/accuracy_reward": 0.4791666865348816, + "rewards/reasoning_steps_reward": 0.9652778506278992, + "rewards/repetition_penalty_reward": -0.015863300301134586, + "rewards/tag_count_reward": 0.8229166865348816, "step": 517 }, { "clip_ratio": 0.0, - "completion_length": 272.62501525878906, + "completion_length": 413.8333435058594, "epoch": 0.518, - "grad_norm": 28.694566994806465, - "kl": 1.251953125, + "grad_norm": 25.626709792783128, + "kl": 2.48046875, "learning_rate": 6.001610194928464e-07, - "loss": 0.3619, - "reward": 2.802891492843628, - "reward_std": 0.35522156208753586, + "loss": 0.4396, + "reward": 2.7058017253875732, + "reward_std": 0.5824707746505737, "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.023497527465224266, - "rewards/tag_count_reward": 0.9791666865348816, + "rewards/reasoning_steps_reward": 0.9444445371627808, + "rewards/repetition_penalty_reward": -0.019892807584255934, + "rewards/tag_count_reward": 0.9270833730697632, "step": 518 }, { "clip_ratio": 0.0, - "completion_length": 228.52084350585938, + "completion_length": 741.0833435058594, "epoch": 0.519, - "grad_norm": 13.235531254693765, - "kl": 0.88671875, + "grad_norm": 59.3566513287147, + "kl": 8.140625, "learning_rate": 5.985997100677103e-07, - "loss": 0.1233, - "reward": 2.961024522781372, - "reward_std": 0.11631786031648517, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.0077256623189896345, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 1.4268, + "reward": 2.335380792617798, + "reward_std": 0.8169020414352417, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.03441094420850277, + "rewards/tag_count_reward": 0.7864583432674408, "step": 519 }, { "clip_ratio": 0.0, - "completion_length": 218.7916717529297, + "completion_length": 692.6875, "epoch": 0.52, - "grad_norm": 18.065847926507352, - "kl": 0.845703125, + "grad_norm": 30.718619323707678, + "kl": 7.515625, "learning_rate": 5.97037808470444e-07, - "loss": 0.1008, - "reward": 2.703016996383667, - "reward_std": 0.10459034889936447, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.012260687537491322, - "rewards/tag_count_reward": 1.0, + "loss": 0.9893, + "reward": 1.9578059911727905, + "reward_std": 0.5474497377872467, + "rewards/accuracy_reward": 0.2083333432674408, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.014416326768696308, + "rewards/tag_count_reward": 0.7916666865348816, "step": 520 }, { "clip_ratio": 0.0, - "completion_length": 217.58334350585938, + "completion_length": 486.4166717529297, "epoch": 0.521, - "grad_norm": 11.234681019700515, - "kl": 2.013671875, + "grad_norm": 18.357835870055244, + "kl": 4.421875, "learning_rate": 5.954753337323259e-07, - "loss": 0.5183, - "reward": 2.977494239807129, - "reward_std": 0.06380407209508121, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.006880956469103694, - "rewards/tag_count_reward": 0.984375, + "loss": 0.4358, + "reward": 2.387876033782959, + "reward_std": 0.5994940996170044, + "rewards/accuracy_reward": 0.5625000298023224, + "rewards/reasoning_steps_reward": 0.9722222089767456, + "rewards/repetition_penalty_reward": -0.02184621151536703, + "rewards/tag_count_reward": 0.8750000298023224, "step": 521 }, { "clip_ratio": 0.0, - "completion_length": 240.70834350585938, + "completion_length": 589.25, "epoch": 0.522, - "grad_norm": 18.54791145428405, - "kl": 2.15234375, + "grad_norm": 23.01587053197016, + "kl": 4.734375, "learning_rate": 5.939123048916173e-07, - "loss": 0.1077, - "reward": 2.5097007751464844, - "reward_std": 0.23574757669121027, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/reasoning_steps_reward": 0.9861111044883728, - "rewards/repetition_penalty_reward": -0.012868763180449605, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.7259, + "reward": 2.393532395362854, + "reward_std": 0.7403541803359985, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.028342580422759056, + "rewards/tag_count_reward": 0.8177083730697632, "step": 522 }, { "clip_ratio": 0.0, - "completion_length": 290.8125, + "completion_length": 502.4791717529297, "epoch": 0.523, - "grad_norm": 17.49485293791556, - "kl": 2.328125, + "grad_norm": 81.64058877460842, + "kl": 4.53125, "learning_rate": 5.923487409933315e-07, - "loss": 0.1349, - "reward": 2.8130398988723755, - "reward_std": 0.31947530806064606, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.022029752377420664, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.9372, + "reward": 2.3321489691734314, + "reward_std": 0.47031281888484955, + "rewards/accuracy_reward": 0.4791666679084301, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.018545696511864662, + "rewards/tag_count_reward": 0.90625, "step": 523 }, { "clip_ratio": 0.0, - "completion_length": 237.8541717529297, + "completion_length": 536.5416870117188, "epoch": 0.524, - "grad_norm": 17.787314011672997, - "kl": 2.02734375, + "grad_norm": 15.605773200090558, + "kl": 4.109375, "learning_rate": 5.907846610890011e-07, - "loss": 0.1161, - "reward": 2.4981160163879395, - "reward_std": 0.3354267030954361, - "rewards/accuracy_reward": 0.5208333358168602, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.00882860948331654, - "rewards/tag_count_reward": 1.0, + "loss": 0.5794, + "reward": 2.2709691524505615, + "reward_std": 0.644355833530426, + "rewards/accuracy_reward": 0.520833358168602, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.01375302067026496, + "rewards/tag_count_reward": 0.7916666865348816, "step": 524 }, { "clip_ratio": 0.0, - "completion_length": 285.4791717529297, + "completion_length": 655.5833740234375, "epoch": 0.525, - "grad_norm": 35.4918268606938, - "kl": 5.34375, + "grad_norm": 13.665522547941404, + "kl": 5.3125, "learning_rate": 5.892200842364462e-07, - "loss": 0.4311, - "reward": 2.6550995111465454, - "reward_std": 0.2949841320514679, - "rewards/accuracy_reward": 0.6875000298023224, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.008094974793493748, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.839, + "reward": 2.3515175580978394, + "reward_std": 0.7731278240680695, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.028690868988633156, + "rewards/tag_count_reward": 0.7760416865348816, "step": 525 }, { "clip_ratio": 0.0, - "completion_length": 304.75000762939453, + "completion_length": 641.8750305175781, "epoch": 0.526, - "grad_norm": 39.105218698701265, - "kl": 5.265625, + "grad_norm": 9.492149650000716, + "kl": 3.7890625, "learning_rate": 5.87655029499542e-07, - "loss": 0.825, - "reward": 2.717597246170044, - "reward_std": 0.3586873412132263, - "rewards/accuracy_reward": 0.7708333730697632, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.011569550260901451, - "rewards/tag_count_reward": 0.9583333432674408, + "loss": 0.7068, + "reward": 2.473028063774109, + "reward_std": 0.6603610515594482, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.018291576765477657, + "rewards/tag_count_reward": 0.7760416865348816, "step": 526 }, { "clip_ratio": 0.0, - "completion_length": 339.8541717529297, + "completion_length": 498.0625, "epoch": 0.527, - "grad_norm": 19.06114029118238, - "kl": 3.1953125, + "grad_norm": 9.861486616032105, + "kl": 3.2421875, "learning_rate": 5.860895159479864e-07, - "loss": 0.5097, - "reward": 2.7959665060043335, - "reward_std": 0.4172126352787018, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.01653355499729514, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.5474, + "reward": 2.4325687885284424, + "reward_std": 0.5291797816753387, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.9583334028720856, + "rewards/repetition_penalty_reward": -0.020556333474814892, + "rewards/tag_count_reward": 0.8489583432674408, "step": 527 }, { "clip_ratio": 0.0, - "completion_length": 410.3541717529297, + "completion_length": 562.8958587646484, "epoch": 0.528, - "grad_norm": 34.14598480795745, - "kl": 6.296875, + "grad_norm": 11.54211464068912, + "kl": 4.3984375, "learning_rate": 5.845235626570683e-07, - "loss": 1.0668, - "reward": 2.751347064971924, - "reward_std": 0.5215412527322769, - "rewards/accuracy_reward": 0.875, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.024694829247891903, - "rewards/tag_count_reward": 0.9218750298023224, + "loss": 0.6108, + "reward": 2.3765957355499268, + "reward_std": 0.6778049767017365, + "rewards/accuracy_reward": 0.5625, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.01229327404871583, + "rewards/tag_count_reward": 0.8541666865348816, "step": 528 }, { "clip_ratio": 0.0, - "completion_length": 223.1041717529297, + "completion_length": 580.6041870117188, "epoch": 0.529, - "grad_norm": 15.651918860665688, - "kl": 1.99609375, + "grad_norm": 12.470353874580706, + "kl": 4.125, "learning_rate": 5.829571887074343e-07, - "loss": 0.221, - "reward": 2.7265334129333496, - "reward_std": 0.03349946439266205, - "rewards/accuracy_reward": 0.75, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.016522385645657778, - "rewards/tag_count_reward": 1.0, + "loss": 0.5239, + "reward": 2.4933345317840576, + "reward_std": 0.509730190038681, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9791666567325592, + "rewards/repetition_penalty_reward": -0.02229050174355507, + "rewards/tag_count_reward": 0.8281250298023224, "step": 529 }, { "clip_ratio": 0.0, - "completion_length": 167.25000762939453, + "completion_length": 833.7917175292969, "epoch": 0.53, - "grad_norm": 15.103469836062416, - "kl": 0.630859375, + "grad_norm": 18.321092863583786, + "kl": 6.75, "learning_rate": 5.813904131848564e-07, - "loss": 0.0216, - "reward": 2.768776774406433, - "reward_std": 0.0753321188967675, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.002056604775134474, - "rewards/tag_count_reward": 1.0, + "loss": 0.6788, + "reward": 1.967248022556305, + "reward_std": 0.6098497807979584, + "rewards/accuracy_reward": 0.3333333358168602, + "rewards/reasoning_steps_reward": 0.9375000298023224, + "rewards/repetition_penalty_reward": -0.0327520202845335, + "rewards/tag_count_reward": 0.7291666865348816, "step": 530 }, { "clip_ratio": 0.0, - "completion_length": 218.7291717529297, + "completion_length": 697.2916870117188, "epoch": 0.531, - "grad_norm": 12.407668176927533, - "kl": 1.283203125, + "grad_norm": 19.33500544989958, + "kl": 5.546875, "learning_rate": 5.798232551800002e-07, - "loss": 0.3535, - "reward": 2.8210554122924805, - "reward_std": 0.2488305892329663, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/reasoning_steps_reward": 0.9583333432674408, - "rewards/repetition_penalty_reward": -0.007069613056955859, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 0.6624, + "reward": 2.311483144760132, + "reward_std": 0.6170333027839661, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.018378185108304024, + "rewards/tag_count_reward": 0.8229166865348816, "step": 531 }, { "clip_ratio": 0.0, - "completion_length": 211.45833587646484, + "completion_length": 734.2916870117188, "epoch": 0.532, - "grad_norm": 10.723579203108498, - "kl": 1.46875, + "grad_norm": 16.207591684417004, + "kl": 4.796875, "learning_rate": 5.78255733788191e-07, - "loss": 0.1478, - "reward": 2.821176052093506, - "reward_std": 0.1518668793141842, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.005212899297475815, - "rewards/tag_count_reward": 1.0, + "loss": 0.8369, + "reward": 2.042443633079529, + "reward_std": 0.6627461314201355, + "rewards/accuracy_reward": 0.354166679084301, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.02873703371733427, + "rewards/tag_count_reward": 0.7447916865348816, "step": 532 }, { "clip_ratio": 0.0, - "completion_length": 253.9166717529297, + "completion_length": 670.3333435058594, "epoch": 0.533, - "grad_norm": 13.649785123659296, - "kl": 1.8046875, + "grad_norm": 17.149503798992292, + "kl": 5.671875, "learning_rate": 5.766878681091828e-07, - "loss": 0.3093, - "reward": 2.8249999284744263, - "reward_std": 0.23385293036699295, - "rewards/accuracy_reward": 0.8750000298023224, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.010069485229905695, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 0.8688, + "reward": 1.991923749446869, + "reward_std": 0.7662231922149658, + "rewards/accuracy_reward": 0.3541666679084301, + "rewards/reasoning_steps_reward": 0.9305556416511536, + "rewards/repetition_penalty_reward": -0.04279853031039238, + "rewards/tag_count_reward": 0.7500000298023224, "step": 533 }, { "clip_ratio": 0.0, - "completion_length": 351.875, + "completion_length": 579.8125, "epoch": 0.534, - "grad_norm": 91.30404609649949, - "kl": 1.4453125, + "grad_norm": 18.63845179074038, + "kl": 4.59375, "learning_rate": 5.751196772469237e-07, - "loss": 0.6259, - "reward": 2.515122175216675, - "reward_std": 0.44669362902641296, - "rewards/accuracy_reward": 0.625, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.05779457651078701, - "rewards/tag_count_reward": 0.9687500298023224, + "loss": 0.7095, + "reward": 2.073606848716736, + "reward_std": 0.7093684077262878, + "rewards/accuracy_reward": 0.375, + "rewards/reasoning_steps_reward": 0.9722222089767456, + "rewards/repetition_penalty_reward": -0.023615523241460323, + "rewards/tag_count_reward": 0.7500000298023224, "step": 534 }, { "clip_ratio": 0.0, - "completion_length": 279.8333435058594, + "completion_length": 618.0, "epoch": 0.535, - "grad_norm": 12.469064496821428, - "kl": 1.833984375, + "grad_norm": 26.379982268973194, + "kl": 4.984375, "learning_rate": 5.735511803093248e-07, - "loss": 0.2155, - "reward": 2.643766164779663, - "reward_std": 0.2279389649629593, - "rewards/accuracy_reward": 0.6875000149011612, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.017692290246486664, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 0.8534, + "reward": 2.1693702936172485, + "reward_std": 0.578357994556427, + "rewards/accuracy_reward": 0.4375000298023224, + "rewards/reasoning_steps_reward": 0.9375000596046448, + "rewards/repetition_penalty_reward": -0.0389630775898695, + "rewards/tag_count_reward": 0.8333333432674408, "step": 535 }, { "clip_ratio": 0.0, - "completion_length": 183.7916717529297, + "completion_length": 630.9791870117188, "epoch": 0.536, - "grad_norm": 7.278871796827916, - "kl": 1.4609375, + "grad_norm": 16.107255305572483, + "kl": 3.328125, "learning_rate": 5.71982396408026e-07, - "loss": 0.0929, - "reward": 2.954980731010437, - "reward_std": 0.10164909064769745, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.003352693049237132, - "rewards/tag_count_reward": 1.0, + "loss": 0.6713, + "reward": 2.3328845500946045, + "reward_std": 0.762814462184906, + "rewards/accuracy_reward": 0.5625000298023224, + "rewards/reasoning_steps_reward": 0.951388955116272, + "rewards/repetition_penalty_reward": -0.024754411540925503, + "rewards/tag_count_reward": 0.8437500298023224, "step": 536 }, { "clip_ratio": 0.0, - "completion_length": 254.56251525878906, + "completion_length": 644.5416870117188, "epoch": 0.537, - "grad_norm": 17.11322209348403, - "kl": 2.0234375, + "grad_norm": 9.431209455223057, + "kl": 4.125, "learning_rate": 5.704133446581642e-07, - "loss": 0.4898, - "reward": 2.7544987201690674, - "reward_std": 0.38861820101737976, - "rewards/accuracy_reward": 0.8333333730697632, - "rewards/reasoning_steps_reward": 0.972222238779068, - "rewards/repetition_penalty_reward": -0.014598728157579899, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.5649, + "reward": 2.1833406686782837, + "reward_std": 0.6518435776233673, + "rewards/accuracy_reward": 0.4166666716337204, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.023256704211235046, + "rewards/tag_count_reward": 0.796875, "step": 537 }, { "clip_ratio": 0.0, - "completion_length": 246.6666717529297, + "completion_length": 521.5625, "epoch": 0.538, - "grad_norm": 10.182630326500368, - "kl": 1.3203125, + "grad_norm": 13.691614334173481, + "kl": 2.828125, "learning_rate": 5.688440441781398e-07, - "loss": 0.0894, - "reward": 2.9222214221954346, - "reward_std": 0.15060079097747803, - "rewards/accuracy_reward": 0.9583333730697632, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.022222970612347126, - "rewards/tag_count_reward": 1.0, + "loss": 0.532, + "reward": 2.5351574420928955, + "reward_std": 0.5137946009635925, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.02213435433804989, + "rewards/tag_count_reward": 0.8906250298023224, "step": 538 }, { "clip_ratio": 0.0, - "completion_length": 270.75001525878906, + "completion_length": 504.20835876464844, "epoch": 0.539, - "grad_norm": 20.32453119258816, - "kl": 2.0, + "grad_norm": 15.514611157640976, + "kl": 3.296875, "learning_rate": 5.672745140893839e-07, - "loss": 0.5416, - "reward": 2.902275562286377, - "reward_std": 0.2899784669280052, - "rewards/accuracy_reward": 0.9583333730697632, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.014391135657206178, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.4842, + "reward": 2.3555127382278442, + "reward_std": 0.6762471795082092, + "rewards/accuracy_reward": 0.5625, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.017751268576830626, + "rewards/tag_count_reward": 0.8593750298023224, "step": 539 }, { "clip_ratio": 0.0, - "completion_length": 260.75000762939453, + "completion_length": 500.2083435058594, "epoch": 0.54, - "grad_norm": 13.19486639531855, - "kl": 2.56640625, + "grad_norm": 33.71631385555809, + "kl": 4.0390625, "learning_rate": 5.657047735161255e-07, - "loss": 0.6075, - "reward": 2.875817060470581, - "reward_std": 0.34979934245347977, - "rewards/accuracy_reward": 0.9583333730697632, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.013072141446173191, - "rewards/tag_count_reward": 0.9583333432674408, + "loss": 0.5623, + "reward": 2.4260315895080566, + "reward_std": 0.7543806433677673, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.02014908567070961, + "rewards/tag_count_reward": 0.8489583730697632, "step": 540 }, { "clip_ratio": 0.0, - "completion_length": 165.8541717529297, + "completion_length": 645.2083435058594, "epoch": 0.541, - "grad_norm": 13.628328393223272, - "kl": 1.26953125, + "grad_norm": 18.05553096101935, + "kl": 5.75, "learning_rate": 5.641348415851577e-07, - "loss": 0.1194, - "reward": 2.8456040620803833, - "reward_std": 0.21230873791500926, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.008562608039937913, - "rewards/tag_count_reward": 1.0, + "loss": 0.9497, + "reward": 2.2081546783447266, + "reward_std": 0.5377081632614136, + "rewards/accuracy_reward": 0.4375000149011612, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.017539918422698975, + "rewards/tag_count_reward": 0.8229166865348816, "step": 541 }, { "clip_ratio": 0.0, - "completion_length": 385.04168701171875, + "completion_length": 739.2291870117188, "epoch": 0.542, - "grad_norm": 19.8048077223596, - "kl": 4.6484375, + "grad_norm": 16.273564979773667, + "kl": 7.109375, "learning_rate": 5.625647374256061e-07, - "loss": 1.0099, - "reward": 2.7794227600097656, - "reward_std": 0.4676542580127716, - "rewards/accuracy_reward": 0.8750000298023224, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.022660567425191402, - "rewards/tag_count_reward": 0.9270833730697632, + "loss": 0.902, + "reward": 2.1837064027786255, + "reward_std": 0.6955267190933228, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.014210469089448452, + "rewards/tag_count_reward": 0.71875, "step": 542 }, { "clip_ratio": 0.0, - "completion_length": 335.3958435058594, + "completion_length": 586.3125, "epoch": 0.543, - "grad_norm": 14.871106518703135, - "kl": 5.125, + "grad_norm": 13.748732846527533, + "kl": 3.7109375, "learning_rate": 5.60994480168694e-07, - "loss": 0.8748, - "reward": 2.7120354175567627, - "reward_std": 0.27602626266889274, - "rewards/accuracy_reward": 0.7708333432674408, + "loss": 0.8548, + "reward": 2.5483158826828003, + "reward_std": 0.593627005815506, + "rewards/accuracy_reward": 0.7083333432674408, "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.017131428699940443, - "rewards/tag_count_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.024600773118436337, + "rewards/tag_count_reward": 0.8645833432674408, "step": 543 }, { "clip_ratio": 0.0, - "completion_length": 263.25, + "completion_length": 609.5416870117188, "epoch": 0.544, - "grad_norm": 17.444782869377377, - "kl": 2.8671875, + "grad_norm": 17.36578790885452, + "kl": 6.140625, "learning_rate": 5.594240889475106e-07, - "loss": 0.4992, - "reward": 2.700100898742676, - "reward_std": 0.21643656492233276, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.016912923892959952, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 1.0892, + "reward": 2.431919813156128, + "reward_std": 0.8149698972702026, + "rewards/accuracy_reward": 0.75, + "rewards/reasoning_steps_reward": 0.8958333730697632, + "rewards/repetition_penalty_reward": -0.021205355413258076, + "rewards/tag_count_reward": 0.8072916865348816, "step": 544 }, { "clip_ratio": 0.0, - "completion_length": 492.1458435058594, + "completion_length": 527.9791870117188, "epoch": 0.545, - "grad_norm": 47.200814678697526, - "kl": 10.1875, + "grad_norm": 30.417556701818675, + "kl": 2.8984375, "learning_rate": 5.578535828967777e-07, - "loss": 1.9236, - "reward": 2.4887442588806152, - "reward_std": 0.4649422764778137, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/reasoning_steps_reward": 0.972222238779068, - "rewards/repetition_penalty_reward": -0.01472798828035593, - "rewards/tag_count_reward": 0.8645833432674408, + "loss": 0.8559, + "reward": 2.590558648109436, + "reward_std": 0.6466452777385712, + "rewards/accuracy_reward": 0.75, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.030969200655817986, + "rewards/tag_count_reward": 0.8854166865348816, "step": 545 }, { "clip_ratio": 0.0, - "completion_length": 357.2916793823242, + "completion_length": 633.8958435058594, "epoch": 0.546, - "grad_norm": 25.60601139042577, - "kl": 4.89453125, + "grad_norm": 15.846035378270587, + "kl": 6.0234375, "learning_rate": 5.562829811526154e-07, - "loss": 0.6718, - "reward": 2.618017077445984, - "reward_std": 0.30866929423063993, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/reasoning_steps_reward": 0.9652777910232544, - "rewards/repetition_penalty_reward": -0.013927504420280457, - "rewards/tag_count_reward": 0.9583333432674408, + "loss": 1.0087, + "reward": 2.4130187034606934, + "reward_std": 0.680559515953064, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.01580093428492546, + "rewards/tag_count_reward": 0.8177083432674408, "step": 546 }, { "clip_ratio": 0.0, - "completion_length": 257.00001525878906, + "completion_length": 432.8958435058594, "epoch": 0.547, - "grad_norm": 10.89262608607717, - "kl": 1.59375, + "grad_norm": 11.403437149970372, + "kl": 1.345703125, "learning_rate": 5.547123028523106e-07, - "loss": 0.1743, - "reward": 2.878835439682007, - "reward_std": 0.20555399113800377, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.010053388774394989, - "rewards/tag_count_reward": 1.0, + "loss": 0.0869, + "reward": 2.532930016517639, + "reward_std": 0.4439847320318222, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.03130619879812002, + "rewards/tag_count_reward": 0.9739583432674408, "step": 547 }, { "clip_ratio": 0.0, - "completion_length": 353.125, + "completion_length": 646.3750305175781, "epoch": 0.548, - "grad_norm": 81.79931127751958, - "kl": 5.52734375, + "grad_norm": 67.38569099061115, + "kl": 7.3125, "learning_rate": 5.531415671340826e-07, - "loss": 1.3182, - "reward": 2.794838070869446, - "reward_std": 0.42619338631629944, - "rewards/accuracy_reward": 0.8750000298023224, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.022870427928864956, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 1.0126, + "reward": 2.4480226039886475, + "reward_std": 0.7663466334342957, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.02767193131148815, + "rewards/tag_count_reward": 0.8229166865348816, "step": 548 }, { "clip_ratio": 0.0, - "completion_length": 171.06250762939453, + "completion_length": 534.4166717529297, "epoch": 0.549, - "grad_norm": 13.574923566249973, - "kl": 2.1015625, + "grad_norm": 15.126469736221692, + "kl": 3.61328125, "learning_rate": 5.515707931368507e-07, - "loss": 0.227, - "reward": 2.964742660522461, - "reward_std": 0.09593780152499676, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.004007585579529405, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.4266, + "reward": 2.385190963745117, + "reward_std": 0.47373223304748535, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.01932289730757475, + "rewards/tag_count_reward": 0.9114583432674408, "step": 549 }, { "clip_ratio": 0.0, - "completion_length": 378.39583587646484, + "completion_length": 622.2916870117188, "epoch": 0.55, - "grad_norm": 15.789594589167345, - "kl": 4.54296875, + "grad_norm": 16.89430814413906, + "kl": 5.453125, "learning_rate": 5.5e-07, - "loss": 0.681, - "reward": 2.8109264373779297, - "reward_std": 0.19634315185248852, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.006781912874430418, - "rewards/tag_count_reward": 0.9218750298023224, + "loss": 0.6763, + "reward": 2.388463854789734, + "reward_std": 0.5397979617118835, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.017786113545298576, + "rewards/tag_count_reward": 0.8437500298023224, "step": 550 }, { "clip_ratio": 0.0, - "completion_length": 443.2083435058594, + "completion_length": 533.1666717529297, "epoch": 0.551, - "grad_norm": 34.739161486159645, - "kl": 5.828125, + "grad_norm": 13.91558589732855, + "kl": 4.0078125, "learning_rate": 5.484292068631494e-07, - "loss": 1.0555, - "reward": 2.6288487911224365, - "reward_std": 0.539858877658844, - "rewards/accuracy_reward": 0.7708333730697632, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.027401148341596127, - "rewards/tag_count_reward": 0.9062500298023224, + "loss": 0.6629, + "reward": 2.265714168548584, + "reward_std": 0.6671868860721588, + "rewards/accuracy_reward": 0.4791666865348816, + "rewards/reasoning_steps_reward": 0.9375000596046448, + "rewards/repetition_penalty_reward": -0.020744211971759796, + "rewards/tag_count_reward": 0.8697916865348816, "step": 551 }, { "clip_ratio": 0.0, - "completion_length": 487.81251525878906, + "completion_length": 573.5625305175781, "epoch": 0.552, - "grad_norm": 22.178292603511963, - "kl": 3.3125, + "grad_norm": 16.883404148942905, + "kl": 5.0, "learning_rate": 5.468584328659172e-07, - "loss": 0.8108, - "reward": 2.6969659328460693, - "reward_std": 0.5279301404953003, - "rewards/accuracy_reward": 0.8125000298023224, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.033936976455152035, - "rewards/tag_count_reward": 0.9322916865348816, + "loss": 0.8272, + "reward": 2.377629041671753, + "reward_std": 0.6626934707164764, + "rewards/accuracy_reward": 0.5625, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.021676572039723396, + "rewards/tag_count_reward": 0.84375, "step": 552 }, { "clip_ratio": 0.0, - "completion_length": 305.875, + "completion_length": 641.9583435058594, "epoch": 0.553, - "grad_norm": 27.40506165255743, - "kl": 3.984375, + "grad_norm": 12.510249676603848, + "kl": 6.515625, "learning_rate": 5.452876971476896e-07, - "loss": 0.9763, - "reward": 2.798345685005188, - "reward_std": 0.42504626512527466, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.017626761458814144, - "rewards/tag_count_reward": 0.9270833432674408, + "loss": 1.1594, + "reward": 2.224699914455414, + "reward_std": 0.7095433175563812, + "rewards/accuracy_reward": 0.4791666865348816, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.020091742277145386, + "rewards/tag_count_reward": 0.8072916865348816, "step": 553 }, { "clip_ratio": 0.0, - "completion_length": 368.1458435058594, + "completion_length": 621.0000305175781, "epoch": 0.554, - "grad_norm": 9.328097603916511, - "kl": 4.71875, + "grad_norm": 20.15924923331073, + "kl": 4.90625, "learning_rate": 5.437170188473847e-07, - "loss": 0.7962, - "reward": 2.7200623750686646, - "reward_std": 0.40101030468940735, - "rewards/accuracy_reward": 0.8125, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.017784900963306427, - "rewards/tag_count_reward": 0.9322916865348816, + "loss": 1.009, + "reward": 2.4554017782211304, + "reward_std": 0.6323766112327576, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.025501138530671597, + "rewards/tag_count_reward": 0.8489583432674408, "step": 554 }, { "clip_ratio": 0.0, - "completion_length": 377.5208435058594, + "completion_length": 516.3125305175781, "epoch": 0.555, - "grad_norm": 11.797943211156122, - "kl": 2.82421875, + "grad_norm": 15.439703900925965, + "kl": 3.703125, "learning_rate": 5.421464171032224e-07, - "loss": 0.3458, - "reward": 2.746278762817383, - "reward_std": 0.31177210807800293, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/reasoning_steps_reward": 0.9861111044883728, - "rewards/repetition_penalty_reward": -0.021082570776343346, - "rewards/tag_count_reward": 0.9479166865348816, + "loss": 0.6462, + "reward": 2.5977327823638916, + "reward_std": 0.5676628202199936, + "rewards/accuracy_reward": 0.75, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.016850699670612812, + "rewards/tag_count_reward": 0.9062500298023224, "step": 555 }, { "clip_ratio": 0.0, - "completion_length": 339.5, + "completion_length": 483.5625, "epoch": 0.556, - "grad_norm": 22.44042981813086, - "kl": 4.0234375, + "grad_norm": 11.490664107960336, + "kl": 3.9921875, "learning_rate": 5.405759110524894e-07, - "loss": 0.9891, - "reward": 2.7945016622543335, - "reward_std": 0.4282144755125046, - "rewards/accuracy_reward": 0.8958333432674408, + "loss": 0.5193, + "reward": 2.5783601999282837, + "reward_std": 0.6397126764059067, + "rewards/accuracy_reward": 0.7708333730697632, "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.016262172255665064, - "rewards/tag_count_reward": 0.9427083730697632, + "rewards/repetition_penalty_reward": -0.0292786480858922, + "rewards/tag_count_reward": 0.8645833432674408, "step": 556 }, { "clip_ratio": 0.0, - "completion_length": 334.6041717529297, + "completion_length": 516.6041870117188, "epoch": 0.557, - "grad_norm": 21.49959702815894, - "kl": 4.484375, + "grad_norm": 27.558157304060057, + "kl": 4.078125, "learning_rate": 5.390055198313061e-07, - "loss": 0.7577, - "reward": 2.7817689180374146, - "reward_std": 0.2094714716076851, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.01337004592642188, - "rewards/tag_count_reward": 0.9687500298023224, - "step": 557 - }, + "loss": 0.7408, + "reward": 2.3512425422668457, + "reward_std": 0.5882950127124786, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.965277761220932, + "rewards/repetition_penalty_reward": -0.025493742898106575, + "rewards/tag_count_reward": 0.8697916865348816, + "step": 557 + }, { "clip_ratio": 0.0, - "completion_length": 268.64583587646484, + "completion_length": 540.9166870117188, "epoch": 0.558, - "grad_norm": 16.990527988299593, - "kl": 1.78515625, + "grad_norm": 22.15871149302614, + "kl": 5.25, "learning_rate": 5.37435262574394e-07, - "loss": 0.2315, - "reward": 2.7632081508636475, - "reward_std": 0.22523664683103561, - "rewards/accuracy_reward": 0.8125000298023224, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.011097445152699947, - "rewards/tag_count_reward": 0.9687500298023224, + "loss": 0.7217, + "reward": 2.273875594139099, + "reward_std": 0.604903370141983, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 0.951388955116272, + "rewards/repetition_penalty_reward": -0.01605505309998989, + "rewards/tag_count_reward": 0.8177083432674408, "step": 558 }, { "clip_ratio": 0.0, - "completion_length": 225.52084350585938, + "completion_length": 564.1875305175781, "epoch": 0.559, - "grad_norm": 16.669848540805553, - "kl": 1.767578125, + "grad_norm": 20.132907345221618, + "kl": 5.90625, "learning_rate": 5.358651584148423e-07, - "loss": 0.0992, - "reward": 2.88400936126709, - "reward_std": 0.2095352467149496, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.018768414855003357, - "rewards/tag_count_reward": 1.0, + "loss": 0.975, + "reward": 2.391770362854004, + "reward_std": 0.7469497919082642, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 0.958333432674408, + "rewards/repetition_penalty_reward": -0.0248964074999094, + "rewards/tag_count_reward": 0.8333333432674408, "step": 559 }, { "clip_ratio": 0.0, - "completion_length": 265.77083587646484, + "completion_length": 602.7500305175781, "epoch": 0.56, - "grad_norm": 13.352376304592537, - "kl": 2.4765625, + "grad_norm": 20.768809175821836, + "kl": 5.4296875, "learning_rate": 5.342952264838747e-07, - "loss": 0.4526, - "reward": 2.6482033729553223, - "reward_std": 0.3250310868024826, - "rewards/accuracy_reward": 0.7083333730697632, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.014990968629717827, - "rewards/tag_count_reward": 0.96875, + "loss": 0.7266, + "reward": 1.9830502271652222, + "reward_std": 0.5142233371734619, + "rewards/accuracy_reward": 0.2500000111758709, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.02563036046922207, + "rewards/tag_count_reward": 0.7864583432674408, "step": 560 }, { "clip_ratio": 0.0, - "completion_length": 304.9791717529297, + "completion_length": 522.9583435058594, "epoch": 0.561, - "grad_norm": 12.99918228178363, - "kl": 2.23828125, + "grad_norm": 17.71687219992085, + "kl": 6.890625, "learning_rate": 5.32725485910616e-07, - "loss": 0.4029, - "reward": 2.4955302476882935, - "reward_std": 0.47989992797374725, - "rewards/accuracy_reward": 0.5625000298023224, - "rewards/reasoning_steps_reward": 0.9861111044883728, - "rewards/repetition_penalty_reward": -0.021830917336046696, - "rewards/tag_count_reward": 0.96875, + "loss": 0.4285, + "reward": 2.2837836742401123, + "reward_std": 0.7183063626289368, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.9583333134651184, + "rewards/repetition_penalty_reward": -0.02350800298154354, + "rewards/tag_count_reward": 0.8072916865348816, "step": 561 }, { "clip_ratio": 0.0, - "completion_length": 181.27084350585938, + "completion_length": 533.6666870117188, "epoch": 0.562, - "grad_norm": 13.401389295348176, - "kl": 1.15625, + "grad_norm": 17.98874232563595, + "kl": 2.96875, "learning_rate": 5.311559558218603e-07, - "loss": 0.1771, - "reward": 2.8899264335632324, - "reward_std": 0.13301172037608922, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.005907091952394694, - "rewards/tag_count_reward": 1.0, + "loss": 0.7934, + "reward": 2.4511380195617676, + "reward_std": 0.5958435237407684, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.021084393840283155, + "rewards/tag_count_reward": 0.8958333432674408, "step": 562 }, { "clip_ratio": 0.0, - "completion_length": 270.1458435058594, + "completion_length": 678.0625305175781, "epoch": 0.563, - "grad_norm": 17.20728237283863, - "kl": 3.1015625, + "grad_norm": 20.342890063735087, + "kl": 6.2265625, "learning_rate": 5.295866553418358e-07, - "loss": 0.5065, - "reward": 2.905181646347046, - "reward_std": 0.2545322924852371, - "rewards/accuracy_reward": 0.9375000298023224, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.014957254752516747, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.8234, + "reward": 2.3168532848358154, + "reward_std": 0.6717122793197632, + "rewards/accuracy_reward": 0.6250000149011612, + "rewards/reasoning_steps_reward": 0.9305556118488312, + "rewards/repetition_penalty_reward": -0.025160694494843483, + "rewards/tag_count_reward": 0.7864583730697632, "step": 563 }, { "clip_ratio": 0.0, - "completion_length": 275.3541793823242, + "completion_length": 617.2500305175781, "epoch": 0.564, - "grad_norm": 15.944772105410507, - "kl": 1.859375, + "grad_norm": 16.061198883154272, + "kl": 3.953125, "learning_rate": 5.28017603591974e-07, - "loss": 0.505, - "reward": 2.901887059211731, - "reward_std": 0.23889102041721344, - "rewards/accuracy_reward": 0.9583333730697632, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.01998799294233322, - "rewards/tag_count_reward": 0.984375, + "loss": 0.9804, + "reward": 2.5708781480789185, + "reward_std": 0.6792239546775818, + "rewards/accuracy_reward": 0.75, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.024608048610389233, + "rewards/tag_count_reward": 0.8593750298023224, "step": 564 }, { "clip_ratio": 0.0, - "completion_length": 418.3333435058594, + "completion_length": 500.29168701171875, "epoch": 0.565, - "grad_norm": 14.320653686242203, - "kl": 3.46875, + "grad_norm": 17.72408984099671, + "kl": 2.89453125, "learning_rate": 5.264488196906752e-07, - "loss": 0.6924, - "reward": 2.709797978401184, - "reward_std": 0.46882903575897217, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.0349939139559865, - "rewards/tag_count_reward": 0.953125, + "loss": 0.7968, + "reward": 2.518498420715332, + "reward_std": 0.6503982543945312, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.024904441088438034, + "rewards/tag_count_reward": 0.9114583432674408, "step": 565 }, { "clip_ratio": 0.0, - "completion_length": 268.81250762939453, + "completion_length": 576.8958435058594, "epoch": 0.566, - "grad_norm": 13.487209487558582, - "kl": 2.025390625, + "grad_norm": 22.26454438337593, + "kl": 4.609375, "learning_rate": 5.248803227530763e-07, - "loss": 0.0756, - "reward": 2.934495449066162, - "reward_std": 0.0947490967810154, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 0.9652778804302216, - "rewards/repetition_penalty_reward": -0.02557390369474888, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 1.0678, + "reward": 2.483291506767273, + "reward_std": 0.4923284649848938, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.020180770196020603, + "rewards/tag_count_reward": 0.8437500298023224, "step": 566 }, { "clip_ratio": 0.0, - "completion_length": 228.95833587646484, + "completion_length": 555.2708587646484, "epoch": 0.567, - "grad_norm": 12.720588845203377, - "kl": 1.7890625, + "grad_norm": 15.919462093451902, + "kl": 5.5703125, "learning_rate": 5.233121318908173e-07, - "loss": 0.1153, - "reward": 2.925658345222473, - "reward_std": 0.13863864541053772, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.010105773340910673, - "rewards/tag_count_reward": 0.984375, + "loss": 0.7808, + "reward": 2.2207025289535522, + "reward_std": 0.6656961143016815, + "rewards/accuracy_reward": 0.4375000149011612, + "rewards/reasoning_steps_reward": 0.9375001192092896, + "rewards/repetition_penalty_reward": -0.018881036899983883, + "rewards/tag_count_reward": 0.8645833432674408, "step": 567 }, { "clip_ratio": 0.0, - "completion_length": 230.25000762939453, + "completion_length": 642.1041870117188, "epoch": 0.568, - "grad_norm": 18.333229344716603, - "kl": 2.6640625, + "grad_norm": 21.902991875998683, + "kl": 6.234375, "learning_rate": 5.21744266211809e-07, - "loss": 0.3118, - "reward": 2.899394392967224, - "reward_std": 0.14636608306318521, - "rewards/accuracy_reward": 0.9375, - "rewards/reasoning_steps_reward": 0.9861111044883728, - "rewards/repetition_penalty_reward": -0.013800207525491714, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.7828, + "reward": 2.2312402725219727, + "reward_std": 0.6396611332893372, + "rewards/accuracy_reward": 0.5, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.023968255147337914, + "rewards/tag_count_reward": 0.7968750298023224, "step": 568 }, { "clip_ratio": 0.0, - "completion_length": 278.7708435058594, + "completion_length": 493.2708435058594, "epoch": 0.569, - "grad_norm": 15.950186419162394, - "kl": 3.1904296875, + "grad_norm": 11.796721143909934, + "kl": 2.9140625, "learning_rate": 5.2017674482e-07, - "loss": 0.7446, - "reward": 2.886800169944763, - "reward_std": 0.25502097187563777, - "rewards/accuracy_reward": 0.9375, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.02465830370783806, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 0.4568, + "reward": 2.5521020889282227, + "reward_std": 0.6523496210575104, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9583334028720856, + "rewards/repetition_penalty_reward": -0.02081457804888487, + "rewards/tag_count_reward": 0.8854166865348816, "step": 569 }, { "clip_ratio": 0.0, - "completion_length": 181.3125, + "completion_length": 563.8750305175781, "epoch": 0.57, - "grad_norm": 24.901491000705754, - "kl": 1.87109375, + "grad_norm": 16.743725013071696, + "kl": 4.6015625, "learning_rate": 5.186095868151436e-07, - "loss": 0.2866, - "reward": 2.9956284761428833, - "reward_std": 0.0075892198365181684, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.004371511400677264, - "rewards/tag_count_reward": 1.0, + "loss": 0.542, + "reward": 2.138919949531555, + "reward_std": 0.6218420267105103, + "rewards/accuracy_reward": 0.3958333432674408, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.017330123111605644, + "rewards/tag_count_reward": 0.8020833432674408, "step": 570 }, { "clip_ratio": 0.0, - "completion_length": 264.5416717529297, + "completion_length": 593.0625305175781, "epoch": 0.571, - "grad_norm": 15.665007769940441, - "kl": 1.9375, + "grad_norm": 31.64391481670669, + "kl": 5.453125, "learning_rate": 5.170428112925659e-07, - "loss": 0.2378, - "reward": 2.651398181915283, - "reward_std": 0.19297663122415543, - "rewards/accuracy_reward": 0.6875000149011612, - "rewards/reasoning_steps_reward": 0.9861111044883728, - "rewards/repetition_penalty_reward": -0.017004678025841713, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.6923, + "reward": 2.3365062475204468, + "reward_std": 0.6305623352527618, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9444445371627808, + "rewards/repetition_penalty_reward": -0.02460480574518442, + "rewards/tag_count_reward": 0.8333333432674408, "step": 571 }, { "clip_ratio": 0.0, - "completion_length": 220.79167938232422, + "completion_length": 431.81251525878906, "epoch": 0.572, - "grad_norm": 14.843994300972396, - "kl": 3.4765625, + "grad_norm": 18.846581957559952, + "kl": 1.83203125, "learning_rate": 5.154764373429315e-07, - "loss": 0.4761, - "reward": 2.8788846731185913, - "reward_std": 0.21930089266970754, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.011740590212866664, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.3648, + "reward": 2.5100269317626953, + "reward_std": 0.38545116782188416, + "rewards/accuracy_reward": 0.625, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.022959282621741295, + "rewards/tag_count_reward": 0.9218750298023224, "step": 572 }, { "clip_ratio": 0.0, - "completion_length": 188.9375, + "completion_length": 553.2291717529297, "epoch": 0.573, - "grad_norm": 24.84769278457942, - "kl": 3.54296875, + "grad_norm": 13.121698712408023, + "kl": 3.3359375, "learning_rate": 5.139104840520135e-07, - "loss": 0.2086, - "reward": 2.535950183868408, - "reward_std": 0.09902881435118616, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.005716606043279171, - "rewards/tag_count_reward": 1.0, + "loss": 0.7169, + "reward": 2.331916570663452, + "reward_std": 0.6915982961654663, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.030930647626519203, + "rewards/tag_count_reward": 0.8697916865348816, "step": 573 }, { "clip_ratio": 0.0, - "completion_length": 253.7291717529297, + "completion_length": 730.5625, "epoch": 0.574, - "grad_norm": 20.225766295368967, - "kl": 4.0625, + "grad_norm": 35.573664674103505, + "kl": 7.890625, "learning_rate": 5.123449705004581e-07, - "loss": 0.5296, - "reward": 2.9398797750473022, - "reward_std": 0.09285355894826353, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.016717405407689512, - "rewards/tag_count_reward": 0.984375, + "loss": 0.8329, + "reward": 1.8582618832588196, + "reward_std": 0.7164104580879211, + "rewards/accuracy_reward": 0.2708333432674408, + "rewards/reasoning_steps_reward": 0.9027777910232544, + "rewards/repetition_penalty_reward": -0.018474191427230835, + "rewards/tag_count_reward": 0.703125, "step": 574 }, { "clip_ratio": 0.0, - "completion_length": 207.1666717529297, + "completion_length": 706.9166870117188, "epoch": 0.575, - "grad_norm": 26.81042849414736, - "kl": 3.109375, + "grad_norm": 12.890226305523278, + "kl": 5.65625, "learning_rate": 5.107799157635538e-07, - "loss": 0.0434, - "reward": 2.853146553039551, - "reward_std": 0.21280575916171074, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.013172881212085485, - "rewards/tag_count_reward": 0.9843750298023224, + "loss": 0.7742, + "reward": 2.287819743156433, + "reward_std": 0.8128792345523834, + "rewards/accuracy_reward": 0.6458333730697632, + "rewards/reasoning_steps_reward": 0.9027777910232544, + "rewards/repetition_penalty_reward": -0.02641639020293951, + "rewards/tag_count_reward": 0.7656250298023224, "step": 575 }, { "clip_ratio": 0.0, - "completion_length": 290.3541717529297, + "completion_length": 678.9166870117188, "epoch": 0.576, - "grad_norm": 47.214348733471816, - "kl": 7.8515625, + "grad_norm": 22.0467723806393, + "kl": 6.984375, "learning_rate": 5.09215338910999e-07, - "loss": 0.8872, - "reward": 2.853019595146179, - "reward_std": 0.2147191883996129, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.013300020480528474, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.8976, + "reward": 2.1274478435516357, + "reward_std": 0.8857446014881134, + "rewards/accuracy_reward": 0.4583333432674408, + "rewards/reasoning_steps_reward": 0.9305555820465088, + "rewards/repetition_penalty_reward": -0.016649481374770403, + "rewards/tag_count_reward": 0.7552083432674408, "step": 576 }, { "clip_ratio": 0.0, - "completion_length": 267.9583435058594, + "completion_length": 843.8333435058594, "epoch": 0.577, - "grad_norm": 19.616859953874243, - "kl": 4.65625, + "grad_norm": 24.290171025234606, + "kl": 7.203125, "learning_rate": 5.076512590066685e-07, - "loss": 0.7547, - "reward": 2.877570629119873, - "reward_std": 0.2527960389852524, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.007846252992749214, - "rewards/tag_count_reward": 0.96875, + "loss": 0.9397, + "reward": 1.9734002351760864, + "reward_std": 0.8574240803718567, + "rewards/accuracy_reward": 0.3750000149011612, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.012710887007415295, + "rewards/tag_count_reward": 0.6458333432674408, "step": 577 }, { "clip_ratio": 0.0, - "completion_length": 206.7916717529297, + "completion_length": 601.8958435058594, "epoch": 0.578, - "grad_norm": 14.247435042678376, - "kl": 2.6015625, + "grad_norm": 13.84698671695797, + "kl": 4.265625, "learning_rate": 5.060876951083828e-07, - "loss": 0.371, - "reward": 2.9510765075683594, - "reward_std": 0.10527589311823249, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.007257002405822277, - "rewards/tag_count_reward": 1.0, + "loss": 0.6815, + "reward": 2.2862409353256226, + "reward_std": 0.7043590843677521, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.01757870987057686, + "rewards/tag_count_reward": 0.796875, "step": 578 }, { "clip_ratio": 0.0, - "completion_length": 227.7916717529297, + "completion_length": 473.2083435058594, "epoch": 0.579, - "grad_norm": 11.166406455014867, - "kl": 1.0, + "grad_norm": 21.25841736364304, + "kl": 2.7265625, "learning_rate": 5.045246662676741e-07, - "loss": 0.0528, - "reward": 2.8006995916366577, - "reward_std": 0.2522887587547302, - "rewards/accuracy_reward": 0.8333333730697632, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.011800445383414626, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.4115, + "reward": 2.3248329162597656, + "reward_std": 0.6839044392108917, + "rewards/accuracy_reward": 0.5000000149011612, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.022389397025108337, + "rewards/tag_count_reward": 0.8750000298023224, "step": 579 }, { "clip_ratio": 0.0, - "completion_length": 200.1875, + "completion_length": 506.2708435058594, "epoch": 0.58, - "grad_norm": 8.98992050750106, - "kl": 1.09765625, + "grad_norm": 13.768352502526005, + "kl": 2.55859375, "learning_rate": 5.02962191529556e-07, - "loss": 0.1204, - "reward": 2.78869366645813, - "reward_std": 0.15213570836931467, - "rewards/accuracy_reward": 0.8125, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.009917512070387602, - "rewards/tag_count_reward": 1.0, + "loss": 0.4207, + "reward": 2.34650194644928, + "reward_std": 0.4788215011358261, + "rewards/accuracy_reward": 0.5208333358168602, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.03023417294025421, + "rewards/tag_count_reward": 0.890625, "step": 580 }, { "clip_ratio": 0.0, - "completion_length": 188.39583587646484, + "completion_length": 509.14585876464844, "epoch": 0.581, - "grad_norm": 12.699860575377372, - "kl": 1.31640625, + "grad_norm": 31.447352351340843, + "kl": 2.5546875, "learning_rate": 5.014002899322896e-07, - "loss": 0.1168, - "reward": 2.986881971359253, - "reward_std": 0.030311900656670332, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.006173709291033447, - "rewards/tag_count_reward": 1.0, + "loss": 0.7482, + "reward": 2.202039122581482, + "reward_std": 0.689550518989563, + "rewards/accuracy_reward": 0.4166666716337204, + "rewards/reasoning_steps_reward": 0.9375000596046448, + "rewards/repetition_penalty_reward": -0.02712748385965824, + "rewards/tag_count_reward": 0.8750000298023224, "step": 581 }, { "clip_ratio": 0.0, - "completion_length": 330.2083435058594, + "completion_length": 636.8750305175781, "epoch": 0.582, - "grad_norm": 13.418631640585215, - "kl": 3.21875, + "grad_norm": 31.988630155115366, + "kl": 5.734375, "learning_rate": 4.998389805071536e-07, - "loss": 0.7679, - "reward": 2.8146709203720093, - "reward_std": 0.3184046745300293, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.018662618473172188, - "rewards/tag_count_reward": 0.9583333730697632, + "loss": 0.9076, + "reward": 2.169505000114441, + "reward_std": 0.861674964427948, + "rewards/accuracy_reward": 0.5000000149011612, + "rewards/reasoning_steps_reward": 0.9166666865348816, + "rewards/repetition_penalty_reward": -0.028411878272891045, + "rewards/tag_count_reward": 0.7812500298023224, "step": 582 }, { "clip_ratio": 0.0, - "completion_length": 189.7291717529297, + "completion_length": 605.1666870117188, "epoch": 0.583, - "grad_norm": 17.513603027134494, - "kl": 1.390625, + "grad_norm": 18.313268799229025, + "kl": 3.3671875, "learning_rate": 4.982782822782101e-07, - "loss": 0.1662, - "reward": 2.7447941303253174, - "reward_std": 0.007499830797314644, - "rewards/accuracy_reward": 0.75, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.005205954425036907, - "rewards/tag_count_reward": 1.0, + "loss": 0.8556, + "reward": 2.253504157066345, + "reward_std": 0.5843206644058228, + "rewards/accuracy_reward": 0.4166666865348816, + "rewards/reasoning_steps_reward": 0.9722222983837128, + "rewards/repetition_penalty_reward": -0.020801476202905178, + "rewards/tag_count_reward": 0.8854166865348816, "step": 583 }, { "clip_ratio": 0.0, - "completion_length": 209.89584350585938, + "completion_length": 519.5000305175781, "epoch": 0.584, - "grad_norm": 10.693227268542563, - "kl": 1.9375, + "grad_norm": 12.893361447903395, + "kl": 5.90625, "learning_rate": 4.967182142620745e-07, - "loss": 0.3712, - "reward": 2.943057417869568, - "reward_std": 0.16701282560825348, - "rewards/accuracy_reward": 0.9583333730697632, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.003123341826722026, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.5758, + "reward": 2.2402533292770386, + "reward_std": 0.9014249742031097, + "rewards/accuracy_reward": 0.583333358168602, + "rewards/reasoning_steps_reward": 0.916666716337204, + "rewards/repetition_penalty_reward": -0.020163366571068764, + "rewards/tag_count_reward": 0.7604166865348816, "step": 584 }, { "clip_ratio": 0.0, - "completion_length": 296.0416717529297, + "completion_length": 507.9583435058594, "epoch": 0.585, - "grad_norm": 11.800433043925521, - "kl": 2.80859375, + "grad_norm": 2061.671060113234, + "kl": 9.4453125, "learning_rate": 4.951587954676837e-07, - "loss": 0.4711, - "reward": 2.604295015335083, - "reward_std": 0.4741365760564804, - "rewards/accuracy_reward": 0.6875000149011612, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.018968880642205477, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 1.0213, + "reward": 2.4675891399383545, + "reward_std": 0.6850857138633728, + "rewards/accuracy_reward": 0.6250000149011612, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.02546628937125206, + "rewards/tag_count_reward": 0.9166666865348816, "step": 585 }, { "clip_ratio": 0.0, - "completion_length": 239.9791717529297, + "completion_length": 554.5208587646484, "epoch": 0.586, - "grad_norm": 19.90426066364908, - "kl": 2.0703125, + "grad_norm": 19.335025088394126, + "kl": 3.296875, "learning_rate": 4.93600044896063e-07, - "loss": 0.1971, - "reward": 2.8646950721740723, - "reward_std": 0.28305116295814514, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.015513446182012558, - "rewards/tag_count_reward": 0.9843750298023224, + "loss": 0.7192, + "reward": 2.120868682861328, + "reward_std": 0.5526419132947922, + "rewards/accuracy_reward": 0.2916666679084301, + "rewards/reasoning_steps_reward": 0.9722222983837128, + "rewards/repetition_penalty_reward": -0.01802036538720131, + "rewards/tag_count_reward": 0.875, "step": 586 }, { "clip_ratio": 0.0, - "completion_length": 206.7291717529297, + "completion_length": 629.6041870117188, "epoch": 0.587, - "grad_norm": 11.30082738056735, - "kl": 1.890625, + "grad_norm": 18.146859923722392, + "kl": 4.8671875, "learning_rate": 4.920419815400968e-07, - "loss": 0.2445, - "reward": 2.9491833448410034, - "reward_std": 0.13919522892683744, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.012622236623428762, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.886, + "reward": 2.283421516418457, + "reward_std": 0.7014163732528687, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.02039794623851776, + "rewards/tag_count_reward": 0.7968750298023224, "step": 587 }, { "clip_ratio": 0.0, - "completion_length": 248.83333587646484, + "completion_length": 870.6041870117188, "epoch": 0.588, - "grad_norm": 14.481732702815943, - "kl": 2.0859375, + "grad_norm": 40.04439172717602, + "kl": 9.53125, "learning_rate": 4.904846243842949e-07, - "loss": 0.4443, - "reward": 2.9367491006851196, - "reward_std": 0.1736038289964199, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.014640030916780233, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.8823, + "reward": 1.9214681386947632, + "reward_std": 0.8858973979949951, + "rewards/accuracy_reward": 0.3541666716337204, + "rewards/reasoning_steps_reward": 0.9166666865348816, + "rewards/repetition_penalty_reward": -0.0368652418255806, + "rewards/tag_count_reward": 0.6875000298023224, "step": 588 }, { "clip_ratio": 0.0, - "completion_length": 266.9583435058594, + "completion_length": 655.1250305175781, "epoch": 0.589, - "grad_norm": 8.14033231581376, - "kl": 2.140625, + "grad_norm": 38.61615762624294, + "kl": 6.953125, "learning_rate": 4.88927992404563e-07, - "loss": 0.1238, - "reward": 2.8683533668518066, - "reward_std": 0.19320962950587273, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.018799440003931522, - "rewards/tag_count_reward": 0.9843750298023224, + "loss": 0.9102, + "reward": 2.024886429309845, + "reward_std": 0.7311058044433594, + "rewards/accuracy_reward": 0.3333333358168602, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.028933134861290455, + "rewards/tag_count_reward": 0.7760416865348816, "step": 589 }, { "clip_ratio": 0.0, - "completion_length": 189.4791717529297, + "completion_length": 536.8333435058594, "epoch": 0.59, - "grad_norm": 9.330085471394412, - "kl": 0.703125, + "grad_norm": 18.369342242727228, + "kl": 4.1875, "learning_rate": 4.873721045679706e-07, - "loss": 0.004, - "reward": 2.7432258129119873, - "reward_std": 0.006511852843686938, - "rewards/accuracy_reward": 0.75, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.006774259149096906, - "rewards/tag_count_reward": 1.0, + "loss": 0.7111, + "reward": 2.163462996482849, + "reward_std": 0.677480012178421, + "rewards/accuracy_reward": 0.3750000149011612, + "rewards/reasoning_steps_reward": 0.9444445371627808, + "rewards/repetition_penalty_reward": -0.020564796403050423, + "rewards/tag_count_reward": 0.8645833432674408, "step": 590 }, { "clip_ratio": 0.0, - "completion_length": 286.1041717529297, + "completion_length": 424.18751525878906, "epoch": 0.591, - "grad_norm": 11.366357415570771, - "kl": 2.39453125, + "grad_norm": 16.75415987323558, + "kl": 2.5, "learning_rate": 4.858169798325198e-07, - "loss": 0.2903, - "reward": 2.826495051383972, - "reward_std": 0.12824256252497435, + "loss": 0.3791, + "reward": 2.6984031200408936, + "reward_std": 0.4382772147655487, "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.01204665843397379, - "rewards/tag_count_reward": 0.984375, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.018610700964927673, + "rewards/tag_count_reward": 0.9114583730697632, "step": 591 }, { "clip_ratio": 0.0, - "completion_length": 236.95834350585938, + "completion_length": 541.2083435058594, "epoch": 0.592, - "grad_norm": 7.941124920798613, - "kl": 1.2890625, + "grad_norm": 29.77449375477728, + "kl": 5.0546875, "learning_rate": 4.842626371469149e-07, - "loss": 0.0808, - "reward": 2.9577300548553467, - "reward_std": 0.06351701728999615, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 0.979166716337204, - "rewards/repetition_penalty_reward": -0.021436726208776236, - "rewards/tag_count_reward": 1.0, + "loss": 0.6774, + "reward": 2.1399881839752197, + "reward_std": 0.769228607416153, + "rewards/accuracy_reward": 0.4583333432674408, + "rewards/reasoning_steps_reward": 0.9305556118488312, + "rewards/repetition_penalty_reward": -0.0197342149913311, + "rewards/tag_count_reward": 0.7708333432674408, "step": 592 }, { "clip_ratio": 0.0, - "completion_length": 274.62501525878906, + "completion_length": 536.8125305175781, "epoch": 0.593, - "grad_norm": 10.32011046498739, - "kl": 0.673828125, + "grad_norm": 18.37611006786208, + "kl": 2.578125, "learning_rate": 4.827090954503308e-07, - "loss": 0.0601, - "reward": 2.9369728565216064, - "reward_std": 0.10752777382731438, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.021360446698963642, - "rewards/tag_count_reward": 1.0, + "loss": 0.5711, + "reward": 2.4687711000442505, + "reward_std": 0.6630526483058929, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.02428449969738722, + "rewards/tag_count_reward": 0.8750000298023224, "step": 593 }, { "clip_ratio": 0.0, - "completion_length": 369.56251525878906, + "completion_length": 586.25, "epoch": 0.594, - "grad_norm": 34.24917664394145, - "kl": 3.078125, + "grad_norm": 14.12630099692924, + "kl": 4.84375, "learning_rate": 4.811563736721829e-07, - "loss": 0.4633, - "reward": 2.649326801300049, - "reward_std": 0.40142597258090973, - "rewards/accuracy_reward": 0.75, - "rewards/reasoning_steps_reward": 0.9652777910232544, - "rewards/repetition_penalty_reward": -0.019076048163697124, - "rewards/tag_count_reward": 0.953125, + "loss": 0.7276, + "reward": 2.257407784461975, + "reward_std": 0.7551295161247253, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9444445371627808, + "rewards/repetition_penalty_reward": -0.030786780640482903, + "rewards/tag_count_reward": 0.7395833730697632, "step": 594 }, { "clip_ratio": 0.0, - "completion_length": 432.95835876464844, + "completion_length": 642.9583435058594, "epoch": 0.595, - "grad_norm": 13.823892166306113, - "kl": 4.06640625, + "grad_norm": 21.92582517717831, + "kl": 4.5703125, "learning_rate": 4.79604490731896e-07, - "loss": 0.6464, - "reward": 2.4132511615753174, - "reward_std": 0.4310075640678406, - "rewards/accuracy_reward": 0.5000000149011612, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.017304460052400827, - "rewards/tag_count_reward": 0.9375, + "loss": 1.0746, + "reward": 2.405961036682129, + "reward_std": 0.8549763560295105, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.92361119389534, + "rewards/repetition_penalty_reward": -0.012441786471754313, + "rewards/tag_count_reward": 0.7656250298023224, "step": 595 }, { "clip_ratio": 0.0, - "completion_length": 218.33334350585938, + "completion_length": 626.7500305175781, "epoch": 0.596, - "grad_norm": 18.01717614262785, - "kl": 1.89453125, + "grad_norm": 17.21164988814367, + "kl": 4.7109375, "learning_rate": 4.780534655386743e-07, - "loss": 0.4039, - "reward": 2.9618523120880127, - "reward_std": 0.12083648517727852, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.005161744426004589, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.8805, + "reward": 2.197944164276123, + "reward_std": 0.7794502377510071, + "rewards/accuracy_reward": 0.4583333432674408, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.020805937238037586, + "rewards/tag_count_reward": 0.8020833432674408, "step": 596 }, { "clip_ratio": 0.0, - "completion_length": 278.1458435058594, + "completion_length": 835.3958435058594, "epoch": 0.597, - "grad_norm": 1117.3329602623837, - "kl": 10.1015625, + "grad_norm": 15.129508307858261, + "kl": 5.84375, "learning_rate": 4.7650331699127013e-07, - "loss": 0.4104, - "reward": 2.8861132860183716, - "reward_std": 0.22613783925771713, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.02013701805844903, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.8528, + "reward": 2.195970058441162, + "reward_std": 0.8639613389968872, + "rewards/accuracy_reward": 0.5625, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.01930786296725273, + "rewards/tag_count_reward": 0.6875000298023224, "step": 597 }, { "clip_ratio": 0.0, - "completion_length": 244.7291717529297, + "completion_length": 786.5208435058594, "epoch": 0.598, - "grad_norm": 14.100839255560855, - "kl": 2.19921875, + "grad_norm": 11.69120320460148, + "kl": 5.765625, "learning_rate": 4.749540639777539e-07, - "loss": 0.5039, - "reward": 2.716684103012085, - "reward_std": 0.10242344066500664, - "rewards/accuracy_reward": 0.75, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.005538215569686145, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 1.0647, + "reward": 2.122739553451538, + "reward_std": 0.8401265442371368, + "rewards/accuracy_reward": 0.5000000298023224, + "rewards/reasoning_steps_reward": 0.9444445073604584, + "rewards/repetition_penalty_reward": -0.014413274824619293, + "rewards/tag_count_reward": 0.6927083432674408, "step": 598 }, { "clip_ratio": 0.0, - "completion_length": 240.875, + "completion_length": 647.9375305175781, "epoch": 0.599, - "grad_norm": 10.89913117945693, - "kl": 1.80859375, + "grad_norm": 56.51129938563486, + "kl": 5.953125, "learning_rate": 4.7340572537528547e-07, - "loss": 0.1403, - "reward": 2.7366116046905518, - "reward_std": 0.2655755281448364, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 0.9722222089767456, - "rewards/repetition_penalty_reward": -0.01165237883105874, - "rewards/tag_count_reward": 0.9843750298023224, + "loss": 0.9006, + "reward": 1.922993242740631, + "reward_std": 0.7214177250862122, + "rewards/accuracy_reward": 0.2500000074505806, + "rewards/reasoning_steps_reward": 0.888888955116272, + "rewards/repetition_penalty_reward": -0.023187357001006603, + "rewards/tag_count_reward": 0.8072916865348816, "step": 599 }, { "clip_ratio": 0.0, - "completion_length": 297.5208435058594, + "completion_length": 613.0833587646484, "epoch": 0.6, - "grad_norm": 17.020007989261867, - "kl": 2.60546875, + "grad_norm": 19.124513938177042, + "kl": 4.421875, "learning_rate": 4.7185832004988133e-07, - "loss": 0.4497, - "reward": 2.815277934074402, - "reward_std": 0.4092426598072052, - "rewards/accuracy_reward": 0.8750000298023224, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.01631933473981917, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.8773, + "reward": 2.302823781967163, + "reward_std": 0.793424665927887, + "rewards/accuracy_reward": 0.5625000298023224, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.03745407238602638, + "rewards/tag_count_reward": 0.8125, "step": 600 }, { "clip_ratio": 0.0, - "completion_length": 247.58334350585938, + "completion_length": 573.4375305175781, "epoch": 0.601, - "grad_norm": 11.989076278081575, - "kl": 2.4765625, + "grad_norm": 21.633820801259493, + "kl": 5.6875, "learning_rate": 4.703118668561875e-07, - "loss": 0.3905, - "reward": 2.8410815000534058, - "reward_std": 0.23662713915109634, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.011349301552399993, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.8408, + "reward": 1.9621968269348145, + "reward_std": 0.7473039925098419, + "rewards/accuracy_reward": 0.2916666716337204, + "rewards/reasoning_steps_reward": 0.9166666567325592, + "rewards/repetition_penalty_reward": -0.016969827935099602, + "rewards/tag_count_reward": 0.7708333432674408, "step": 601 }, { "clip_ratio": 0.0, - "completion_length": 321.4375, + "completion_length": 409.9583435058594, "epoch": 0.602, - "grad_norm": 25.393481717105253, - "kl": 3.34375, + "grad_norm": 21.57642847298448, + "kl": 2.21875, "learning_rate": 4.68766384637248e-07, - "loss": 0.6337, - "reward": 2.6751890182495117, - "reward_std": 0.374066099524498, - "rewards/accuracy_reward": 0.75, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.02446372713893652, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.595, + "reward": 2.3019765615463257, + "reward_std": 0.5564347207546234, + "rewards/accuracy_reward": 0.4166666679084301, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.01573185622692108, + "rewards/tag_count_reward": 0.9218750298023224, "step": 602 }, { "clip_ratio": 0.0, - "completion_length": 266.25001525878906, + "completion_length": 656.0000305175781, "epoch": 0.603, - "grad_norm": 21.292989818375567, - "kl": 3.890625, + "grad_norm": 18.78803423208506, + "kl": 4.390625, "learning_rate": 4.672218922242759e-07, - "loss": 0.7378, - "reward": 2.934574007987976, - "reward_std": 0.12263527151662856, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.00813442698563449, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.8124, + "reward": 2.2104203701019287, + "reward_std": 0.60920250415802, + "rewards/accuracy_reward": 0.5, + "rewards/reasoning_steps_reward": 0.9305555820465088, + "rewards/repetition_penalty_reward": -0.0170103432610631, + "rewards/tag_count_reward": 0.796875, "step": 603 }, { "clip_ratio": 0.0, - "completion_length": 191.5, + "completion_length": 704.3541870117188, "epoch": 0.604, - "grad_norm": 19.357742696467227, - "kl": 2.70703125, + "grad_norm": 17.09263062702093, + "kl": 5.625, "learning_rate": 4.656784084364238e-07, - "loss": 0.1731, - "reward": 2.9714030027389526, - "reward_std": 0.0844070240855217, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.004291372140869498, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.8064, + "reward": 1.9496487379074097, + "reward_std": 0.6896158158779144, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/reasoning_steps_reward": 0.9305555820465088, + "rewards/repetition_penalty_reward": -0.017365206964313984, + "rewards/tag_count_reward": 0.7031250298023224, "step": 604 }, { "clip_ratio": 0.0, - "completion_length": 301.06251525878906, + "completion_length": 709.5416870117188, "epoch": 0.605, - "grad_norm": 23.383245507946857, - "kl": 5.5, + "grad_norm": 14.18186552493172, + "kl": 5.40625, "learning_rate": 4.641359520805548e-07, - "loss": 0.7556, - "reward": 2.7268714904785156, - "reward_std": 0.24603672325611115, - "rewards/accuracy_reward": 0.8125, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.0057675400748848915, - "rewards/tag_count_reward": 0.9270833432674408, + "loss": 0.9465, + "reward": 1.9041990041732788, + "reward_std": 0.5497189462184906, + "rewards/accuracy_reward": 0.25000000558793545, + "rewards/reasoning_steps_reward": 0.92361119389534, + "rewards/repetition_penalty_reward": -0.014204025734215975, + "rewards/tag_count_reward": 0.7447916865348816, "step": 605 }, { "clip_ratio": 0.0, - "completion_length": 220.8541717529297, + "completion_length": 644.1875, "epoch": 0.606, - "grad_norm": 51.81074352447238, - "kl": 4.3515625, + "grad_norm": 21.61369420565558, + "kl": 4.0625, "learning_rate": 4.6259454195101267e-07, - "loss": 0.3715, - "reward": 2.490087389945984, - "reward_std": 0.2846665307879448, - "rewards/accuracy_reward": 0.5416666865348816, + "loss": 1.0074, + "reward": 2.424658179283142, + "reward_std": 0.7420503497123718, + "rewards/accuracy_reward": 0.6458333730697632, "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.008176573319360614, - "rewards/tag_count_reward": 0.9635416865348816, + "rewards/repetition_penalty_reward": -0.021522408351302147, + "rewards/tag_count_reward": 0.8072916865348816, "step": 606 }, { "clip_ratio": 0.0, - "completion_length": 246.64584350585938, + "completion_length": 675.5833740234375, "epoch": 0.607, - "grad_norm": 16.03826543431405, - "kl": 3.34375, + "grad_norm": 20.01514537964165, + "kl": 4.484375, "learning_rate": 4.6105419682939316e-07, - "loss": 0.4905, - "reward": 2.8872032165527344, - "reward_std": 0.27019158005714417, - "rewards/accuracy_reward": 0.9375000298023224, - "rewards/reasoning_steps_reward": 0.979166716337204, - "rewards/repetition_penalty_reward": -0.013838549610227346, - "rewards/tag_count_reward": 0.984375, + "loss": 0.94, + "reward": 2.2794888019561768, + "reward_std": 0.6631845533847809, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.015650033485144377, + "rewards/tag_count_reward": 0.7604166865348816, "step": 607 }, { "clip_ratio": 0.0, - "completion_length": 249.68751525878906, + "completion_length": 588.3125, "epoch": 0.608, - "grad_norm": 36.26601254592886, - "kl": 3.1796875, + "grad_norm": 15.233743739571594, + "kl": 3.671875, "learning_rate": 4.59514935484316e-07, - "loss": 0.9279, - "reward": 2.8424474000930786, - "reward_std": 0.4830077439546585, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.011719382368028164, - "rewards/tag_count_reward": 0.9583333432674408, + "loss": 0.6811, + "reward": 2.3253376483917236, + "reward_std": 0.7875647842884064, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.013204073999077082, + "rewards/tag_count_reward": 0.7968750298023224, "step": 608 }, { "clip_ratio": 0.0, - "completion_length": 290.5625, + "completion_length": 443.0625, "epoch": 0.609, - "grad_norm": 13.581435606171588, - "kl": 2.203125, + "grad_norm": 23.918832923297472, + "kl": 2.390625, "learning_rate": 4.579767766711944e-07, - "loss": 0.5108, - "reward": 2.6821742057800293, - "reward_std": 0.28868842124938965, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9861111044883728, - "rewards/repetition_penalty_reward": -0.017478578723967075, - "rewards/tag_count_reward": 0.984375, + "loss": 0.5455, + "reward": 2.240389823913574, + "reward_std": 0.6115424335002899, + "rewards/accuracy_reward": 0.4166666716337204, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.02349921502172947, + "rewards/tag_count_reward": 0.8958333432674408, "step": 609 }, { "clip_ratio": 0.0, - "completion_length": 178.4791717529297, + "completion_length": 501.1458435058594, "epoch": 0.61, - "grad_norm": 36.41572435981373, - "kl": 1.65625, + "grad_norm": 21.038617061450985, + "kl": 4.1015625, "learning_rate": 4.5643973913200837e-07, - "loss": 0.3989, - "reward": 2.4863682985305786, - "reward_std": 0.04183072363957763, - "rewards/accuracy_reward": 0.5, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.0032150641782209277, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.809, + "reward": 2.464809775352478, + "reward_std": 0.7636359930038452, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9374999701976776, + "rewards/repetition_penalty_reward": -0.019565261900424957, + "rewards/tag_count_reward": 0.8593750298023224, "step": 610 }, { "clip_ratio": 0.0, - "completion_length": 217.33334350585938, + "completion_length": 681.8125, "epoch": 0.611, - "grad_norm": 29.34643208701461, - "kl": 1.607421875, + "grad_norm": 166.2589762047824, + "kl": 12.0625, "learning_rate": 4.549038415950751e-07, - "loss": 0.3667, - "reward": 2.6305577754974365, - "reward_std": 0.39583562314510345, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.006595249753445387, - "rewards/tag_count_reward": 0.984375, + "loss": 1.4371, + "reward": 1.9376187920570374, + "reward_std": 0.9399848580360413, + "rewards/accuracy_reward": 0.3750000149011612, + "rewards/reasoning_steps_reward": 0.8888888955116272, + "rewards/repetition_penalty_reward": -0.013770201243460178, + "rewards/tag_count_reward": 0.6875000298023224, "step": 611 }, { "clip_ratio": 0.0, - "completion_length": 213.08334350585938, + "completion_length": 600.0625152587891, "epoch": 0.612, - "grad_norm": 9.597333907981147, - "kl": 2.51171875, + "grad_norm": 63.22955290170651, + "kl": 7.67578125, "learning_rate": 4.5336910277482155e-07, - "loss": 0.4284, - "reward": 2.7900279760360718, - "reward_std": 0.23067965358495712, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.010319311637431383, - "rewards/tag_count_reward": 0.9531250298023224, + "loss": 1.1131, + "reward": 2.076914668083191, + "reward_std": 0.6576991975307465, + "rewards/accuracy_reward": 0.3958333432674408, + "rewards/reasoning_steps_reward": 0.8472222685813904, + "rewards/repetition_penalty_reward": -0.02030777558684349, + "rewards/tag_count_reward": 0.8541666865348816, "step": 612 }, { "clip_ratio": 0.0, - "completion_length": 250.1875, + "completion_length": 442.7291717529297, "epoch": 0.613, - "grad_norm": 19.759163541178832, - "kl": 2.89453125, + "grad_norm": 41.71612091835444, + "kl": 4.90625, "learning_rate": 4.51835541371556e-07, - "loss": 0.4515, - "reward": 2.8175108432769775, - "reward_std": 0.2531072050333023, - "rewards/accuracy_reward": 0.875, - "rewards/reasoning_steps_reward": 0.9861111044883728, - "rewards/repetition_penalty_reward": -0.01755856117233634, - "rewards/tag_count_reward": 0.9739583730697632, + "loss": 0.4885, + "reward": 2.2476359605789185, + "reward_std": 0.6959330439567566, + "rewards/accuracy_reward": 0.4375000149011612, + "rewards/reasoning_steps_reward": 0.9444445371627808, + "rewards/repetition_penalty_reward": -0.01972527615725994, + "rewards/tag_count_reward": 0.8854166865348816, "step": 613 }, { "clip_ratio": 0.0, - "completion_length": 285.6666717529297, + "completion_length": 576.3750305175781, "epoch": 0.614, - "grad_norm": 19.129528656799383, - "kl": 3.359375, + "grad_norm": 22.018133081081007, + "kl": 4.546875, "learning_rate": 4.503031760712397e-07, - "loss": 0.5804, - "reward": 2.614456057548523, - "reward_std": 0.39149369299411774, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.014016230590641499, - "rewards/tag_count_reward": 0.96875, + "loss": 0.813, + "reward": 2.5740822553634644, + "reward_std": 0.7949818074703217, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.01619557524099946, + "rewards/tag_count_reward": 0.8541666865348816, "step": 614 }, { "clip_ratio": 0.0, - "completion_length": 227.50000762939453, + "completion_length": 558.2916870117188, "epoch": 0.615, - "grad_norm": 110.24006783167859, - "kl": 1.7734375, + "grad_norm": 20.90403864764217, + "kl": 4.046875, "learning_rate": 4.4877202554526084e-07, - "loss": 0.6454, - "reward": 2.9104275703430176, - "reward_std": 0.28398814611136913, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/reasoning_steps_reward": 0.979166716337204, - "rewards/repetition_penalty_reward": -0.006238935049623251, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.8765, + "reward": 2.31480872631073, + "reward_std": 0.6609582602977753, + "rewards/accuracy_reward": 0.5, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.018524575512856245, + "rewards/tag_count_reward": 0.875, "step": 615 }, { "clip_ratio": 0.0, - "completion_length": 183.45834350585938, + "completion_length": 571.6666870117188, "epoch": 0.616, - "grad_norm": 5.53691882563287, - "kl": 0.7333984375, + "grad_norm": 20.967646246131107, + "kl": 5.28125, "learning_rate": 4.4724210845020494e-07, - "loss": -0.0601, - "reward": 2.9226795434951782, - "reward_std": 0.16225393116474152, - "rewards/accuracy_reward": 0.9583333730697632, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.016556608956307173, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.7339, + "reward": 2.366945743560791, + "reward_std": 0.5788512378931046, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.014998776838183403, + "rewards/tag_count_reward": 0.8125, "step": 616 }, { "clip_ratio": 0.0, - "completion_length": 297.12501525878906, + "completion_length": 597.75, "epoch": 0.617, - "grad_norm": 19.376352962370834, - "kl": 2.564453125, + "grad_norm": 62.998128656761025, + "kl": 8.015625, "learning_rate": 4.457134434276293e-07, - "loss": 0.3272, - "reward": 2.608876943588257, - "reward_std": 0.23755387589335442, - "rewards/accuracy_reward": 0.6458333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.021331378258764744, - "rewards/tag_count_reward": 0.984375, + "loss": 1.0981, + "reward": 2.1459895372390747, + "reward_std": 0.7351883053779602, + "rewards/accuracy_reward": 0.4583333432674408, + "rewards/reasoning_steps_reward": 0.9444444477558136, + "rewards/repetition_penalty_reward": -0.011996725108474493, + "rewards/tag_count_reward": 0.7552083432674408, "step": 617 }, { "clip_ratio": 0.0, - "completion_length": 239.20833587646484, + "completion_length": 578.9583435058594, "epoch": 0.618, - "grad_norm": 20.5961575097037, - "kl": 2.0703125, + "grad_norm": 35.2952233783846, + "kl": 7.296875, "learning_rate": 4.441860491038345e-07, - "loss": 0.3409, - "reward": 2.7382986545562744, - "reward_std": 0.2788648307323456, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.015173627529293299, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 1.1757, + "reward": 1.9162747263908386, + "reward_std": 0.7140241265296936, + "rewards/accuracy_reward": 0.2708333432674408, + "rewards/reasoning_steps_reward": 0.9097222685813904, + "rewards/repetition_penalty_reward": -0.029905788600444794, + "rewards/tag_count_reward": 0.7656250298023224, "step": 618 }, { "clip_ratio": 0.0, - "completion_length": 320.81250762939453, + "completion_length": 412.0208435058594, "epoch": 0.619, - "grad_norm": 11.404867137908473, - "kl": 3.390625, + "grad_norm": 20.183860058369124, + "kl": 3.5234375, "learning_rate": 4.4265994408963867e-07, - "loss": 0.4104, - "reward": 2.680772304534912, - "reward_std": 0.288493387401104, - "rewards/accuracy_reward": 0.7708333730697632, - "rewards/reasoning_steps_reward": 0.9513889253139496, - "rewards/repetition_penalty_reward": -0.025824937503784895, - "rewards/tag_count_reward": 0.984375, + "loss": 0.6553, + "reward": 2.6022950410842896, + "reward_std": 0.690807044506073, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.01749663334339857, + "rewards/tag_count_reward": 0.8697916865348816, "step": 619 }, { "clip_ratio": 0.0, - "completion_length": 191.0625, + "completion_length": 551.7083587646484, "epoch": 0.62, - "grad_norm": 8.921413334254904, - "kl": 0.720703125, + "grad_norm": 30.2741526195306, + "kl": 3.6640625, "learning_rate": 4.4113514698014953e-07, - "loss": 0.0473, - "reward": 2.860893130302429, - "reward_std": 0.15492321690544486, - "rewards/accuracy_reward": 0.875, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.007162411697208881, - "rewards/tag_count_reward": 1.0, + "loss": 1.2213, + "reward": 2.3724182844161987, + "reward_std": 0.7322799563407898, + "rewards/accuracy_reward": 0.5625000149011612, + "rewards/reasoning_steps_reward": 0.951388955116272, + "rewards/repetition_penalty_reward": -0.021679001860320568, + "rewards/tag_count_reward": 0.8802083730697632, "step": 620 }, { "clip_ratio": 0.0, - "completion_length": 196.33334350585938, + "completion_length": 477.89585876464844, "epoch": 0.621, - "grad_norm": 12.938101775630177, - "kl": 1.12890625, + "grad_norm": 23.62241565191626, + "kl": 4.26171875, "learning_rate": 4.3961167635453876e-07, - "loss": 0.2863, - "reward": 2.7922120094299316, - "reward_std": 0.23985068500041962, - "rewards/accuracy_reward": 0.8125000298023224, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.00466301734559238, - "rewards/tag_count_reward": 0.9843750298023224, + "loss": 0.6591, + "reward": 2.3674203753471375, + "reward_std": 0.6224862933158875, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.9375000596046448, + "rewards/repetition_penalty_reward": -0.012787933927029371, + "rewards/tag_count_reward": 0.7968750298023224, "step": 621 }, { "clip_ratio": 0.0, - "completion_length": 195.4166717529297, + "completion_length": 589.2500305175781, "epoch": 0.622, - "grad_norm": 8.78381099654674, - "kl": 1.044921875, + "grad_norm": 26.80506804197189, + "kl": 4.28125, "learning_rate": 4.3808955077581546e-07, - "loss": 0.1121, - "reward": 2.7049620151519775, - "reward_std": 0.1331772692501545, - "rewards/accuracy_reward": 0.7291666716337204, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.006843709939857945, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 1.3034, + "reward": 2.1128806471824646, + "reward_std": 0.6486698687076569, + "rewards/accuracy_reward": 0.37500002048909664, + "rewards/reasoning_steps_reward": 0.9236111044883728, + "rewards/repetition_penalty_reward": -0.01906377449631691, + "rewards/tag_count_reward": 0.8333333432674408, "step": 622 }, { "clip_ratio": 0.0, - "completion_length": 305.93751525878906, + "completion_length": 591.5416870117188, "epoch": 0.623, - "grad_norm": 18.024063332931444, - "kl": 3.3515625, + "grad_norm": 17.029635227087233, + "kl": 4.71875, "learning_rate": 4.365687887905988e-07, - "loss": 0.9923, - "reward": 2.729868769645691, - "reward_std": 0.3502872884273529, - "rewards/accuracy_reward": 0.8333333730697632, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.013186943717300892, - "rewards/tag_count_reward": 0.9166666865348816, + "loss": 0.7684, + "reward": 2.364261269569397, + "reward_std": 0.7807703912258148, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.9236111640930176, + "rewards/repetition_penalty_reward": -0.017683234997093678, + "rewards/tag_count_reward": 0.8125000298023224, "step": 623 }, { "clip_ratio": 0.0, - "completion_length": 230.0416717529297, + "completion_length": 657.8541870117188, "epoch": 0.624, - "grad_norm": 22.21612200768272, - "kl": 1.173828125, + "grad_norm": 29.359681860338085, + "kl": 8.28125, "learning_rate": 4.350494089288943e-07, - "loss": 0.2699, - "reward": 2.818644881248474, - "reward_std": 0.1639364566653967, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.01121642580255866, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 1.1389, + "reward": 2.2420893907546997, + "reward_std": 0.8185697495937347, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.01485508494079113, + "rewards/tag_count_reward": 0.7083333432674408, "step": 624 }, { "clip_ratio": 0.0, - "completion_length": 211.58334350585938, + "completion_length": 572.8958435058594, "epoch": 0.625, - "grad_norm": 30.292390094152662, - "kl": 1.142578125, + "grad_norm": 25.429883504952183, + "kl": 5.6875, "learning_rate": 4.3353142970386557e-07, - "loss": 0.5253, - "reward": 2.723633885383606, - "reward_std": 0.0684351809322834, - "rewards/accuracy_reward": 0.75, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.003796677221544087, - "rewards/tag_count_reward": 0.984375, + "loss": 0.8027, + "reward": 2.140592336654663, + "reward_std": 0.777917891740799, + "rewards/accuracy_reward": 0.4375, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.019129890482872725, + "rewards/tag_count_reward": 0.7708333730697632, "step": 625 }, { "clip_ratio": 0.0, - "completion_length": 271.25000762939453, + "completion_length": 713.5833435058594, "epoch": 0.626, - "grad_norm": 15.560128866147947, - "kl": 1.47265625, + "grad_norm": 43.92929389982015, + "kl": 8.28125, "learning_rate": 4.3201486961161093e-07, - "loss": 0.48, - "reward": 2.889008641242981, - "reward_std": 0.29097793996334076, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 0.9583333432674408, - "rewards/repetition_penalty_reward": -0.02244984172284603, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 1.2015, + "reward": 1.720135509967804, + "reward_std": 0.6745887994766235, + "rewards/accuracy_reward": 0.2083333358168602, + "rewards/reasoning_steps_reward": 0.9097222685813904, + "rewards/repetition_penalty_reward": -0.012503455393016338, + "rewards/tag_count_reward": 0.6145833730697632, "step": 626 }, { "clip_ratio": 0.0, - "completion_length": 333.12501525878906, + "completion_length": 645.3125, "epoch": 0.627, - "grad_norm": 16.81159573541499, - "kl": 3.177734375, + "grad_norm": 57.87651440683306, + "kl": 8.28125, "learning_rate": 4.304997471309361e-07, - "loss": 0.9209, - "reward": 2.849704384803772, - "reward_std": 0.31951260194182396, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 0.9305556118488312, - "rewards/repetition_penalty_reward": -0.023559655528515577, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 1.0589, + "reward": 2.1112887859344482, + "reward_std": 0.8265305161476135, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 0.9027777910232544, + "rewards/repetition_penalty_reward": -0.010239105205982924, + "rewards/tag_count_reward": 0.6979166865348816, "step": 627 }, { "clip_ratio": 0.0, - "completion_length": 293.5833435058594, + "completion_length": 648.2916870117188, "epoch": 0.628, - "grad_norm": 25.729946050578672, - "kl": 4.7890625, + "grad_norm": 50.008615375469205, + "kl": 5.703125, "learning_rate": 4.2898608072313045e-07, - "loss": 0.5227, - "reward": 2.8756821155548096, - "reward_std": 0.18366234004497528, - "rewards/accuracy_reward": 0.9375, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.009734605671837926, - "rewards/tag_count_reward": 0.96875, + "loss": 1.1558, + "reward": 2.0761906504631042, + "reward_std": 0.8007937371730804, + "rewards/accuracy_reward": 0.3958333432674408, + "rewards/reasoning_steps_reward": 0.9027778208255768, + "rewards/repetition_penalty_reward": -0.02450390998274088, + "rewards/tag_count_reward": 0.8020833432674408, "step": 628 }, { "clip_ratio": 0.0, - "completion_length": 347.1666717529297, + "completion_length": 510.45835876464844, "epoch": 0.629, - "grad_norm": 25.350419085014476, - "kl": 5.5859375, + "grad_norm": 29.38809691299576, + "kl": 3.390625, "learning_rate": 4.2747388883174154e-07, - "loss": 1.6447, - "reward": 2.836197018623352, - "reward_std": 0.3806745111942291, - "rewards/accuracy_reward": 0.9375, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.023177933879196644, - "rewards/tag_count_reward": 0.9427083432674408, + "loss": 0.6864, + "reward": 2.3009127378463745, + "reward_std": 0.7014258801937103, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9305556118488312, + "rewards/repetition_penalty_reward": -0.009851326234638691, + "rewards/tag_count_reward": 0.7760416865348816, "step": 629 }, { "clip_ratio": 0.0, - "completion_length": 246.04167938232422, + "completion_length": 722.8750305175781, "epoch": 0.63, - "grad_norm": 11.735981031222638, - "kl": 4.046875, + "grad_norm": 15.997709703522245, + "kl": 5.765625, "learning_rate": 4.2596318988235037e-07, - "loss": 0.4885, - "reward": 2.7899720668792725, - "reward_std": 0.27955499291419983, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.006902923574671149, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.7532, + "reward": 1.911007285118103, + "reward_std": 0.7613692283630371, + "rewards/accuracy_reward": 0.3541666716337204, + "rewards/reasoning_steps_reward": 0.8958333730697632, + "rewards/repetition_penalty_reward": -0.021284431219100952, + "rewards/tag_count_reward": 0.6822916865348816, "step": 630 }, { "clip_ratio": 0.0, - "completion_length": 293.56250762939453, + "completion_length": 539.125, "epoch": 0.631, - "grad_norm": 36.50088350603043, - "kl": 4.72265625, + "grad_norm": 21.914178804717245, + "kl": 3.2734375, "learning_rate": 4.2445400228234687e-07, - "loss": 0.7818, - "reward": 2.609186053276062, - "reward_std": 0.22628825157880783, - "rewards/accuracy_reward": 0.75, - "rewards/reasoning_steps_reward": 0.9305556118488312, - "rewards/repetition_penalty_reward": -0.00886953080771491, - "rewards/tag_count_reward": 0.9375, + "loss": 0.7349, + "reward": 2.5217862129211426, + "reward_std": 0.7204074263572693, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.018144560046494007, + "rewards/tag_count_reward": 0.8385416865348816, "step": 631 }, { "clip_ratio": 0.0, - "completion_length": 233.95834350585938, + "completion_length": 582.5000152587891, "epoch": 0.632, - "grad_norm": 22.39501795697153, - "kl": 3.5546875, + "grad_norm": 23.574151149835284, + "kl": 4.3125, "learning_rate": 4.2294634442070553e-07, - "loss": 0.5166, - "reward": 2.8717983961105347, - "reward_std": 0.3161999434232712, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/reasoning_steps_reward": 0.965277761220932, - "rewards/repetition_penalty_reward": -0.010145986452698708, - "rewards/tag_count_reward": 0.9583333730697632, + "loss": 0.776, + "reward": 2.2236552238464355, + "reward_std": 0.7735611796379089, + "rewards/accuracy_reward": 0.4791666865348816, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.014192146249115467, + "rewards/tag_count_reward": 0.7656250298023224, "step": 632 }, { "clip_ratio": 0.0, - "completion_length": 385.7916793823242, + "completion_length": 606.8125305175781, "epoch": 0.633, - "grad_norm": 12.617361952005716, - "kl": 4.21484375, + "grad_norm": 27.523457434836338, + "kl": 3.375, "learning_rate": 4.214402346677619e-07, - "loss": 0.4706, - "reward": 2.61710786819458, - "reward_std": 0.16445695608854294, - "rewards/accuracy_reward": 0.7291666716337204, - "rewards/reasoning_steps_reward": 0.9583333134651184, - "rewards/repetition_penalty_reward": -0.02872568927705288, - "rewards/tag_count_reward": 0.9583333432674408, + "loss": 0.8245, + "reward": 2.0943539142608643, + "reward_std": 0.7719534635543823, + "rewards/accuracy_reward": 0.375, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.020229644142091274, + "rewards/tag_count_reward": 0.78125, "step": 633 }, { "clip_ratio": 0.0, - "completion_length": 269.56250762939453, + "completion_length": 468.04168701171875, "epoch": 0.634, - "grad_norm": 18.152474881472962, - "kl": 2.9453125, + "grad_norm": 26.042764464725323, + "kl": 3.5234375, "learning_rate": 4.1993569137498776e-07, - "loss": 0.4038, - "reward": 2.7271751165390015, - "reward_std": 0.1621021293103695, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.019352862611413002, - "rewards/tag_count_reward": 0.96875, + "loss": 0.6013, + "reward": 2.314329147338867, + "reward_std": 0.48445817828178406, + "rewards/accuracy_reward": 0.541666679084301, + "rewards/reasoning_steps_reward": 0.9305555522441864, + "rewards/repetition_penalty_reward": -0.01205990044400096, + "rewards/tag_count_reward": 0.8541666865348816, "step": 634 }, { "clip_ratio": 0.0, - "completion_length": 203.14584350585938, + "completion_length": 585.9791717529297, "epoch": 0.635, - "grad_norm": 13.11573795878282, - "kl": 1.796875, + "grad_norm": 110.89920440001275, + "kl": 5.375, "learning_rate": 4.1843273287476854e-07, - "loss": 0.5947, - "reward": 2.9767191410064697, - "reward_std": 0.07769604562781751, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.0024476193939335644, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 1.2961, + "reward": 2.3673198223114014, + "reward_std": 0.7202720046043396, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9652778506278992, + "rewards/repetition_penalty_reward": -0.014624850358814001, + "rewards/tag_count_reward": 0.8333333432674408, "step": 635 }, { "clip_ratio": 0.0, - "completion_length": 215.6041717529297, + "completion_length": 543.1250152587891, "epoch": 0.636, - "grad_norm": 27.107359805462256, - "kl": 1.96484375, + "grad_norm": 168.91845413040815, + "kl": 6.78125, "learning_rate": 4.1693137748017915e-07, - "loss": 0.2203, - "reward": 2.589609146118164, - "reward_std": 0.37453170120716095, - "rewards/accuracy_reward": 0.6250000298023224, - "rewards/reasoning_steps_reward": 0.9861111044883728, - "rewards/repetition_penalty_reward": -0.005877024494111538, - "rewards/tag_count_reward": 0.984375, + "loss": 0.9638, + "reward": 2.172168493270874, + "reward_std": 0.7465826869010925, + "rewards/accuracy_reward": 0.4166666865348816, + "rewards/reasoning_steps_reward": 0.9305556118488312, + "rewards/repetition_penalty_reward": -0.01880389917641878, + "rewards/tag_count_reward": 0.8437500298023224, "step": 636 }, { "clip_ratio": 0.0, - "completion_length": 214.1041717529297, + "completion_length": 367.12501525878906, "epoch": 0.637, - "grad_norm": 13.651598677743008, - "kl": 2.33203125, + "grad_norm": 14.167427077773384, + "kl": 1.734375, "learning_rate": 4.15431643484761e-07, - "loss": 0.4742, - "reward": 2.738916754722595, - "reward_std": 0.23451564460992813, - "rewards/accuracy_reward": 0.7708333730697632, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.009347157087177038, - "rewards/tag_count_reward": 0.984375, + "loss": 0.2434, + "reward": 2.3709245920181274, + "reward_std": 0.41279859840869904, + "rewards/accuracy_reward": 0.4791666716337204, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.011019973549991846, + "rewards/tag_count_reward": 0.9166666865348816, "step": 637 }, { "clip_ratio": 0.0, - "completion_length": 197.14584350585938, + "completion_length": 409.0208435058594, "epoch": 0.638, - "grad_norm": 22.122302955740754, - "kl": 1.2890625, + "grad_norm": 44.523191376668905, + "kl": 2.34765625, "learning_rate": 4.1393354916230005e-07, - "loss": 0.2579, - "reward": 2.7617021799087524, - "reward_std": 0.07898123748600483, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.009131280705332756, - "rewards/tag_count_reward": 1.0, + "loss": 0.6225, + "reward": 2.353254556655884, + "reward_std": 0.5010079145431519, + "rewards/accuracy_reward": 0.5000000111758709, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.025217789225280285, + "rewards/tag_count_reward": 0.9062500298023224, "step": 638 }, { "clip_ratio": 0.0, - "completion_length": 180.7291717529297, + "completion_length": 375.56251525878906, "epoch": 0.639, - "grad_norm": 20.38519258881025, - "kl": 1.2578125, + "grad_norm": 19.368462137693786, + "kl": 1.640625, "learning_rate": 4.124371127666024e-07, - "loss": -0.0172, - "reward": 2.857728123664856, - "reward_std": 0.15181594900786877, - "rewards/accuracy_reward": 0.875, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.010327422874979675, - "rewards/tag_count_reward": 1.0, + "loss": 0.4301, + "reward": 2.260585904121399, + "reward_std": 0.4507751762866974, + "rewards/accuracy_reward": 0.3750000149011612, + "rewards/reasoning_steps_reward": 0.9652778506278992, + "rewards/repetition_penalty_reward": -0.017191945109516382, + "rewards/tag_count_reward": 0.9375000298023224, "step": 639 }, { "clip_ratio": 0.0, - "completion_length": 277.83333587646484, + "completion_length": 527.1666870117188, "epoch": 0.64, - "grad_norm": 13.884187169941764, - "kl": 1.521484375, + "grad_norm": 23.408643386653363, + "kl": 5.40625, "learning_rate": 4.1094235253127374e-07, - "loss": 0.2174, - "reward": 2.9081408977508545, - "reward_std": 0.15698403120040894, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.03109535202383995, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 1.0619, + "reward": 2.11304247379303, + "reward_std": 0.6343182921409607, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/reasoning_steps_reward": 0.9583334028720856, + "rewards/repetition_penalty_reward": -0.017165867146104574, + "rewards/tag_count_reward": 0.8385416865348816, "step": 640 }, { "clip_ratio": 0.0, - "completion_length": 234.2916717529297, + "completion_length": 338.7083435058594, "epoch": 0.641, - "grad_norm": 13.908439261285777, - "kl": 1.587890625, + "grad_norm": 26.252165148333535, + "kl": 2.26953125, "learning_rate": 4.0944928666949527e-07, - "loss": 0.4798, - "reward": 2.9004958868026733, - "reward_std": 0.22105778683908284, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/reasoning_steps_reward": 0.9722222089767456, - "rewards/repetition_penalty_reward": -0.014434898155741394, - "rewards/tag_count_reward": 0.984375, + "loss": 0.0696, + "reward": 2.4866459369659424, + "reward_std": 0.29595404863357544, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9652778506278992, + "rewards/repetition_penalty_reward": -0.020298474468290806, + "rewards/tag_count_reward": 0.9583333730697632, "step": 641 }, { "clip_ratio": 0.0, - "completion_length": 183.43750762939453, + "completion_length": 613.9166717529297, "epoch": 0.642, - "grad_norm": 9.584206218129793, - "kl": 1.345703125, + "grad_norm": 83.06728366905014, + "kl": 9.09375, "learning_rate": 4.079579333738039e-07, - "loss": 0.1575, - "reward": 2.9864131212234497, - "reward_std": 0.0265450244769454, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.008378781378269196, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 1.4448, + "reward": 2.021218776702881, + "reward_std": 0.7841091454029083, + "rewards/accuracy_reward": 0.3541666679084301, + "rewards/reasoning_steps_reward": 0.881944477558136, + "rewards/repetition_penalty_reward": -0.011767241638153791, + "rewards/tag_count_reward": 0.7968750298023224, "step": 642 }, { "clip_ratio": 0.0, - "completion_length": 176.6041717529297, + "completion_length": 517.4375, "epoch": 0.643, - "grad_norm": 23.52917028340049, - "kl": 0.841796875, + "grad_norm": 29.352168555788648, + "kl": 4.1328125, "learning_rate": 4.064683108158685e-07, - "loss": 0.1134, - "reward": 2.9705183506011963, - "reward_std": 0.07729751616716385, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.008648322662338614, - "rewards/tag_count_reward": 1.0, + "loss": 0.8538, + "reward": 2.1417415142059326, + "reward_std": 0.7165846824645996, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.019716893322765827, + "rewards/tag_count_reward": 0.8697916865348816, "step": 643 }, { "clip_ratio": 0.0, - "completion_length": 271.02083587646484, + "completion_length": 472.8958435058594, "epoch": 0.644, - "grad_norm": 12.937708426834615, - "kl": 2.349609375, + "grad_norm": 27.512095159522406, + "kl": 5.0078125, "learning_rate": 4.0498043714627006e-07, - "loss": 0.6234, - "reward": 2.929720401763916, - "reward_std": 0.17625097930431366, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.021668408066034317, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.4818, + "reward": 2.178748071193695, + "reward_std": 0.5007181763648987, + "rewards/accuracy_reward": 0.3958333544433117, + "rewards/reasoning_steps_reward": 0.9444445073604584, + "rewards/repetition_penalty_reward": -0.01569645293056965, + "rewards/tag_count_reward": 0.8541666865348816, "step": 644 }, { "clip_ratio": 0.0, - "completion_length": 287.3958435058594, + "completion_length": 487.25001525878906, "epoch": 0.645, - "grad_norm": 25.38978245016162, - "kl": 2.60546875, + "grad_norm": 42.088759648308134, + "kl": 2.484375, "learning_rate": 4.034943304942796e-07, - "loss": 0.4295, - "reward": 2.627605676651001, - "reward_std": 0.35428738594055176, - "rewards/accuracy_reward": 0.6875, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.028644328005611897, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.9438, + "reward": 2.6129757165908813, + "reward_std": 0.6440701186656952, + "rewards/accuracy_reward": 0.75, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.02244096901267767, + "rewards/tag_count_reward": 0.9270833730697632, "step": 645 }, { "clip_ratio": 0.0, - "completion_length": 308.70833587646484, + "completion_length": 506.22918701171875, "epoch": 0.646, - "grad_norm": 38.49421100811256, - "kl": 5.03515625, + "grad_norm": 15.525366461402383, + "kl": 3.5234375, "learning_rate": 4.020100089676376e-07, - "loss": 1.2125, - "reward": 2.8713592290878296, - "reward_std": 0.3377022026106715, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.008849205216392875, - "rewards/tag_count_reward": 0.9427083432674408, - "step": 646 + "loss": 0.5343, + "reward": 2.1641972064971924, + "reward_std": 0.6387200355529785, + "rewards/accuracy_reward": 0.3750000149011612, + "rewards/reasoning_steps_reward": 0.9374999701976776, + "rewards/repetition_penalty_reward": -0.018094514962285757, + "rewards/tag_count_reward": 0.8697916865348816, + "step": 646 }, { "clip_ratio": 0.0, - "completion_length": 331.3958435058594, + "completion_length": 676.6875305175781, "epoch": 0.647, - "grad_norm": 21.111310783784766, - "kl": 5.59375, + "grad_norm": 27.736362696423306, + "kl": 5.53125, "learning_rate": 4.005274906523336e-07, - "loss": 0.9957, - "reward": 2.7880390882492065, - "reward_std": 0.5258179903030396, - "rewards/accuracy_reward": 0.875, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.014044209383428097, - "rewards/tag_count_reward": 0.9479166865348816, + "loss": 1.0089, + "reward": 2.418774127960205, + "reward_std": 0.8770462274551392, + "rewards/accuracy_reward": 0.7083333730697632, + "rewards/reasoning_steps_reward": 0.9305555820465088, + "rewards/repetition_penalty_reward": -0.022198081016540527, + "rewards/tag_count_reward": 0.8020833730697632, "step": 647 }, { "clip_ratio": 0.0, - "completion_length": 281.7916717529297, + "completion_length": 477.10418701171875, "epoch": 0.648, - "grad_norm": 16.277405752370488, - "kl": 2.203125, + "grad_norm": 32.594519011116475, + "kl": 3.15625, "learning_rate": 3.9904679361238526e-07, - "loss": 0.4532, - "reward": 2.8440529108047485, - "reward_std": 0.2617475986480713, - "rewards/accuracy_reward": 0.8958333730697632, - "rewards/reasoning_steps_reward": 0.9861111044883728, - "rewards/repetition_penalty_reward": -0.017058377619832754, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.8488, + "reward": 2.172785520553589, + "reward_std": 0.6095466017723083, + "rewards/accuracy_reward": 0.3541666716337204, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.014714594930410385, + "rewards/tag_count_reward": 0.875, "step": 648 }, { "clip_ratio": 0.0, - "completion_length": 218.1041717529297, + "completion_length": 466.7083435058594, "epoch": 0.649, - "grad_norm": 23.55767104856182, - "kl": 2.2734375, + "grad_norm": 16.053982100692465, + "kl": 2.6982421875, "learning_rate": 3.975679358896189e-07, - "loss": 0.3329, - "reward": 2.662079930305481, - "reward_std": 0.16216762736439705, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.015003470703959465, - "rewards/tag_count_reward": 0.9895833730697632, + "loss": 0.6344, + "reward": 2.492353320121765, + "reward_std": 0.48535051196813583, + "rewards/accuracy_reward": 0.6041666716337204, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.014591319020837545, + "rewards/tag_count_reward": 0.9166666865348816, "step": 649 }, { "clip_ratio": 0.0, - "completion_length": 212.5625, + "completion_length": 421.4583435058594, "epoch": 0.65, - "grad_norm": 12.544705263837713, - "kl": 2.47265625, + "grad_norm": 33.807152135246525, + "kl": 4.578125, "learning_rate": 3.9609093550344907e-07, - "loss": 0.2773, - "reward": 2.8979930877685547, - "reward_std": 0.23362043499946594, - "rewards/accuracy_reward": 0.9583333730697632, - "rewards/reasoning_steps_reward": 0.9861111044883728, - "rewards/repetition_penalty_reward": -0.015201434027403593, - "rewards/tag_count_reward": 0.96875, + "loss": 0.9576, + "reward": 2.364701509475708, + "reward_std": 0.6279256045818329, + "rewards/accuracy_reward": 0.5000000149011612, + "rewards/reasoning_steps_reward": 0.9513889849185944, + "rewards/repetition_penalty_reward": -0.01377071999013424, + "rewards/tag_count_reward": 0.9270833432674408, "step": 650 }, { "clip_ratio": 0.0, - "completion_length": 223.77084350585938, + "completion_length": 430.08335876464844, "epoch": 0.651, - "grad_norm": 19.01086719912534, - "kl": 2.83203125, + "grad_norm": 54.92633243016837, + "kl": 4.2021484375, "learning_rate": 3.946158104506594e-07, - "loss": 0.4572, - "reward": 2.7025800943374634, - "reward_std": 0.19257907569408417, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.017906030640006065, - "rewards/tag_count_reward": 0.984375, + "loss": 0.7082, + "reward": 2.6497297286987305, + "reward_std": 0.48640553653240204, + "rewards/accuracy_reward": 0.75, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.02214530110359192, + "rewards/tag_count_reward": 0.9427083432674408, "step": 651 }, { "clip_ratio": 0.0, - "completion_length": 261.8541717529297, + "completion_length": 449.62501525878906, "epoch": 0.652, - "grad_norm": 33.44497585161576, - "kl": 3.27734375, + "grad_norm": 338.42582974401023, + "kl": 9.8125, "learning_rate": 3.931425787051832e-07, - "loss": 0.6085, - "reward": 2.73846697807312, - "reward_std": 0.39070405066013336, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.009797110687941313, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 1.7215, + "reward": 2.6626009941101074, + "reward_std": 0.6703141033649445, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9375, + "rewards/repetition_penalty_reward": -0.019690793938934803, + "rewards/tag_count_reward": 0.8906250298023224, "step": 652 }, { "clip_ratio": 0.0, - "completion_length": 212.54167938232422, + "completion_length": 304.85418701171875, "epoch": 0.653, - "grad_norm": 10.682019544464067, - "kl": 1.396484375, + "grad_norm": 17.319403464457796, + "kl": 1.7109375, "learning_rate": 3.9167125821788416e-07, - "loss": 0.1746, - "reward": 2.9806525707244873, - "reward_std": 0.039536988362669945, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.012403292581439018, - "rewards/tag_count_reward": 1.0, + "loss": 0.14, + "reward": 2.6247986555099487, + "reward_std": 0.37955524772405624, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.015826416201889515, + "rewards/tag_count_reward": 0.9739583432674408, "step": 653 }, { "clip_ratio": 0.0, - "completion_length": 273.87501525878906, + "completion_length": 333.5, "epoch": 0.654, - "grad_norm": 330.54915303351294, - "kl": 5.328125, + "grad_norm": 73.877312813324, + "kl": 1.705078125, "learning_rate": 3.902018669163384e-07, - "loss": 1.0207, - "reward": 2.7675833702087402, - "reward_std": 0.3268594592809677, - "rewards/accuracy_reward": 0.8125000298023224, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.013666700106114149, - "rewards/tag_count_reward": 0.96875, + "loss": 0.4122, + "reward": 2.905174732208252, + "reward_std": 0.26645080000162125, + "rewards/accuracy_reward": 0.9583333730697632, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.01843650173395872, + "rewards/tag_count_reward": 0.9791666865348816, "step": 654 }, { "clip_ratio": 0.0, - "completion_length": 240.54167938232422, + "completion_length": 458.52085876464844, "epoch": 0.655, - "grad_norm": 16.73346798516969, - "kl": 2.0, + "grad_norm": 87.77289847817504, + "kl": 6.890625, "learning_rate": 3.8873442270461485e-07, - "loss": 0.2908, - "reward": 2.911833167076111, - "reward_std": 0.19429031014442444, - "rewards/accuracy_reward": 0.9583333730697632, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.015250249183736742, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 1.0085, + "reward": 2.5801607370376587, + "reward_std": 0.7309716641902924, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.92361119389534, + "rewards/repetition_penalty_reward": -0.02574214804917574, + "rewards/tag_count_reward": 0.8697916865348816, "step": 655 }, { "clip_ratio": 0.0, - "completion_length": 247.77083587646484, + "completion_length": 542.7500305175781, "epoch": 0.656, - "grad_norm": 28.668584194587563, - "kl": 2.9296875, + "grad_norm": 104.02522400010055, + "kl": 8.09375, "learning_rate": 3.872689434630585e-07, - "loss": 0.6955, - "reward": 2.647436022758484, - "reward_std": 0.24594880640506744, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9652777910232544, - "rewards/repetition_penalty_reward": -0.0157586089335382, - "rewards/tag_count_reward": 0.96875, + "loss": 1.0897, + "reward": 2.465412139892578, + "reward_std": 0.540164515376091, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.010282414965331554, + "rewards/tag_count_reward": 0.8437500298023224, "step": 656 }, { "clip_ratio": 0.0, - "completion_length": 191.39583587646484, + "completion_length": 510.3958435058594, "epoch": 0.657, - "grad_norm": 22.55407883517572, - "kl": 2.0, + "grad_norm": 43.13318047502928, + "kl": 6.53125, "learning_rate": 3.8580544704807117e-07, - "loss": 0.539, - "reward": 2.672939658164978, - "reward_std": 0.25764209032058716, - "rewards/accuracy_reward": 0.7083333730697632, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.007616016548126936, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.8464, + "reward": 2.118070363998413, + "reward_std": 0.7317387461662292, + "rewards/accuracy_reward": 0.3541666865348816, + "rewards/reasoning_steps_reward": 0.9375000596046448, + "rewards/repetition_penalty_reward": -0.02255462296307087, + "rewards/tag_count_reward": 0.8489583432674408, "step": 657 }, { "clip_ratio": 0.0, - "completion_length": 239.08333587646484, + "completion_length": 491.0833435058594, "epoch": 0.658, - "grad_norm": 12.558003660661395, - "kl": 3.34375, + "grad_norm": 40.435901700616085, + "kl": 2.390625, "learning_rate": 3.843439512918949e-07, - "loss": 0.5763, - "reward": 2.6935726404190063, - "reward_std": 0.22167278081178665, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.013024728745222092, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.7348, + "reward": 2.421813726425171, + "reward_std": 0.6320691108703613, + "rewards/accuracy_reward": 0.5625000298023224, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.017422514967620373, + "rewards/tag_count_reward": 0.9114583432674408, "step": 658 }, { "clip_ratio": 0.0, - "completion_length": 218.27083587646484, + "completion_length": 508.5000305175781, "epoch": 0.659, - "grad_norm": 16.248296621714427, - "kl": 1.09375, + "grad_norm": 13.800630290866199, + "kl": 2.7109375, "learning_rate": 3.8288447400239443e-07, - "loss": 0.1499, - "reward": 2.909277319908142, - "reward_std": 0.18255577981472015, - "rewards/accuracy_reward": 0.9375000298023224, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.014333693077787757, - "rewards/tag_count_reward": 1.0, + "loss": 0.4474, + "reward": 2.462868094444275, + "reward_std": 0.6122282147407532, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.016298661939799786, + "rewards/tag_count_reward": 0.8958333432674408, "step": 659 }, { "clip_ratio": 0.0, - "completion_length": 185.18750762939453, + "completion_length": 857.5000305175781, "epoch": 0.66, - "grad_norm": 17.425186923798993, - "kl": 3.4765625, + "grad_norm": 26.939008770933444, + "kl": 8.09375, "learning_rate": 3.8142703296283953e-07, - "loss": 0.3213, - "reward": 2.5550224781036377, - "reward_std": 0.3512069210410118, - "rewards/accuracy_reward": 0.6041666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.007477572187781334, - "rewards/tag_count_reward": 0.9583333432674408, + "loss": 0.9327, + "reward": 1.7486651539802551, + "reward_std": 0.8026353716850281, + "rewards/accuracy_reward": 0.2708333358168602, + "rewards/reasoning_steps_reward": 0.8750000298023224, + "rewards/repetition_penalty_reward": -0.027376560494303703, + "rewards/tag_count_reward": 0.6302083432674408, "step": 660 }, { "clip_ratio": 0.0, - "completion_length": 224.18750762939453, + "completion_length": 504.16668701171875, "epoch": 0.661, - "grad_norm": 18.315073516908974, - "kl": 2.140625, + "grad_norm": 40.18446601488726, + "kl": 2.203125, "learning_rate": 3.7997164593168983e-07, - "loss": 0.1232, - "reward": 2.8803622722625732, - "reward_std": 0.22212205175310373, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.013734814245253801, - "rewards/tag_count_reward": 0.984375, + "loss": 0.8515, + "reward": 2.3131070137023926, + "reward_std": 0.6172126531600952, + "rewards/accuracy_reward": 0.479166679084301, + "rewards/reasoning_steps_reward": 0.9583334028720856, + "rewards/repetition_penalty_reward": -0.02543477900326252, + "rewards/tag_count_reward": 0.9010416865348816, "step": 661 }, { "clip_ratio": 0.0, - "completion_length": 253.89583587646484, + "completion_length": 404.04168701171875, "epoch": 0.662, - "grad_norm": 20.413530808454787, - "kl": 5.0, + "grad_norm": 19.64806111061822, + "kl": 1.80078125, "learning_rate": 3.785183306423767e-07, - "loss": 1.0135, - "reward": 2.907068371772766, - "reward_std": 0.2130483016371727, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.013070642948150635, - "rewards/tag_count_reward": 0.96875, + "loss": 0.3147, + "reward": 2.165276527404785, + "reward_std": 0.5021775662899017, + "rewards/accuracy_reward": 0.2708333358168602, + "rewards/reasoning_steps_reward": 0.9652778506278992, + "rewards/repetition_penalty_reward": -0.018751220777630806, + "rewards/tag_count_reward": 0.9479166865348816, "step": 662 }, { "clip_ratio": 0.0, - "completion_length": 363.3333435058594, + "completion_length": 457.5416717529297, "epoch": 0.663, - "grad_norm": 28.110438341217158, - "kl": 4.53125, + "grad_norm": 55.11379027399269, + "kl": 2.46875, "learning_rate": 3.7706710480308835e-07, - "loss": 0.2439, - "reward": 2.542426824569702, - "reward_std": 0.2915441542863846, - "rewards/accuracy_reward": 0.6250000298023224, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.03396210819482803, - "rewards/tag_count_reward": 0.9583333432674408, + "loss": 0.9405, + "reward": 2.3307788372039795, + "reward_std": 0.5423199832439423, + "rewards/accuracy_reward": 0.4583333432674408, + "rewards/reasoning_steps_reward": 0.9861111640930176, + "rewards/repetition_penalty_reward": -0.014707335270941257, + "rewards/tag_count_reward": 0.9010416865348816, "step": 663 }, { "clip_ratio": 0.0, - "completion_length": 263.375, + "completion_length": 376.9583435058594, "epoch": 0.664, - "grad_norm": 18.803823307299016, - "kl": 1.517578125, + "grad_norm": 46.90834721507952, + "kl": 3.6171875, "learning_rate": 3.7561798609655373e-07, - "loss": 0.3597, - "reward": 2.8734437227249146, - "reward_std": 0.2446470558643341, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/reasoning_steps_reward": 0.9652777910232544, - "rewards/repetition_penalty_reward": -0.029334094375371933, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.7765, + "reward": 2.5231001377105713, + "reward_std": 0.43632885813713074, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.018566494807600975, + "rewards/tag_count_reward": 0.9375, "step": 664 }, { "clip_ratio": 0.0, - "completion_length": 239.9166717529297, + "completion_length": 472.875, "epoch": 0.665, - "grad_norm": 229.9136474356772, - "kl": 5.53125, + "grad_norm": 36.36032137924947, + "kl": 4.66015625, "learning_rate": 3.7417099217982686e-07, - "loss": 0.4498, - "reward": 2.7615668773651123, - "reward_std": 0.2331508807837963, - "rewards/accuracy_reward": 0.8333333432674408, + "loss": 0.6785, + "reward": 2.34568190574646, + "reward_std": 0.5171235352754593, + "rewards/accuracy_reward": 0.5, "rewards/reasoning_steps_reward": 0.9583333730697632, - "rewards/repetition_penalty_reward": -0.009266480803489685, - "rewards/tag_count_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.013693147338926792, + "rewards/tag_count_reward": 0.9010416865348816, "step": 665 }, { "clip_ratio": 0.0, - "completion_length": 247.83334350585938, + "completion_length": 389.18751525878906, "epoch": 0.666, - "grad_norm": 25.788270483666004, - "kl": 2.671875, + "grad_norm": 43.19617147385512, + "kl": 4.390625, "learning_rate": 3.72726140684072e-07, - "loss": 0.7629, - "reward": 2.7527973651885986, - "reward_std": 0.142390388995409, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.012827732600271702, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 0.7594, + "reward": 2.619339108467102, + "reward_std": 0.5449521988630295, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9583334028720856, + "rewards/repetition_penalty_reward": -0.0160775538533926, + "rewards/tag_count_reward": 0.9270833432674408, "step": 666 }, { "clip_ratio": 0.0, - "completion_length": 200.85417938232422, + "completion_length": 399.2708435058594, "epoch": 0.667, - "grad_norm": 28.765632678179347, - "kl": 1.6875, + "grad_norm": 34.057601735761196, + "kl": 3.2890625, "learning_rate": 3.712834492143487e-07, - "loss": 0.6986, - "reward": 2.9394389390945435, - "reward_std": 0.18052342534065247, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.011949990293942392, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.504, + "reward": 2.495935320854187, + "reward_std": 0.37501704692840576, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.01448146067559719, + "rewards/tag_count_reward": 0.9270833432674408, "step": 667 }, { "clip_ratio": 0.0, - "completion_length": 183.2083396911621, + "completion_length": 516.7500305175781, "epoch": 0.668, - "grad_norm": 17.24380263113382, - "kl": 1.57421875, + "grad_norm": 69.82503772312313, + "kl": 5.609375, "learning_rate": 3.6984293534939737e-07, - "loss": 0.1997, - "reward": 2.961165189743042, - "reward_std": 0.0693398155272007, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.012793133384548128, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.9785, + "reward": 2.399042248725891, + "reward_std": 0.7354680299758911, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9513889849185944, + "rewards/repetition_penalty_reward": -0.015888389199972153, + "rewards/tag_count_reward": 0.8593750298023224, "step": 668 }, { "clip_ratio": 0.0, - "completion_length": 294.6666679382324, + "completion_length": 462.50001525878906, "epoch": 0.669, - "grad_norm": 24.20000464035394, - "kl": 6.28125, + "grad_norm": 32.586128695642, + "kl": 4.359375, "learning_rate": 3.6840461664142444e-07, - "loss": 0.9215, - "reward": 2.829747200012207, - "reward_std": 0.33862260504974984, - "rewards/accuracy_reward": 0.9375, - "rewards/reasoning_steps_reward": 0.9722222089767456, - "rewards/repetition_penalty_reward": -0.012266828838619404, - "rewards/tag_count_reward": 0.9322916865348816, + "loss": 0.7185, + "reward": 2.410151481628418, + "reward_std": 0.44753579795360565, + "rewards/accuracy_reward": 0.5625000149011612, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.016932022757828236, + "rewards/tag_count_reward": 0.8854166865348816, "step": 669 }, { "clip_ratio": 0.0, - "completion_length": 133.2916717529297, + "completion_length": 551.4375305175781, "epoch": 0.67, - "grad_norm": 25.745600615046822, - "kl": 1.2890625, + "grad_norm": 22.702344571228725, + "kl": 3.390625, "learning_rate": 3.6696851061588994e-07, - "loss": 0.209, - "reward": 2.9985584020614624, - "reward_std": 0.003719109285157174, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.0014416724734473974, - "rewards/tag_count_reward": 1.0, + "loss": 0.7687, + "reward": 2.3318662643432617, + "reward_std": 0.6909976303577423, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 0.9375000596046448, + "rewards/repetition_penalty_reward": -0.017092150636017323, + "rewards/tag_count_reward": 0.890625, "step": 670 }, { "clip_ratio": 0.0, - "completion_length": 209.1041717529297, + "completion_length": 565.4791870117188, "epoch": 0.671, - "grad_norm": 17.317208426562, - "kl": 3.9609375, + "grad_norm": 27.618008409549773, + "kl": 4.2265625, "learning_rate": 3.655346347712922e-07, - "loss": 0.7982, - "reward": 2.812960624694824, - "reward_std": 0.3401038944721222, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.009956196416169405, - "rewards/tag_count_reward": 0.96875, + "loss": 0.8779, + "reward": 2.0886669158935547, + "reward_std": 0.6510388255119324, + "rewards/accuracy_reward": 0.2916666716337204, + "rewards/reasoning_steps_reward": 0.9444445073604584, + "rewards/repetition_penalty_reward": -0.017235863022506237, + "rewards/tag_count_reward": 0.8697916865348816, "step": 671 }, { "clip_ratio": 0.0, - "completion_length": 187.6666717529297, + "completion_length": 487.18751525878906, "epoch": 0.672, - "grad_norm": 14.899636842613678, - "kl": 2.6875, + "grad_norm": 24.355953329836403, + "kl": 2.40625, "learning_rate": 3.641030065789562e-07, - "loss": 0.3843, - "reward": 2.968170642852783, - "reward_std": 0.06798989698290825, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.01620438229292631, - "rewards/tag_count_reward": 0.984375, + "loss": 0.5358, + "reward": 2.412584662437439, + "reward_std": 0.6757764220237732, + "rewards/accuracy_reward": 0.5625000298023224, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.021443082485347986, + "rewards/tag_count_reward": 0.9062500298023224, "step": 672 }, { "clip_ratio": 0.0, - "completion_length": 196.5416717529297, + "completion_length": 558.0833435058594, "epoch": 0.673, - "grad_norm": 20.216555829124548, - "kl": 3.40234375, + "grad_norm": 45.77951130491503, + "kl": 3.328125, "learning_rate": 3.6267364348281946e-07, - "loss": 0.3351, - "reward": 2.758676290512085, - "reward_std": 0.16276206448674202, - "rewards/accuracy_reward": 0.8125, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.010421007638797164, - "rewards/tag_count_reward": 0.984375, + "loss": 1.0366, + "reward": 2.355557918548584, + "reward_std": 0.6195343136787415, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.02465049270540476, + "rewards/tag_count_reward": 0.859375, "step": 673 }, { "clip_ratio": 0.0, - "completion_length": 259.7291717529297, + "completion_length": 529.1666870117188, "epoch": 0.674, - "grad_norm": 35.3127536043053, - "kl": 4.703125, + "grad_norm": 41.84743305028243, + "kl": 4.65625, "learning_rate": 3.612465628992203e-07, - "loss": 0.6993, - "reward": 2.643582344055176, - "reward_std": 0.4248330295085907, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9652778208255768, - "rewards/repetition_penalty_reward": -0.009195396210998297, - "rewards/tag_count_reward": 0.9583333432674408, + "loss": 0.8446, + "reward": 2.305051267147064, + "reward_std": 0.6280795484781265, + "rewards/accuracy_reward": 0.5416666716337204, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.010920950677245855, + "rewards/tag_count_reward": 0.8229166865348816, "step": 674 }, { "clip_ratio": 0.0, - "completion_length": 468.4791793823242, + "completion_length": 442.2083435058594, "epoch": 0.675, - "grad_norm": 47.27997872268839, - "kl": 9.9140625, + "grad_norm": 27.763872293665937, + "kl": 2.62890625, "learning_rate": 3.5982178221668533e-07, - "loss": 0.964, - "reward": 2.546661853790283, - "reward_std": 0.35572879761457443, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/reasoning_steps_reward": 0.979166716337204, - "rewards/repetition_penalty_reward": -0.02625508955679834, - "rewards/tag_count_reward": 0.8854166865348816, + "loss": 0.7363, + "reward": 2.520777702331543, + "reward_std": 0.6029552221298218, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.019152968656271696, + "rewards/tag_count_reward": 0.9218750298023224, "step": 675 }, { "clip_ratio": 0.0, - "completion_length": 325.2083435058594, + "completion_length": 626.0833435058594, "epoch": 0.676, - "grad_norm": 24.14163451574894, - "kl": 5.296875, + "grad_norm": 146.14695704648105, + "kl": 11.421875, "learning_rate": 3.5839931879571725e-07, - "loss": 0.6163, - "reward": 2.7341156005859375, - "reward_std": 0.16252686642110348, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.019356681033968925, - "rewards/tag_count_reward": 0.9687500298023224, + "loss": 1.492, + "reward": 1.9430179595947266, + "reward_std": 0.49505507946014404, + "rewards/accuracy_reward": 0.2291666716337204, + "rewards/reasoning_steps_reward": 0.9861111640930176, + "rewards/repetition_penalty_reward": -0.027468254789710045, + "rewards/tag_count_reward": 0.7552083432674408, "step": 676 }, { "clip_ratio": 0.0, - "completion_length": 228.58334350585938, + "completion_length": 448.0000305175781, "epoch": 0.677, - "grad_norm": 34.103361260594454, - "kl": 4.5546875, + "grad_norm": 147.52034576763307, + "kl": 6.8095703125, "learning_rate": 3.5697918996858443e-07, - "loss": 0.7747, - "reward": 2.8681647777557373, - "reward_std": 0.3863070458173752, - "rewards/accuracy_reward": 0.9375000298023224, - "rewards/reasoning_steps_reward": 0.9652777910232544, - "rewards/repetition_penalty_reward": -0.008571414276957512, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 0.9345, + "reward": 2.3936002254486084, + "reward_std": 0.4429845064878464, + "rewards/accuracy_reward": 0.5208333358168602, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.010913772508502007, + "rewards/tag_count_reward": 0.9114583432674408, "step": 677 }, { "clip_ratio": 0.0, - "completion_length": 382.4791793823242, + "completion_length": 512.9166870117188, "epoch": 0.678, - "grad_norm": 17.719591756631562, - "kl": 6.5546875, + "grad_norm": 140.08402872570105, + "kl": 7.609375, "learning_rate": 3.555614130391079e-07, - "loss": 0.9253, - "reward": 2.684828758239746, - "reward_std": 0.43789561837911606, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.025240877643227577, - "rewards/tag_count_reward": 0.9322916865348816, + "loss": 1.0663, + "reward": 2.3663707971572876, + "reward_std": 0.6558624804019928, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.02772645093500614, + "rewards/tag_count_reward": 0.9010416865348816, "step": 678 }, { "clip_ratio": 0.0, - "completion_length": 144.70833587646484, + "completion_length": 417.0, "epoch": 0.679, - "grad_norm": 35.78746577788138, - "kl": 2.5859375, + "grad_norm": 96.73929298933032, + "kl": 4.5390625, "learning_rate": 3.5414600528245266e-07, - "loss": 0.4426, - "reward": 2.9731714725494385, - "reward_std": 0.08279756363481283, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.0059953355230391026, - "rewards/tag_count_reward": 1.0, + "loss": 1.0437, + "reward": 2.23331880569458, + "reward_std": 0.6037196218967438, + "rewards/accuracy_reward": 0.37500002048909664, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.02883412316441536, + "rewards/tag_count_reward": 0.9218750298023224, "step": 679 }, { "clip_ratio": 0.0, - "completion_length": 310.81251525878906, + "completion_length": 357.5625, "epoch": 0.68, - "grad_norm": 20.293549329794388, - "kl": 4.1875, + "grad_norm": 21.200219203477406, + "kl": 2.416015625, "learning_rate": 3.5273298394491515e-07, - "loss": 0.5765, - "reward": 2.8984086513519287, - "reward_std": 0.2387396264821291, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.0199942234903574, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 0.5258, + "reward": 2.639436721801758, + "reward_std": 0.38805626332759857, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.009868907742202282, + "rewards/tag_count_reward": 0.9479166865348816, "step": 680 }, { "clip_ratio": 0.0, - "completion_length": 265.4583435058594, + "completion_length": 387.7708435058594, "epoch": 0.681, - "grad_norm": 18.503894824696935, - "kl": 2.9453125, + "grad_norm": 14.11167765417183, + "kl": 1.080078125, "learning_rate": 3.513223662437147e-07, - "loss": 0.4423, - "reward": 2.6786974668502808, - "reward_std": 0.27390020340681076, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.020955480635166168, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.1954, + "reward": 2.5773390531539917, + "reward_std": 0.42192623019218445, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.02856375463306904, + "rewards/tag_count_reward": 0.9739583432674408, "step": 681 }, { "clip_ratio": 0.0, - "completion_length": 192.08333587646484, + "completion_length": 463.89585876464844, "epoch": 0.682, - "grad_norm": 13.6250208174273, - "kl": 3.3984375, + "grad_norm": 35.11880079033104, + "kl": 5.19921875, "learning_rate": 3.4991416936678276e-07, - "loss": 0.365, - "reward": 2.9281957149505615, - "reward_std": 0.15829423069953918, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.01798485405743122, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 0.666, + "reward": 2.4289733171463013, + "reward_std": 0.7871748507022858, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.025887836702167988, + "rewards/tag_count_reward": 0.8645833432674408, "step": 682 }, { "clip_ratio": 0.0, - "completion_length": 309.2708435058594, + "completion_length": 596.7500305175781, "epoch": 0.683, - "grad_norm": 22.480980323456404, - "kl": 3.72265625, + "grad_norm": 21.947154239119367, + "kl": 5.1796875, "learning_rate": 3.4850841047255364e-07, - "loss": 0.8954, - "reward": 2.78403639793396, - "reward_std": 0.29784230701625347, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.0249915374442935, - "rewards/tag_count_reward": 0.96875, + "loss": 0.7009, + "reward": 1.906419038772583, + "reward_std": 0.5553164780139923, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.015455981716513634, + "rewards/tag_count_reward": 0.7968750298023224, "step": 683 }, { "clip_ratio": 0.0, - "completion_length": 212.00000762939453, + "completion_length": 661.0208435058594, "epoch": 0.684, - "grad_norm": 25.610640191700295, - "kl": 2.3046875, + "grad_norm": 25.598616282466537, + "kl": 4.953125, "learning_rate": 3.471051066897562e-07, - "loss": 0.4334, - "reward": 2.870127558708191, - "reward_std": 0.36344006657600403, - "rewards/accuracy_reward": 0.9375000298023224, - "rewards/reasoning_steps_reward": 0.9652777910232544, - "rewards/repetition_penalty_reward": -0.011816964950412512, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.9532, + "reward": 2.2370306253433228, + "reward_std": 0.6496289968490601, + "rewards/accuracy_reward": 0.4375, + "rewards/reasoning_steps_reward": 0.9722222089767456, + "rewards/repetition_penalty_reward": -0.016441638581454754, + "rewards/tag_count_reward": 0.8437500298023224, "step": 684 }, { "clip_ratio": 0.0, - "completion_length": 210.95833587646484, + "completion_length": 464.70835876464844, "epoch": 0.685, - "grad_norm": 44.54358735643805, - "kl": 2.5390625, + "grad_norm": 49.95217737848169, + "kl": 1.669921875, "learning_rate": 3.45704275117204e-07, - "loss": 0.8758, - "reward": 2.9013065099716187, - "reward_std": 0.28338224440813065, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.020568530075252056, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.614, + "reward": 2.6332918405532837, + "reward_std": 0.49084220826625824, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.01948598213493824, + "rewards/tag_count_reward": 0.9166666865348816, "step": 685 }, { "clip_ratio": 0.0, - "completion_length": 229.77084350585938, + "completion_length": 686.3958435058594, "epoch": 0.686, - "grad_norm": 27.89292051800001, - "kl": 1.57421875, + "grad_norm": 68.70749191055191, + "kl": 5.984375, "learning_rate": 3.4430593282358777e-07, - "loss": 0.3594, - "reward": 2.9172399044036865, - "reward_std": 0.18947718292474747, - "rewards/accuracy_reward": 0.9375000298023224, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.008107327623292804, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.9271, + "reward": 2.089155912399292, + "reward_std": 0.5732367038726807, + "rewards/accuracy_reward": 0.3750000223517418, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.013274761848151684, + "rewards/tag_count_reward": 0.7760416865348816, "step": 686 }, { "clip_ratio": 0.0, - "completion_length": 183.45833587646484, + "completion_length": 432.2083435058594, "epoch": 0.687, - "grad_norm": 38.1675063592923, - "kl": 1.880859375, + "grad_norm": 32.08822044000933, + "kl": 1.79296875, "learning_rate": 3.429100968472668e-07, - "loss": 0.3984, - "reward": 2.9289207458496094, - "reward_std": 0.22132886946201324, - "rewards/accuracy_reward": 0.9583333730697632, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.008579338667914271, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.5082, + "reward": 2.5415300130844116, + "reward_std": 0.48894260823726654, + "rewards/accuracy_reward": 0.625, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.017497885040938854, + "rewards/tag_count_reward": 0.9687500298023224, "step": 687 }, { "clip_ratio": 0.0, - "completion_length": 233.31250762939453, + "completion_length": 427.12501525878906, "epoch": 0.688, - "grad_norm": 22.28480531591192, - "kl": 1.06640625, + "grad_norm": 44.25186835660776, + "kl": 1.40625, "learning_rate": 3.4151678419606233e-07, - "loss": 0.1798, - "reward": 2.9338208436965942, - "reward_std": 0.12233167886734009, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.02104026358574629, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.6032, + "reward": 2.457394003868103, + "reward_std": 0.4042099863290787, + "rewards/accuracy_reward": 0.5416666716337204, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.020036617293953896, + "rewards/tag_count_reward": 0.9427083730697632, "step": 688 }, { "clip_ratio": 0.0, - "completion_length": 222.58334350585938, + "completion_length": 475.31251525878906, "epoch": 0.689, - "grad_norm": 22.933358746670084, - "kl": 1.31640625, + "grad_norm": 29.033085916995276, + "kl": 3.84375, "learning_rate": 3.4012601184704904e-07, - "loss": 0.3647, - "reward": 2.8137935400009155, - "reward_std": 0.3461058437824249, - "rewards/accuracy_reward": 0.8750000298023224, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.02822032803669572, - "rewards/tag_count_reward": 0.9739583730697632, + "loss": 0.7725, + "reward": 2.554749846458435, + "reward_std": 0.610569179058075, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.023375309072434902, + "rewards/tag_count_reward": 0.890625, "step": 689 }, { "clip_ratio": 0.0, - "completion_length": 237.7291717529297, + "completion_length": 494.12501525878906, "epoch": 0.69, - "grad_norm": 35.38373342659696, - "kl": 2.28125, + "grad_norm": 57.99082028977846, + "kl": 5.2578125, "learning_rate": 3.387377967463493e-07, - "loss": 0.387, - "reward": 2.8465791940689087, - "reward_std": 0.25619735568761826, - "rewards/accuracy_reward": 0.8958333730697632, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.009323742240667343, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 0.5999, + "reward": 2.3892362117767334, + "reward_std": 0.5195634961128235, + "rewards/accuracy_reward": 0.5625, + "rewards/reasoning_steps_reward": 0.9722222983837128, + "rewards/repetition_penalty_reward": -0.020485990215092897, + "rewards/tag_count_reward": 0.875, "step": 690 }, { "clip_ratio": 0.0, - "completion_length": 232.52084350585938, + "completion_length": 491.29168701171875, "epoch": 0.691, - "grad_norm": 35.4982337022001, - "kl": 3.1171875, + "grad_norm": 63.801944338558975, + "kl": 6.546875, "learning_rate": 3.3735215580892575e-07, - "loss": 0.8813, - "reward": 2.897489309310913, - "reward_std": 0.3297596722841263, - "rewards/accuracy_reward": 0.9375000298023224, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.008760684635490179, - "rewards/tag_count_reward": 0.96875, + "loss": 0.8708, + "reward": 2.57023286819458, + "reward_std": 0.5704822838306427, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.020044888369739056, + "rewards/tag_count_reward": 0.8958333730697632, "step": 691 }, { "clip_ratio": 0.0, - "completion_length": 232.43751525878906, + "completion_length": 438.83335876464844, "epoch": 0.692, - "grad_norm": 14.562418522027901, - "kl": 2.16015625, + "grad_norm": 63.66620073917318, + "kl": 4.0234375, "learning_rate": 3.359691059183761e-07, - "loss": 0.3739, - "reward": 2.8865208625793457, - "reward_std": 0.20953704416751862, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.012784908525645733, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.7175, + "reward": 2.389517307281494, + "reward_std": 0.6298649907112122, + "rewards/accuracy_reward": 0.5625000298023224, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.03756603505462408, + "rewards/tag_count_reward": 0.9062500298023224, "step": 692 }, { "clip_ratio": 0.0, - "completion_length": 287.2291717529297, + "completion_length": 559.5416870117188, "epoch": 0.693, - "grad_norm": 15.062301756160279, - "kl": 4.609375, + "grad_norm": 261.1873431552047, + "kl": 10.6875, "learning_rate": 3.3458866392672694e-07, - "loss": 0.8824, - "reward": 2.4654624462127686, - "reward_std": 0.3091539740562439, - "rewards/accuracy_reward": 0.5416666716337204, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.015440445393323898, - "rewards/tag_count_reward": 0.953125, + "loss": 1.6707, + "reward": 2.32285612821579, + "reward_std": 0.4213666617870331, + "rewards/accuracy_reward": 0.5000000111758709, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.019157718401402235, + "rewards/tag_count_reward": 0.8489583432674408, "step": 693 }, { "clip_ratio": 0.0, - "completion_length": 291.00001525878906, + "completion_length": 548.9375, "epoch": 0.694, - "grad_norm": 18.68029379562651, - "kl": 5.3984375, + "grad_norm": 30.493282980019504, + "kl": 5.34375, "learning_rate": 3.3321084665422803e-07, - "loss": 1.1877, - "reward": 2.7637146711349487, - "reward_std": 0.369149349629879, - "rewards/accuracy_reward": 0.8125000298023224, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.015799180371686816, - "rewards/tag_count_reward": 0.9739583730697632, + "loss": 1.2128, + "reward": 2.4720669984817505, + "reward_std": 0.7172423005104065, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.015780417248606682, + "rewards/tag_count_reward": 0.8906250298023224, "step": 694 }, { "clip_ratio": 0.0, - "completion_length": 298.1041717529297, + "completion_length": 527.9375305175781, "epoch": 0.695, - "grad_norm": 41.216959763632985, - "kl": 7.44921875, + "grad_norm": 32.828395091508476, + "kl": 5.03125, "learning_rate": 3.3183567088914833e-07, - "loss": 0.9188, - "reward": 2.7015944719314575, - "reward_std": 0.37430499494075775, - "rewards/accuracy_reward": 0.8125000298023224, - "rewards/reasoning_steps_reward": 0.9652777910232544, - "rewards/repetition_penalty_reward": -0.029308347031474113, - "rewards/tag_count_reward": 0.953125, + "loss": 0.8333, + "reward": 2.6700429916381836, + "reward_std": 0.5373901128768921, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.02092924155294895, + "rewards/tag_count_reward": 0.8854166865348816, "step": 695 }, { "clip_ratio": 0.0, - "completion_length": 201.45834350585938, + "completion_length": 374.8333435058594, "epoch": 0.696, - "grad_norm": 48.80303297301862, - "kl": 3.796875, + "grad_norm": 13.216131597812279, + "kl": 2.4765625, "learning_rate": 3.3046315338757026e-07, - "loss": 0.7273, - "reward": 2.922324538230896, - "reward_std": 0.22493132948875427, - "rewards/accuracy_reward": 0.9583333730697632, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.008231179555878043, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.3144, + "reward": 2.609209179878235, + "reward_std": 0.655095100402832, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.01926326658576727, + "rewards/tag_count_reward": 0.9270833432674408, "step": 696 }, { "clip_ratio": 0.0, - "completion_length": 246.4166717529297, + "completion_length": 318.6666717529297, "epoch": 0.697, - "grad_norm": 27.69572426512447, - "kl": 2.818359375, + "grad_norm": 10.118006800111877, + "kl": 0.5234375, "learning_rate": 3.290933108731866e-07, - "loss": 0.5321, - "reward": 2.9189170598983765, - "reward_std": 0.13302714005112648, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.02899962943047285, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": -0.0809, + "reward": 2.3212504386901855, + "reward_std": 0.5412976741790771, + "rewards/accuracy_reward": 0.3958333432674408, + "rewards/reasoning_steps_reward": 0.9722222089767456, + "rewards/repetition_penalty_reward": -0.020763473585247993, + "rewards/tag_count_reward": 0.9739583432674408, "step": 697 }, { "clip_ratio": 0.0, - "completion_length": 326.4166717529297, + "completion_length": 742.3541870117188, "epoch": 0.698, - "grad_norm": 39.039263843388184, - "kl": 8.53125, + "grad_norm": 28.054949929765485, + "kl": 5.546875, "learning_rate": 3.2772616003709616e-07, - "loss": 1.1547, - "reward": 2.771924376487732, - "reward_std": 0.4464236795902252, - "rewards/accuracy_reward": 0.8750000298023224, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.019742392003536224, - "rewards/tag_count_reward": 0.9166666865348816, + "loss": 0.7595, + "reward": 2.193854570388794, + "reward_std": 0.8353064060211182, + "rewards/accuracy_reward": 0.5000000149011612, + "rewards/reasoning_steps_reward": 0.9375000298023224, + "rewards/repetition_penalty_reward": -0.035312142223119736, + "rewards/tag_count_reward": 0.7916666865348816, "step": 698 }, { "clip_ratio": 0.0, - "completion_length": 214.6666717529297, + "completion_length": 452.37501525878906, "epoch": 0.699, - "grad_norm": 27.315748130046792, - "kl": 6.96875, + "grad_norm": 41.658117016977116, + "kl": 2.1875, "learning_rate": 3.263617175376001e-07, - "loss": 1.3311, - "reward": 2.88124680519104, - "reward_std": 0.3213895112276077, - "rewards/accuracy_reward": 0.9583333730697632, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.0076421641279011965, + "loss": 0.8256, + "reward": 2.7566497325897217, + "reward_std": 0.49461202323436737, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.01418376062065363, "rewards/tag_count_reward": 0.9375, "step": 699 }, { "clip_ratio": 0.0, - "completion_length": 242.1041717529297, + "completion_length": 663.5000305175781, "epoch": 0.7, - "grad_norm": 30.13695479194309, - "kl": 4.8125, + "grad_norm": 26.592270906723318, + "kl": 6.0625, "learning_rate": 3.250000000000001e-07, - "loss": 0.6518, - "reward": 2.8072937726974487, - "reward_std": 0.35539232194423676, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 0.979166716337204, - "rewards/repetition_penalty_reward": -0.010414544492959976, - "rewards/tag_count_reward": 0.9843750298023224, + "loss": 0.7409, + "reward": 2.101960301399231, + "reward_std": 0.7690182626247406, + "rewards/accuracy_reward": 0.4166666865348816, + "rewards/reasoning_steps_reward": 0.9375000596046448, + "rewards/repetition_penalty_reward": -0.02824808470904827, + "rewards/tag_count_reward": 0.7760416865348816, "step": 700 }, { "clip_ratio": 0.0, - "completion_length": 275.18751525878906, + "completion_length": 576.25, "epoch": 0.701, - "grad_norm": 57.102694694394614, - "kl": 8.1875, + "grad_norm": 19.522315074592626, + "kl": 4.1015625, "learning_rate": 3.2364102401639423e-07, - "loss": 1.1402, - "reward": 2.658607006072998, - "reward_std": 0.33723458647727966, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.01153182820416987, - "rewards/tag_count_reward": 0.96875, + "loss": 0.6651, + "reward": 2.648020029067993, + "reward_std": 0.46788084506988525, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/reasoning_steps_reward": 0.9652778506278992, + "rewards/repetition_penalty_reward": -0.009966201148927212, + "rewards/tag_count_reward": 0.8593750298023224, "step": 701 }, { "clip_ratio": 0.0, - "completion_length": 274.6666717529297, + "completion_length": 593.4375305175781, "epoch": 0.702, - "grad_norm": 24.993627500634673, - "kl": 5.859375, + "grad_norm": 21.19308541158298, + "kl": 4.4375, "learning_rate": 3.222848061454764e-07, - "loss": 0.4616, - "reward": 2.7014336585998535, - "reward_std": 0.29814160987734795, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.013844262808561325, - "rewards/tag_count_reward": 0.9583333432674408, + "loss": 0.8651, + "reward": 2.0901803970336914, + "reward_std": 0.6151553392410278, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/reasoning_steps_reward": 0.9444444179534912, + "rewards/repetition_penalty_reward": -0.020930853206664324, + "rewards/tag_count_reward": 0.8333333730697632, "step": 702 }, { "clip_ratio": 0.0, - "completion_length": 311.7916717529297, + "completion_length": 448.5416717529297, "epoch": 0.703, - "grad_norm": 13.658419931098718, - "kl": 5.109375, + "grad_norm": 14.569396100243361, + "kl": 2.4140625, "learning_rate": 3.209313629123329e-07, - "loss": 1.1121, - "reward": 2.5931508541107178, - "reward_std": 0.2883653864264488, - "rewards/accuracy_reward": 0.6875, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.014488131739199162, - "rewards/tag_count_reward": 0.9479166865348816, + "loss": 0.3438, + "reward": 2.424591898918152, + "reward_std": 0.4449867308139801, + "rewards/accuracy_reward": 0.5625, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.025061040185391903, + "rewards/tag_count_reward": 0.9010416865348816, "step": 703 }, { "clip_ratio": 0.0, - "completion_length": 286.50001525878906, + "completion_length": 571.1458435058594, "epoch": 0.704, - "grad_norm": 17.458361775204484, - "kl": 5.40625, + "grad_norm": 40.600173749644895, + "kl": 6.4453125, "learning_rate": 3.195807108082429e-07, - "loss": 1.0745, - "reward": 2.8795833587646484, - "reward_std": 0.3267674595117569, - "rewards/accuracy_reward": 0.9583333730697632, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.01624994818121195, - "rewards/tag_count_reward": 0.9583333730697632, + "loss": 0.9898, + "reward": 2.4354113340377808, + "reward_std": 0.6632784008979797, + "rewards/accuracy_reward": 0.645833358168602, + "rewards/reasoning_steps_reward": 0.9652778506278992, + "rewards/repetition_penalty_reward": -0.014241641853004694, + "rewards/tag_count_reward": 0.8385416865348816, "step": 704 }, { "clip_ratio": 0.0, - "completion_length": 182.75000762939453, + "completion_length": 480.1458435058594, "epoch": 0.705, - "grad_norm": 11.711973709598922, - "kl": 1.76953125, + "grad_norm": 39.53208480397733, + "kl": 3.4765625, "learning_rate": 3.182328662904756e-07, - "loss": 0.226, - "reward": 2.9404876232147217, - "reward_std": 0.1566546568647027, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.017845831695012748, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.7597, + "reward": 2.5596699714660645, + "reward_std": 0.6091938018798828, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9791667461395264, + "rewards/repetition_penalty_reward": -0.02366339974105358, + "rewards/tag_count_reward": 0.9166666865348816, "step": 705 }, { "clip_ratio": 0.0, - "completion_length": 209.12500762939453, + "completion_length": 376.5416717529297, "epoch": 0.706, - "grad_norm": 15.475544333478009, - "kl": 1.09765625, + "grad_norm": 26.171445195886303, + "kl": 2.21484375, "learning_rate": 3.168878457820915e-07, - "loss": 0.1107, - "reward": 2.833465337753296, - "reward_std": 0.23035762272775173, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.013757005101069808, - "rewards/tag_count_reward": 1.0, + "loss": 0.5183, + "reward": 2.515265941619873, + "reward_std": 0.4705822169780731, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 0.9861111640930176, + "rewards/repetition_penalty_reward": -0.017720218747854233, + "rewards/tag_count_reward": 0.921875, "step": 706 }, { "clip_ratio": 0.0, - "completion_length": 190.7916717529297, + "completion_length": 421.41668701171875, "epoch": 0.707, - "grad_norm": 30.261955790190083, - "kl": 1.587890625, + "grad_norm": 23.58547734465505, + "kl": 5.1875, "learning_rate": 3.155456656717408e-07, - "loss": 0.6059, - "reward": 2.9305397272109985, - "reward_std": 0.184548458782956, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.012168700341135263, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.9749, + "reward": 2.6543229818344116, + "reward_std": 0.67611363530159, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.010607585310935974, + "rewards/tag_count_reward": 0.9010416865348816, "step": 707 }, { "clip_ratio": 0.0, - "completion_length": 171.77083587646484, + "completion_length": 420.1458435058594, "epoch": 0.708, - "grad_norm": 46.76863103072219, - "kl": 1.796875, + "grad_norm": 22.53505575124777, + "kl": 3.265625, "learning_rate": 3.142063423134644e-07, - "loss": 0.3905, - "reward": 2.7071508169174194, - "reward_std": 0.27239881455898285, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.006390861934050918, - "rewards/tag_count_reward": 0.984375, + "loss": 0.5001, + "reward": 2.434324622154236, + "reward_std": 0.42964955419301987, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.01706432970240712, + "rewards/tag_count_reward": 0.9166666865348816, "step": 708 }, { "clip_ratio": 0.0, - "completion_length": 195.39584350585938, + "completion_length": 604.5000305175781, "epoch": 0.709, - "grad_norm": 27.546992507279487, - "kl": 1.5, + "grad_norm": 100.60447189912114, + "kl": 10.71875, "learning_rate": 3.1286989202649503e-07, - "loss": 0.5333, - "reward": 2.760728597640991, - "reward_std": 0.4179042801260948, - "rewards/accuracy_reward": 0.8125000298023224, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.003160247695632279, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 1.2098, + "reward": 2.214650869369507, + "reward_std": 0.7704206705093384, + "rewards/accuracy_reward": 0.520833358168602, + "rewards/reasoning_steps_reward": 0.92361119389534, + "rewards/repetition_penalty_reward": -0.021460428833961487, + "rewards/tag_count_reward": 0.7916666865348816, "step": 709 }, { "clip_ratio": 0.0, - "completion_length": 274.3333435058594, + "completion_length": 441.5208435058594, "epoch": 0.71, - "grad_norm": 15.112928229835997, - "kl": 3.35546875, + "grad_norm": 62.885652673494626, + "kl": 4.6171875, "learning_rate": 3.115363310950578e-07, - "loss": 0.4492, - "reward": 2.8428107500076294, - "reward_std": 0.2803904265165329, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.014828108134679496, - "rewards/tag_count_reward": 0.9479166865348816, + "loss": 0.6795, + "reward": 2.4361919164657593, + "reward_std": 0.6633725464344025, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9375000596046448, + "rewards/repetition_penalty_reward": -0.022141442634165287, + "rewards/tag_count_reward": 0.9166666865348816, "step": 710 }, { "clip_ratio": 0.0, - "completion_length": 220.85417938232422, + "completion_length": 505.9791717529297, "epoch": 0.711, - "grad_norm": 18.336454114210635, - "kl": 2.736328125, + "grad_norm": 91.4316669194455, + "kl": 6.65625, "learning_rate": 3.102056757681715e-07, - "loss": 0.9938, - "reward": 2.889613389968872, - "reward_std": 0.35467652045190334, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/reasoning_steps_reward": 0.972222238779068, - "rewards/repetition_penalty_reward": -0.00969224888831377, - "rewards/tag_count_reward": 0.96875, + "loss": 1.2926, + "reward": 2.3233230113983154, + "reward_std": 0.7938812673091888, + "rewards/accuracy_reward": 0.5416666716337204, + "rewards/reasoning_steps_reward": 0.9375001192092896, + "rewards/repetition_penalty_reward": -0.01001034933142364, + "rewards/tag_count_reward": 0.8541666865348816, "step": 711 }, { "clip_ratio": 0.0, - "completion_length": 208.1666717529297, + "completion_length": 497.7708435058594, "epoch": 0.712, - "grad_norm": 10.884752088251568, - "kl": 2.52734375, + "grad_norm": 49.94435657649251, + "kl": 5.265625, "learning_rate": 3.0887794225945143e-07, - "loss": 0.3575, - "reward": 2.788315773010254, - "reward_std": 0.3212761878967285, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/reasoning_steps_reward": 0.979166716337204, - "rewards/repetition_penalty_reward": -0.013767710886895657, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.7182, + "reward": 2.6127805709838867, + "reward_std": 0.5191345363855362, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.019164070021361113, + "rewards/tag_count_reward": 0.875, "step": 712 }, { "clip_ratio": 0.0, - "completion_length": 191.06250762939453, + "completion_length": 408.9791717529297, "epoch": 0.713, - "grad_norm": 19.476817090586856, - "kl": 2.1015625, + "grad_norm": 24.539952113509884, + "kl": 3.46875, "learning_rate": 3.075531467469116e-07, - "loss": 0.5788, - "reward": 2.681349277496338, - "reward_std": 0.17629835568368435, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.0044149053283035755, - "rewards/tag_count_reward": 0.984375, + "loss": 0.5752, + "reward": 2.349856376647949, + "reward_std": 0.4715754985809326, + "rewards/accuracy_reward": 0.4583333432674408, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.023407643660902977, + "rewards/tag_count_reward": 0.9218750298023224, "step": 713 }, { "clip_ratio": 0.0, - "completion_length": 155.89584350585938, + "completion_length": 506.7708435058594, "epoch": 0.714, - "grad_norm": 19.725672306294406, - "kl": 0.9453125, + "grad_norm": 32.90722497302991, + "kl": 4.640625, "learning_rate": 3.062313053727671e-07, - "loss": -0.0363, - "reward": 2.8258572816848755, - "reward_std": 0.13794720731675625, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.009212223580107093, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.8249, + "reward": 2.2763147354125977, + "reward_std": 0.7002745866775513, + "rewards/accuracy_reward": 0.4791666865348816, + "rewards/reasoning_steps_reward": 0.9513889849185944, + "rewards/repetition_penalty_reward": -0.013615832198411226, + "rewards/tag_count_reward": 0.859375, "step": 714 }, { "clip_ratio": 0.0, - "completion_length": 252.02084350585938, + "completion_length": 362.0833435058594, "epoch": 0.715, - "grad_norm": 18.642567269799745, - "kl": 2.453125, + "grad_norm": 27.678515846159396, + "kl": 2.14453125, "learning_rate": 3.0491243424323783e-07, - "loss": 0.7051, - "reward": 2.7655842304229736, - "reward_std": 0.3113892078399658, - "rewards/accuracy_reward": 0.8333333730697632, - "rewards/reasoning_steps_reward": 0.9722222089767456, - "rewards/repetition_penalty_reward": -0.008721279678866267, - "rewards/tag_count_reward": 0.96875, + "loss": 0.3263, + "reward": 2.8031435012817383, + "reward_std": 0.45686857402324677, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.023245446383953094, + "rewards/tag_count_reward": 0.9375, "step": 715 }, { "clip_ratio": 0.0, - "completion_length": 198.18750762939453, + "completion_length": 466.6666717529297, "epoch": 0.716, - "grad_norm": 22.296443764491606, - "kl": 2.53125, + "grad_norm": 29.560245962070923, + "kl": 3.8515625, "learning_rate": 3.0359654942835247e-07, - "loss": 0.4787, - "reward": 2.5452027320861816, - "reward_std": 0.12316333223134279, - "rewards/accuracy_reward": 0.5833333432674408, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.010353040648624301, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.8846, + "reward": 2.3151395320892334, + "reward_std": 0.7553818225860596, + "rewards/accuracy_reward": 0.5, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.03381876181811094, + "rewards/tag_count_reward": 0.8906250298023224, "step": 716 }, { "clip_ratio": 0.0, - "completion_length": 241.47917938232422, + "completion_length": 677.5625, "epoch": 0.717, - "grad_norm": 25.195280591175802, - "kl": 2.359375, + "grad_norm": 28.56711039404196, + "kl": 6.859375, "learning_rate": 3.02283666961752e-07, - "loss": 0.8437, - "reward": 2.6863746643066406, - "reward_std": 0.39484353363513947, - "rewards/accuracy_reward": 0.7708333730697632, - "rewards/reasoning_steps_reward": 0.9652778506278992, - "rewards/repetition_penalty_reward": -0.00806977367028594, - "rewards/tag_count_reward": 0.9583333730697632, + "loss": 0.8948, + "reward": 2.116614818572998, + "reward_std": 0.61385178565979, + "rewards/accuracy_reward": 0.3750000223517418, + "rewards/reasoning_steps_reward": 0.958333432674408, + "rewards/repetition_penalty_reward": -0.018802037462592125, + "rewards/tag_count_reward": 0.8020833730697632, "step": 717 }, { "clip_ratio": 0.0, - "completion_length": 202.12500762939453, + "completion_length": 651.3125305175781, "epoch": 0.718, - "grad_norm": 19.15570911102889, - "kl": 1.5703125, + "grad_norm": 28.83560352945601, + "kl": 6.46875, "learning_rate": 3.0097380284049523e-07, - "loss": 0.4466, - "reward": 2.789478302001953, - "reward_std": 0.3083754926919937, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.016077382490038872, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 1.0283, + "reward": 2.1972588300704956, + "reward_std": 0.6690528243780136, + "rewards/accuracy_reward": 0.4791666716337204, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.014546706806868315, + "rewards/tag_count_reward": 0.7812500298023224, "step": 718 }, { "clip_ratio": 0.0, - "completion_length": 147.02083587646484, + "completion_length": 389.4375, "epoch": 0.719, - "grad_norm": 7.290032040915521, - "kl": 0.650390625, + "grad_norm": 10.934491674221926, + "kl": 1.875, "learning_rate": 2.996669730248628e-07, - "loss": 0.057, - "reward": 2.9914984703063965, - "reward_std": 0.006619289051741362, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.008501835342030972, - "rewards/tag_count_reward": 1.0, + "loss": 0.2297, + "reward": 2.1007025241851807, + "reward_std": 0.48380589485168457, + "rewards/accuracy_reward": 0.2291666716337204, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.017353057861328125, + "rewards/tag_count_reward": 0.9166666865348816, "step": 719 }, { "clip_ratio": 0.0, - "completion_length": 236.75000762939453, + "completion_length": 394.1041717529297, "epoch": 0.72, - "grad_norm": 21.655585010252846, - "kl": 3.65625, + "grad_norm": 29.82240224098748, + "kl": 3.19921875, "learning_rate": 2.9836319343816397e-07, - "loss": 1.1954, - "reward": 2.663915514945984, - "reward_std": 0.261481836438179, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.007959766313433647, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.6818, + "reward": 2.698520064353943, + "reward_std": 0.6881579607725143, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.011549403425306082, + "rewards/tag_count_reward": 0.9114583432674408, "step": 720 }, { "clip_ratio": 0.0, - "completion_length": 176.6666717529297, + "completion_length": 467.0208435058594, "epoch": 0.721, - "grad_norm": 18.460592446878213, - "kl": 2.16796875, + "grad_norm": 25.761812478587363, + "kl": 3.8984375, "learning_rate": 2.9706247996654134e-07, - "loss": 0.6556, - "reward": 2.9740225076675415, - "reward_std": 0.0827195099554956, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.005144236842170358, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.5726, + "reward": 2.4670268297195435, + "reward_std": 0.6666984856128693, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.0173482783138752, + "rewards/tag_count_reward": 0.8802083432674408, "step": 721 }, { "clip_ratio": 0.0, - "completion_length": 175.6875, + "completion_length": 459.50001525878906, "epoch": 0.722, - "grad_norm": 55.28102311952479, - "kl": 3.1328125, + "grad_norm": 66.05822501836576, + "kl": 4.71875, "learning_rate": 2.9576484845877793e-07, - "loss": 0.9524, - "reward": 2.95867919921875, - "reward_std": 0.13475285191088915, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.004862489411607385, - "rewards/tag_count_reward": 0.984375, + "loss": 0.7149, + "reward": 2.672752857208252, + "reward_std": 0.6598548293113708, + "rewards/accuracy_reward": 0.8333333730697632, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.02516369242221117, + "rewards/tag_count_reward": 0.9062500298023224, "step": 722 }, { "clip_ratio": 0.0, - "completion_length": 149.18750762939453, + "completion_length": 469.5, "epoch": 0.723, - "grad_norm": 9.624153647037245, - "kl": 0.716796875, + "grad_norm": 33.50765624110453, + "kl": 3.296875, "learning_rate": 2.944703147261046e-07, - "loss": -0.0301, - "reward": 2.648527979850769, - "reward_std": 0.1572950854897499, - "rewards/accuracy_reward": 0.6666666716337204, - "rewards/reasoning_steps_reward": 0.9861111044883728, - "rewards/repetition_penalty_reward": -0.004249830497428775, - "rewards/tag_count_reward": 1.0, + "loss": 1.0137, + "reward": 2.1091710329055786, + "reward_std": 0.5909655094146729, + "rewards/accuracy_reward": 0.2916666716337204, + "rewards/reasoning_steps_reward": 0.9305555522441864, + "rewards/repetition_penalty_reward": -0.008884491166099906, + "rewards/tag_count_reward": 0.8958333432674408, "step": 723 }, { "clip_ratio": 0.0, - "completion_length": 166.02084350585938, + "completion_length": 475.9583435058594, "epoch": 0.724, - "grad_norm": 22.691419813179106, - "kl": 3.0859375, + "grad_norm": 32.701475311916404, + "kl": 4.078125, "learning_rate": 2.931788945420058e-07, - "loss": 0.5976, - "reward": 2.933652400970459, - "reward_std": 0.16886768210679293, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.009055959060788155, - "rewards/tag_count_reward": 0.984375, + "loss": 1.2, + "reward": 2.4817965030670166, + "reward_std": 0.5851520895957947, + "rewards/accuracy_reward": 0.6250000149011612, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.012995375785976648, + "rewards/tag_count_reward": 0.890625, "step": 724 }, { "clip_ratio": 0.0, - "completion_length": 188.7916717529297, + "completion_length": 554.7083435058594, "epoch": 0.725, - "grad_norm": 13.514562580053095, - "kl": 1.40625, + "grad_norm": 32.79842010966518, + "kl": 5.91796875, "learning_rate": 2.918906036420294e-07, - "loss": 0.0894, - "reward": 2.874236822128296, - "reward_std": 0.1563171287998557, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.014652209356427193, - "rewards/tag_count_reward": 1.0, + "loss": 0.8106, + "reward": 2.432934522628784, + "reward_std": 0.6692648828029633, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.01671838667243719, + "rewards/tag_count_reward": 0.859375, "step": 725 }, { "clip_ratio": 0.0, - "completion_length": 181.5416717529297, + "completion_length": 572.5208435058594, "epoch": 0.726, - "grad_norm": 19.574932456068673, - "kl": 1.78515625, + "grad_norm": 34.07001019061104, + "kl": 4.34375, "learning_rate": 2.9060545772359305e-07, - "loss": 0.1352, - "reward": 2.7464375495910645, - "reward_std": 0.10333237610757351, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.012243114179000258, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.985, + "reward": 2.4105674028396606, + "reward_std": 0.5044368803501129, + "rewards/accuracy_reward": 0.5625000298023224, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.02866880688816309, + "rewards/tag_count_reward": 0.8906250298023224, "step": 726 }, { "clip_ratio": 0.0, - "completion_length": 219.18750762939453, + "completion_length": 485.41668701171875, "epoch": 0.727, - "grad_norm": 28.005859333493035, - "kl": 4.5703125, + "grad_norm": 22.090198544561794, + "kl": 3.53125, "learning_rate": 2.893234724457946e-07, - "loss": 0.9252, - "reward": 2.6594338417053223, - "reward_std": 0.5057697296142578, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.01070529012940824, - "rewards/tag_count_reward": 0.9687500298023224, + "loss": 0.662, + "reward": 2.4937909841537476, + "reward_std": 0.6231936812400818, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.027042274363338947, + "rewards/tag_count_reward": 0.875, "step": 727 }, { "clip_ratio": 0.0, - "completion_length": 324.77083587646484, + "completion_length": 596.9375, "epoch": 0.728, - "grad_norm": 69.40275566790582, - "kl": 10.6875, + "grad_norm": 39.74856940380198, + "kl": 5.5, "learning_rate": 2.8804466342921987e-07, - "loss": 1.6972, - "reward": 2.6688969135284424, - "reward_std": 0.3911566957831383, - "rewards/accuracy_reward": 0.7708333730697632, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.02381149446591735, - "rewards/tag_count_reward": 0.921875, + "loss": 1.4125, + "reward": 2.3083548545837402, + "reward_std": 0.7532355189323425, + "rewards/accuracy_reward": 0.5625, + "rewards/reasoning_steps_reward": 0.9513888657093048, + "rewards/repetition_penalty_reward": -0.02324253274127841, + "rewards/tag_count_reward": 0.8177083432674408, "step": 728 }, { "clip_ratio": 0.0, - "completion_length": 341.12500762939453, + "completion_length": 644.2708587646484, "epoch": 0.729, - "grad_norm": 92.2139665613306, - "kl": 11.234375, + "grad_norm": 27.585966675541705, + "kl": 5.046875, "learning_rate": 2.86769046255753e-07, - "loss": 1.2349, - "reward": 2.3806169033050537, - "reward_std": 0.3950771391391754, - "rewards/accuracy_reward": 0.479166679084301, - "rewards/reasoning_steps_reward": 0.9652778506278992, - "rewards/repetition_penalty_reward": -0.016952670644968748, - "rewards/tag_count_reward": 0.9531250298023224, + "loss": 0.8454, + "reward": 2.0349258184432983, + "reward_std": 0.7431914508342743, + "rewards/accuracy_reward": 0.375, + "rewards/reasoning_steps_reward": 0.9236111640930176, + "rewards/repetition_penalty_reward": -0.024102029390633106, + "rewards/tag_count_reward": 0.7604166865348816, "step": 729 }, { "clip_ratio": 0.0, - "completion_length": 202.20833587646484, + "completion_length": 416.8958435058594, "epoch": 0.73, - "grad_norm": 14.657545952316154, - "kl": 2.8046875, + "grad_norm": 38.15694283580282, + "kl": 1.8203125, "learning_rate": 2.854966364683872e-07, - "loss": 0.2399, - "reward": 2.8786327838897705, - "reward_std": 0.20367477275431156, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.010256112553179264, - "rewards/tag_count_reward": 1.0, + "loss": 0.4531, + "reward": 2.5842431783676147, + "reward_std": 0.4230002462863922, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.021659548394382, + "rewards/tag_count_reward": 0.9322916865348816, "step": 730 }, { "clip_ratio": 0.0, - "completion_length": 206.7708396911621, + "completion_length": 522.5416870117188, "epoch": 0.731, - "grad_norm": 47.20076188385478, - "kl": 6.546875, + "grad_norm": 19.973765241704356, + "kl": 3.59375, "learning_rate": 2.842274495710335e-07, - "loss": 0.7475, - "reward": 2.650895357131958, - "reward_std": 0.21683883713558316, - "rewards/accuracy_reward": 0.6875, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.0053546574199572206, - "rewards/tag_count_reward": 0.96875, + "loss": 0.5592, + "reward": 2.3725064992904663, + "reward_std": 0.5484490990638733, + "rewards/accuracy_reward": 0.5625000298023224, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.016382555477321148, + "rewards/tag_count_reward": 0.8750000298023224, "step": 731 }, { "clip_ratio": 0.0, - "completion_length": 188.1875114440918, + "completion_length": 775.3125305175781, "epoch": 0.732, - "grad_norm": 11.319949330820803, - "kl": 2.47265625, + "grad_norm": 19.54218927587623, + "kl": 8.140625, "learning_rate": 2.829615010283344e-07, - "loss": 0.4432, - "reward": 2.9654629230499268, - "reward_std": 0.073213592171669, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.010231582447886467, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 1.0839, + "reward": 2.0963175296783447, + "reward_std": 0.8241714835166931, + "rewards/accuracy_reward": 0.4791666865348816, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.06340477429330349, + "rewards/tag_count_reward": 0.7291666865348816, "step": 732 }, { "clip_ratio": 0.0, - "completion_length": 377.68751525878906, + "completion_length": 618.8958435058594, "epoch": 0.733, - "grad_norm": 46.72803847262272, - "kl": 10.59375, + "grad_norm": 15.706934322980523, + "kl": 5.859375, "learning_rate": 2.8169880626547283e-07, - "loss": 1.259, - "reward": 2.65623676776886, - "reward_std": 0.5035720467567444, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.024318894371390343, - "rewards/tag_count_reward": 0.9166666865348816, + "loss": 0.8039, + "reward": 2.311804175376892, + "reward_std": 0.7484097182750702, + "rewards/accuracy_reward": 0.6250000149011612, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.033682181499898434, + "rewards/tag_count_reward": 0.7760416865348816, "step": 733 }, { "clip_ratio": 0.0, - "completion_length": 213.6041717529297, + "completion_length": 640.8541870117188, "epoch": 0.734, - "grad_norm": 14.152390796800312, - "kl": 1.921875, + "grad_norm": 45.104210914901735, + "kl": 4.265625, "learning_rate": 2.8043938066798645e-07, - "loss": 0.3338, - "reward": 2.94937801361084, - "reward_std": 0.13634144980460405, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.01416354114189744, - "rewards/tag_count_reward": 0.984375, + "loss": 1.1408, + "reward": 2.3691216707229614, + "reward_std": 0.8749706745147705, + "rewards/accuracy_reward": 0.6458333730697632, + "rewards/reasoning_steps_reward": 0.9305555820465088, + "rewards/repetition_penalty_reward": -0.01976742222905159, + "rewards/tag_count_reward": 0.8125, "step": 734 }, { "clip_ratio": 0.0, - "completion_length": 236.9375, + "completion_length": 556.1041717529297, "epoch": 0.735, - "grad_norm": 18.291230494979676, - "kl": 3.9453125, + "grad_norm": 25.06133634060821, + "kl": 4.1171875, "learning_rate": 2.791832395815782e-07, - "loss": 1.0227, - "reward": 2.734309196472168, - "reward_std": 0.31506381928920746, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.012218464049510658, - "rewards/tag_count_reward": 0.96875, + "loss": 0.901, + "reward": 2.317103087902069, + "reward_std": 0.6717338263988495, + "rewards/accuracy_reward": 0.5625000298023224, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.030119193717837334, + "rewards/tag_count_reward": 0.8333333730697632, "step": 735 }, { "clip_ratio": 0.0, - "completion_length": 267.25001525878906, + "completion_length": 541.4166870117188, "epoch": 0.736, - "grad_norm": 15.578549850826635, - "kl": 4.984375, + "grad_norm": 70.06964895356546, + "kl": 4.8828125, "learning_rate": 2.7793039831193133e-07, - "loss": 1.1073, - "reward": 2.935491442680359, - "reward_std": 0.14980192482471466, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 0.9861111044883728, - "rewards/repetition_penalty_reward": -0.014161322731524706, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.6714, + "reward": 2.38431978225708, + "reward_std": 0.4595172256231308, + "rewards/accuracy_reward": 0.5416666716337204, + "rewards/reasoning_steps_reward": 0.9791666567325592, + "rewards/repetition_penalty_reward": -0.016721924766898155, + "rewards/tag_count_reward": 0.8802083432674408, "step": 736 }, { "clip_ratio": 0.0, - "completion_length": 152.7291717529297, + "completion_length": 563.3541870117188, "epoch": 0.737, - "grad_norm": 28.48324056384496, - "kl": 2.75390625, + "grad_norm": 37.75513345672942, + "kl": 3.3359375, "learning_rate": 2.766808721245211e-07, - "loss": 0.463, - "reward": 2.7014344930648804, - "reward_std": 0.11277389200404286, - "rewards/accuracy_reward": 0.7291666716337204, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.005162854678928852, - "rewards/tag_count_reward": 0.984375, + "loss": 1.1112, + "reward": 2.4378621578216553, + "reward_std": 0.59513920545578, + "rewards/accuracy_reward": 0.6041666716337204, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.016999011393636465, + "rewards/tag_count_reward": 0.8645833432674408, "step": 737 }, { "clip_ratio": 0.0, - "completion_length": 154.33333587646484, + "completion_length": 531.875, "epoch": 0.738, - "grad_norm": 22.130402817391822, - "kl": 1.7890625, + "grad_norm": 18.56617973802945, + "kl": 4.125, "learning_rate": 2.7543467624442956e-07, - "loss": 0.3, - "reward": 2.958582043647766, - "reward_std": 0.10669095907360315, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.008431715425103903, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.8568, + "reward": 2.246809482574463, + "reward_std": 0.6611096858978271, + "rewards/accuracy_reward": 0.4166666716337204, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.017079456709325314, + "rewards/tag_count_reward": 0.875, "step": 738 }, { "clip_ratio": 0.0, - "completion_length": 166.43750381469727, + "completion_length": 520.2708435058594, "epoch": 0.739, - "grad_norm": 29.8265950358636, - "kl": 1.8671875, + "grad_norm": 23.972621042263274, + "kl": 3.859375, "learning_rate": 2.741918258561607e-07, - "loss": 0.2552, - "reward": 2.8668748140335083, - "reward_std": 0.13983655767515302, - "rewards/accuracy_reward": 0.875, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.00812540017068386, - "rewards/tag_count_reward": 1.0, + "loss": 1.0541, + "reward": 2.4821611642837524, + "reward_std": 0.4908117651939392, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.03520001098513603, + "rewards/tag_count_reward": 0.9062500298023224, "step": 739 }, { "clip_ratio": 0.0, - "completion_length": 173.2291717529297, + "completion_length": 527.1458587646484, "epoch": 0.74, - "grad_norm": 18.620822989023786, - "kl": 3.53125, + "grad_norm": 40.196471457608176, + "kl": 6.765625, "learning_rate": 2.729523361034538e-07, - "loss": 0.8902, - "reward": 2.9260356426239014, - "reward_std": 0.2500705784186721, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.002783867996186018, - "rewards/tag_count_reward": 0.984375, + "loss": 1.2053, + "reward": 2.3264917135238647, + "reward_std": 0.6097930297255516, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9375000596046448, + "rewards/repetition_penalty_reward": -0.022466725669801235, + "rewards/tag_count_reward": 0.8281250298023224, "step": 740 }, { "clip_ratio": 0.0, - "completion_length": 216.47917938232422, + "completion_length": 493.04168701171875, "epoch": 0.741, - "grad_norm": 16.147072718339448, - "kl": 1.9609375, + "grad_norm": 29.927610585853564, + "kl": 4.03515625, "learning_rate": 2.717162220891007e-07, - "loss": 0.2391, - "reward": 2.9077422618865967, - "reward_std": 0.18657968193292618, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.01934122107923031, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.9692, + "reward": 2.016554594039917, + "reward_std": 0.4730721116065979, + "rewards/accuracy_reward": 0.16666667722165585, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.023376007564365864, + "rewards/tag_count_reward": 0.921875, "step": 741 }, { "clip_ratio": 0.0, - "completion_length": 273.7916717529297, + "completion_length": 512.9583435058594, "epoch": 0.742, - "grad_norm": 16.89279253923722, - "kl": 4.5703125, + "grad_norm": 25.41926623083724, + "kl": 4.2734375, "learning_rate": 2.7048349887476037e-07, - "loss": 0.6698, - "reward": 2.8869292736053467, - "reward_std": 0.2661277502775192, - "rewards/accuracy_reward": 0.9375000298023224, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.019320933148264885, - "rewards/tag_count_reward": 0.9687500298023224, + "loss": 1.0409, + "reward": 2.418197512626648, + "reward_std": 0.6901432275772095, + "rewards/accuracy_reward": 0.6250000149011612, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.03492756187915802, + "rewards/tag_count_reward": 0.8697916865348816, "step": 742 }, { "clip_ratio": 0.0, - "completion_length": 120.85416793823242, + "completion_length": 648.3958435058594, "epoch": 0.743, - "grad_norm": 17.51515110675139, - "kl": 2.1171875, + "grad_norm": 28.016549982017455, + "kl": 6.734375, "learning_rate": 2.692541814807763e-07, - "loss": 0.1561, - "reward": 2.983708143234253, - "reward_std": 0.04747849889099598, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.0024030021158978343, - "rewards/tag_count_reward": 1.0, + "loss": 1.1588, + "reward": 2.3356810808181763, + "reward_std": 0.7242786288261414, + "rewards/accuracy_reward": 0.5625, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.021957868244498968, + "rewards/tag_count_reward": 0.8229166865348816, "step": 743 }, { "clip_ratio": 0.0, - "completion_length": 384.1666717529297, + "completion_length": 528.0000152587891, "epoch": 0.744, - "grad_norm": 42.90871362781308, - "kl": 7.0703125, + "grad_norm": 93.21543758888738, + "kl": 9.375, "learning_rate": 2.6802828488599294e-07, - "loss": 0.5937, - "reward": 2.2971469163894653, - "reward_std": 0.32202285528182983, - "rewards/accuracy_reward": 0.3958333432674408, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.029241953045129776, - "rewards/tag_count_reward": 0.9375000298023224, + "loss": 1.1425, + "reward": 2.218702793121338, + "reward_std": 0.743184506893158, + "rewards/accuracy_reward": 0.458333358168602, + "rewards/reasoning_steps_reward": 0.9375000298023224, + "rewards/repetition_penalty_reward": -0.02608906291425228, + "rewards/tag_count_reward": 0.8489583432674408, "step": 744 }, { "clip_ratio": 0.0, - "completion_length": 294.64583587646484, + "completion_length": 522.5833587646484, "epoch": 0.745, - "grad_norm": 62.51860865543721, - "kl": 7.3984375, + "grad_norm": 29.592772740929842, + "kl": 5.4375, "learning_rate": 2.6680582402757324e-07, - "loss": 0.5642, - "reward": 2.505735754966736, - "reward_std": 0.507663369178772, - "rewards/accuracy_reward": 0.625, - "rewards/reasoning_steps_reward": 0.9722222089767456, - "rewards/repetition_penalty_reward": -0.018569822888821363, - "rewards/tag_count_reward": 0.9270833432674408, + "loss": 0.8376, + "reward": 2.1133170127868652, + "reward_std": 0.4543229639530182, + "rewards/accuracy_reward": 0.27083333395421505, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.020363647490739822, + "rewards/tag_count_reward": 0.890625, "step": 745 }, { "clip_ratio": 0.0, - "completion_length": 213.12501525878906, + "completion_length": 537.7083435058594, "epoch": 0.746, - "grad_norm": 14.701319777324137, - "kl": 5.1875, + "grad_norm": 52.43685846686619, + "kl": 7.9375, "learning_rate": 2.655868138008171e-07, - "loss": 1.2155, - "reward": 2.8672006130218506, - "reward_std": 0.277056522667408, - "rewards/accuracy_reward": 0.9375, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.007799285929650068, - "rewards/tag_count_reward": 0.9375000298023224, + "loss": 1.3022, + "reward": 2.560097575187683, + "reward_std": 0.6881845593452454, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9375, + "rewards/repetition_penalty_reward": -0.01802760176360607, + "rewards/tag_count_reward": 0.8697916865348816, "step": 746 }, { "clip_ratio": 0.0, - "completion_length": 171.62500762939453, + "completion_length": 491.4583435058594, "epoch": 0.747, - "grad_norm": 24.416347333819463, - "kl": 4.4765625, + "grad_norm": 42.17070883402169, + "kl": 4.8125, "learning_rate": 2.6437126905897967e-07, - "loss": 0.5074, - "reward": 2.8408756256103516, - "reward_std": 0.2884572371840477, - "rewards/accuracy_reward": 0.8958333730697632, - "rewards/reasoning_steps_reward": 0.979166716337204, - "rewards/repetition_penalty_reward": -0.013291162671521306, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.9389, + "reward": 2.4418286085128784, + "reward_std": 0.479451060295105, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9444445371627808, + "rewards/repetition_penalty_reward": -0.023449244908988476, + "rewards/tag_count_reward": 0.9166666865348816, "step": 747 }, { "clip_ratio": 0.0, - "completion_length": 140.02083587646484, + "completion_length": 495.89585876464844, "epoch": 0.748, - "grad_norm": 38.84889999453808, - "kl": 2.07421875, + "grad_norm": 42.23947257797948, + "kl": 3.734375, "learning_rate": 2.631592046130896e-07, - "loss": 0.3744, - "reward": 2.7368489503860474, - "reward_std": 0.028997281100600958, - "rewards/accuracy_reward": 0.75, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.006206809193827212, - "rewards/tag_count_reward": 1.0, + "loss": 0.8408, + "reward": 2.2848896980285645, + "reward_std": 0.6534326076507568, + "rewards/accuracy_reward": 0.4791666865348816, + "rewards/reasoning_steps_reward": 0.965277761220932, + "rewards/repetition_penalty_reward": -0.029346639290452003, + "rewards/tag_count_reward": 0.8697916865348816, "step": 748 }, { "clip_ratio": 0.0, - "completion_length": 171.2083396911621, + "completion_length": 520.9166717529297, "epoch": 0.749, - "grad_norm": 20.950568177980344, - "kl": 3.0390625, + "grad_norm": 28.13462040337534, + "kl": 4.9921875, "learning_rate": 2.6195063523177e-07, - "loss": 0.881, - "reward": 2.9372819662094116, - "reward_std": 0.2130552427843213, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.003690255805850029, - "rewards/tag_count_reward": 0.96875, + "loss": 0.6576, + "reward": 2.5075308084487915, + "reward_std": 0.4372696727514267, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9444445073604584, + "rewards/repetition_penalty_reward": -0.020247011445462704, + "rewards/tag_count_reward": 0.8958333730697632, "step": 749 }, { "clip_ratio": 0.0, - "completion_length": 127.35417175292969, + "completion_length": 550.9791870117188, "epoch": 0.75, - "grad_norm": 22.815278928317912, - "kl": 2.11328125, + "grad_norm": 51.58149030204615, + "kl": 4.734375, "learning_rate": 2.6074557564105724e-07, - "loss": 0.4686, - "reward": 2.968356490135193, - "reward_std": 0.10191531106829643, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.0038657764671370387, - "rewards/tag_count_reward": 1.0, + "loss": 1.2626, + "reward": 2.304089069366455, + "reward_std": 0.65184286236763, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.010147055611014366, + "rewards/tag_count_reward": 0.8489583730697632, "step": 750 }, { "clip_ratio": 0.0, - "completion_length": 129.50000381469727, + "completion_length": 414.5208435058594, "epoch": 0.751, - "grad_norm": 15.567302000111946, - "kl": 3.1875, + "grad_norm": 25.50420977958888, + "kl": 2.85546875, "learning_rate": 2.595440405242222e-07, - "loss": 0.6828, - "reward": 2.9434677362442017, - "reward_std": 0.16990657895803452, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.0027129139052703977, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 0.4946, + "reward": 2.3329780101776123, + "reward_std": 0.4813043922185898, + "rewards/accuracy_reward": 0.4375000149011612, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.01424429938197136, + "rewards/tag_count_reward": 0.9166666865348816, "step": 751 }, { "clip_ratio": 0.0, - "completion_length": 217.64583587646484, + "completion_length": 549.7708587646484, "epoch": 0.752, - "grad_norm": 12.41339243387379, - "kl": 4.640625, + "grad_norm": 30.257524874531967, + "kl": 4.296875, "learning_rate": 2.583460445215911e-07, - "loss": 0.5962, - "reward": 2.921668767929077, - "reward_std": 0.17934391275048256, - "rewards/accuracy_reward": 0.9375, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.015831364318728447, - "rewards/tag_count_reward": 1.0, + "loss": 1.0795, + "reward": 2.672580599784851, + "reward_std": 0.6382189244031906, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.013183359056711197, + "rewards/tag_count_reward": 0.8802083432674408, "step": 752 }, { "clip_ratio": 0.0, - "completion_length": 181.58334350585938, + "completion_length": 409.50001525878906, "epoch": 0.753, - "grad_norm": 27.910440784105973, - "kl": 5.27734375, + "grad_norm": 42.99616520034882, + "kl": 3.15625, "learning_rate": 2.571516022303671e-07, - "loss": 0.4888, - "reward": 2.95553195476532, - "reward_std": 0.11562120169401169, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.008009851910173893, - "rewards/tag_count_reward": 0.9843750298023224, + "loss": 0.8373, + "reward": 2.759161591529846, + "reward_std": 0.7119235694408417, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.02382473973557353, + "rewards/tag_count_reward": 0.921875, "step": 753 }, { "clip_ratio": 0.0, - "completion_length": 267.5208435058594, + "completion_length": 536.5625305175781, "epoch": 0.754, - "grad_norm": 22.261054871996944, - "kl": 5.046875, + "grad_norm": 31.379188639299347, + "kl": 5.7109375, "learning_rate": 2.5596072820445254e-07, - "loss": 0.9985, - "reward": 2.60988712310791, - "reward_std": 0.22960419952869415, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.020321237854659557, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 1.1123, + "reward": 2.451164484024048, + "reward_std": 0.6144693195819855, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.909722238779068, + "rewards/repetition_penalty_reward": -0.01584968389943242, + "rewards/tag_count_reward": 0.8697916865348816, "step": 754 }, { "clip_ratio": 0.0, - "completion_length": 111.52083587646484, + "completion_length": 489.0416717529297, "epoch": 0.755, - "grad_norm": 16.218760674533854, - "kl": 0.984375, + "grad_norm": 45.223755625235505, + "kl": 4.796875, "learning_rate": 2.547734369542718e-07, - "loss": 0.1248, - "reward": 2.7425146102905273, - "reward_std": 0.02554816461633891, - "rewards/accuracy_reward": 0.75, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.0005409659206634387, - "rewards/tag_count_reward": 1.0, + "loss": 1.2234, + "reward": 2.4944331645965576, + "reward_std": 0.610102117061615, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 0.9861111640930176, + "rewards/repetition_penalty_reward": -0.012511319015175104, + "rewards/tag_count_reward": 0.8958333432674408, "step": 755 }, { "clip_ratio": 0.0, - "completion_length": 157.3541717529297, + "completion_length": 443.12501525878906, "epoch": 0.756, - "grad_norm": 14.794892527909738, - "kl": 3.4296875, + "grad_norm": 43.56858310727776, + "kl": 4.9921875, "learning_rate": 2.5358974294659373e-07, - "loss": 0.4376, - "reward": 2.9546356201171875, - "reward_std": 0.13463160302489996, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.008906134404242039, - "rewards/tag_count_reward": 0.984375, + "loss": 1.0932, + "reward": 2.5396214723587036, + "reward_std": 0.7032516896724701, + "rewards/accuracy_reward": 0.7083333730697632, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.014198230113834143, + "rewards/tag_count_reward": 0.8802083432674408, "step": 756 }, { "clip_ratio": 0.0, - "completion_length": 231.68751525878906, + "completion_length": 380.25, "epoch": 0.757, - "grad_norm": 17.520783547440633, - "kl": 3.45703125, + "grad_norm": 42.833747394340975, + "kl": 3.90625, "learning_rate": 2.5240966060435674e-07, - "loss": 0.7316, - "reward": 2.9272828102111816, - "reward_std": 0.13057884108275175, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.010217505972832441, - "rewards/tag_count_reward": 0.9583333432674408, + "loss": 0.4364, + "reward": 2.670251488685608, + "reward_std": 0.3203464448451996, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9652778506278992, + "rewards/repetition_penalty_reward": -0.013776272535324097, + "rewards/tag_count_reward": 0.9270833432674408, "step": 757 }, { "clip_ratio": 0.0, - "completion_length": 180.68750762939453, + "completion_length": 520.7291870117188, "epoch": 0.758, - "grad_norm": 19.025131213173587, - "kl": 4.859375, + "grad_norm": 103.32021322129948, + "kl": 6.125, "learning_rate": 2.512332043064913e-07, - "loss": 0.7966, - "reward": 2.6720521450042725, - "reward_std": 0.21184147894382477, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.005031284177675843, - "rewards/tag_count_reward": 0.96875, + "loss": 1.2798, + "reward": 2.463586449623108, + "reward_std": 0.6297429800033569, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.017316261306405067, + "rewards/tag_count_reward": 0.890625, "step": 758 }, { "clip_ratio": 0.0, - "completion_length": 166.87500762939453, + "completion_length": 603.5625305175781, "epoch": 0.759, - "grad_norm": 18.736246550285085, - "kl": 3.59375, + "grad_norm": 43.65057021190212, + "kl": 6.9375, "learning_rate": 2.5006038838774647e-07, - "loss": 0.5299, - "reward": 2.764945387840271, - "reward_std": 0.29092290811240673, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.004151897912379354, - "rewards/tag_count_reward": 0.984375, + "loss": 0.9189, + "reward": 2.158547341823578, + "reward_std": 0.6411450803279877, + "rewards/accuracy_reward": 0.375, + "rewards/reasoning_steps_reward": 0.9444445073604584, + "rewards/repetition_penalty_reward": -0.009855579119175673, + "rewards/tag_count_reward": 0.8489583730697632, "step": 759 }, { "clip_ratio": 0.0, - "completion_length": 172.3541717529297, + "completion_length": 617.6041870117188, "epoch": 0.76, - "grad_norm": 12.657785398168317, - "kl": 3.48046875, + "grad_norm": 19.541751124761845, + "kl": 6.046875, "learning_rate": 2.488912271385139e-07, - "loss": 0.5394, - "reward": 2.6646286249160767, - "reward_std": 0.21880869567394257, - "rewards/accuracy_reward": 0.6875000149011612, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.007246227003633976, - "rewards/tag_count_reward": 0.984375, + "loss": 1.0373, + "reward": 2.354681611061096, + "reward_std": 0.6235771775245667, + "rewards/accuracy_reward": 0.6041666716337204, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.025526808109134436, + "rewards/tag_count_reward": 0.796875, "step": 760 }, { "clip_ratio": 0.0, - "completion_length": 150.52083587646484, + "completion_length": 486.4166717529297, "epoch": 0.761, - "grad_norm": 13.324455559920084, - "kl": 2.853515625, + "grad_norm": 39.557769432826085, + "kl": 2.8046875, "learning_rate": 2.4772573480465445e-07, - "loss": 0.3322, - "reward": 2.746537208557129, - "reward_std": 0.2090878188610077, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.0034628245048224926, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.8268, + "reward": 2.4871546030044556, + "reward_std": 0.6703044772148132, + "rewards/accuracy_reward": 0.625, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.024998143315315247, + "rewards/tag_count_reward": 0.921875, "step": 761 }, { "clip_ratio": 0.0, - "completion_length": 131.1041717529297, + "completion_length": 558.5208435058594, "epoch": 0.762, - "grad_norm": 14.591735378716425, - "kl": 2.1796875, + "grad_norm": 37.29197408055863, + "kl": 4.4140625, "learning_rate": 2.465639255873246e-07, - "loss": 0.278, - "reward": 2.765766978263855, - "reward_std": 0.07774607161991298, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.005066295387223363, - "rewards/tag_count_reward": 1.0, + "loss": 0.9776, + "reward": 2.1103891134262085, + "reward_std": 0.7344604879617691, + "rewards/accuracy_reward": 0.3958333358168602, + "rewards/reasoning_steps_reward": 0.9375000298023224, + "rewards/repetition_penalty_reward": -0.02502751164138317, + "rewards/tag_count_reward": 0.8020833432674408, "step": 762 }, { "clip_ratio": 0.0, - "completion_length": 144.83333587646484, + "completion_length": 556.3333435058594, "epoch": 0.763, - "grad_norm": 42.22171491104914, - "kl": 1.578125, + "grad_norm": 17.379113373562127, + "kl": 4.255859375, "learning_rate": 2.454058136428027e-07, - "loss": 0.0699, - "reward": 2.538090944290161, - "reward_std": 0.282714419066906, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.0035757378209382296, - "rewards/tag_count_reward": 1.0, + "loss": 0.5297, + "reward": 2.3058613538742065, + "reward_std": 0.5592197477817535, + "rewards/accuracy_reward": 0.5416666716337204, + "rewards/reasoning_steps_reward": 0.9583334028720856, + "rewards/repetition_penalty_reward": -0.017055293079465628, + "rewards/tag_count_reward": 0.8229166865348816, "step": 763 }, { "clip_ratio": 0.0, - "completion_length": 223.79167938232422, + "completion_length": 404.56251525878906, "epoch": 0.764, - "grad_norm": 13.865044383760187, - "kl": 3.650390625, + "grad_norm": 18.161864885556103, + "kl": 1.474609375, "learning_rate": 2.4425141308231765e-07, - "loss": 0.8248, - "reward": 2.9157071113586426, - "reward_std": 0.19353636540472507, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.011376385111361742, - "rewards/tag_count_reward": 0.96875, + "loss": 0.2815, + "reward": 2.35969877243042, + "reward_std": 0.4401056468486786, + "rewards/accuracy_reward": 0.4375, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.023981940932571888, + "rewards/tag_count_reward": 0.953125, "step": 764 }, { "clip_ratio": 0.0, - "completion_length": 198.52084350585938, + "completion_length": 388.0, "epoch": 0.765, - "grad_norm": 14.395945516425703, - "kl": 1.84375, + "grad_norm": 35.807830777637626, + "kl": 2.65625, "learning_rate": 2.4310073797187573e-07, - "loss": 0.229, - "reward": 2.5803310871124268, - "reward_std": 0.3961055725812912, - "rewards/accuracy_reward": 0.6250000298023224, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.023835754953324795, - "rewards/tag_count_reward": 1.0, + "loss": 0.7565, + "reward": 2.547232747077942, + "reward_std": 0.6375119686126709, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9305556416511536, + "rewards/repetition_penalty_reward": -0.013531276490539312, + "rewards/tag_count_reward": 0.921875, "step": 765 }, { "clip_ratio": 0.0, - "completion_length": 190.1875, + "completion_length": 414.5416717529297, "epoch": 0.766, - "grad_norm": 18.081841371989597, - "kl": 3.3984375, + "grad_norm": 31.125919596419223, + "kl": 1.38671875, "learning_rate": 2.4195380233209006e-07, - "loss": 0.7971, - "reward": 2.9633615016937256, - "reward_std": 0.11096163839101791, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 0.9861111044883728, - "rewards/repetition_penalty_reward": -0.007124634925276041, - "rewards/tag_count_reward": 0.984375, + "loss": 0.481, + "reward": 2.5880796909332275, + "reward_std": 0.547327809035778, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.021295433398336172, + "rewards/tag_count_reward": 0.9427083432674408, "step": 766 }, { "clip_ratio": 0.0, - "completion_length": 298.0208435058594, + "completion_length": 451.41668701171875, "epoch": 0.767, - "grad_norm": 25.22808097654846, - "kl": 4.943359375, + "grad_norm": 36.219759589516954, + "kl": 4.25, "learning_rate": 2.408106201380097e-07, - "loss": 0.495, - "reward": 2.532307267189026, - "reward_std": 0.2690594419836998, - "rewards/accuracy_reward": 0.6041666716337204, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.012831668369472027, - "rewards/tag_count_reward": 0.9479166865348816, + "loss": 0.8591, + "reward": 2.333495259284973, + "reward_std": 0.6193402707576752, + "rewards/accuracy_reward": 0.583333358168602, + "rewards/reasoning_steps_reward": 0.9166666865348816, + "rewards/repetition_penalty_reward": -0.015463168267160654, + "rewards/tag_count_reward": 0.8489583432674408, "step": 767 }, { "clip_ratio": 0.0, - "completion_length": 238.64584350585938, + "completion_length": 520.5833435058594, "epoch": 0.768, - "grad_norm": 14.197644611931855, - "kl": 4.2421875, + "grad_norm": 23.45639672156471, + "kl": 4.4765625, "learning_rate": 2.3967120531894857e-07, - "loss": 1.0039, - "reward": 2.6744498014450073, - "reward_std": 0.436637282371521, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.014786453917622566, - "rewards/tag_count_reward": 0.953125, + "loss": 0.9129, + "reward": 2.30954110622406, + "reward_std": 0.6106710433959961, + "rewards/accuracy_reward": 0.4791666865348816, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.016847790218889713, + "rewards/tag_count_reward": 0.8750000298023224, "step": 768 }, { "clip_ratio": 0.0, - "completion_length": 243.81251525878906, + "completion_length": 551.25, "epoch": 0.769, - "grad_norm": 9.36349937077964, - "kl": 4.328125, + "grad_norm": 129.78473682307904, + "kl": 6.140625, "learning_rate": 2.38535571758317e-07, - "loss": 0.7109, - "reward": 2.884034276008606, - "reward_std": 0.22643481940031052, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.011799137573689222, - "rewards/tag_count_reward": 0.9375000298023224, + "loss": 1.3243, + "reward": 2.448180079460144, + "reward_std": 0.7538703083992004, + "rewards/accuracy_reward": 0.6875000149011612, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.02404231671243906, + "rewards/tag_count_reward": 0.8333333432674408, "step": 769 }, { "clip_ratio": 0.0, - "completion_length": 287.4375, + "completion_length": 569.8750305175781, "epoch": 0.77, - "grad_norm": 16.449473053498554, - "kl": 4.1875, + "grad_norm": 35.971720291998714, + "kl": 7.40625, "learning_rate": 2.374037332934512e-07, - "loss": 0.6688, - "reward": 2.753445863723755, - "reward_std": 0.38689200580120087, - "rewards/accuracy_reward": 0.8125, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.012179049663245678, - "rewards/tag_count_reward": 0.9531250298023224, + "loss": 1.0446, + "reward": 2.1608505249023438, + "reward_std": 0.7265328466892242, + "rewards/accuracy_reward": 0.4166666716337204, + "rewards/reasoning_steps_reward": 0.9444445371627808, + "rewards/repetition_penalty_reward": -0.028385651297867298, + "rewards/tag_count_reward": 0.828125, "step": 770 }, { "clip_ratio": 0.0, - "completion_length": 131.00000381469727, + "completion_length": 474.7916717529297, "epoch": 0.771, - "grad_norm": 29.20972356383704, - "kl": 1.6953125, + "grad_norm": 78.42716292437864, + "kl": 4.375, "learning_rate": 2.36275703715446e-07, - "loss": 0.0826, - "reward": 2.8022760152816772, - "reward_std": 0.1071137166582048, - "rewards/accuracy_reward": 0.8125, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.005015602451749146, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.6855, + "reward": 2.4102360010147095, + "reward_std": 0.627646267414093, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.016847367398440838, + "rewards/tag_count_reward": 0.9270833730697632, "step": 771 }, { "clip_ratio": 0.0, - "completion_length": 203.7291717529297, + "completion_length": 517.9166870117188, "epoch": 0.772, - "grad_norm": 22.356575620780674, - "kl": 2.3984375, + "grad_norm": 51.33839598106081, + "kl": 7.234375, "learning_rate": 2.3515149676898552e-07, - "loss": 0.2179, - "reward": 2.8789279460906982, - "reward_std": 0.21045194566249847, - "rewards/accuracy_reward": 0.8958333730697632, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.01169718848541379, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.9322, + "reward": 2.0547146797180176, + "reward_std": 0.7165104746818542, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/reasoning_steps_reward": 0.8819445371627808, + "rewards/repetition_penalty_reward": -0.03035500179976225, + "rewards/tag_count_reward": 0.8697916865348816, "step": 772 }, { "clip_ratio": 0.0, - "completion_length": 250.02083587646484, + "completion_length": 662.8541870117188, "epoch": 0.773, - "grad_norm": 18.592500928883528, - "kl": 3.36328125, + "grad_norm": 99.24123358538509, + "kl": 10.1875, "learning_rate": 2.3403112615217693e-07, - "loss": 0.7522, - "reward": 2.6700769662857056, - "reward_std": 0.0987859012093395, - "rewards/accuracy_reward": 0.75, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.015686861937865615, - "rewards/tag_count_reward": 0.9427083432674408, + "loss": 1.2164, + "reward": 2.322015166282654, + "reward_std": 0.7576204538345337, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.014790416695177555, + "rewards/tag_count_reward": 0.7812500298023224, "step": 773 }, { "clip_ratio": 0.0, - "completion_length": 216.9166717529297, + "completion_length": 555.0208587646484, "epoch": 0.774, - "grad_norm": 15.152721490251723, - "kl": 1.828125, + "grad_norm": 55.815576719277246, + "kl": 6.765625, "learning_rate": 2.3291460551638237e-07, - "loss": 0.6006, - "reward": 2.916693687438965, - "reward_std": 0.21100225299596786, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.019070114940404892, - "rewards/tag_count_reward": 0.984375, + "loss": 1.1251, + "reward": 2.1742671728134155, + "reward_std": 0.6895886063575745, + "rewards/accuracy_reward": 0.3958333432674408, + "rewards/reasoning_steps_reward": 0.9652778506278992, + "rewards/repetition_penalty_reward": -0.020177372731268406, + "rewards/tag_count_reward": 0.8333333432674408, "step": 774 }, { "clip_ratio": 0.0, - "completion_length": 226.75000762939453, + "completion_length": 501.3333435058594, "epoch": 0.775, - "grad_norm": 22.24137323003984, - "kl": 3.21484375, + "grad_norm": 21.026258662130076, + "kl": 5.171875, "learning_rate": 2.3180194846605364e-07, - "loss": 0.5179, - "reward": 2.751786470413208, - "reward_std": 0.45633381605148315, - "rewards/accuracy_reward": 0.8125, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.008630172349512577, - "rewards/tag_count_reward": 0.9479166865348816, + "loss": 0.8011, + "reward": 2.2563339471817017, + "reward_std": 0.6224390119314194, + "rewards/accuracy_reward": 0.4375000149011612, + "rewards/reasoning_steps_reward": 0.9652778506278992, + "rewards/repetition_penalty_reward": -0.011027187574654818, + "rewards/tag_count_reward": 0.8645833730697632, "step": 775 }, { "clip_ratio": 0.0, - "completion_length": 238.75, + "completion_length": 488.3958435058594, "epoch": 0.776, - "grad_norm": 26.753612353431073, - "kl": 4.921875, + "grad_norm": 40.72839840915295, + "kl": 2.578125, "learning_rate": 2.306931685585657e-07, - "loss": 0.6384, - "reward": 2.8042763471603394, - "reward_std": 0.35237380862236023, - "rewards/accuracy_reward": 0.8750000298023224, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.020376557484269142, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.791, + "reward": 2.5573089122772217, + "reward_std": 0.6792595684528351, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.04164944589138031, + "rewards/tag_count_reward": 0.9114583432674408, "step": 776 }, { "clip_ratio": 0.0, - "completion_length": 176.77084350585938, + "completion_length": 478.1875, "epoch": 0.777, - "grad_norm": 9.906183452338604, - "kl": 1.126953125, + "grad_norm": 65.73064893150305, + "kl": 3.2578125, "learning_rate": 2.2958827930405162e-07, - "loss": 0.0839, - "reward": 2.9708948135375977, - "reward_std": 0.044292759615927935, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.008272158447653055, - "rewards/tag_count_reward": 1.0, + "loss": 1.0533, + "reward": 2.6781049966812134, + "reward_std": 0.6686598658561707, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9652778506278992, + "rewards/repetition_penalty_reward": -0.01633965945802629, + "rewards/tag_count_reward": 0.8750000298023224, "step": 777 }, { "clip_ratio": 0.0, - "completion_length": 324.6875, + "completion_length": 374.79168701171875, "epoch": 0.778, - "grad_norm": 17.830842087547587, - "kl": 4.6875, + "grad_norm": 12.77197238349156, + "kl": 2.05078125, "learning_rate": 2.2848729416523859e-07, - "loss": 0.7073, - "reward": 2.4572025537490845, - "reward_std": 0.4395640790462494, - "rewards/accuracy_reward": 0.5208333432674408, + "loss": 0.2598, + "reward": 2.4885555505752563, + "reward_std": 0.43929314613342285, + "rewards/accuracy_reward": 0.5625000298023224, "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.0150197958573699, - "rewards/tag_count_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.014916833024471998, + "rewards/tag_count_reward": 0.9687500298023224, "step": 778 }, { "clip_ratio": 0.0, - "completion_length": 203.27084350585938, + "completion_length": 520.2291870117188, "epoch": 0.779, - "grad_norm": 17.619753875907666, - "kl": 1.75390625, + "grad_norm": 27.96030488624158, + "kl": 6.6875, "learning_rate": 2.2739022655728277e-07, - "loss": 0.594, - "reward": 2.946708559989929, - "reward_std": 0.13191527011804283, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.016833133762702346, - "rewards/tag_count_reward": 0.984375, + "loss": 0.9628, + "reward": 2.069424092769623, + "reward_std": 0.6286317706108093, + "rewards/accuracy_reward": 0.3125000111758709, + "rewards/reasoning_steps_reward": 0.9305556118488312, + "rewards/repetition_penalty_reward": -0.012173243798315525, + "rewards/tag_count_reward": 0.8385416865348816, "step": 779 }, { "clip_ratio": 0.0, - "completion_length": 136.89583587646484, + "completion_length": 430.22918701171875, "epoch": 0.78, - "grad_norm": 35.192253885531564, - "kl": 1.58203125, + "grad_norm": 29.26274656267414, + "kl": 3.59375, "learning_rate": 2.2629708984760706e-07, - "loss": 0.2184, - "reward": 2.8248159885406494, - "reward_std": 0.14497528970241547, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.0033091583172790706, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.9216, + "reward": 2.4278910160064697, + "reward_std": 0.5140875577926636, + "rewards/accuracy_reward": 0.5416666716337204, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.018289764411747456, + "rewards/tag_count_reward": 0.9322916865348816, "step": 780 }, { "clip_ratio": 0.0, - "completion_length": 150.7708396911621, + "completion_length": 520.8750152587891, "epoch": 0.781, - "grad_norm": 13.382902330343864, - "kl": 2.125, + "grad_norm": 27.923966049196657, + "kl": 4.5703125, "learning_rate": 2.2520789735573704e-07, - "loss": 0.2202, - "reward": 2.9909991025924683, - "reward_std": 0.00852905202191323, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.009000916528748348, - "rewards/tag_count_reward": 1.0, + "loss": 1.121, + "reward": 2.480145573616028, + "reward_std": 0.6819994747638702, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.02679906040430069, + "rewards/tag_count_reward": 0.8541666865348816, "step": 781 }, { "clip_ratio": 0.0, - "completion_length": 196.4375114440918, + "completion_length": 503.1875305175781, "epoch": 0.782, - "grad_norm": 20.432170938851172, - "kl": 1.765625, + "grad_norm": 33.004171238913635, + "kl": 2.75, "learning_rate": 2.2412266235313973e-07, - "loss": 0.5266, - "reward": 2.968478202819824, - "reward_std": 0.09098697081208229, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.010688654729165137, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.7836, + "reward": 2.555114269256592, + "reward_std": 0.6732204258441925, + "rewards/accuracy_reward": 0.7083333730697632, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.04210797697305679, + "rewards/tag_count_reward": 0.9166666865348816, "step": 782 }, { "clip_ratio": 0.0, - "completion_length": 207.62501525878906, + "completion_length": 543.2291870117188, "epoch": 0.783, - "grad_norm": 16.22037756932636, - "kl": 3.2734375, + "grad_norm": 26.670810792645383, + "kl": 3.9765625, "learning_rate": 2.230413980630609e-07, - "loss": 0.6805, - "reward": 2.9210058450698853, - "reward_std": 0.16372934356331825, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.007813675329089165, - "rewards/tag_count_reward": 0.984375, + "loss": 0.8157, + "reward": 2.3876798152923584, + "reward_std": 0.6481176614761353, + "rewards/accuracy_reward": 0.625, + "rewards/reasoning_steps_reward": 0.9305555522441864, + "rewards/repetition_penalty_reward": -0.02725088130682707, + "rewards/tag_count_reward": 0.8593750298023224, "step": 783 }, { "clip_ratio": 0.0, - "completion_length": 144.4375, + "completion_length": 454.14585876464844, "epoch": 0.784, - "grad_norm": 18.901786788594798, - "kl": 1.79296875, + "grad_norm": 20.273867549895126, + "kl": 2.8828125, "learning_rate": 2.2196411766036487e-07, - "loss": 0.1775, - "reward": 2.5054566860198975, - "reward_std": 0.21673375321552157, - "rewards/accuracy_reward": 0.5208333432674408, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.0032239637803286314, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.3284, + "reward": 2.4754650592803955, + "reward_std": 0.5723298937082291, + "rewards/accuracy_reward": 0.6250000149011612, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.03495161887258291, + "rewards/tag_count_reward": 0.90625, "step": 784 }, { "clip_ratio": 0.0, - "completion_length": 178.02083587646484, + "completion_length": 726.125, "epoch": 0.785, - "grad_norm": 18.543389088207263, - "kl": 2.4609375, + "grad_norm": 29.95305237028232, + "kl": 8.2578125, "learning_rate": 2.2089083427137329e-07, - "loss": 0.1686, - "reward": 2.797831892967224, - "reward_std": 0.22630199790000916, - "rewards/accuracy_reward": 0.8125000298023224, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.007723640068434179, - "rewards/tag_count_reward": 1.0, + "loss": 1.0536, + "reward": 2.2507245540618896, + "reward_std": 0.7002788186073303, + "rewards/accuracy_reward": 0.5625, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.025317177176475525, + "rewards/tag_count_reward": 0.7552083432674408, "step": 785 }, { "clip_ratio": 0.0, - "completion_length": 275.25001525878906, + "completion_length": 456.6875, "epoch": 0.786, - "grad_norm": 30.974844901718203, - "kl": 4.4375, + "grad_norm": 23.330969926542757, + "kl": 3.6953125, "learning_rate": 2.1982156097370557e-07, - "loss": 1.4646, - "reward": 2.854225993156433, - "reward_std": 0.39963753521442413, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.01903794752433896, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.851, + "reward": 2.6652588844299316, + "reward_std": 0.6001283526420593, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.015296765603125095, + "rewards/tag_count_reward": 0.8958333730697632, "step": 786 }, { "clip_ratio": 0.0, - "completion_length": 255.4791717529297, + "completion_length": 439.56251525878906, "epoch": 0.787, - "grad_norm": 28.634847436695246, - "kl": 3.4296875, + "grad_norm": 41.78445904580422, + "kl": 3.1875, "learning_rate": 2.1875631079611956e-07, - "loss": 0.5931, - "reward": 2.887614369392395, - "reward_std": 0.15608347207307816, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.01863575167953968, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.9046, + "reward": 2.517007827758789, + "reward_std": 0.5808334052562714, + "rewards/accuracy_reward": 0.6875, + "rewards/reasoning_steps_reward": 0.958333432674408, + "rewards/repetition_penalty_reward": -0.019450515508651733, + "rewards/tag_count_reward": 0.8906250298023224, "step": 787 }, { "clip_ratio": 0.0, - "completion_length": 298.9166717529297, + "completion_length": 500.0208435058594, "epoch": 0.788, - "grad_norm": 58.22264849159361, - "kl": 9.0, + "grad_norm": 21.94070893015738, + "kl": 3.8828125, "learning_rate": 2.1769509671835223e-07, - "loss": 1.3216, - "reward": 2.867523670196533, - "reward_std": 0.3909846842288971, - "rewards/accuracy_reward": 0.9375000298023224, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.016157003468833864, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 0.8589, + "reward": 2.501682996749878, + "reward_std": 0.7997144758701324, + "rewards/accuracy_reward": 0.7083333730697632, + "rewards/reasoning_steps_reward": 0.9305555820465088, + "rewards/repetition_penalty_reward": -0.022622703574597836, + "rewards/tag_count_reward": 0.8854166865348816, "step": 788 }, { "clip_ratio": 0.0, - "completion_length": 177.93750762939453, + "completion_length": 356.10418701171875, "epoch": 0.789, - "grad_norm": 17.57635390493844, - "kl": 3.3359375, + "grad_norm": 18.917515789374068, + "kl": 4.484375, "learning_rate": 2.166379316709625e-07, - "loss": 0.3588, - "reward": 2.5191460847854614, - "reward_std": 0.3168012648820877, - "rewards/accuracy_reward": 0.5625000298023224, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.013840071391314268, - "rewards/tag_count_reward": 0.984375, + "loss": 0.4355, + "reward": 2.2824082374572754, + "reward_std": 0.5984006226062775, + "rewards/accuracy_reward": 0.5000000149011612, + "rewards/reasoning_steps_reward": 0.916666716337204, + "rewards/repetition_penalty_reward": -0.019675294868648052, + "rewards/tag_count_reward": 0.8854166865348816, "step": 789 }, { "clip_ratio": 0.0, - "completion_length": 181.31250762939453, + "completion_length": 632.2083740234375, "epoch": 0.79, - "grad_norm": 49.708814225578905, - "kl": 6.96875, + "grad_norm": 16.91140019655358, + "kl": 6.6875, "learning_rate": 2.1558482853517253e-07, - "loss": 0.612, - "reward": 2.935931444168091, - "reward_std": 0.13722094893455505, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.010249054757878184, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 1.2443, + "reward": 2.2729358673095703, + "reward_std": 0.853611171245575, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.8819445073604584, + "rewards/repetition_penalty_reward": -0.025675359182059765, + "rewards/tag_count_reward": 0.8333333432674408, "step": 790 }, { "clip_ratio": 0.0, - "completion_length": 198.77084350585938, + "completion_length": 376.31251525878906, "epoch": 0.791, - "grad_norm": 14.310785168079715, - "kl": 3.25, + "grad_norm": 20.048000622022776, + "kl": 1.310546875, "learning_rate": 2.1453580014271203e-07, - "loss": 0.2853, - "reward": 2.839933753013611, - "reward_std": 0.24727932829409838, - "rewards/accuracy_reward": 0.875, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.010760855628177524, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.3778, + "reward": 2.2707479000091553, + "reward_std": 0.3341696485877037, + "rewards/accuracy_reward": 0.3125000074505806, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.0191827230155468, + "rewards/tag_count_reward": 0.984375, "step": 791 }, { "clip_ratio": 0.0, - "completion_length": 167.12500762939453, + "completion_length": 371.6458435058594, "epoch": 0.792, - "grad_norm": 35.116816044627456, - "kl": 3.578125, + "grad_norm": 19.6502337749718, + "kl": 1.71484375, "learning_rate": 2.134908592756607e-07, - "loss": 0.7321, - "reward": 2.9144824743270874, - "reward_std": 0.26326968893408775, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.005656404755427502, - "rewards/tag_count_reward": 0.9687500298023224, + "loss": 0.4382, + "reward": 2.6625298261642456, + "reward_std": 0.537158876657486, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.02670634165406227, + "rewards/tag_count_reward": 0.9739583730697632, "step": 792 }, { "clip_ratio": 0.0, - "completion_length": 125.64583587646484, + "completion_length": 515.7291870117188, "epoch": 0.793, - "grad_norm": 28.157215116049354, - "kl": 2.60546875, + "grad_norm": 28.434148981976, + "kl": 6.421875, "learning_rate": 2.124500186662932e-07, - "loss": 0.3993, - "reward": 2.903472065925598, - "reward_std": 0.2684580236673355, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.0010419311583973467, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.8134, + "reward": 2.500126600265503, + "reward_std": 0.7610350847244263, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.013762431219220161, + "rewards/tag_count_reward": 0.8541666865348816, "step": 793 }, { "clip_ratio": 0.0, - "completion_length": 198.18750762939453, + "completion_length": 621.7500305175781, "epoch": 0.794, - "grad_norm": 21.04750447916934, - "kl": 4.0859375, + "grad_norm": 34.773569674299296, + "kl": 7.453125, "learning_rate": 2.1141329099692406e-07, - "loss": 0.8452, - "reward": 2.729486346244812, - "reward_std": 0.3937009274959564, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.008360920706763864, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 1.1273, + "reward": 2.032840847969055, + "reward_std": 0.881601870059967, + "rewards/accuracy_reward": 0.3541666716337204, + "rewards/reasoning_steps_reward": 0.9166666865348816, + "rewards/repetition_penalty_reward": -0.034867677837610245, + "rewards/tag_count_reward": 0.7968750298023224, "step": 794 }, { "clip_ratio": 0.0, - "completion_length": 128.58333587646484, + "completion_length": 517.0000152587891, "epoch": 0.795, - "grad_norm": 24.98864576662068, - "kl": 2.890625, + "grad_norm": 30.664837556455563, + "kl": 5.78125, "learning_rate": 2.1038068889975259e-07, - "loss": 0.2467, - "reward": 2.9079995155334473, - "reward_std": 0.21143119782209396, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.001722982560750097, - "rewards/tag_count_reward": 1.0, + "loss": 1.0339, + "reward": 2.5728070735931396, + "reward_std": 0.7294317185878754, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.024415286257863045, + "rewards/tag_count_reward": 0.8541666865348816, "step": 795 }, { "clip_ratio": 0.0, - "completion_length": 151.8958396911621, + "completion_length": 454.56251525878906, "epoch": 0.796, - "grad_norm": 15.081799606350472, - "kl": 2.640625, + "grad_norm": 22.153680697625163, + "kl": 5.1015625, "learning_rate": 2.0935222495670968e-07, - "loss": 0.3069, - "reward": 2.9890072345733643, - "reward_std": 0.022960615810006857, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.005784486711490899, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 1.0399, + "reward": 2.4211684465408325, + "reward_std": 0.7883006036281586, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9444445371627808, + "rewards/repetition_penalty_reward": -0.01285938685759902, + "rewards/tag_count_reward": 0.8854166865348816, "step": 796 }, { "clip_ratio": 0.0, - "completion_length": 510.72918701171875, + "completion_length": 558.0000152587891, "epoch": 0.797, - "grad_norm": 31.638153140272287, - "kl": 9.40625, + "grad_norm": 44.01821977708549, + "kl": 5.890625, "learning_rate": 2.0832791169930363e-07, - "loss": 1.098, - "reward": 2.2927438020706177, - "reward_std": 0.29085440188646317, - "rewards/accuracy_reward": 0.4375, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.030172881670296192, - "rewards/tag_count_reward": 0.8854166865348816, + "loss": 1.2008, + "reward": 2.6672359704971313, + "reward_std": 0.5852415859699249, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9722222983837128, + "rewards/repetition_penalty_reward": -0.02894441783428192, + "rewards/tag_count_reward": 0.8697916865348816, "step": 797 }, { "clip_ratio": 0.0, - "completion_length": 201.06250762939453, + "completion_length": 579.6458435058594, "epoch": 0.798, - "grad_norm": 27.628692921933613, - "kl": 1.95703125, + "grad_norm": 23.974305974531344, + "kl": 5.125, "learning_rate": 2.0730776160846853e-07, - "loss": 0.2975, - "reward": 2.776444673538208, - "reward_std": 0.11451778374612331, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.015222116839140654, - "rewards/tag_count_reward": 1.0, + "loss": 0.9124, + "reward": 2.1921111345291138, + "reward_std": 0.7356277704238892, + "rewards/accuracy_reward": 0.458333358168602, + "rewards/reasoning_steps_reward": 0.9305555820465088, + "rewards/repetition_penalty_reward": -0.024902895092964172, + "rewards/tag_count_reward": 0.8281250298023224, "step": 798 }, { "clip_ratio": 0.0, - "completion_length": 302.37500762939453, + "completion_length": 496.81251525878906, "epoch": 0.799, - "grad_norm": 25.476498471471352, - "kl": 8.0234375, + "grad_norm": 20.99627755388401, + "kl": 4.53125, "learning_rate": 2.0629178711441115e-07, - "loss": 0.969, - "reward": 2.819660782814026, - "reward_std": 0.26262570545077324, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.013672715052962303, - "rewards/tag_count_reward": 0.9375000298023224, + "loss": 0.7604, + "reward": 2.4410648345947266, + "reward_std": 0.5636108368635178, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.010324170347303152, + "rewards/tag_count_reward": 0.8541666865348816, "step": 799 }, { "clip_ratio": 0.0, - "completion_length": 272.3125, + "completion_length": 608.5208740234375, "epoch": 0.8, - "grad_norm": 29.10986543537435, - "kl": 6.1875, + "grad_norm": 26.72579720606878, + "kl": 5.25, "learning_rate": 2.0528000059645995e-07, - "loss": 1.6256, - "reward": 2.741499662399292, - "reward_std": 0.4798247665166855, - "rewards/accuracy_reward": 0.8125, - "rewards/reasoning_steps_reward": 0.9861111044883728, - "rewards/repetition_penalty_reward": -0.010236701928079128, - "rewards/tag_count_reward": 0.9531250298023224, + "loss": 0.7956, + "reward": 2.119356393814087, + "reward_std": 0.5921457409858704, + "rewards/accuracy_reward": 0.3541666716337204, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.03689361736178398, + "rewards/tag_count_reward": 0.8229166865348816, "step": 800 }, { "clip_ratio": 0.0, - "completion_length": 268.1041717529297, + "completion_length": 312.0, "epoch": 0.801, - "grad_norm": 19.979377164238073, - "kl": 5.4609375, + "grad_norm": 13.404486874760053, + "kl": 1.23046875, "learning_rate": 2.042724143829146e-07, - "loss": 1.1233, - "reward": 2.526451587677002, - "reward_std": 0.3320402354001999, - "rewards/accuracy_reward": 0.6041666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.008270852966234088, - "rewards/tag_count_reward": 0.9375000298023224, + "loss": 0.077, + "reward": 2.3504269123077393, + "reward_std": 0.36202409863471985, + "rewards/accuracy_reward": 0.4375000149011612, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.014156644232571125, + "rewards/tag_count_reward": 0.9687500298023224, "step": 801 }, { "clip_ratio": 0.0, - "completion_length": 184.5, + "completion_length": 434.4583435058594, "epoch": 0.802, - "grad_norm": 20.176874273289776, - "kl": 3.5390625, + "grad_norm": 19.113136797766565, + "kl": 2.7421875, "learning_rate": 2.032690407508949e-07, - "loss": 0.4736, - "reward": 2.4679800868034363, - "reward_std": 0.06469320203177631, - "rewards/accuracy_reward": 0.5, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.0042423170525580645, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.5925, + "reward": 2.4155834913253784, + "reward_std": 0.6261934638023376, + "rewards/accuracy_reward": 0.5625, + "rewards/reasoning_steps_reward": 0.951388955116272, + "rewards/repetition_penalty_reward": -0.020180439576506615, + "rewards/tag_count_reward": 0.921875, "step": 802 }, { "clip_ratio": 0.0, - "completion_length": 182.6666717529297, + "completion_length": 501.4583435058594, "epoch": 0.803, - "grad_norm": 14.182107375770432, - "kl": 2.09765625, + "grad_norm": 25.89659290548137, + "kl": 3.203125, "learning_rate": 2.0226989192619204e-07, - "loss": 0.5176, - "reward": 2.960512161254883, - "reward_std": 0.0716961994767189, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 0.979166716337204, - "rewards/repetition_penalty_reward": -0.008237778907641768, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.4875, + "reward": 2.301639437675476, + "reward_std": 0.555485874414444, + "rewards/accuracy_reward": 0.4791666865348816, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.03690226562321186, + "rewards/tag_count_reward": 0.8802083730697632, "step": 803 }, { "clip_ratio": 0.0, - "completion_length": 252.9791717529297, + "completion_length": 438.1250305175781, "epoch": 0.804, - "grad_norm": 27.87760350444284, - "kl": 4.5859375, + "grad_norm": 21.314874667338287, + "kl": 2.4453125, "learning_rate": 2.0127498008311922e-07, - "loss": 0.7529, - "reward": 2.5297632217407227, - "reward_std": 0.39311040937900543, - "rewards/accuracy_reward": 0.6041666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.011903538135811687, - "rewards/tag_count_reward": 0.9375000298023224, + "loss": 0.3104, + "reward": 2.334763288497925, + "reward_std": 0.3796348571777344, + "rewards/accuracy_reward": 0.4791666865348816, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.02287570107728243, + "rewards/tag_count_reward": 0.8854166865348816, "step": 804 }, { "clip_ratio": 0.0, - "completion_length": 222.58333587646484, + "completion_length": 421.6666717529297, "epoch": 0.805, - "grad_norm": 24.54643556307578, - "kl": 5.59375, + "grad_norm": 14.366039627616365, + "kl": 1.8720703125, "learning_rate": 2.0028431734436308e-07, - "loss": 0.9578, - "reward": 2.44782555103302, - "reward_std": 0.42135491967201233, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.008771708235144615, - "rewards/tag_count_reward": 0.9427083432674408, + "loss": 0.3854, + "reward": 2.546318769454956, + "reward_std": 0.3817850574851036, + "rewards/accuracy_reward": 0.645833358168602, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.026598164811730385, + "rewards/tag_count_reward": 0.9479166865348816, "step": 805 }, { "clip_ratio": 0.0, - "completion_length": 223.9791717529297, + "completion_length": 418.625, "epoch": 0.806, - "grad_norm": 13.970202190286107, - "kl": 2.7265625, + "grad_norm": 21.19577118539385, + "kl": 3.046875, "learning_rate": 1.9929791578083655e-07, - "loss": 0.4888, - "reward": 2.614449381828308, - "reward_std": 0.20439724251627922, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/reasoning_steps_reward": 0.9861111044883728, - "rewards/repetition_penalty_reward": -0.012286719866096973, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 0.5684, + "reward": 2.6353834867477417, + "reward_std": 0.5922529548406601, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.02260266337543726, + "rewards/tag_count_reward": 0.9218750298023224, "step": 806 }, { "clip_ratio": 0.0, - "completion_length": 119.50000381469727, + "completion_length": 527.6666870117188, "epoch": 0.807, - "grad_norm": 21.32991115406617, - "kl": 0.927734375, + "grad_norm": 26.76069518759677, + "kl": 5.15625, "learning_rate": 1.9831578741153155e-07, - "loss": 0.102, - "reward": 2.9977136850357056, - "reward_std": 0.005249683745205402, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.002286378585267812, - "rewards/tag_count_reward": 1.0, + "loss": 0.7895, + "reward": 2.3032405376434326, + "reward_std": 0.808287501335144, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.9305556416511536, + "rewards/repetition_penalty_reward": -0.028356720693409443, + "rewards/tag_count_reward": 0.8593750298023224, "step": 807 }, { "clip_ratio": 0.0, - "completion_length": 315.0208435058594, + "completion_length": 381.8125, "epoch": 0.808, - "grad_norm": 22.143550611904363, - "kl": 5.3203125, + "grad_norm": 20.376838690762124, + "kl": 1.927734375, "learning_rate": 1.9733794420337213e-07, - "loss": 1.1402, - "reward": 2.7978882789611816, - "reward_std": 0.42497333884239197, - "rewards/accuracy_reward": 0.8958333730697632, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.02155654039233923, - "rewards/tag_count_reward": 0.9375000298023224, + "loss": 0.268, + "reward": 2.428946018218994, + "reward_std": 0.5701454132795334, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.027651351876556873, + "rewards/tag_count_reward": 0.9635416865348816, "step": 808 }, { "clip_ratio": 0.0, - "completion_length": 178.52084350585938, + "completion_length": 445.62501525878906, "epoch": 0.809, - "grad_norm": 31.584435156396214, - "kl": 3.53125, + "grad_norm": 20.816082501770886, + "kl": 5.140625, "learning_rate": 1.9636439807106912e-07, - "loss": 0.8084, - "reward": 2.7189764976501465, - "reward_std": 0.1829577311873436, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.008454266237094998, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.8017, + "reward": 2.5268882513046265, + "reward_std": 0.7573049068450928, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9722222089767456, + "rewards/repetition_penalty_reward": -0.023458978161215782, + "rewards/tag_count_reward": 0.8489583730697632, "step": 809 }, { "clip_ratio": 0.0, - "completion_length": 185.875, + "completion_length": 391.81251525878906, "epoch": 0.81, - "grad_norm": 33.557908206953115, - "kl": 2.0703125, + "grad_norm": 22.694875364982547, + "kl": 2.3359375, "learning_rate": 1.9539516087697517e-07, - "loss": 0.4514, - "reward": 2.9699642658233643, - "reward_std": 0.07631884841248393, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.009202503133565187, - "rewards/tag_count_reward": 1.0, + "loss": 0.5417, + "reward": 2.658096432685852, + "reward_std": 0.6354215741157532, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.0172510021366179, + "rewards/tag_count_reward": 0.953125, "step": 810 }, { "clip_ratio": 0.0, - "completion_length": 162.20833587646484, + "completion_length": 503.35418701171875, "epoch": 0.811, - "grad_norm": 14.99261027516307, - "kl": 2.44921875, + "grad_norm": 58.019855154883885, + "kl": 6.5625, "learning_rate": 1.944302444309393e-07, - "loss": 0.3815, - "reward": 2.889218807220459, - "reward_std": 0.3184753730893135, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.006614618236199021, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 1.3334, + "reward": 2.501904606819153, + "reward_std": 0.7767859101295471, + "rewards/accuracy_reward": 0.7083333730697632, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.022400940768420696, + "rewards/tag_count_reward": 0.8645833730697632, "step": 811 }, { "clip_ratio": 0.0, - "completion_length": 131.93750762939453, + "completion_length": 516.8125, "epoch": 0.812, - "grad_norm": 17.91536017885896, - "kl": 0.775390625, + "grad_norm": 54.33176698076773, + "kl": 7.484375, "learning_rate": 1.934696604901642e-07, - "loss": 0.1524, - "reward": 2.994596838951111, - "reward_std": 0.008447684347629547, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.005403144052252173, - "rewards/tag_count_reward": 1.0, + "loss": 0.8888, + "reward": 2.2534443140029907, + "reward_std": 0.7112808525562286, + "rewards/accuracy_reward": 0.4583333432674408, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.017389055341482162, + "rewards/tag_count_reward": 0.8333333730697632, "step": 812 }, { "clip_ratio": 0.0, - "completion_length": 152.33333587646484, + "completion_length": 467.00001525878906, "epoch": 0.813, - "grad_norm": 21.36824162303136, - "kl": 2.234375, + "grad_norm": 35.200107482672536, + "kl": 4.8515625, "learning_rate": 1.9251342075906179e-07, - "loss": 0.2129, - "reward": 2.843026876449585, - "reward_std": 0.20656383782625198, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.011139992624521255, - "rewards/tag_count_reward": 1.0, + "loss": 0.9493, + "reward": 2.1571664810180664, + "reward_std": 0.6603484153747559, + "rewards/accuracy_reward": 0.3750000149011612, + "rewards/reasoning_steps_reward": 0.9236112236976624, + "rewards/repetition_penalty_reward": -0.021653023548424244, + "rewards/tag_count_reward": 0.8802083432674408, "step": 813 }, { "clip_ratio": 0.0, - "completion_length": 269.64583587646484, + "completion_length": 469.1041717529297, "epoch": 0.814, - "grad_norm": 19.664060487902095, - "kl": 6.1875, + "grad_norm": 54.98157225337433, + "kl": 5.5, "learning_rate": 1.915615368891117e-07, - "loss": 1.0941, - "reward": 2.6215662956237793, - "reward_std": 0.2508995831012726, - "rewards/accuracy_reward": 0.6875000298023224, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.008642320288345218, - "rewards/tag_count_reward": 0.9427083432674408, + "loss": 0.9186, + "reward": 2.1737260818481445, + "reward_std": 0.6402971148490906, + "rewards/accuracy_reward": 0.3750000149011612, + "rewards/reasoning_steps_reward": 0.9166667461395264, + "rewards/repetition_penalty_reward": -0.013774096965789795, + "rewards/tag_count_reward": 0.8958333432674408, "step": 814 }, { "clip_ratio": 0.0, - "completion_length": 247.14584350585938, + "completion_length": 531.7291717529297, "epoch": 0.815, - "grad_norm": 38.90681323498606, - "kl": 5.609375, + "grad_norm": 44.84634369429172, + "kl": 6.203125, "learning_rate": 1.9061402047871833e-07, - "loss": 1.1451, - "reward": 2.9012926816940308, - "reward_std": 0.25670523568987846, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.01884620380587876, - "rewards/tag_count_reward": 0.9479166865348816, + "loss": 1.2846, + "reward": 2.147634506225586, + "reward_std": 0.592269778251648, + "rewards/accuracy_reward": 0.3541666716337204, + "rewards/reasoning_steps_reward": 0.9375, + "rewards/repetition_penalty_reward": -0.019032152369618416, + "rewards/tag_count_reward": 0.875, "step": 815 }, { "clip_ratio": 0.0, - "completion_length": 206.3125, + "completion_length": 393.7291717529297, "epoch": 0.816, - "grad_norm": 19.031133937790777, - "kl": 4.15625, + "grad_norm": 23.81197579521365, + "kl": 4.3671875, "learning_rate": 1.8967088307307e-07, - "loss": 1.0639, - "reward": 2.9277145862579346, - "reward_std": 0.21353978104889393, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.009785510133951902, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.4182, + "reward": 2.38164883852005, + "reward_std": 0.6038723587989807, + "rewards/accuracy_reward": 0.5625, + "rewards/reasoning_steps_reward": 0.9375001192092896, + "rewards/repetition_penalty_reward": -0.019393039867281914, + "rewards/tag_count_reward": 0.9010416865348816, "step": 816 }, { "clip_ratio": 0.0, - "completion_length": 176.33333587646484, + "completion_length": 435.91668701171875, "epoch": 0.817, - "grad_norm": 23.420091630037813, - "kl": 2.9453125, + "grad_norm": 25.95495924946456, + "kl": 3.8046875, "learning_rate": 1.887321361639985e-07, - "loss": 0.3948, - "reward": 2.82608425617218, - "reward_std": 0.3052468076348305, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.010721505153924227, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 1.0102, + "reward": 2.579134702682495, + "reward_std": 0.5107276141643524, + "rewards/accuracy_reward": 0.7083333730697632, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.026768138632178307, + "rewards/tag_count_reward": 0.9322916865348816, "step": 817 }, { "clip_ratio": 0.0, - "completion_length": 205.45834350585938, + "completion_length": 375.68751525878906, "epoch": 0.818, - "grad_norm": 15.158650834511374, - "kl": 4.4609375, + "grad_norm": 12.048598436192929, + "kl": 1.546875, "learning_rate": 1.8779779118983867e-07, - "loss": 0.871, - "reward": 2.4508496522903442, - "reward_std": 0.1142427884042263, - "rewards/accuracy_reward": 0.5, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.007483683060854673, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.0993, + "reward": 2.702491521835327, + "reward_std": 0.4488513916730881, + "rewards/accuracy_reward": 0.7708333730697632, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.028411319479346275, + "rewards/tag_count_reward": 0.9739583432674408, "step": 818 }, { "clip_ratio": 0.0, - "completion_length": 126.72916793823242, + "completion_length": 601.5416870117188, "epoch": 0.819, - "grad_norm": 23.96962184371209, - "kl": 2.6796875, + "grad_norm": 36.93153228705177, + "kl": 4.96875, "learning_rate": 1.8686785953528922e-07, - "loss": 0.5561, - "reward": 2.7376667261123657, - "reward_std": 0.029658941086381674, - "rewards/accuracy_reward": 0.75, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.0019167900900356472, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 1.1533, + "reward": 2.5069422721862793, + "reward_std": 0.7914581596851349, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9722222089767456, + "rewards/repetition_penalty_reward": -0.02257181704044342, + "rewards/tag_count_reward": 0.8489583432674408, "step": 819 }, { "clip_ratio": 0.0, - "completion_length": 202.6875, + "completion_length": 414.00001525878906, "epoch": 0.82, - "grad_norm": 14.53483225868331, - "kl": 3.1640625, + "grad_norm": 36.676997752826466, + "kl": 2.37890625, "learning_rate": 1.8594235253127372e-07, - "loss": 0.5463, - "reward": 2.894922375679016, - "reward_std": 0.22689326852560043, - "rewards/accuracy_reward": 0.9375000298023224, - "rewards/reasoning_steps_reward": 0.9861111044883728, - "rewards/repetition_penalty_reward": -0.01306393276900053, - "rewards/tag_count_reward": 0.984375, + "loss": 0.8483, + "reward": 2.6289749145507812, + "reward_std": 0.5092899203300476, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.020330775994807482, + "rewards/tag_count_reward": 0.9479166865348816, "step": 820 }, { "clip_ratio": 0.0, - "completion_length": 220.7291717529297, + "completion_length": 424.6458435058594, "epoch": 0.821, - "grad_norm": 25.455621787871273, - "kl": 3.875, + "grad_norm": 25.786201228130313, + "kl": 1.57421875, "learning_rate": 1.850212814548031e-07, - "loss": 0.608, - "reward": 2.824221134185791, - "reward_std": 0.4408956617116928, - "rewards/accuracy_reward": 0.8750000298023224, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.014320507179945707, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.4719, + "reward": 2.536083459854126, + "reward_std": 0.5136556923389435, + "rewards/accuracy_reward": 0.6250000149011612, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.02815271820873022, + "rewards/tag_count_reward": 0.953125, "step": 821 }, { "clip_ratio": 0.0, - "completion_length": 141.2916717529297, + "completion_length": 327.3333435058594, "epoch": 0.822, - "grad_norm": 29.387848892527, - "kl": 1.8828125, + "grad_norm": 9.339930080359409, + "kl": 0.5400390625, "learning_rate": 1.8410465752883758e-07, - "loss": 0.158, - "reward": 2.59416127204895, - "reward_std": 0.365949884057045, - "rewards/accuracy_reward": 0.6041666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.004797216039150953, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0109, + "reward": 2.5481141805648804, + "reward_std": 0.4344000518321991, + "rewards/accuracy_reward": 0.6250000149011612, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.026538672856986523, + "rewards/tag_count_reward": 0.984375, "step": 822 }, { "clip_ratio": 0.0, - "completion_length": 203.8958396911621, + "completion_length": 425.04168701171875, "epoch": 0.823, - "grad_norm": 23.97471757267884, - "kl": 4.498046875, + "grad_norm": 17.806285658594135, + "kl": 2.37109375, "learning_rate": 1.8319249192215055e-07, - "loss": 0.7478, - "reward": 2.911274790763855, - "reward_std": 0.23020393354818225, - "rewards/accuracy_reward": 0.9375, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.01580862980335951, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.6621, + "reward": 2.3081016540527344, + "reward_std": 0.5683871209621429, + "rewards/accuracy_reward": 0.4375000298023224, + "rewards/reasoning_steps_reward": 0.9444444477558136, + "rewards/repetition_penalty_reward": -0.02175955567508936, + "rewards/tag_count_reward": 0.9479166865348816, "step": 823 }, { "clip_ratio": 0.0, - "completion_length": 215.0, + "completion_length": 488.16668701171875, "epoch": 0.824, - "grad_norm": 33.70496723679782, - "kl": 3.21875, + "grad_norm": 30.757926718689315, + "kl": 4.0625, "learning_rate": 1.822847957491922e-07, - "loss": 1.2428, - "reward": 2.9269388914108276, - "reward_std": 0.24015385657548904, - "rewards/accuracy_reward": 0.9583333730697632, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.00535275531001389, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 1.0787, + "reward": 2.3749157190322876, + "reward_std": 0.5852408409118652, + "rewards/accuracy_reward": 0.5416666716337204, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.01918159332126379, + "rewards/tag_count_reward": 0.9010416865348816, "step": 824 }, { "clip_ratio": 0.0, - "completion_length": 135.89583587646484, + "completion_length": 474.3125305175781, "epoch": 0.825, - "grad_norm": 26.680007453940927, - "kl": 4.484375, + "grad_norm": 22.249676223818117, + "kl": 3.10546875, "learning_rate": 1.8138158006995363e-07, - "loss": 0.5156, - "reward": 2.9656097888946533, - "reward_std": 0.07952729985117912, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.0031402936729136854, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.568, + "reward": 2.3771095275878906, + "reward_std": 0.5015990436077118, + "rewards/accuracy_reward": 0.479166679084301, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.015251755714416504, + "rewards/tag_count_reward": 0.9270833730697632, "step": 825 }, { "clip_ratio": 0.0, - "completion_length": 163.06250762939453, + "completion_length": 347.37501525878906, "epoch": 0.826, - "grad_norm": 19.47720457533648, - "kl": 2.13671875, + "grad_norm": 14.195193659523204, + "kl": 2.14453125, "learning_rate": 1.804828558898332e-07, - "loss": 0.3726, - "reward": 2.9385839700698853, - "reward_std": 0.1863657347857952, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.007596701383590698, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.1007, + "reward": 2.5919097661972046, + "reward_std": 0.468966469168663, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.010520970448851585, + "rewards/tag_count_reward": 0.9635416865348816, "step": 826 }, { "clip_ratio": 0.0, - "completion_length": 154.58334350585938, + "completion_length": 450.2083435058594, "epoch": 0.827, - "grad_norm": 11.014695118224315, - "kl": 1.75, + "grad_norm": 32.764312557604384, + "kl": 4.703125, "learning_rate": 1.7958863415950112e-07, - "loss": 0.1492, - "reward": 2.7207624912261963, - "reward_std": 0.05280167330056429, - "rewards/accuracy_reward": 0.75, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.011876523029059172, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.8324, + "reward": 2.222867786884308, + "reward_std": 0.6226075887680054, + "rewards/accuracy_reward": 0.3958333358168602, + "rewards/reasoning_steps_reward": 0.9375000596046448, + "rewards/repetition_penalty_reward": -0.02192395133897662, + "rewards/tag_count_reward": 0.9114583730697632, "step": 827 }, { "clip_ratio": 0.0, - "completion_length": 140.7916717529297, + "completion_length": 537.3958587646484, "epoch": 0.828, - "grad_norm": 21.8008904399678, - "kl": 1.0703125, + "grad_norm": 102.68181493594267, + "kl": 7.6669921875, "learning_rate": 1.7869892577476722e-07, - "loss": 0.1613, - "reward": 2.9569069147109985, - "reward_std": 0.07939230743795633, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.017051690258085728, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.8764, + "reward": 2.513993501663208, + "reward_std": 0.5042714290320873, + "rewards/accuracy_reward": 0.7291666716337204, + "rewards/reasoning_steps_reward": 0.930555522441864, + "rewards/repetition_penalty_reward": -0.025937245693057775, + "rewards/tag_count_reward": 0.8802083432674408, "step": 828 }, { "clip_ratio": 0.0, - "completion_length": 137.02084350585938, + "completion_length": 428.5833435058594, "epoch": 0.829, - "grad_norm": 21.694291133908372, - "kl": 2.3671875, + "grad_norm": 25.253738055443158, + "kl": 4.703125, "learning_rate": 1.7781374157644713e-07, - "loss": 0.3124, - "reward": 2.9050207138061523, - "reward_std": 0.21275982819497585, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.004701685626059771, - "rewards/tag_count_reward": 1.0, + "loss": 0.8549, + "reward": 2.5480579137802124, + "reward_std": 0.7514591813087463, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9375, + "rewards/repetition_penalty_reward": -0.01444217236712575, + "rewards/tag_count_reward": 0.8958333432674408, "step": 829 }, { "clip_ratio": 0.0, - "completion_length": 191.5416717529297, + "completion_length": 319.7708435058594, "epoch": 0.83, - "grad_norm": 13.781218217476162, - "kl": 2.4609375, + "grad_norm": 43.578316518329956, + "kl": 3.408203125, "learning_rate": 1.7693309235023127e-07, - "loss": 0.6399, - "reward": 2.9440847635269165, - "reward_std": 0.1809553827624768, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.005568042630329728, - "rewards/tag_count_reward": 0.984375, + "loss": 0.2552, + "reward": 2.751266121864319, + "reward_std": 0.45656658709049225, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9375, + "rewards/repetition_penalty_reward": -0.014358973130583763, + "rewards/tag_count_reward": 0.9739583432674408, "step": 830 }, { "clip_ratio": 0.0, - "completion_length": 182.27084350585938, + "completion_length": 485.10418701171875, "epoch": 0.831, - "grad_norm": 105.5362423191159, - "kl": 5.046875, + "grad_norm": 27.193058696951205, + "kl": 5.515625, "learning_rate": 1.7605698882655233e-07, - "loss": 0.9119, - "reward": 2.6831743717193604, - "reward_std": 0.15842505544424057, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.009534040233120322, - "rewards/tag_count_reward": 0.984375, + "loss": 0.8536, + "reward": 2.412537455558777, + "reward_std": 0.8152902722358704, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 0.9375, + "rewards/repetition_penalty_reward": -0.04058762267231941, + "rewards/tag_count_reward": 0.8906250298023224, "step": 831 }, { "clip_ratio": 0.0, - "completion_length": 271.5625, + "completion_length": 331.0208435058594, "epoch": 0.832, - "grad_norm": 21.187796762980987, - "kl": 6.2734375, + "grad_norm": 19.688526945972043, + "kl": 2.2265625, "learning_rate": 1.7518544168045524e-07, - "loss": 1.7523, - "reward": 2.7503111362457275, - "reward_std": 0.5767822861671448, - "rewards/accuracy_reward": 0.8750000298023224, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.004897281061857939, - "rewards/tag_count_reward": 0.9010416865348816, + "loss": 0.3667, + "reward": 2.815763831138611, + "reward_std": 0.3860451430082321, + "rewards/accuracy_reward": 0.875, + "rewards/reasoning_steps_reward": 0.9791667461395264, + "rewards/repetition_penalty_reward": -0.012361295986920595, + "rewards/tag_count_reward": 0.9739583432674408, "step": 832 }, { "clip_ratio": 0.0, - "completion_length": 177.5, + "completion_length": 346.56251525878906, "epoch": 0.833, - "grad_norm": 21.585911317510547, - "kl": 2.484375, + "grad_norm": 22.298341306372343, + "kl": 2.765625, "learning_rate": 1.743184615314671e-07, - "loss": 0.3132, - "reward": 2.901798725128174, - "reward_std": 0.20072048902511597, - "rewards/accuracy_reward": 0.9375000298023224, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.018340162001550198, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.3607, + "reward": 2.7588515281677246, + "reward_std": 0.545660674571991, + "rewards/accuracy_reward": 0.8750000298023224, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.02066253498196602, + "rewards/tag_count_reward": 0.953125, "step": 833 }, { "clip_ratio": 0.0, - "completion_length": 127.16667175292969, + "completion_length": 359.31251525878906, "epoch": 0.834, - "grad_norm": 13.92565607457608, - "kl": 2.7265625, + "grad_norm": 28.797542999117116, + "kl": 3.25390625, "learning_rate": 1.7345605894346726e-07, - "loss": 0.2961, - "reward": 2.987924337387085, - "reward_std": 0.02974029944743961, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.005131273763254285, - "rewards/tag_count_reward": 1.0, + "loss": 0.5221, + "reward": 2.7526845932006836, + "reward_std": 0.674776554107666, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.01641272520646453, + "rewards/tag_count_reward": 0.9218750298023224, "step": 834 }, { "clip_ratio": 0.0, - "completion_length": 187.9791717529297, + "completion_length": 430.62501525878906, "epoch": 0.835, - "grad_norm": 35.53949725325245, - "kl": 3.5625, + "grad_norm": 43.496982955042476, + "kl": 3.0859375, "learning_rate": 1.7259824442455923e-07, - "loss": 0.7295, - "reward": 2.890645146369934, - "reward_std": 0.31412097811698914, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.00866050599142909, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.6982, + "reward": 2.299278497695923, + "reward_std": 0.6011901348829269, + "rewards/accuracy_reward": 0.5000000149011612, + "rewards/reasoning_steps_reward": 0.9027777910232544, + "rewards/repetition_penalty_reward": -0.00974938040599227, + "rewards/tag_count_reward": 0.9062500298023224, "step": 835 }, { "clip_ratio": 0.0, - "completion_length": 144.77084350585938, + "completion_length": 519.9375152587891, "epoch": 0.836, - "grad_norm": 25.5818679698, - "kl": 4.1328125, + "grad_norm": 23.29360992129975, + "kl": 3.484375, "learning_rate": 1.7174502842694212e-07, - "loss": 0.2721, - "reward": 2.841456651687622, - "reward_std": 0.27661269158124924, - "rewards/accuracy_reward": 0.875, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.00229342607781291, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 1.0119, + "reward": 2.5549784898757935, + "reward_std": 0.7014127969741821, + "rewards/accuracy_reward": 0.6875, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.021410479210317135, + "rewards/tag_count_reward": 0.8958333730697632, "step": 836 }, { "clip_ratio": 0.0, - "completion_length": 293.0833435058594, + "completion_length": 468.81251525878906, "epoch": 0.837, - "grad_norm": 14.49023544091302, - "kl": 4.375, + "grad_norm": 21.45595694285051, + "kl": 4.984375, "learning_rate": 1.7089642134678364e-07, - "loss": 0.9197, - "reward": 2.5983880758285522, - "reward_std": 0.27600688487291336, - "rewards/accuracy_reward": 0.6875000298023224, - "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.02487595658749342, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.8767, + "reward": 2.2903480529785156, + "reward_std": 0.6241036355495453, + "rewards/accuracy_reward": 0.4375000149011612, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.018679913599044085, + "rewards/tag_count_reward": 0.90625, "step": 837 }, { "clip_ratio": 0.0, - "completion_length": 283.5833435058594, + "completion_length": 736.0833435058594, "epoch": 0.838, - "grad_norm": 16.48418648820746, - "kl": 7.171875, + "grad_norm": 62.125475017964185, + "kl": 9.53125, "learning_rate": 1.7005243352409333e-07, - "loss": 1.0486, - "reward": 2.6211971044540405, - "reward_std": 0.38547276705503464, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/reasoning_steps_reward": 0.979166716337204, - "rewards/repetition_penalty_reward": -0.019427934661507607, - "rewards/tag_count_reward": 0.953125, + "loss": 1.49, + "reward": 2.3610810041427612, + "reward_std": 0.8324085474014282, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9444445073604584, + "rewards/repetition_penalty_reward": -0.020863616839051247, + "rewards/tag_count_reward": 0.7708333432674408, "step": 838 }, { "clip_ratio": 0.0, - "completion_length": 174.50000762939453, + "completion_length": 711.6041870117188, "epoch": 0.839, - "grad_norm": 71.54279627624388, - "kl": 5.09375, + "grad_norm": 51.57175499154389, + "kl": 10.3125, "learning_rate": 1.6921307524259625e-07, - "loss": 0.5844, - "reward": 2.4203994274139404, - "reward_std": 0.17448097094893456, - "rewards/accuracy_reward": 0.4583333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.006684020394459367, - "rewards/tag_count_reward": 0.9687500298023224, + "loss": 1.4895, + "reward": 2.234101891517639, + "reward_std": 0.9274267852306366, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.02457870915532112, + "rewards/tag_count_reward": 0.7239583730697632, "step": 839 }, { "clip_ratio": 0.0, - "completion_length": 165.0625, + "completion_length": 431.00001525878906, "epoch": 0.84, - "grad_norm": 42.528276606227685, - "kl": 5.109375, + "grad_norm": 33.432954925810336, + "kl": 2.5546875, "learning_rate": 1.6837835672960831e-07, - "loss": 0.9171, - "reward": 2.7064452171325684, - "reward_std": 0.25300221145153046, - "rewards/accuracy_reward": 0.7500000298023224, + "loss": 0.8481, + "reward": 2.6463335752487183, + "reward_std": 0.5603546500205994, + "rewards/accuracy_reward": 0.7291666865348816, "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.001888129161670804, - "rewards/tag_count_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.015124747063964605, + "rewards/tag_count_reward": 0.9322916865348816, "step": 840 }, { "clip_ratio": 0.0, - "completion_length": 209.77083587646484, + "completion_length": 512.1666717529297, "epoch": 0.841, - "grad_norm": 46.80904621643401, - "kl": 4.8046875, + "grad_norm": 66.56725483975436, + "kl": 5.359375, "learning_rate": 1.6754828815591131e-07, - "loss": 0.3505, - "reward": 2.6658207178115845, - "reward_std": 0.3700069189071655, - "rewards/accuracy_reward": 0.7291666865348816, + "loss": 1.1354, + "reward": 2.3149657249450684, + "reward_std": 0.6210701763629913, + "rewards/accuracy_reward": 0.479166679084301, "rewards/reasoning_steps_reward": 0.9722222685813904, - "rewards/repetition_penalty_reward": -0.014735077507793903, - "rewards/tag_count_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.01663155946880579, + "rewards/tag_count_reward": 0.8802083730697632, "step": 841 }, { "clip_ratio": 0.0, - "completion_length": 247.58333587646484, + "completion_length": 407.85418701171875, "epoch": 0.842, - "grad_norm": 19.14288749189081, - "kl": 5.890625, + "grad_norm": 12.29872685471777, + "kl": 3.1630859375, "learning_rate": 1.6672287963562852e-07, - "loss": 1.9361, - "reward": 2.8263509273529053, - "reward_std": 0.4268178790807724, - "rewards/accuracy_reward": 0.9375, - "rewards/reasoning_steps_reward": 0.979166716337204, - "rewards/repetition_penalty_reward": -0.006982591236010194, - "rewards/tag_count_reward": 0.9166666865348816, + "loss": 0.4297, + "reward": 2.510159969329834, + "reward_std": 0.602110430598259, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.01761801028624177, + "rewards/tag_count_reward": 0.9375, "step": 842 }, { "clip_ratio": 0.0, - "completion_length": 161.7291717529297, + "completion_length": 541.5, "epoch": 0.843, - "grad_norm": 16.034275324076322, - "kl": 4.4765625, + "grad_norm": 27.148373245511962, + "kl": 5.609375, "learning_rate": 1.659021412261026e-07, - "loss": 0.8985, - "reward": 2.6036722660064697, - "reward_std": 0.30704423785209656, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.009175132494419813, - "rewards/tag_count_reward": 0.9531250298023224, + "loss": 0.9403, + "reward": 2.315541386604309, + "reward_std": 0.7348792850971222, + "rewards/accuracy_reward": 0.5, + "rewards/reasoning_steps_reward": 0.965277761220932, + "rewards/repetition_penalty_reward": -0.024736556224524975, + "rewards/tag_count_reward": 0.8750000298023224, "step": 843 }, { "clip_ratio": 0.0, - "completion_length": 225.14584350585938, + "completion_length": 594.3750152587891, "epoch": 0.844, - "grad_norm": 12.532828330781854, - "kl": 2.859375, + "grad_norm": 37.507439921203925, + "kl": 6.9140625, "learning_rate": 1.6508608292777203e-07, - "loss": 0.2647, - "reward": 2.612767815589905, - "reward_std": 0.3318920433521271, - "rewards/accuracy_reward": 0.6666666716337204, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.019176696427166462, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.4498, + "reward": 2.3864521980285645, + "reward_std": 0.3542592525482178, + "rewards/accuracy_reward": 0.625, + "rewards/reasoning_steps_reward": 0.9305556118488312, + "rewards/repetition_penalty_reward": -0.01806183159351349, + "rewards/tag_count_reward": 0.8489583432674408, "step": 844 }, { "clip_ratio": 0.0, - "completion_length": 166.93750762939453, + "completion_length": 574.5416870117188, "epoch": 0.845, - "grad_norm": 16.00425115206657, - "kl": 2.671875, + "grad_norm": 42.479348438013346, + "kl": 4.875, "learning_rate": 1.6427471468404952e-07, - "loss": 0.7556, - "reward": 2.950788140296936, - "reward_std": 0.16109416633844376, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.0058092172257602215, - "rewards/tag_count_reward": 0.984375, + "loss": 1.2607, + "reward": 2.191675543785095, + "reward_std": 0.5457549095153809, + "rewards/accuracy_reward": 0.3750000149011612, + "rewards/reasoning_steps_reward": 0.9652778506278992, + "rewards/repetition_penalty_reward": -0.018394021317362785, + "rewards/tag_count_reward": 0.8697916865348816, "step": 845 }, { "clip_ratio": 0.0, - "completion_length": 231.00000762939453, + "completion_length": 447.5, "epoch": 0.846, - "grad_norm": 27.756833086334918, - "kl": 3.03125, + "grad_norm": 32.21272755825097, + "kl": 2.6328125, "learning_rate": 1.6346804638120098e-07, - "loss": 0.4202, - "reward": 2.8793323040008545, - "reward_std": 0.2521085739135742, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.019973283633589745, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.6288, + "reward": 2.359144449234009, + "reward_std": 0.7404135763645172, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.9236111342906952, + "rewards/repetition_penalty_reward": -0.017591833136975765, + "rewards/tag_count_reward": 0.9114583730697632, "step": 846 }, { "clip_ratio": 0.0, - "completion_length": 129.62500762939453, + "completion_length": 384.18751525878906, "epoch": 0.847, - "grad_norm": 19.48338649800903, - "kl": 3.5625, + "grad_norm": 14.416271559769205, + "kl": 2.5927734375, "learning_rate": 1.6266608784822542e-07, - "loss": 0.5836, - "reward": 2.951775550842285, - "reward_std": 0.10108292568475008, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.0048216290306299925, - "rewards/tag_count_reward": 0.984375, + "loss": 0.393, + "reward": 2.211021304130554, + "reward_std": 0.4275623857975006, + "rewards/accuracy_reward": 0.3750000149011612, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.014673332683742046, + "rewards/tag_count_reward": 0.90625, "step": 847 }, { "clip_ratio": 0.0, - "completion_length": 237.83333587646484, + "completion_length": 436.5625, "epoch": 0.848, - "grad_norm": 10.207396414525643, - "kl": 3.6796875, + "grad_norm": 14.205493656115495, + "kl": 3.384765625, "learning_rate": 1.6186884885673413e-07, - "loss": 0.5974, - "reward": 2.8975290060043335, - "reward_std": 0.20708778128027916, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.017401515040546656, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.5267, + "reward": 2.2585190534591675, + "reward_std": 0.7064023613929749, + "rewards/accuracy_reward": 0.4375000149011612, + "rewards/reasoning_steps_reward": 0.9375, + "rewards/repetition_penalty_reward": -0.03314755018800497, + "rewards/tag_count_reward": 0.9166666865348816, "step": 848 }, { "clip_ratio": 0.0, - "completion_length": 121.43750762939453, + "completion_length": 496.375, "epoch": 0.849, - "grad_norm": 23.81064998912473, - "kl": 1.86328125, + "grad_norm": 47.615315210946434, + "kl": 3.21484375, "learning_rate": 1.610763391208329e-07, - "loss": 0.354, - "reward": 2.95501971244812, - "reward_std": 0.14760111132636666, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.003313728258945048, - "rewards/tag_count_reward": 1.0, + "loss": 0.7123, + "reward": 2.543179988861084, + "reward_std": 0.6331344544887543, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.01584793906658888, + "rewards/tag_count_reward": 0.8854166865348816, "step": 849 }, { "clip_ratio": 0.0, - "completion_length": 258.93750762939453, + "completion_length": 330.125, "epoch": 0.85, - "grad_norm": 11.830420077282275, - "kl": 5.046875, + "grad_norm": 22.926928355727185, + "kl": 1.275390625, "learning_rate": 1.6028856829700258e-07, - "loss": 0.9731, - "reward": 2.5669829845428467, - "reward_std": 0.3018846660852432, - "rewards/accuracy_reward": 0.645833358168602, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.01808652514591813, - "rewards/tag_count_reward": 0.953125, + "loss": 0.2793, + "reward": 2.8741886615753174, + "reward_std": 0.27669021487236023, + "rewards/accuracy_reward": 0.9166666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.011228146031498909, + "rewards/tag_count_reward": 0.9687500298023224, "step": 850 }, { "clip_ratio": 0.0, - "completion_length": 193.3125, + "completion_length": 352.6875, "epoch": 0.851, - "grad_norm": 17.378770858098157, - "kl": 4.5390625, + "grad_norm": 23.63650850429641, + "kl": 1.009765625, "learning_rate": 1.5950554598398228e-07, - "loss": 0.8706, - "reward": 2.6216492652893066, - "reward_std": 0.37398314476013184, - "rewards/accuracy_reward": 0.6875000298023224, - "rewards/reasoning_steps_reward": 0.9722222089767456, - "rewards/repetition_penalty_reward": -0.006823166157118976, - "rewards/tag_count_reward": 0.9687500298023224, + "loss": 0.2369, + "reward": 2.6674481630325317, + "reward_std": 0.3812839537858963, + "rewards/accuracy_reward": 0.7500000298023224, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.014843634329736233, + "rewards/tag_count_reward": 0.9739583432674408, "step": 851 }, { "clip_ratio": 0.0, - "completion_length": 178.8125, + "completion_length": 504.22918701171875, "epoch": 0.852, - "grad_norm": 20.93493604978067, - "kl": 4.1640625, + "grad_norm": 26.46847605504948, + "kl": 6.5625, "learning_rate": 1.5872728172265146e-07, - "loss": 0.5516, - "reward": 2.8940550088882446, - "reward_std": 0.23269455134868622, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.006986758206039667, - "rewards/tag_count_reward": 0.9843750298023224, + "loss": 0.9065, + "reward": 2.415515661239624, + "reward_std": 0.7073606848716736, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 0.9444445073604584, + "rewards/repetition_penalty_reward": -0.01330387475900352, + "rewards/tag_count_reward": 0.8593750298023224, "step": 852 }, { "clip_ratio": 0.0, - "completion_length": 185.8541717529297, + "completion_length": 561.5416870117188, "epoch": 0.853, - "grad_norm": 12.626834964043598, - "kl": 2.984375, + "grad_norm": 25.999454906190575, + "kl": 4.828125, "learning_rate": 1.579537849959148e-07, - "loss": 0.5635, - "reward": 2.69532310962677, - "reward_std": 0.15234212949872017, - "rewards/accuracy_reward": 0.75, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.01648262469097972, - "rewards/tag_count_reward": 0.96875, + "loss": 1.113, + "reward": 2.4573510885238647, + "reward_std": 0.7351089715957642, + "rewards/accuracy_reward": 0.6458333730697632, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.021815632469952106, + "rewards/tag_count_reward": 0.8541666865348816, "step": 853 }, { "clip_ratio": 0.0, - "completion_length": 220.25000762939453, + "completion_length": 511.2708435058594, "epoch": 0.854, - "grad_norm": 31.197803106272517, - "kl": 6.125, + "grad_norm": 32.177866129176245, + "kl": 5.859375, "learning_rate": 1.5718506522858572e-07, - "loss": 1.227, - "reward": 2.6356923580169678, - "reward_std": 0.38519713282585144, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.013613236136734486, - "rewards/tag_count_reward": 0.9479166865348816, + "loss": 0.9686, + "reward": 2.1545926332473755, + "reward_std": 0.6024680733680725, + "rewards/accuracy_reward": 0.3541666865348816, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.031171313486993313, + "rewards/tag_count_reward": 0.8802083730697632, "step": 854 }, { "clip_ratio": 0.0, - "completion_length": 125.12500381469727, + "completion_length": 464.5416717529297, "epoch": 0.855, - "grad_norm": 37.76658760815233, - "kl": 2.01171875, + "grad_norm": 35.98943778638736, + "kl": 2.75, "learning_rate": 1.5642113178727193e-07, - "loss": 0.0259, - "reward": 2.768761992454529, - "reward_std": 0.07164399686735123, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.0020713811682071537, - "rewards/tag_count_reward": 1.0, + "loss": 0.8586, + "reward": 2.234099566936493, + "reward_std": 0.4664178192615509, + "rewards/accuracy_reward": 0.3541666679084301, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.028053374961018562, + "rewards/tag_count_reward": 0.9427083730697632, "step": 855 }, { "clip_ratio": 0.0, - "completion_length": 152.02083587646484, + "completion_length": 539.6250305175781, "epoch": 0.856, - "grad_norm": 22.162918408387117, - "kl": 2.875, + "grad_norm": 25.61587236309964, + "kl": 4.1640625, "learning_rate": 1.5566199398026147e-07, - "loss": 0.4462, - "reward": 2.750713348388672, - "reward_std": 0.07319960929453373, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.007967321667820215, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 1.0126, + "reward": 2.0876386165618896, + "reward_std": 0.7813932299613953, + "rewards/accuracy_reward": 0.3333333358168602, + "rewards/reasoning_steps_reward": 0.916666716337204, + "rewards/repetition_penalty_reward": -0.02173637691885233, + "rewards/tag_count_reward": 0.8593750298023224, "step": 856 }, { "clip_ratio": 0.0, - "completion_length": 149.3958396911621, + "completion_length": 394.68751525878906, "epoch": 0.857, - "grad_norm": 29.28036686726787, - "kl": 2.83984375, + "grad_norm": 21.20771571565506, + "kl": 2.45703125, "learning_rate": 1.5490766105740876e-07, - "loss": 0.8962, - "reward": 2.681897282600403, - "reward_std": 0.18577580153942108, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.002130626031430438, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.5834, + "reward": 2.4374274015426636, + "reward_std": 0.4593869596719742, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.9722222089767456, + "rewards/repetition_penalty_reward": -0.019169961102306843, + "rewards/tag_count_reward": 0.9427083730697632, "step": 857 }, { "clip_ratio": 0.0, - "completion_length": 158.8333396911621, + "completion_length": 338.50001525878906, "epoch": 0.858, - "grad_norm": 12.196089812699844, - "kl": 4.1875, + "grad_norm": 11.186901565270288, + "kl": 2.41796875, "learning_rate": 1.5415814221002265e-07, - "loss": 0.922, - "reward": 2.9287012815475464, - "reward_std": 0.1960319245699793, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.0035903056850656867, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 0.1621, + "reward": 2.769268274307251, + "reward_std": 0.46035242080688477, + "rewards/accuracy_reward": 0.8750000298023224, + "rewards/reasoning_steps_reward": 0.965277761220932, + "rewards/repetition_penalty_reward": -0.024134621024131775, + "rewards/tag_count_reward": 0.953125, "step": 858 }, { "clip_ratio": 0.0, - "completion_length": 169.7291717529297, + "completion_length": 433.9166717529297, "epoch": 0.859, - "grad_norm": 26.940056408748244, - "kl": 4.984375, + "grad_norm": 26.347652823757198, + "kl": 2.93359375, "learning_rate": 1.5341344657075354e-07, - "loss": 0.7228, - "reward": 2.85272479057312, - "reward_std": 0.36393919587135315, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.004914292600005865, - "rewards/tag_count_reward": 0.9687500298023224, + "loss": 0.7346, + "reward": 2.543567419052124, + "reward_std": 0.6669409871101379, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.02066876133903861, + "rewards/tag_count_reward": 0.9114583730697632, "step": 859 }, { "clip_ratio": 0.0, - "completion_length": 189.9791717529297, + "completion_length": 308.7708435058594, "epoch": 0.86, - "grad_norm": 24.86736981230482, - "kl": 4.53125, + "grad_norm": 24.14563880775655, + "kl": 1.58203125, "learning_rate": 1.5267358321348285e-07, - "loss": 0.6506, - "reward": 2.829435706138611, - "reward_std": 0.3517237454652786, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.005633883876726031, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.3053, + "reward": 2.8249399662017822, + "reward_std": 0.4262206554412842, + "rewards/accuracy_reward": 0.8750000298023224, + "rewards/reasoning_steps_reward": 0.9930555820465088, + "rewards/repetition_penalty_reward": -0.01186564564704895, + "rewards/tag_count_reward": 0.96875, "step": 860 }, { "clip_ratio": 0.0, - "completion_length": 152.75000762939453, + "completion_length": 364.9583435058594, "epoch": 0.861, - "grad_norm": 39.886648633012136, - "kl": 2.96484375, + "grad_norm": 30.59682645079428, + "kl": 2.7890625, "learning_rate": 1.5193856115321224e-07, - "loss": 0.8118, - "reward": 2.919276475906372, - "reward_std": 0.22871966660022736, - "rewards/accuracy_reward": 0.9583333730697632, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.011279066558927298, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.4332, + "reward": 2.4405341148376465, + "reward_std": 0.4027775228023529, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.01779924053698778, + "rewards/tag_count_reward": 0.9375000298023224, "step": 861 }, { "clip_ratio": 0.0, - "completion_length": 122.52083969116211, + "completion_length": 432.06251525878906, "epoch": 0.862, - "grad_norm": 25.070662225153228, - "kl": 1.4375, + "grad_norm": 36.09673783797281, + "kl": 6.0625, "learning_rate": 1.5120838934595337e-07, - "loss": 0.2506, - "reward": 2.9113051891326904, - "reward_std": 0.12661150796338916, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.005361593677662313, - "rewards/tag_count_reward": 1.0, + "loss": 0.8199, + "reward": 2.335049033164978, + "reward_std": 0.7307652831077576, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 0.951388955116272, + "rewards/repetition_penalty_reward": -0.01738166157156229, + "rewards/tag_count_reward": 0.8802083432674408, "step": 862 }, { "clip_ratio": 0.0, - "completion_length": 161.33333587646484, + "completion_length": 502.0208435058594, "epoch": 0.863, - "grad_norm": 16.885203108598695, - "kl": 2.6328125, + "grad_norm": 53.67270636598954, + "kl": 6.25, "learning_rate": 1.5048307668861947e-07, - "loss": 0.5401, - "reward": 2.9832613468170166, - "reward_std": 0.034483253955841064, - "rewards/accuracy_reward": 1.0, + "loss": 1.0301, + "reward": 2.3041592836380005, + "reward_std": 0.6167122721672058, + "rewards/accuracy_reward": 0.4375000223517418, "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.009794335346668959, - "rewards/tag_count_reward": 1.0, + "rewards/repetition_penalty_reward": -0.017021275125443935, + "rewards/tag_count_reward": 0.8906250298023224, "step": 863 }, { "clip_ratio": 0.0, - "completion_length": 106.89583587646484, + "completion_length": 502.625, "epoch": 0.864, - "grad_norm": 28.094851879784528, - "kl": 1.30078125, + "grad_norm": 40.13186559524844, + "kl": 6.4140625, "learning_rate": 1.4976263201891613e-07, - "loss": 0.26, - "reward": 2.74881649017334, - "reward_std": 0.0035065680276602507, - "rewards/accuracy_reward": 0.75, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.0011835595942102373, - "rewards/tag_count_reward": 1.0, + "loss": 1.1658, + "reward": 2.634553909301758, + "reward_std": 0.6430298089981079, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.9444445073604584, + "rewards/repetition_penalty_reward": -0.01822400465607643, + "rewards/tag_count_reward": 0.8958333432674408, "step": 864 }, { "clip_ratio": 0.0, - "completion_length": 171.20833587646484, + "completion_length": 602.0625152587891, "epoch": 0.865, - "grad_norm": 14.7101414593934, - "kl": 3.87890625, + "grad_norm": 67.05207348072643, + "kl": 9.6796875, "learning_rate": 1.4904706411523448e-07, - "loss": 0.923, - "reward": 2.7409067153930664, - "reward_std": 0.24218794656917453, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.00735730049200356, - "rewards/tag_count_reward": 0.984375, + "loss": 1.2048, + "reward": 2.128021478652954, + "reward_std": 0.7044219970703125, + "rewards/accuracy_reward": 0.416666679084301, + "rewards/reasoning_steps_reward": 0.9375, + "rewards/repetition_penalty_reward": -0.0178119083866477, + "rewards/tag_count_reward": 0.7916666865348816, "step": 865 }, { "clip_ratio": 0.0, - "completion_length": 100.04166793823242, + "completion_length": 473.0, "epoch": 0.866, - "grad_norm": 28.651268250207252, - "kl": 0.826171875, + "grad_norm": 17.39107169503413, + "kl": 5.40625, "learning_rate": 1.483363816965435e-07, - "loss": -0.0194, - "reward": 2.8398990631103516, - "reward_std": 0.2355441451072693, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 0.9861111044883728, - "rewards/repetition_penalty_reward": -0.0003788656322285533, - "rewards/tag_count_reward": 1.0, + "loss": 1.0166, + "reward": 2.366263747215271, + "reward_std": 0.7000749707221985, + "rewards/accuracy_reward": 0.5625000149011612, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.024361333809792995, + "rewards/tag_count_reward": 0.8697916865348816, "step": 866 }, { "clip_ratio": 0.0, - "completion_length": 177.4166717529297, + "completion_length": 589.4166870117188, "epoch": 0.867, - "grad_norm": 25.334666510505855, - "kl": 3.86328125, + "grad_norm": 52.2532552532266, + "kl": 8.546875, "learning_rate": 1.4763059342228434e-07, - "loss": 0.6724, - "reward": 2.902491569519043, - "reward_std": 0.2566691040992737, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.015911318361759186, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 1.0051, + "reward": 2.1066824197769165, + "reward_std": 0.6221250742673874, + "rewards/accuracy_reward": 0.3750000149011612, + "rewards/reasoning_steps_reward": 0.965277761220932, + "rewards/repetition_penalty_reward": -0.009637102019041777, + "rewards/tag_count_reward": 0.7760416865348816, "step": 867 }, { "clip_ratio": 0.0, - "completion_length": 126.70833587646484, + "completion_length": 482.3541717529297, "epoch": 0.868, - "grad_norm": 15.226495195075463, - "kl": 1.53515625, + "grad_norm": 34.11769671109303, + "kl": 4.921875, "learning_rate": 1.469297078922642e-07, - "loss": 0.2349, - "reward": 2.692921280860901, - "reward_std": 0.21399458253290504, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.004995569935999811, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.8941, + "reward": 2.2735098600387573, + "reward_std": 0.3796389400959015, + "rewards/accuracy_reward": 0.5, + "rewards/reasoning_steps_reward": 0.92361119389534, + "rewards/repetition_penalty_reward": -0.01468472508713603, + "rewards/tag_count_reward": 0.8645833432674408, "step": 868 }, { "clip_ratio": 0.0, - "completion_length": 164.5416717529297, + "completion_length": 419.0208435058594, "epoch": 0.869, - "grad_norm": 17.569476996404937, - "kl": 4.34375, + "grad_norm": 28.34872735283239, + "kl": 3.4375, "learning_rate": 1.4623373364655223e-07, - "loss": 1.0261, - "reward": 2.9051607847213745, - "reward_std": 0.30173751863185316, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.0010890999692492187, - "rewards/tag_count_reward": 0.96875, + "loss": 0.9823, + "reward": 2.6034168004989624, + "reward_std": 0.6418764889240265, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.014638913795351982, + "rewards/tag_count_reward": 0.8958333730697632, "step": 869 }, { "clip_ratio": 0.0, - "completion_length": 103.43750381469727, + "completion_length": 400.56251525878906, "epoch": 0.87, - "grad_norm": 32.5215620904755, - "kl": 1.54296875, + "grad_norm": 17.236700769600052, + "kl": 2.61328125, "learning_rate": 1.4554267916537495e-07, - "loss": 0.3078, - "reward": 2.5138537883758545, - "reward_std": 0.07958317711018026, - "rewards/accuracy_reward": 0.520833333954215, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.0017713196575641632, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.5795, + "reward": 2.361931324005127, + "reward_std": 0.5499020516872406, + "rewards/accuracy_reward": 0.520833358168602, + "rewards/reasoning_steps_reward": 0.9375000298023224, + "rewards/repetition_penalty_reward": -0.018277212977409363, + "rewards/tag_count_reward": 0.921875, "step": 870 }, { "clip_ratio": 0.0, - "completion_length": 157.1041717529297, + "completion_length": 570.1875152587891, "epoch": 0.871, - "grad_norm": 21.018784474334147, - "kl": 3.78125, + "grad_norm": 27.373915984285357, + "kl": 5.46875, "learning_rate": 1.448565528690129e-07, - "loss": 0.5598, - "reward": 2.936721086502075, - "reward_std": 0.14570784918032587, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.005987389595247805, - "rewards/tag_count_reward": 0.984375, + "loss": 1.2036, + "reward": 2.156299114227295, + "reward_std": 0.615426778793335, + "rewards/accuracy_reward": 0.4166666865348816, + "rewards/reasoning_steps_reward": 0.92361119389534, + "rewards/repetition_penalty_reward": -0.027728760614991188, + "rewards/tag_count_reward": 0.84375, "step": 871 }, { "clip_ratio": 0.0, - "completion_length": 136.5416717529297, + "completion_length": 488.875, "epoch": 0.872, - "grad_norm": 84.45442970177966, - "kl": 2.5, + "grad_norm": 25.43206804197954, + "kl": 3.91796875, "learning_rate": 1.4417536311769885e-07, - "loss": 0.2872, - "reward": 2.6898854970932007, - "reward_std": 0.1334502473473549, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.008031183388084173, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.8613, + "reward": 2.2447725534439087, + "reward_std": 0.6266459226608276, + "rewards/accuracy_reward": 0.4166666716337204, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.012171986512839794, + "rewards/tag_count_reward": 0.875, "step": 872 }, { "clip_ratio": 0.0, - "completion_length": 126.60417175292969, + "completion_length": 385.1458435058594, "epoch": 0.873, - "grad_norm": 32.52088708455719, - "kl": 2.00390625, + "grad_norm": 17.40833999493999, + "kl": 2.28125, "learning_rate": 1.4349911821151462e-07, - "loss": 0.5909, - "reward": 2.7453362941741943, - "reward_std": 0.009054683614522219, - "rewards/accuracy_reward": 0.75, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.004663640633225441, - "rewards/tag_count_reward": 1.0, + "loss": 0.3716, + "reward": 2.3771172761917114, + "reward_std": 0.5491065829992294, + "rewards/accuracy_reward": 0.4791666716337204, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.016980044543743134, + "rewards/tag_count_reward": 0.9427083730697632, "step": 873 }, { "clip_ratio": 0.0, - "completion_length": 209.27083587646484, + "completion_length": 504.31251525878906, "epoch": 0.874, - "grad_norm": 30.86699409261829, - "kl": 4.4296875, + "grad_norm": 33.796267710350925, + "kl": 3.75, "learning_rate": 1.4282782639029128e-07, - "loss": 0.7892, - "reward": 2.715803623199463, - "reward_std": 0.22722584009170532, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.013363065663725138, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.8301, + "reward": 2.4175636768341064, + "reward_std": 0.5949557721614838, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.02167257433757186, + "rewards/tag_count_reward": 0.890625, "step": 874 }, { "clip_ratio": 0.0, - "completion_length": 192.5625, + "completion_length": 591.6041870117188, "epoch": 0.875, - "grad_norm": 27.998930587769088, - "kl": 3.01171875, + "grad_norm": 26.046786075949313, + "kl": 7.28125, "learning_rate": 1.4216149583350755e-07, - "loss": 0.8169, - "reward": 2.8096829652786255, - "reward_std": 0.4498905539512634, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.01670619542710483, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 1.0826, + "reward": 2.029456853866577, + "reward_std": 0.6440670937299728, + "rewards/accuracy_reward": 0.3125000111758709, + "rewards/reasoning_steps_reward": 0.951388955116272, + "rewards/repetition_penalty_reward": -0.015682112891227007, + "rewards/tag_count_reward": 0.7812500298023224, "step": 875 }, { "clip_ratio": 0.0, - "completion_length": 194.0625, + "completion_length": 409.10418701171875, "epoch": 0.876, - "grad_norm": 27.80070696245838, - "kl": 5.765625, + "grad_norm": 27.02250848782874, + "kl": 2.4375, "learning_rate": 1.4150013466019114e-07, - "loss": 1.097, - "reward": 2.511355996131897, - "reward_std": 0.3631603717803955, - "rewards/accuracy_reward": 0.5416666716337204, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.00947731058113277, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.5262, + "reward": 2.6342684030532837, + "reward_std": 0.5182087272405624, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9652778506278992, + "rewards/repetition_penalty_reward": -0.023717753123492002, + "rewards/tag_count_reward": 0.9218750298023224, "step": 876 }, { "clip_ratio": 0.0, - "completion_length": 115.77083969116211, + "completion_length": 640.5208740234375, "epoch": 0.877, - "grad_norm": 16.77731827455148, - "kl": 3.3984375, + "grad_norm": 37.146004800834625, + "kl": 6.90625, "learning_rate": 1.4084375092881917e-07, - "loss": 0.3446, - "reward": 2.954527735710144, - "reward_std": 0.14746041223406792, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.003805672749876976, - "rewards/tag_count_reward": 1.0, + "loss": 1.1224, + "reward": 2.2963308095932007, + "reward_std": 0.7272785305976868, + "rewards/accuracy_reward": 0.5833333730697632, + "rewards/reasoning_steps_reward": 0.9305556118488312, + "rewards/repetition_penalty_reward": -0.024849796667695045, + "rewards/tag_count_reward": 0.8072916865348816, "step": 877 }, { "clip_ratio": 0.0, - "completion_length": 183.89583587646484, + "completion_length": 498.75001525878906, "epoch": 0.878, - "grad_norm": 28.60340870731462, - "kl": 4.0546875, + "grad_norm": 22.18515714015866, + "kl": 4.2578125, "learning_rate": 1.4019235263722034e-07, - "loss": 0.3565, - "reward": 2.547884702682495, - "reward_std": 0.23046265542507172, - "rewards/accuracy_reward": 0.583333358168602, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.014615323394536972, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.8769, + "reward": 2.144127130508423, + "reward_std": 0.4630337953567505, + "rewards/accuracy_reward": 0.3125000074505806, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.015595164615660906, + "rewards/tag_count_reward": 0.8958333432674408, "step": 878 }, { "clip_ratio": 0.0, - "completion_length": 128.89583587646484, + "completion_length": 486.8333435058594, "epoch": 0.879, - "grad_norm": 11.578764012610502, - "kl": 2.5, + "grad_norm": 33.22887528150052, + "kl": 4.6015625, "learning_rate": 1.395459477224772e-07, - "loss": 0.6318, - "reward": 2.962771773338318, - "reward_std": 0.12896279990673065, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.0007700205314904451, - "rewards/tag_count_reward": 0.984375, + "loss": 0.8573, + "reward": 2.4029178619384766, + "reward_std": 0.5731684267520905, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.02763769868761301, + "rewards/tag_count_reward": 0.8958333730697632, "step": 879 }, { "clip_ratio": 0.0, - "completion_length": 231.77084350585938, + "completion_length": 386.7291717529297, "epoch": 0.88, - "grad_norm": 23.99968613097257, - "kl": 6.21875, + "grad_norm": 20.990104821906048, + "kl": 1.6796875, "learning_rate": 1.3890454406082956e-07, - "loss": 0.8412, - "reward": 2.8951797485351562, - "reward_std": 0.211418267339468, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.0058621446951292455, - "rewards/tag_count_reward": 0.9427083432674408, + "loss": 0.5235, + "reward": 2.454037666320801, + "reward_std": 0.4934305101633072, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.01644855784252286, + "rewards/tag_count_reward": 0.9635416865348816, "step": 880 }, { "clip_ratio": 0.0, - "completion_length": 132.75000381469727, + "completion_length": 502.29168701171875, "epoch": 0.881, - "grad_norm": 23.06908278689687, - "kl": 2.6015625, + "grad_norm": 36.364235182054, + "kl": 5.3828125, "learning_rate": 1.3826814946757888e-07, - "loss": 0.2221, - "reward": 2.974642038345337, - "reward_std": 0.07654392649419606, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.0045247445232234895, - "rewards/tag_count_reward": 1.0, + "loss": 1.4712, + "reward": 2.4933007955551147, + "reward_std": 0.7101709246635437, + "rewards/accuracy_reward": 0.6875, + "rewards/reasoning_steps_reward": 0.9513889253139496, + "rewards/repetition_penalty_reward": -0.015379873104393482, + "rewards/tag_count_reward": 0.8697916865348816, "step": 881 }, { "clip_ratio": 0.0, - "completion_length": 189.95833587646484, + "completion_length": 520.25, "epoch": 0.882, - "grad_norm": 29.701403745153087, - "kl": 3.8203125, + "grad_norm": 60.16390554825615, + "kl": 6.0546875, "learning_rate": 1.3763677169699217e-07, - "loss": 0.7141, - "reward": 2.961447238922119, - "reward_std": 0.10944340378046036, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.010775103932246566, - "rewards/tag_count_reward": 1.0, + "loss": 1.0879, + "reward": 2.3464200496673584, + "reward_std": 0.5781585574150085, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.019899526610970497, + "rewards/tag_count_reward": 0.8802083730697632, "step": 882 }, { "clip_ratio": 0.0, - "completion_length": 166.37500762939453, + "completion_length": 499.5625, "epoch": 0.883, - "grad_norm": 10.500130101863714, - "kl": 3.609375, + "grad_norm": 34.74857681904962, + "kl": 6.4375, "learning_rate": 1.370104184422085e-07, - "loss": 0.7028, - "reward": 2.9743770360946655, - "reward_std": 0.07862749975174665, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.004789595492184162, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.5873, + "reward": 2.2807507514953613, + "reward_std": 0.5849172174930573, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 0.9166667759418488, + "rewards/repetition_penalty_reward": -0.01091603422537446, + "rewards/tag_count_reward": 0.8541666865348816, "step": 883 }, { "clip_ratio": 0.0, - "completion_length": 150.33333587646484, + "completion_length": 448.31251525878906, "epoch": 0.884, - "grad_norm": 36.24133319749277, - "kl": 2.107421875, + "grad_norm": 24.793424614985756, + "kl": 3.4609375, "learning_rate": 1.3638909733514452e-07, - "loss": 0.3025, - "reward": 2.828453540802002, - "reward_std": 0.12964597344398499, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.010088222101330757, - "rewards/tag_count_reward": 0.984375, + "loss": 1.0009, + "reward": 2.423251986503601, + "reward_std": 0.6239519119262695, + "rewards/accuracy_reward": 0.5625, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.01598423975519836, + "rewards/tag_count_reward": 0.9322916865348816, "step": 884 }, { "clip_ratio": 0.0, - "completion_length": 230.27083587646484, + "completion_length": 541.0, "epoch": 0.885, - "grad_norm": 22.269081646087574, - "kl": 5.16015625, + "grad_norm": 40.71564310689436, + "kl": 6.109375, "learning_rate": 1.3577281594640182e-07, - "loss": 1.218, - "reward": 2.6230573654174805, - "reward_std": 0.329606831073761, - "rewards/accuracy_reward": 0.6875, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.01235929410904646, - "rewards/tag_count_reward": 0.9479166865348816, + "loss": 0.9699, + "reward": 2.234649658203125, + "reward_std": 0.5764244198799133, + "rewards/accuracy_reward": 0.4166666865348816, + "rewards/reasoning_steps_reward": 0.965277761220932, + "rewards/repetition_penalty_reward": -0.01708643836900592, + "rewards/tag_count_reward": 0.8697916865348816, "step": 885 }, { "clip_ratio": 0.0, - "completion_length": 153.27083587646484, + "completion_length": 400.8541717529297, "epoch": 0.886, - "grad_norm": 15.575223174176452, - "kl": 4.0234375, + "grad_norm": 26.123391575885496, + "kl": 3.5625, "learning_rate": 1.351615817851748e-07, - "loss": 0.9782, - "reward": 2.962851643562317, - "reward_std": 0.12105419836007059, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.005898476054426283, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.8407, + "reward": 2.6832687854766846, + "reward_std": 0.4722274839878082, + "rewards/accuracy_reward": 0.8333333730697632, + "rewards/reasoning_steps_reward": 0.9236111640930176, + "rewards/repetition_penalty_reward": -0.021592404693365097, + "rewards/tag_count_reward": 0.9479166865348816, "step": 886 }, { "clip_ratio": 0.0, - "completion_length": 150.31250762939453, + "completion_length": 527.3333587646484, "epoch": 0.887, - "grad_norm": 22.806258820652328, - "kl": 3.21875, + "grad_norm": 20.992724819526305, + "kl": 5.24609375, "learning_rate": 1.345554022991586e-07, - "loss": 0.5622, - "reward": 2.9878013134002686, - "reward_std": 0.03211684059351683, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.005254437914118171, - "rewards/tag_count_reward": 1.0, + "loss": 1.196, + "reward": 2.380396842956543, + "reward_std": 0.6401159465312958, + "rewards/accuracy_reward": 0.6041666716337204, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.011964202858507633, + "rewards/tag_count_reward": 0.8437500298023224, "step": 887 }, { "clip_ratio": 0.0, - "completion_length": 136.75000762939453, + "completion_length": 429.3541717529297, "epoch": 0.888, - "grad_norm": 35.896160880496254, - "kl": 1.390625, + "grad_norm": 18.876114754116053, + "kl": 2.08984375, "learning_rate": 1.3395428487445914e-07, - "loss": 0.0584, - "reward": 2.88995099067688, - "reward_std": 0.13492857944220304, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.005882482917513698, - "rewards/tag_count_reward": 1.0, + "loss": 0.5497, + "reward": 2.315842390060425, + "reward_std": 0.41791072487831116, + "rewards/accuracy_reward": 0.3958333432674408, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.014018761925399303, + "rewards/tag_count_reward": 0.9479166865348816, "step": 888 }, { "clip_ratio": 0.0, - "completion_length": 143.70833587646484, + "completion_length": 433.9166717529297, "epoch": 0.889, - "grad_norm": 16.437086506839755, - "kl": 3.0703125, + "grad_norm": 20.1828507834608, + "kl": 4.0234375, "learning_rate": 1.3335823683550237e-07, - "loss": 0.4026, - "reward": 2.930903196334839, - "reward_std": 0.22038672864437103, - "rewards/accuracy_reward": 0.9583333730697632, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.006596800638362765, - "rewards/tag_count_reward": 1.0, + "loss": 0.7386, + "reward": 2.623020648956299, + "reward_std": 0.6058576703071594, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.979166716337204, + "rewards/repetition_penalty_reward": -0.017604444175958633, + "rewards/tag_count_reward": 0.8906250298023224, "step": 889 }, { "clip_ratio": 0.0, - "completion_length": 167.9583396911621, + "completion_length": 498.7500305175781, "epoch": 0.89, - "grad_norm": 25.89335890558197, - "kl": 2.35546875, + "grad_norm": 21.280761697718606, + "kl": 4.3125, "learning_rate": 1.3276726544494571e-07, - "loss": 0.8407, - "reward": 2.929213762283325, - "reward_std": 0.24252951191738248, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.003078056062804535, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 0.8813, + "reward": 2.311787486076355, + "reward_std": 0.6657915413379669, + "rewards/accuracy_reward": 0.5, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.019809790886938572, + "rewards/tag_count_reward": 0.8593750298023224, "step": 890 }, { "clip_ratio": 0.0, - "completion_length": 173.93750762939453, + "completion_length": 408.9166717529297, "epoch": 0.891, - "grad_norm": 15.594278792333219, - "kl": 2.7265625, + "grad_norm": 15.5213911574011, + "kl": 3.2626953125, "learning_rate": 1.3218137790358892e-07, - "loss": 0.7233, - "reward": 2.9212726354599, - "reward_std": 0.2542204111814499, - "rewards/accuracy_reward": 0.9583333730697632, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.0058107099030166864, - "rewards/tag_count_reward": 0.9687500298023224, + "loss": 0.3428, + "reward": 2.777226448059082, + "reward_std": 0.3445788323879242, + "rewards/accuracy_reward": 0.8958333432674408, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.016176452860236168, + "rewards/tag_count_reward": 0.9322916865348816, "step": 891 }, { "clip_ratio": 0.0, - "completion_length": 210.91667938232422, + "completion_length": 547.1458435058594, "epoch": 0.892, - "grad_norm": 27.83941373714299, - "kl": 4.734375, + "grad_norm": 26.69864519239987, + "kl": 5.2734375, "learning_rate": 1.316005813502869e-07, - "loss": 0.943, - "reward": 2.8897125720977783, - "reward_std": 0.2741171419620514, - "rewards/accuracy_reward": 0.9375000298023224, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.009593124967068434, - "rewards/tag_count_reward": 0.96875, + "loss": 0.735, + "reward": 2.280866265296936, + "reward_std": 0.5884718000888824, + "rewards/accuracy_reward": 0.4583333432674408, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.017744951881468296, + "rewards/tag_count_reward": 0.875, "step": 892 }, { "clip_ratio": 0.0, - "completion_length": 250.39584350585938, + "completion_length": 429.68751525878906, "epoch": 0.893, - "grad_norm": 14.003466715724374, - "kl": 5.703125, + "grad_norm": 16.8784751889796, + "kl": 2.828125, "learning_rate": 1.3102488286186234e-07, - "loss": 1.2256, - "reward": 2.8108887672424316, - "reward_std": 0.4131583273410797, - "rewards/accuracy_reward": 0.8750000298023224, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.017236345447599888, - "rewards/tag_count_reward": 0.9531250298023224, + "loss": 0.5288, + "reward": 2.408605933189392, + "reward_std": 0.48363907635211945, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.9583333432674408, + "rewards/repetition_penalty_reward": -0.023685818538069725, + "rewards/tag_count_reward": 0.9322916865348816, "step": 893 }, { "clip_ratio": 0.0, - "completion_length": 128.06250381469727, + "completion_length": 437.5416717529297, "epoch": 0.894, - "grad_norm": 18.975324903166904, - "kl": 4.109375, + "grad_norm": 41.80416425701382, + "kl": 3.0625, "learning_rate": 1.3045428945301953e-07, - "loss": 0.4723, - "reward": 2.9647419452667236, - "reward_std": 0.11661456897854805, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.002271933015435934, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.8179, + "reward": 2.665311813354492, + "reward_std": 0.4915819466114044, + "rewards/accuracy_reward": 0.7916666865348816, + "rewards/reasoning_steps_reward": 0.9583334028720856, + "rewards/repetition_penalty_reward": -0.01177146751433611, + "rewards/tag_count_reward": 0.9270833730697632, "step": 894 }, { "clip_ratio": 0.0, - "completion_length": 102.85416793823242, + "completion_length": 358.5833435058594, "epoch": 0.895, - "grad_norm": 19.22314589312988, - "kl": 0.9296875, + "grad_norm": 47.78557902231892, + "kl": 2.76171875, "learning_rate": 1.2988880807625927e-07, - "loss": 0.026, - "reward": 2.8115921020507812, - "reward_std": 0.11343610286712646, - "rewards/accuracy_reward": 0.8125, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.0009078612783923745, - "rewards/tag_count_reward": 1.0, + "loss": 0.5157, + "reward": 2.5233113765716553, + "reward_std": 0.5640779733657837, + "rewards/accuracy_reward": 0.625, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.025299932807683945, + "rewards/tag_count_reward": 0.9375000298023224, "step": 895 }, { "clip_ratio": 0.0, - "completion_length": 121.81250381469727, + "completion_length": 535.6666870117188, "epoch": 0.896, - "grad_norm": 14.909968929847974, - "kl": 3.6328125, + "grad_norm": 27.290533216671655, + "kl": 4.2578125, "learning_rate": 1.2932844562179352e-07, - "loss": 0.3571, - "reward": 2.9651137590408325, - "reward_std": 0.11284186551347375, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.003636407549493015, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.6981, + "reward": 2.0957794189453125, + "reward_std": 0.5598824918270111, + "rewards/accuracy_reward": 0.25, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.015331756323575974, + "rewards/tag_count_reward": 0.8958333432674408, "step": 896 }, { "clip_ratio": 0.0, - "completion_length": 119.31250381469727, + "completion_length": 384.68751525878906, "epoch": 0.897, - "grad_norm": 11.559902531534766, - "kl": 2.07421875, + "grad_norm": 8.938743613588342, + "kl": 1.4921875, "learning_rate": 1.2877320891746201e-07, - "loss": 0.1839, - "reward": 2.874724864959717, - "reward_std": 0.25377359986305237, - "rewards/accuracy_reward": 0.8958333730697632, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.005483619868755341, - "rewards/tag_count_reward": 0.984375, + "loss": 0.2616, + "reward": 2.667781949043274, + "reward_std": 0.4907715171575546, + "rewards/accuracy_reward": 0.75, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.026662585325539112, + "rewards/tag_count_reward": 0.9583333432674408, "step": 897 }, { "clip_ratio": 0.0, - "completion_length": 146.8541717529297, + "completion_length": 413.6041717529297, "epoch": 0.898, - "grad_norm": 14.893119945620496, - "kl": 2.51171875, + "grad_norm": 44.44288512911277, + "kl": 1.8125, "learning_rate": 1.2822310472864885e-07, - "loss": 0.5267, - "reward": 2.8828548192977905, - "reward_std": 0.27301620692014694, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.0025617413921281695, - "rewards/tag_count_reward": 0.96875, + "loss": 0.7351, + "reward": 2.583755135536194, + "reward_std": 0.557906985282898, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.01867562346160412, + "rewards/tag_count_reward": 0.9427083730697632, "step": 898 }, { "clip_ratio": 0.0, - "completion_length": 172.64583587646484, + "completion_length": 339.2708435058594, "epoch": 0.899, - "grad_norm": 16.937675711865598, - "kl": 1.474609375, + "grad_norm": 12.549931479255974, + "kl": 2.23046875, "learning_rate": 1.2767813975819983e-07, - "loss": 0.5063, - "reward": 2.908220887184143, - "reward_std": 0.17426241002976894, - "rewards/accuracy_reward": 0.9375, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.006709729554131627, - "rewards/tag_count_reward": 0.984375, + "loss": 0.3723, + "reward": 2.7536635398864746, + "reward_std": 0.3352846037596464, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.010225379839539528, + "rewards/tag_count_reward": 0.9375, "step": 899 }, { "clip_ratio": 0.0, - "completion_length": 247.06251525878906, + "completion_length": 467.1250305175781, "epoch": 0.9, - "grad_norm": 18.201158395631065, - "kl": 5.53125, + "grad_norm": 22.420504928111246, + "kl": 3.375, "learning_rate": 1.2713832064634125e-07, - "loss": 0.8157, - "reward": 2.7290115356445312, - "reward_std": 0.38931135833263397, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.01404411718249321, - "rewards/tag_count_reward": 0.9583333432674408, + "loss": 0.8531, + "reward": 2.0020930767059326, + "reward_std": 0.47625498473644257, + "rewards/accuracy_reward": 0.1875, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.020476454868912697, + "rewards/tag_count_reward": 0.8906250298023224, "step": 900 }, { "clip_ratio": 0.0, - "completion_length": 178.2291717529297, + "completion_length": 603.0833587646484, "epoch": 0.901, - "grad_norm": 15.01359459673891, - "kl": 3.75390625, + "grad_norm": 32.67875227205643, + "kl": 5.296875, "learning_rate": 1.2660365397059856e-07, - "loss": 0.3921, - "reward": 2.927525281906128, - "reward_std": 0.1414708192460239, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.020391544792801142, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 1.2488, + "reward": 2.2659354209899902, + "reward_std": 0.6388919800519943, + "rewards/accuracy_reward": 0.4583333432674408, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.02573126833885908, + "rewards/tag_count_reward": 0.8541666865348816, "step": 901 }, { "clip_ratio": 0.0, - "completion_length": 209.9166717529297, + "completion_length": 458.7916717529297, "epoch": 0.902, - "grad_norm": 32.156129495383254, - "kl": 5.53125, + "grad_norm": 31.91653489837797, + "kl": 2.703125, "learning_rate": 1.260741462457165e-07, - "loss": 0.8491, - "reward": 2.69127094745636, - "reward_std": 0.28871599584817886, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.011854101438075304, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 1.0144, + "reward": 2.4180402755737305, + "reward_std": 0.6183659136295319, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.017723857890814543, + "rewards/tag_count_reward": 0.9218750298023224, "step": 902 }, { "clip_ratio": 0.0, - "completion_length": 239.45833587646484, + "completion_length": 526.8750305175781, "epoch": 0.903, - "grad_norm": 22.028287505574752, - "kl": 5.46875, + "grad_norm": 29.282295286202803, + "kl": 6.421875, "learning_rate": 1.2554980392357956e-07, - "loss": 1.1037, - "reward": 2.8771215677261353, - "reward_std": 0.2805362120270729, - "rewards/accuracy_reward": 0.9375000298023224, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.013503541238605976, - "rewards/tag_count_reward": 0.9531250298023224, + "loss": 1.0086, + "reward": 2.2422985434532166, + "reward_std": 0.8072032630443573, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 0.8888889253139496, + "rewards/repetition_penalty_reward": -0.0163820618763566, + "rewards/tag_count_reward": 0.8489583432674408, "step": 903 }, { "clip_ratio": 0.0, - "completion_length": 194.9166717529297, + "completion_length": 478.37501525878906, "epoch": 0.904, - "grad_norm": 15.906195983404757, - "kl": 3.234375, + "grad_norm": 33.228944520213055, + "kl": 6.609375, "learning_rate": 1.2503063339313356e-07, - "loss": 0.9723, - "reward": 2.9084733724594116, - "reward_std": 0.21682792901992798, - "rewards/accuracy_reward": 0.9375, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.013401664793491364, - "rewards/tag_count_reward": 0.984375, + "loss": 1.1511, + "reward": 2.098447561264038, + "reward_std": 0.6330851316452026, + "rewards/accuracy_reward": 0.3541666716337204, + "rewards/reasoning_steps_reward": 0.902777761220932, + "rewards/repetition_penalty_reward": -0.012663647066801786, + "rewards/tag_count_reward": 0.8541666865348816, "step": 904 }, { "clip_ratio": 0.0, - "completion_length": 141.5416717529297, + "completion_length": 458.87501525878906, "epoch": 0.905, - "grad_norm": 20.623217546254978, - "kl": 3.21875, + "grad_norm": 36.169535665899474, + "kl": 4.171875, "learning_rate": 1.2451664098030743e-07, - "loss": 0.3314, - "reward": 2.682216167449951, - "reward_std": 0.22532377392053604, - "rewards/accuracy_reward": 0.708333358168602, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.005283877719193697, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.7981, + "reward": 2.4604378938674927, + "reward_std": 0.7671001553535461, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9375000596046448, + "rewards/repetition_penalty_reward": -0.0291455565020442, + "rewards/tag_count_reward": 0.8854166865348816, "step": 905 }, { "clip_ratio": 0.0, - "completion_length": 175.3541717529297, + "completion_length": 404.79168701171875, "epoch": 0.906, - "grad_norm": 12.536119866878812, - "kl": 2.2421875, + "grad_norm": 17.708470010127638, + "kl": 3.55078125, "learning_rate": 1.2400783294793668e-07, - "loss": 0.1491, - "reward": 2.7514487504959106, - "reward_std": 0.33865298330783844, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.008968084119260311, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.9617, + "reward": 2.5592939853668213, + "reward_std": 0.7362368702888489, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.015358910895884037, + "rewards/tag_count_reward": 0.9010416865348816, "step": 906 }, { "clip_ratio": 0.0, - "completion_length": 138.70833587646484, + "completion_length": 607.6250305175781, "epoch": 0.907, - "grad_norm": 18.205293974930512, - "kl": 2.30078125, + "grad_norm": 52.06808756849091, + "kl": 9.578125, "learning_rate": 1.235042154956865e-07, - "loss": 0.6393, - "reward": 2.6954036951065063, - "reward_std": 0.1861693114042282, - "rewards/accuracy_reward": 0.7291666716337204, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.005985207157209516, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 1.4479, + "reward": 2.2677348852157593, + "reward_std": 0.8661213219165802, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9097222685813904, + "rewards/repetition_penalty_reward": -0.03261231165379286, + "rewards/tag_count_reward": 0.8072916865348816, "step": 907 }, { "clip_ratio": 0.0, - "completion_length": 174.8541717529297, + "completion_length": 379.8958435058594, "epoch": 0.908, - "grad_norm": 32.7904733738129, - "kl": 4.3125, + "grad_norm": 14.881992614761662, + "kl": 2.5, "learning_rate": 1.2300579475997657e-07, - "loss": 0.4095, - "reward": 2.7134212255477905, - "reward_std": 0.24464604258537292, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.008801199961453676, - "rewards/tag_count_reward": 1.0, + "loss": 0.3101, + "reward": 2.3961899280548096, + "reward_std": 0.6012940406799316, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.023949017748236656, + "rewards/tag_count_reward": 0.9270833432674408, "step": 908 }, { "clip_ratio": 0.0, - "completion_length": 131.06250381469727, + "completion_length": 573.8541870117188, "epoch": 0.909, - "grad_norm": 70.54995439269887, - "kl": 2.6640625, + "grad_norm": 96.3349004340528, + "kl": 8.25, "learning_rate": 1.2251257681390645e-07, - "loss": 0.3804, - "reward": 2.871721625328064, - "reward_std": 0.17339074867777526, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.0032783974311314523, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 1.2975, + "reward": 2.3617441654205322, + "reward_std": 0.7564932107925415, + "rewards/accuracy_reward": 0.6041666716337204, + "rewards/reasoning_steps_reward": 0.951388955116272, + "rewards/repetition_penalty_reward": -0.01151987537741661, + "rewards/tag_count_reward": 0.8177083432674408, "step": 909 }, { "clip_ratio": 0.0, - "completion_length": 155.20833587646484, + "completion_length": 523.9791717529297, "epoch": 0.91, - "grad_norm": 16.345354518188547, - "kl": 0.681640625, + "grad_norm": 36.959308180317855, + "kl": 6.234375, "learning_rate": 1.220245676671809e-07, - "loss": 0.0943, - "reward": 2.994496464729309, - "reward_std": 0.005014357331674546, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.005503674881765619, - "rewards/tag_count_reward": 1.0, + "loss": 1.2606, + "reward": 2.20367431640625, + "reward_std": 0.9045920073986053, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 0.9097222685813904, + "rewards/repetition_penalty_reward": -0.039381190203130245, + "rewards/tag_count_reward": 0.8125000298023224, "step": 910 }, { "clip_ratio": 0.0, - "completion_length": 243.14583587646484, + "completion_length": 556.875, "epoch": 0.911, - "grad_norm": 20.81139865383649, - "kl": 5.5, + "grad_norm": 16.456009421461072, + "kl": 5.59375, "learning_rate": 1.2154177326603763e-07, - "loss": 1.4644, - "reward": 2.9363512992858887, - "reward_std": 0.16245082393288612, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.01156552229076624, - "rewards/tag_count_reward": 0.9687500298023224, + "loss": 1.3297, + "reward": 2.312328815460205, + "reward_std": 0.7721385657787323, + "rewards/accuracy_reward": 0.5625000149011612, + "rewards/reasoning_steps_reward": 0.9236111044883728, + "rewards/repetition_penalty_reward": -0.022740643471479416, + "rewards/tag_count_reward": 0.8489583432674408, "step": 911 }, { "clip_ratio": 0.0, - "completion_length": 213.91667938232422, + "completion_length": 409.29168701171875, "epoch": 0.912, - "grad_norm": 26.713488214415484, - "kl": 4.4375, + "grad_norm": 18.335739930280326, + "kl": 2.6767578125, "learning_rate": 1.2106419949317388e-07, - "loss": 1.2611, - "reward": 2.937116265296936, - "reward_std": 0.1820701863616705, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.0142727205529809, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.6476, + "reward": 2.470385789871216, + "reward_std": 0.5638497471809387, + "rewards/accuracy_reward": 0.5625000298023224, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.01746157230809331, + "rewards/tag_count_reward": 0.953125, "step": 912 }, { "clip_ratio": 0.0, - "completion_length": 129.64583587646484, + "completion_length": 632.8958435058594, "epoch": 0.913, - "grad_norm": 15.866527936231261, - "kl": 2.33203125, + "grad_norm": 23.440878703155796, + "kl": 6.265625, "learning_rate": 1.2059185216767543e-07, - "loss": 0.1861, - "reward": 2.950699210166931, - "reward_std": 0.10356529708951712, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.007634232984855771, - "rewards/tag_count_reward": 1.0, + "loss": 1.0253, + "reward": 2.0628278851509094, + "reward_std": 0.9162045121192932, + "rewards/accuracy_reward": 0.4166666716337204, + "rewards/reasoning_steps_reward": 0.9027778506278992, + "rewards/repetition_penalty_reward": -0.04307497665286064, + "rewards/tag_count_reward": 0.7864583432674408, "step": 913 }, { "clip_ratio": 0.0, - "completion_length": 191.9791717529297, + "completion_length": 622.4583435058594, "epoch": 0.914, - "grad_norm": 12.837726592288698, - "kl": 3.15234375, + "grad_norm": 42.16437419261308, + "kl": 6.59375, "learning_rate": 1.2012473704494537e-07, - "loss": 0.5129, - "reward": 2.9354971647262573, - "reward_std": 0.16008879989385605, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 0.9861111044883728, - "rewards/repetition_penalty_reward": -0.014155736193060875, - "rewards/tag_count_reward": 0.984375, + "loss": 1.123, + "reward": 1.9104456305503845, + "reward_std": 0.520677238702774, + "rewards/accuracy_reward": 0.18750000558793545, + "rewards/reasoning_steps_reward": 0.9305555820465088, + "rewards/repetition_penalty_reward": -0.020110088400542736, + "rewards/tag_count_reward": 0.8125000298023224, "step": 914 }, { "clip_ratio": 0.0, - "completion_length": 118.31250762939453, + "completion_length": 379.04168701171875, "epoch": 0.915, - "grad_norm": 59.43081430847798, - "kl": 3.3515625, + "grad_norm": 108.02105225090547, + "kl": 3.39453125, "learning_rate": 1.1966285981663407e-07, - "loss": 0.2685, - "reward": 2.857953429222107, - "reward_std": 0.3225581496953964, - "rewards/accuracy_reward": 0.8958333730697632, - "rewards/reasoning_steps_reward": 0.9791666567325592, - "rewards/repetition_penalty_reward": -0.001421499764546752, - "rewards/tag_count_reward": 0.984375, + "loss": 0.3631, + "reward": 2.5404409170150757, + "reward_std": 0.3071303367614746, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.020322874188423157, + "rewards/tag_count_reward": 0.9635416865348816, "step": 915 }, { "clip_ratio": 0.0, - "completion_length": 164.5416717529297, + "completion_length": 573.6041870117188, "epoch": 0.916, - "grad_norm": 19.346070518262938, - "kl": 3.1328125, + "grad_norm": 30.21063587734105, + "kl": 5.234375, "learning_rate": 1.1920622611056974e-07, - "loss": 0.4283, - "reward": 2.742331385612488, - "reward_std": 0.013704338576644659, - "rewards/accuracy_reward": 0.75, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.007668603677302599, - "rewards/tag_count_reward": 1.0, + "loss": 0.9573, + "reward": 1.9745216369628906, + "reward_std": 0.8713274002075195, + "rewards/accuracy_reward": 0.291666679084301, + "rewards/reasoning_steps_reward": 0.881944477558136, + "rewards/repetition_penalty_reward": -0.042839540168643, + "rewards/tag_count_reward": 0.8437500298023224, "step": 916 }, { "clip_ratio": 0.0, - "completion_length": 165.9166717529297, + "completion_length": 568.0833587646484, "epoch": 0.917, - "grad_norm": 27.393496780373123, - "kl": 2.6328125, + "grad_norm": 43.80723570465828, + "kl": 4.2109375, "learning_rate": 1.1875484149069004e-07, - "loss": 0.1922, - "reward": 2.9763206243515015, - "reward_std": 0.05182175524532795, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 0.9861111044883728, - "rewards/repetition_penalty_reward": -0.009790432115551084, - "rewards/tag_count_reward": 1.0, + "loss": 1.1291, + "reward": 2.45370614528656, + "reward_std": 0.8172959387302399, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 0.9166666865348816, + "rewards/repetition_penalty_reward": -0.020252287853509188, + "rewards/tag_count_reward": 0.8906250298023224, "step": 917 }, { "clip_ratio": 0.0, - "completion_length": 254.83333587646484, + "completion_length": 400.7083435058594, "epoch": 0.918, - "grad_norm": 30.8077404302773, - "kl": 4.890625, + "grad_norm": 20.756190324375613, + "kl": 1.0966796875, "learning_rate": 1.1830871145697412e-07, - "loss": 0.9281, - "reward": 2.809490919113159, - "reward_std": 0.29925965517759323, - "rewards/accuracy_reward": 0.8333333730697632, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.013425803277641535, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.2955, + "reward": 2.608530044555664, + "reward_std": 0.4455796778202057, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.028622763231396675, + "rewards/tag_count_reward": 0.9635416865348816, "step": 918 }, { "clip_ratio": 0.0, - "completion_length": 119.08333969116211, + "completion_length": 432.9791717529297, "epoch": 0.919, - "grad_norm": 13.318507557481427, - "kl": 2.6875, + "grad_norm": 11.62915536443853, + "kl": 2.140625, "learning_rate": 1.1786784144537563e-07, - "loss": 0.4178, - "reward": 2.9554386138916016, - "reward_std": 0.10260855103842914, - "rewards/accuracy_reward": 0.9583333432674408, + "loss": 0.4186, + "reward": 2.5469974279403687, + "reward_std": 0.4763101041316986, + "rewards/accuracy_reward": 0.6250000149011612, "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.002894609933719039, - "rewards/tag_count_reward": 1.0, + "rewards/repetition_penalty_reward": -0.015502599067986012, + "rewards/tag_count_reward": 0.9375000298023224, "step": 919 }, { "clip_ratio": 0.0, - "completion_length": 109.25000381469727, + "completion_length": 507.95835876464844, "epoch": 0.92, - "grad_norm": 30.43071689947138, - "kl": 1.0625, + "grad_norm": 15.992892533041532, + "kl": 3.59375, "learning_rate": 1.1743223682775649e-07, - "loss": 0.1924, - "reward": 2.9567044973373413, - "reward_std": 0.09984519309364259, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.0016288914484903216, - "rewards/tag_count_reward": 1.0, + "loss": 0.7643, + "reward": 2.407273054122925, + "reward_std": 0.6685203611850739, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.019810385070741177, + "rewards/tag_count_reward": 0.8854166865348816, "step": 920 }, { "clip_ratio": 0.0, - "completion_length": 227.64583587646484, + "completion_length": 349.81251525878906, "epoch": 0.921, - "grad_norm": 20.864742344354564, - "kl": 4.5703125, + "grad_norm": 35.49834021420722, + "kl": 2.08203125, "learning_rate": 1.1700190291182158e-07, - "loss": 0.7158, - "reward": 2.8328511714935303, - "reward_std": 0.3466986119747162, - "rewards/accuracy_reward": 0.8750000298023224, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.016107321251183748, - "rewards/tag_count_reward": 0.9739583730697632, + "loss": 0.499, + "reward": 2.5659878253936768, + "reward_std": 0.5623100101947784, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.00866501871496439, + "rewards/tag_count_reward": 0.9427083432674408, "step": 921 }, { "clip_ratio": 0.0, - "completion_length": 204.58334350585938, + "completion_length": 441.7083435058594, "epoch": 0.922, - "grad_norm": 24.01904801016492, - "kl": 2.63671875, + "grad_norm": 14.606033932622575, + "kl": 2.08203125, "learning_rate": 1.1657684494105386e-07, - "loss": 0.5346, - "reward": 2.7911062240600586, - "reward_std": 0.3030080050230026, - "rewards/accuracy_reward": 0.8125000298023224, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.005768726579844952, - "rewards/tag_count_reward": 0.984375, + "loss": 0.4224, + "reward": 2.6236190795898438, + "reward_std": 0.5365782380104065, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.01526999520137906, + "rewards/tag_count_reward": 0.9375, "step": 922 }, { "clip_ratio": 0.0, - "completion_length": 107.10416793823242, + "completion_length": 544.25, "epoch": 0.923, - "grad_norm": 13.873181163070214, - "kl": 1.31640625, + "grad_norm": 32.63063888698561, + "kl": 5.8515625, "learning_rate": 1.1615706809465051e-07, - "loss": 0.204, - "reward": 2.9981677532196045, - "reward_std": 0.0036169840022921562, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.0018323095282539725, - "rewards/tag_count_reward": 1.0, + "loss": 0.8629, + "reward": 2.486325263977051, + "reward_std": 0.6139254868030548, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.02756360173225403, + "rewards/tag_count_reward": 0.8958333730697632, "step": 923 }, { "clip_ratio": 0.0, - "completion_length": 255.97917938232422, + "completion_length": 347.3125, "epoch": 0.924, - "grad_norm": 17.75492481290831, - "kl": 3.80078125, + "grad_norm": 12.704180107443388, + "kl": 1.7890625, "learning_rate": 1.1574257748745986e-07, - "loss": 1.2383, - "reward": 2.8372085094451904, - "reward_std": 0.34979885816574097, - "rewards/accuracy_reward": 0.8958333730697632, + "loss": 0.2841, + "reward": 2.5728049278259277, + "reward_std": 0.43000659346580505, + "rewards/accuracy_reward": 0.6458333432674408, "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.020430447533726692, - "rewards/tag_count_reward": 0.96875, + "rewards/repetition_penalty_reward": -0.024417342618107796, + "rewards/tag_count_reward": 0.9583333432674408, "step": 924 }, { "clip_ratio": 0.0, - "completion_length": 131.5625, + "completion_length": 413.12501525878906, "epoch": 0.925, - "grad_norm": 13.301133324276973, - "kl": 3.6328125, + "grad_norm": 18.841011447480344, + "kl": 3.2578125, "learning_rate": 1.1533337816991931e-07, - "loss": 0.5305, - "reward": 2.9647101163864136, - "reward_std": 0.11879212036728859, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.0023037675418891013, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.7064, + "reward": 2.5212587118148804, + "reward_std": 0.42140892148017883, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/reasoning_steps_reward": 0.92361119389534, + "rewards/repetition_penalty_reward": -0.011727516539394855, + "rewards/tag_count_reward": 0.9427083730697632, "step": 925 }, { "clip_ratio": 0.0, - "completion_length": 210.6666717529297, + "completion_length": 442.25, "epoch": 0.926, - "grad_norm": 13.721553154273964, - "kl": 3.953125, + "grad_norm": 16.753619228197998, + "kl": 3.5546875, "learning_rate": 1.1492947512799328e-07, - "loss": 0.6834, - "reward": 2.6775180101394653, - "reward_std": 0.2965303361415863, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.01171836769208312, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 0.6144, + "reward": 2.417970061302185, + "reward_std": 0.5506798624992371, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9305556118488312, + "rewards/repetition_penalty_reward": -0.02300213649868965, + "rewards/tag_count_reward": 0.9270833432674408, "step": 926 }, { "clip_ratio": 0.0, - "completion_length": 262.4791717529297, + "completion_length": 532.6250305175781, "epoch": 0.927, - "grad_norm": 24.665188409302505, - "kl": 6.5, + "grad_norm": 47.35016742807312, + "kl": 7.84375, "learning_rate": 1.1453087328311299e-07, - "loss": 1.3918, - "reward": 2.713929772377014, - "reward_std": 0.5867751240730286, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.015237070620059967, - "rewards/tag_count_reward": 0.9375, + "loss": 1.1422, + "reward": 2.0104929208755493, + "reward_std": 0.6963983774185181, + "rewards/accuracy_reward": 0.29166667722165585, + "rewards/reasoning_steps_reward": 0.9305555820465088, + "rewards/repetition_penalty_reward": -0.02422943152487278, + "rewards/tag_count_reward": 0.8125000298023224, "step": 927 }, { "clip_ratio": 0.0, - "completion_length": 222.58334350585938, + "completion_length": 385.4791717529297, "epoch": 0.928, - "grad_norm": 35.147021714127156, - "kl": 3.5234375, + "grad_norm": 15.095495398152355, + "kl": 2.072265625, "learning_rate": 1.1413757749211602e-07, - "loss": 0.7085, - "reward": 2.8975621461868286, - "reward_std": 0.18751079589128494, - "rewards/accuracy_reward": 0.9375, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.01910449331626296, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.4393, + "reward": 2.454028844833374, + "reward_std": 0.44756019115448, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.01645728573203087, + "rewards/tag_count_reward": 0.9427083432674408, "step": 928 }, { "clip_ratio": 0.0, - "completion_length": 178.95834350585938, + "completion_length": 614.6041870117188, "epoch": 0.929, - "grad_norm": 13.37565812508893, - "kl": 3.578125, + "grad_norm": 31.056679659798007, + "kl": 7.28125, "learning_rate": 1.137495925471875e-07, - "loss": 0.5187, - "reward": 2.696104645729065, - "reward_std": 0.14250043034553528, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.010492784902453423, - "rewards/tag_count_reward": 0.9843750298023224, + "loss": 1.1384, + "reward": 2.323223352432251, + "reward_std": 0.7905822396278381, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.958333432674408, + "rewards/repetition_penalty_reward": -0.015318467281758785, + "rewards/tag_count_reward": 0.8385416865348816, "step": 929 }, { "clip_ratio": 0.0, - "completion_length": 160.8125, + "completion_length": 451.43751525878906, "epoch": 0.93, - "grad_norm": 14.009655440289228, - "kl": 2.16015625, + "grad_norm": 24.05252138370831, + "kl": 3.65625, "learning_rate": 1.1336692317580158e-07, - "loss": 0.233, - "reward": 2.7521756887435913, - "reward_std": 0.10116894543170929, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.006504909601062536, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.8455, + "reward": 2.514081120491028, + "reward_std": 0.6175527423620224, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.0275856489315629, + "rewards/tag_count_reward": 0.9375000298023224, "step": 930 }, { "clip_ratio": 0.0, - "completion_length": 202.45834350585938, + "completion_length": 414.43751525878906, "epoch": 0.931, - "grad_norm": 30.09084663207652, - "kl": 3.7578125, + "grad_norm": 27.33191623105398, + "kl": 2.1875, "learning_rate": 1.1298957404066381e-07, - "loss": 0.4317, - "reward": 2.8043735027313232, - "reward_std": 0.1662554070353508, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.015071007888764143, - "rewards/tag_count_reward": 1.0, + "loss": 0.3232, + "reward": 2.5895215272903442, + "reward_std": 0.4988251328468323, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.02506194543093443, + "rewards/tag_count_reward": 0.9479166865348816, "step": 931 }, { "clip_ratio": 0.0, - "completion_length": 170.00000762939453, + "completion_length": 502.9583435058594, "epoch": 0.932, - "grad_norm": 14.637110383229217, - "kl": 3.4765625, + "grad_norm": 30.75771927110975, + "kl": 4.39453125, "learning_rate": 1.1261754973965422e-07, - "loss": 0.4827, - "reward": 2.9095075130462646, - "reward_std": 0.2833403870463371, - "rewards/accuracy_reward": 0.9375000298023224, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.010631326586008072, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.7145, + "reward": 2.1934012174606323, + "reward_std": 0.5984681844711304, + "rewards/accuracy_reward": 0.3750000149011612, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.02534892037510872, + "rewards/tag_count_reward": 0.8645833432674408, "step": 932 }, { "clip_ratio": 0.0, - "completion_length": 153.2291717529297, + "completion_length": 458.54168701171875, "epoch": 0.933, - "grad_norm": 24.38745182963303, - "kl": 2.90234375, + "grad_norm": 26.112800067792758, + "kl": 4.203125, "learning_rate": 1.1225085480577158e-07, - "loss": 0.5059, - "reward": 2.9632582664489746, - "reward_std": 0.1094448952935636, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.00896388478577137, - "rewards/tag_count_reward": 1.0, + "loss": 0.8496, + "reward": 2.2411762475967407, + "reward_std": 0.7032366394996643, + "rewards/accuracy_reward": 0.4166666865348816, + "rewards/reasoning_steps_reward": 0.9444445371627808, + "rewards/repetition_penalty_reward": -0.01576836034655571, + "rewards/tag_count_reward": 0.8958333730697632, "step": 933 }, { "clip_ratio": 0.0, - "completion_length": 183.87500762939453, + "completion_length": 424.37501525878906, "epoch": 0.934, - "grad_norm": 117.82594335717765, - "kl": 5.140625, + "grad_norm": 44.09196385541136, + "kl": 2.703125, "learning_rate": 1.1188949370707787e-07, - "loss": 0.7521, - "reward": 2.9202250242233276, - "reward_std": 0.23685725033283234, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.025955677730962634, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 0.9393, + "reward": 2.4043623208999634, + "reward_std": 0.672376275062561, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9305556118488312, + "rewards/repetition_penalty_reward": -0.010568303987383842, + "rewards/tag_count_reward": 0.9010416865348816, "step": 934 }, { "clip_ratio": 0.0, - "completion_length": 183.45833587646484, + "completion_length": 421.3958435058594, "epoch": 0.935, - "grad_norm": 52.593303676070704, - "kl": 5.859375, + "grad_norm": 19.863825505537683, + "kl": 2.9765625, "learning_rate": 1.1153347084664419e-07, - "loss": 0.8439, - "reward": 2.4936734437942505, - "reward_std": 0.31284280866384506, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.004590651718899608, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.5384, + "reward": 2.456182837486267, + "reward_std": 0.5422267615795135, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.021247888915240765, + "rewards/tag_count_reward": 0.9427083432674408, "step": 935 }, { "clip_ratio": 0.0, - "completion_length": 238.72917938232422, + "completion_length": 487.3541717529297, "epoch": 0.936, - "grad_norm": 160.3168459815195, - "kl": 7.33203125, + "grad_norm": 49.22541738740071, + "kl": 3.1875, "learning_rate": 1.1118279056249653e-07, - "loss": 1.0721, - "reward": 2.8759334087371826, - "reward_std": 0.25604357570409775, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.019899989711120725, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 1.1499, + "reward": 2.2859641313552856, + "reward_std": 0.4807792901992798, + "rewards/accuracy_reward": 0.4583333432674408, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.01785538112744689, + "rewards/tag_count_reward": 0.9010416865348816, "step": 936 }, { "clip_ratio": 0.0, - "completion_length": 122.35417175292969, + "completion_length": 476.3958435058594, "epoch": 0.937, - "grad_norm": 18.979195779541058, - "kl": 3.109375, + "grad_norm": 50.529982193331755, + "kl": 3.078125, "learning_rate": 1.1083745712756364e-07, - "loss": 0.5911, - "reward": 2.960405707359314, - "reward_std": 0.1244276762008667, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.004872129182331264, - "rewards/tag_count_reward": 1.0, + "loss": 0.9369, + "reward": 2.6942487955093384, + "reward_std": 0.6124294996261597, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.019293093122541904, + "rewards/tag_count_reward": 0.9218750298023224, "step": 937 }, { "clip_ratio": 0.0, - "completion_length": 209.9166717529297, + "completion_length": 583.0625, "epoch": 0.938, - "grad_norm": 23.109559357121526, - "kl": 6.09375, + "grad_norm": 23.005166702704713, + "kl": 5.2421875, "learning_rate": 1.1049747474962444e-07, - "loss": 0.7386, - "reward": 2.9225902557373047, - "reward_std": 0.16978863440454006, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/reasoning_steps_reward": 0.9861111342906952, - "rewards/repetition_penalty_reward": -0.011437664739787579, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.8491, + "reward": 2.30727219581604, + "reward_std": 0.6647425889968872, + "rewards/accuracy_reward": 0.4791666865348816, + "rewards/reasoning_steps_reward": 0.9652778506278992, + "rewards/repetition_penalty_reward": -0.022589108906686306, + "rewards/tag_count_reward": 0.8854166865348816, "step": 938 }, { "clip_ratio": 0.0, - "completion_length": 273.00000762939453, + "completion_length": 426.4583435058594, "epoch": 0.939, - "grad_norm": 30.685100278960217, - "kl": 6.265625, + "grad_norm": 27.108706168312672, + "kl": 3.7734375, "learning_rate": 1.1016284757125685e-07, - "loss": 1.1889, - "reward": 2.681376338005066, - "reward_std": 0.3497362285852432, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.016540437005460262, - "rewards/tag_count_reward": 0.9479166865348816, + "loss": 0.644, + "reward": 2.582595705986023, + "reward_std": 0.6667671203613281, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.023307234048843384, + "rewards/tag_count_reward": 0.9114583432674408, "step": 939 }, { "clip_ratio": 0.0, - "completion_length": 187.4166717529297, + "completion_length": 606.9791870117188, "epoch": 0.94, - "grad_norm": 31.36335890057412, - "kl": 4.75, + "grad_norm": 35.88000566575318, + "kl": 7.203125, "learning_rate": 1.0983357966978745e-07, - "loss": 0.8188, - "reward": 2.6121866703033447, - "reward_std": 0.34030066430568695, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/reasoning_steps_reward": 0.9652777910232544, - "rewards/repetition_penalty_reward": -0.004132928268518299, - "rewards/tag_count_reward": 0.984375, + "loss": 1.2855, + "reward": 2.064675807952881, + "reward_std": 0.6436410248279572, + "rewards/accuracy_reward": 0.3125000149011612, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.02039373479783535, + "rewards/tag_count_reward": 0.828125, "step": 940 }, { "clip_ratio": 0.0, - "completion_length": 223.62500762939453, + "completion_length": 449.1041717529297, "epoch": 0.941, - "grad_norm": 23.62792087072924, - "kl": 6.953125, + "grad_norm": 26.56932442747977, + "kl": 2.328125, "learning_rate": 1.0950967505724175e-07, - "loss": 1.173, - "reward": 2.66205096244812, - "reward_std": 0.25784512609243393, - "rewards/accuracy_reward": 0.7083333730697632, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.009824155364185572, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.7233, + "reward": 2.446874976158142, + "reward_std": 0.4883129894733429, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.03229183237999678, + "rewards/tag_count_reward": 0.9583333730697632, "step": 941 }, { "clip_ratio": 0.0, - "completion_length": 191.33334350585938, + "completion_length": 334.3541717529297, "epoch": 0.942, - "grad_norm": 23.918695510171, - "kl": 4.546875, + "grad_norm": 10.161335311358066, + "kl": 1.244140625, "learning_rate": 1.0919113768029517e-07, - "loss": 1.4448, - "reward": 2.920333504676819, - "reward_std": 0.26459308760240674, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.005013965186662972, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 0.0637, + "reward": 2.7936389446258545, + "reward_std": 0.438838854432106, + "rewards/accuracy_reward": 0.875, + "rewards/reasoning_steps_reward": 0.965277761220932, + "rewards/repetition_penalty_reward": -0.02580568566918373, + "rewards/tag_count_reward": 0.9791666865348816, "step": 942 }, { "clip_ratio": 0.0, - "completion_length": 161.02083587646484, + "completion_length": 391.22918701171875, "epoch": 0.943, - "grad_norm": 23.272337338628162, - "kl": 5.625, + "grad_norm": 16.415006349713707, + "kl": 2.619140625, "learning_rate": 1.0887797142022521e-07, - "loss": 0.7776, - "reward": 2.7125717401504517, - "reward_std": 0.11573184933513403, - "rewards/accuracy_reward": 0.7291666716337204, + "loss": 0.2111, + "reward": 2.3882477283477783, + "reward_std": 0.3951050788164139, + "rewards/accuracy_reward": 0.458333358168602, "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.006178448325954378, - "rewards/tag_count_reward": 0.9895833432674408, + "rewards/repetition_penalty_reward": -0.018002384342253208, + "rewards/tag_count_reward": 0.9479166865348816, "step": 943 }, { "clip_ratio": 0.0, - "completion_length": 185.62500762939453, + "completion_length": 540.5208435058594, "epoch": 0.944, - "grad_norm": 28.51300750571186, - "kl": 3.4375, + "grad_norm": 30.14237606691049, + "kl": 5.453125, "learning_rate": 1.0857018009286381e-07, - "loss": 0.7053, - "reward": 2.8887192010879517, - "reward_std": 0.2877109870314598, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.005378077970817685, - "rewards/tag_count_reward": 0.9843750298023224, + "loss": 1.0364, + "reward": 2.2670759558677673, + "reward_std": 0.7407488822937012, + "rewards/accuracy_reward": 0.4791666716337204, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.022854735143482685, + "rewards/tag_count_reward": 0.8385416865348816, "step": 944 }, { "clip_ratio": 0.0, - "completion_length": 260.3333435058594, + "completion_length": 409.10418701171875, "epoch": 0.945, - "grad_norm": 18.47913231650329, - "kl": 6.78125, + "grad_norm": 28.58154188768852, + "kl": 3.96484375, "learning_rate": 1.0826776744855121e-07, - "loss": 1.2381, - "reward": 2.847598075866699, - "reward_std": 0.2717164810746908, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.013513012323528528, - "rewards/tag_count_reward": 0.9583333432674408, + "loss": 0.5507, + "reward": 2.29867160320282, + "reward_std": 0.5912670195102692, + "rewards/accuracy_reward": 0.4166666865348816, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.01556460838764906, + "rewards/tag_count_reward": 0.9322916865348816, "step": 945 }, { "clip_ratio": 0.0, - "completion_length": 153.95833587646484, + "completion_length": 407.93751525878906, "epoch": 0.946, - "grad_norm": 16.559175444345, - "kl": 3.71875, + "grad_norm": 19.249242221629466, + "kl": 2.23828125, "learning_rate": 1.0797073717209013e-07, - "loss": 0.4592, - "reward": 2.831281542778015, - "reward_std": 0.14959253906272352, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.012468677829019725, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.555, + "reward": 2.5011874437332153, + "reward_std": 0.40425705909729004, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 0.9444444179534912, + "rewards/repetition_penalty_reward": -0.02659040503203869, + "rewards/tag_count_reward": 0.9583333432674408, "step": 946 }, { "clip_ratio": 0.0, - "completion_length": 154.2291717529297, + "completion_length": 505.66668701171875, "epoch": 0.947, - "grad_norm": 22.59018713395405, - "kl": 4.8359375, + "grad_norm": 73.25982352915128, + "kl": 6.1875, "learning_rate": 1.0767909288270063e-07, - "loss": 0.7058, - "reward": 2.967753767967224, - "reward_std": 0.08515576831996441, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.009676821297034621, - "rewards/tag_count_reward": 0.984375, + "loss": 1.2229, + "reward": 2.4074935913085938, + "reward_std": 0.7739560008049011, + "rewards/accuracy_reward": 0.6041666716337204, + "rewards/reasoning_steps_reward": 0.9375, + "rewards/repetition_penalty_reward": -0.024798161815851927, + "rewards/tag_count_reward": 0.8906250298023224, "step": 947 }, { "clip_ratio": 0.0, - "completion_length": 144.8958396911621, + "completion_length": 349.8333435058594, "epoch": 0.948, - "grad_norm": 19.68741227294964, - "kl": 2.609375, + "grad_norm": 24.365433217819515, + "kl": 2.26953125, "learning_rate": 1.0739283813397639e-07, - "loss": 0.1983, - "reward": 2.7137997150421143, - "reward_std": 0.07767349923960865, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.010158742778003216, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.1764, + "reward": 2.4521753787994385, + "reward_std": 0.42572128772735596, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.025255252607166767, + "rewards/tag_count_reward": 0.984375, "step": 948 }, { "clip_ratio": 0.0, - "completion_length": 271.20833587646484, + "completion_length": 552.0625, "epoch": 0.949, - "grad_norm": 185.34350183374306, - "kl": 11.0, + "grad_norm": 46.55689685946224, + "kl": 7.375, "learning_rate": 1.0711197641384115e-07, - "loss": 1.8731, - "reward": 2.88392174243927, - "reward_std": 0.24545861780643463, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 0.9791666269302368, - "rewards/repetition_penalty_reward": -0.022328334860503674, - "rewards/tag_count_reward": 0.9479166865348816, + "loss": 1.0546, + "reward": 2.4559671878814697, + "reward_std": 0.6184609234333038, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9375000298023224, + "rewards/repetition_penalty_reward": -0.012783022597432137, + "rewards/tag_count_reward": 0.8437500298023224, "step": 949 }, { "clip_ratio": 0.0, - "completion_length": 142.9166717529297, + "completion_length": 614.4166870117188, "epoch": 0.95, - "grad_norm": 23.31966228949802, - "kl": 2.9296875, + "grad_norm": 35.5385607782762, + "kl": 9.46875, "learning_rate": 1.068365111445064e-07, - "loss": 0.886, - "reward": 2.6903563737869263, - "reward_std": 0.20661196112632751, - "rewards/accuracy_reward": 0.708333358168602, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.002352070529013872, - "rewards/tag_count_reward": 0.984375, + "loss": 1.4734, + "reward": 2.2201212644577026, + "reward_std": 0.7953130602836609, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.9375000298023224, + "rewards/repetition_penalty_reward": -0.03508709650486708, + "rewards/tag_count_reward": 0.7760416865348816, "step": 950 }, { "clip_ratio": 0.0, - "completion_length": 259.2083435058594, + "completion_length": 607.0416870117188, "epoch": 0.951, - "grad_norm": 29.22653439594017, - "kl": 5.1328125, + "grad_norm": 44.814962262317266, + "kl": 10.71875, "learning_rate": 1.0656644568242946e-07, - "loss": 1.0962, - "reward": 2.8103487491607666, - "reward_std": 0.34712648391723633, - "rewards/accuracy_reward": 0.875, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.010831902734935284, - "rewards/tag_count_reward": 0.953125, + "loss": 1.4213, + "reward": 2.336472511291504, + "reward_std": 0.7723419368267059, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 0.9513888955116272, + "rewards/repetition_penalty_reward": -0.01595810428261757, + "rewards/tag_count_reward": 0.7760416865348816, "step": 951 }, { "clip_ratio": 0.0, - "completion_length": 233.1666717529297, + "completion_length": 468.31251525878906, "epoch": 0.952, - "grad_norm": 21.36233014410026, - "kl": 3.796875, + "grad_norm": 25.864907881139306, + "kl": 3.4375, "learning_rate": 1.063017833182728e-07, - "loss": 0.7021, - "reward": 2.89937686920166, - "reward_std": 0.31642352789640427, - "rewards/accuracy_reward": 0.9375000298023224, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.012081567663699389, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 1.2212, + "reward": 2.3151695728302, + "reward_std": 0.5797133445739746, + "rewards/accuracy_reward": 0.4583333432674408, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.014691710472106934, + "rewards/tag_count_reward": 0.9062500298023224, "step": 952 }, { "clip_ratio": 0.0, - "completion_length": 223.4166717529297, + "completion_length": 622.8125305175781, "epoch": 0.953, - "grad_norm": 28.07523655503276, - "kl": 5.6640625, + "grad_norm": 38.335448681356254, + "kl": 9.015625, "learning_rate": 1.0604252727686379e-07, - "loss": 0.9329, - "reward": 2.666787624359131, - "reward_std": 0.1808232143521309, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.015504146460443735, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 1.011, + "reward": 2.2985631823539734, + "reward_std": 0.8530721068382263, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9166666865348816, + "rewards/repetition_penalty_reward": -0.024353576824069023, + "rewards/tag_count_reward": 0.8020833432674408, "step": 953 }, { "clip_ratio": 0.0, - "completion_length": 138.97916793823242, + "completion_length": 426.62501525878906, "epoch": 0.954, - "grad_norm": 25.359527226689746, - "kl": 2.15625, + "grad_norm": 44.054481520111416, + "kl": 3.671875, "learning_rate": 1.0578868071715544e-07, - "loss": 0.2706, - "reward": 2.947670102119446, - "reward_std": 0.12771600298583508, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.003719085594639182, - "rewards/tag_count_reward": 1.0, + "loss": 1.0606, + "reward": 2.355110287666321, + "reward_std": 0.5248502939939499, + "rewards/accuracy_reward": 0.4791666865348816, + "rewards/reasoning_steps_reward": 0.9583334028720856, + "rewards/repetition_penalty_reward": -0.019889703020453453, + "rewards/tag_count_reward": 0.9375, "step": 954 }, { "clip_ratio": 0.0, - "completion_length": 175.20834350585938, + "completion_length": 394.50001525878906, "epoch": 0.955, - "grad_norm": 17.67847040280524, - "kl": 2.4296875, + "grad_norm": 17.701509278594894, + "kl": 2.38671875, "learning_rate": 1.0554024673218806e-07, - "loss": 0.7327, - "reward": 2.4575377702713013, - "reward_std": 0.14129899349063635, - "rewards/accuracy_reward": 0.4791666716337204, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.006003861082717776, - "rewards/tag_count_reward": 0.984375, + "loss": 0.4845, + "reward": 2.6396692991256714, + "reward_std": 0.647905558347702, + "rewards/accuracy_reward": 0.7708333432674408, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.020053054206073284, + "rewards/tag_count_reward": 0.9166666865348816, "step": 955 }, { "clip_ratio": 0.0, - "completion_length": 293.125, + "completion_length": 387.5208435058594, "epoch": 0.956, - "grad_norm": 34.44650995181923, - "kl": 7.984375, + "grad_norm": 22.09702269576327, + "kl": 1.2958984375, "learning_rate": 1.0529722834905125e-07, - "loss": 1.4457, - "reward": 2.820043206214905, - "reward_std": 0.40987710654735565, - "rewards/accuracy_reward": 0.9375, - "rewards/reasoning_steps_reward": 0.9652778506278992, - "rewards/repetition_penalty_reward": -0.02544297743588686, - "rewards/tag_count_reward": 0.9427083432674408, + "loss": 0.3797, + "reward": 2.641054391860962, + "reward_std": 0.46035097539424896, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9861111044883728, + "rewards/repetition_penalty_reward": -0.016931693069636822, + "rewards/tag_count_reward": 0.9635416865348816, "step": 956 }, { "clip_ratio": 0.0, - "completion_length": 171.27084350585938, + "completion_length": 429.41668701171875, "epoch": 0.957, - "grad_norm": 15.39881639267103, - "kl": 2.2421875, + "grad_norm": 34.01762179013479, + "kl": 1.96875, "learning_rate": 1.0505962852884739e-07, - "loss": 0.8714, - "reward": 2.4747426509857178, - "reward_std": 0.08102941373363137, - "rewards/accuracy_reward": 0.5, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.004424213548190892, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.6316, + "reward": 2.5664961338043213, + "reward_std": 0.6158457398414612, + "rewards/accuracy_reward": 0.6875, + "rewards/reasoning_steps_reward": 0.965277761220932, + "rewards/repetition_penalty_reward": -0.02378181181848049, + "rewards/tag_count_reward": 0.9375000298023224, "step": 957 }, { "clip_ratio": 0.0, - "completion_length": 133.83333587646484, + "completion_length": 401.2916717529297, "epoch": 0.958, - "grad_norm": 28.773620253588224, - "kl": 2.162109375, + "grad_norm": 27.1346809126327, + "kl": 2.7109375, "learning_rate": 1.0482745016665526e-07, - "loss": 0.0939, - "reward": 2.5594468116760254, - "reward_std": 0.14345418638549745, - "rewards/accuracy_reward": 0.5833333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.008261657494585961, - "rewards/tag_count_reward": 0.984375, + "loss": 0.7956, + "reward": 2.382103443145752, + "reward_std": 0.40817123651504517, + "rewards/accuracy_reward": 0.5000000298023224, + "rewards/reasoning_steps_reward": 0.958333432674408, + "rewards/repetition_penalty_reward": -0.013729824218899012, + "rewards/tag_count_reward": 0.9375, "step": 958 }, { "clip_ratio": 0.0, - "completion_length": 186.89583587646484, + "completion_length": 464.29168701171875, "epoch": 0.959, - "grad_norm": 13.066598915418979, - "kl": 3.9453125, + "grad_norm": 27.609867845288534, + "kl": 4.34375, "learning_rate": 1.0460069609149496e-07, - "loss": 0.7554, - "reward": 2.622069835662842, - "reward_std": 0.36788778752088547, - "rewards/accuracy_reward": 0.6666666716337204, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.008138408418744802, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 1.1173, + "reward": 2.4054399728775024, + "reward_std": 0.4486614912748337, + "rewards/accuracy_reward": 0.5625, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.011226738337427378, + "rewards/tag_count_reward": 0.8750000298023224, "step": 959 }, { "clip_ratio": 0.0, - "completion_length": 157.06250381469727, + "completion_length": 582.2500305175781, "epoch": 0.96, - "grad_norm": 43.87158683349949, - "kl": 1.921875, + "grad_norm": 34.491132802450544, + "kl": 5.03515625, "learning_rate": 1.0437936906629334e-07, - "loss": -0.0233, - "reward": 2.5156420469284058, - "reward_std": 0.23959733545780182, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.012135828146710992, - "rewards/tag_count_reward": 1.0, + "loss": 0.922, + "reward": 2.4019153118133545, + "reward_std": 0.6779120862483978, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.021695845760405064, + "rewards/tag_count_reward": 0.8750000298023224, "step": 960 }, { "clip_ratio": 0.0, - "completion_length": 196.68750762939453, + "completion_length": 404.8125, "epoch": 0.961, - "grad_norm": 24.430757254695383, - "kl": 2.5859375, + "grad_norm": 23.15463113511402, + "kl": 4.71875, "learning_rate": 1.0416347178785039e-07, - "loss": 0.6334, - "reward": 2.5439319610595703, - "reward_std": 0.27213358879089355, - "rewards/accuracy_reward": 0.5833333432674408, + "loss": 0.7217, + "reward": 2.643435001373291, + "reward_std": 0.5447419583797455, + "rewards/accuracy_reward": 0.7500000298023224, "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.006415329407900572, - "rewards/tag_count_reward": 0.9739583432674408, + "rewards/repetition_penalty_reward": -0.021495724096894264, + "rewards/tag_count_reward": 0.921875, "step": 961 }, { "clip_ratio": 0.0, - "completion_length": 167.14584350585938, + "completion_length": 386.6458435058594, "epoch": 0.962, - "grad_norm": 22.677870950871217, - "kl": 2.0703125, + "grad_norm": 14.191588671125146, + "kl": 2.1640625, "learning_rate": 1.0395300688680625e-07, - "loss": 0.3558, - "reward": 2.6690218448638916, - "reward_std": 0.14844887610524893, - "rewards/accuracy_reward": 0.6875, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.011533594224601984, - "rewards/tag_count_reward": 1.0, + "loss": 0.459, + "reward": 2.233402729034424, + "reward_std": 0.43939197063446045, + "rewards/accuracy_reward": 0.3333333544433117, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.0165973836556077, + "rewards/tag_count_reward": 0.9583333432674408, "step": 962 }, { "clip_ratio": 0.0, - "completion_length": 207.04167938232422, + "completion_length": 684.1458435058594, "epoch": 0.963, - "grad_norm": 17.17148201882875, - "kl": 4.0703125, + "grad_norm": 66.27238140783334, + "kl": 8.921875, "learning_rate": 1.0374797692760933e-07, - "loss": 0.8874, - "reward": 2.7260847091674805, - "reward_std": 0.31574415415525436, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.013498795684427023, - "rewards/tag_count_reward": 0.96875, + "loss": 1.2039, + "reward": 2.404576539993286, + "reward_std": 0.6691045165061951, + "rewards/accuracy_reward": 0.6250000298023224, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.025979137048125267, + "rewards/tag_count_reward": 0.8125000298023224, "step": 963 }, { "clip_ratio": 0.0, - "completion_length": 186.9791717529297, + "completion_length": 517.4375, "epoch": 0.964, - "grad_norm": 21.1862237671307, - "kl": 5.3984375, + "grad_norm": 72.11734538341996, + "kl": 6.40625, "learning_rate": 1.0354838440848501e-07, - "loss": 0.833, - "reward": 2.7419806718826294, - "reward_std": 0.2854643166065216, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.008019302738830447, - "rewards/tag_count_reward": 0.9583333432674408, + "loss": 0.7886, + "reward": 2.2568776607513428, + "reward_std": 0.48433394730091095, + "rewards/accuracy_reward": 0.4375000149011612, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.017428037710487843, + "rewards/tag_count_reward": 0.8645833730697632, "step": 964 }, { "clip_ratio": 0.0, - "completion_length": 149.97916793823242, + "completion_length": 414.0833435058594, "epoch": 0.965, - "grad_norm": 22.995558856592712, - "kl": 2.37109375, + "grad_norm": 22.298889854844614, + "kl": 3.390625, "learning_rate": 1.0335423176140511e-07, - "loss": 0.4009, - "reward": 2.7100484371185303, - "reward_std": 0.11190294893458486, - "rewards/accuracy_reward": 0.7291666716337204, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.008701751008629799, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.4795, + "reward": 2.5144201517105103, + "reward_std": 0.6362143456935883, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.958333432674408, + "rewards/repetition_penalty_reward": -0.016829868778586388, + "rewards/tag_count_reward": 0.9270833432674408, "step": 965 }, { "clip_ratio": 0.0, - "completion_length": 170.31250762939453, + "completion_length": 511.43751525878906, "epoch": 0.966, - "grad_norm": 18.415192657973584, - "kl": 3.5390625, + "grad_norm": 21.828332739197077, + "kl": 5.1015625, "learning_rate": 1.0316552135205837e-07, - "loss": 0.5884, - "reward": 2.7861911058425903, - "reward_std": 0.22223767638206482, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.007211708463728428, - "rewards/tag_count_reward": 0.9739583730697632, + "loss": 0.8053, + "reward": 2.537219762802124, + "reward_std": 0.5350492745637894, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9583334028720856, + "rewards/repetition_penalty_reward": -0.020071997307240963, + "rewards/tag_count_reward": 0.8906250298023224, "step": 966 }, { "clip_ratio": 0.0, - "completion_length": 150.2291717529297, + "completion_length": 437.9166717529297, "epoch": 0.967, - "grad_norm": 18.97339697692922, - "kl": 2.16796875, + "grad_norm": 36.06941047437207, + "kl": 4.046875, "learning_rate": 1.029822554798216e-07, - "loss": 0.2062, - "reward": 2.7153319120407104, - "reward_std": 0.10120345279574394, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.00689036282710731, - "rewards/tag_count_reward": 1.0, + "loss": 0.7437, + "reward": 2.4883724451065063, + "reward_std": 0.756380021572113, + "rewards/accuracy_reward": 0.6875, + "rewards/reasoning_steps_reward": 0.8958333432674408, + "rewards/repetition_penalty_reward": -0.016835974529385567, + "rewards/tag_count_reward": 0.9218750298023224, "step": 967 }, { "clip_ratio": 0.0, - "completion_length": 168.95833587646484, + "completion_length": 361.47918701171875, "epoch": 0.968, - "grad_norm": 23.139989721755683, - "kl": 2.734375, + "grad_norm": 21.352797175646803, + "kl": 1.361328125, "learning_rate": 1.0280443637773163e-07, - "loss": 0.2468, - "reward": 2.5719692707061768, - "reward_std": 0.22161518782377243, - "rewards/accuracy_reward": 0.6041666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.0044197289971634746, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.2428, + "reward": 2.3535938262939453, + "reward_std": 0.34989143908023834, + "rewards/accuracy_reward": 0.4166666865348816, + "rewards/reasoning_steps_reward": 0.9861111342906952, + "rewards/repetition_penalty_reward": -0.017933969385921955, + "rewards/tag_count_reward": 0.96875, "step": 968 }, { "clip_ratio": 0.0, - "completion_length": 119.70833587646484, + "completion_length": 397.4583435058594, "epoch": 0.969, - "grad_norm": 23.708554437246207, - "kl": 1.310546875, + "grad_norm": 35.203792265802775, + "kl": 2.890625, "learning_rate": 1.0263206621245807e-07, - "loss": 0.1927, - "reward": 2.9963735342025757, - "reward_std": 0.007283590966835618, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.003626560210250318, - "rewards/tag_count_reward": 1.0, + "loss": 0.7547, + "reward": 2.354074716567993, + "reward_std": 0.3742067515850067, + "rewards/accuracy_reward": 0.4375000149011612, + "rewards/reasoning_steps_reward": 0.9722222089767456, + "rewards/repetition_penalty_reward": -0.00877267774194479, + "rewards/tag_count_reward": 0.953125, "step": 969 }, { "clip_ratio": 0.0, - "completion_length": 110.43750381469727, + "completion_length": 422.56251525878906, "epoch": 0.97, - "grad_norm": 15.716559088076599, - "kl": 1.26171875, + "grad_norm": 38.23995456003063, + "kl": 4.59375, "learning_rate": 1.0246514708427701e-07, - "loss": 0.1234, - "reward": 2.9908241033554077, - "reward_std": 0.02806683164089918, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.0022316277027130127, - "rewards/tag_count_reward": 1.0, + "loss": 0.5101, + "reward": 2.6607662439346313, + "reward_std": 0.5710070729255676, + "rewards/accuracy_reward": 0.8125, + "rewards/reasoning_steps_reward": 0.9305556118488312, + "rewards/repetition_penalty_reward": -0.014581102412194014, + "rewards/tag_count_reward": 0.9322916865348816, "step": 970 }, { "clip_ratio": 0.0, - "completion_length": 154.18750762939453, + "completion_length": 518.6875305175781, "epoch": 0.971, - "grad_norm": 31.25866739625826, - "kl": 4.4375, + "grad_norm": 21.69225757308708, + "kl": 5.203125, "learning_rate": 1.0230368102704531e-07, - "loss": 0.364, - "reward": 2.7656902074813843, - "reward_std": 0.14860772341489792, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.005143190152011812, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.8592, + "reward": 2.31989324092865, + "reward_std": 0.6834602952003479, + "rewards/accuracy_reward": 0.5000000298023224, + "rewards/reasoning_steps_reward": 0.9583334028720856, + "rewards/repetition_penalty_reward": -0.018648307770490646, + "rewards/tag_count_reward": 0.8802083432674408, "step": 971 }, { "clip_ratio": 0.0, - "completion_length": 192.4375114440918, + "completion_length": 512.9375, "epoch": 0.972, - "grad_norm": 28.904384807238532, - "kl": 3.4375, + "grad_norm": 35.530778404533244, + "kl": 6.0859375, "learning_rate": 1.0214767000817596e-07, - "loss": 0.5271, - "reward": 2.4722615480422974, - "reward_std": 0.20743612898513675, - "rewards/accuracy_reward": 0.5, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.012113594682887197, - "rewards/tag_count_reward": 0.984375, + "loss": 0.8544, + "reward": 2.3701701164245605, + "reward_std": 0.8559737205505371, + "rewards/accuracy_reward": 0.625, + "rewards/reasoning_steps_reward": 0.8958334028720856, + "rewards/repetition_penalty_reward": -0.02045489940792322, + "rewards/tag_count_reward": 0.8697916865348816, "step": 972 }, { "clip_ratio": 0.0, - "completion_length": 173.64583587646484, + "completion_length": 524.1666870117188, "epoch": 0.973, - "grad_norm": 23.001143093088743, - "kl": 1.83203125, + "grad_norm": 30.66580558458817, + "kl": 8.59375, "learning_rate": 1.01997115928614e-07, - "loss": 0.6738, - "reward": 2.9292643070220947, - "reward_std": 0.22146177664399147, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.00823573803063482, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 1.0968, + "reward": 2.2603888511657715, + "reward_std": 0.8733722567558289, + "rewards/accuracy_reward": 0.5833333432674408, + "rewards/reasoning_steps_reward": 0.8888888955116272, + "rewards/repetition_penalty_reward": -0.019125062506645918, + "rewards/tag_count_reward": 0.8072916865348816, "step": 973 }, { "clip_ratio": 0.0, - "completion_length": 181.31250762939453, + "completion_length": 598.7916870117188, "epoch": 0.974, - "grad_norm": 19.240370365972588, - "kl": 4.34375, + "grad_norm": 21.256888216031047, + "kl": 6.8125, "learning_rate": 1.0185202062281336e-07, - "loss": 0.8258, - "reward": 2.476091980934143, - "reward_std": 0.23327933996915817, - "rewards/accuracy_reward": 0.5000000298023224, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.008283115457743406, - "rewards/tag_count_reward": 0.984375, + "loss": 1.205, + "reward": 2.3526848554611206, + "reward_std": 0.7947600483894348, + "rewards/accuracy_reward": 0.6250000149011612, + "rewards/reasoning_steps_reward": 0.9305555522441864, + "rewards/repetition_penalty_reward": -0.02578743826597929, + "rewards/tag_count_reward": 0.8229166865348816, "step": 974 }, { "clip_ratio": 0.0, - "completion_length": 138.68750762939453, + "completion_length": 607.625, "epoch": 0.975, - "grad_norm": 16.12256513879947, - "kl": 1.3359375, + "grad_norm": 19.685066419282403, + "kl": 6.890625, "learning_rate": 1.017123858587145e-07, - "loss": 0.0018, - "reward": 2.9240121841430664, - "reward_std": 0.11036281287670135, - "rewards/accuracy_reward": 0.9375, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.013487806543707848, - "rewards/tag_count_reward": 1.0, + "loss": 0.9204, + "reward": 2.2289880514144897, + "reward_std": 0.5278165340423584, + "rewards/accuracy_reward": 0.5208333432674408, + "rewards/reasoning_steps_reward": 0.9236111044883728, + "rewards/repetition_penalty_reward": -0.027956443838775158, + "rewards/tag_count_reward": 0.8125000298023224, "step": 975 }, { "clip_ratio": 0.0, - "completion_length": 229.58334350585938, + "completion_length": 585.8541870117188, "epoch": 0.976, - "grad_norm": 25.59908578584172, - "kl": 2.84375, + "grad_norm": 21.89405265517972, + "kl": 7.140625, "learning_rate": 1.0157821333772304e-07, - "loss": 0.5046, - "reward": 2.8402005434036255, - "reward_std": 0.2616690397262573, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.02438283059746027, - "rewards/tag_count_reward": 0.9687500298023224, + "loss": 1.0002, + "reward": 2.272312641143799, + "reward_std": 0.7230339646339417, + "rewards/accuracy_reward": 0.5625000298023224, + "rewards/reasoning_steps_reward": 0.9375, + "rewards/repetition_penalty_reward": -0.01935403887182474, + "rewards/tag_count_reward": 0.7916666865348816, "step": 976 }, { "clip_ratio": 0.0, - "completion_length": 161.5625, + "completion_length": 517.0833435058594, "epoch": 0.977, - "grad_norm": 15.156582710575625, - "kl": 2.4765625, + "grad_norm": 19.225554130007215, + "kl": 4.0234375, "learning_rate": 1.014495046946888e-07, - "loss": 0.5223, - "reward": 2.6937586069107056, - "reward_std": 0.22372080385684967, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.004158198833465576, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.8685, + "reward": 2.6582794189453125, + "reward_std": 0.5991591513156891, + "rewards/accuracy_reward": 0.8125000298023224, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.022276088129729033, + "rewards/tag_count_reward": 0.8958333432674408, "step": 977 }, { "clip_ratio": 0.0, - "completion_length": 131.3333396911621, + "completion_length": 436.8333435058594, "epoch": 0.978, - "grad_norm": 19.95336191594788, - "kl": 2.28125, + "grad_norm": 40.49511138880158, + "kl": 2.71875, "learning_rate": 1.013262614978859e-07, - "loss": 0.4245, - "reward": 2.9381306171417236, - "reward_std": 0.16794702410697937, - "rewards/accuracy_reward": 0.9583333730697632, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.0045778031926602125, - "rewards/tag_count_reward": 0.984375, + "loss": 0.8484, + "reward": 2.702500820159912, + "reward_std": 0.6443986296653748, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.951388955116272, + "rewards/repetition_penalty_reward": -0.024929875042289495, + "rewards/tag_count_reward": 0.9218750298023224, "step": 978 }, { "clip_ratio": 0.0, - "completion_length": 202.2291717529297, + "completion_length": 469.5416717529297, "epoch": 0.979, - "grad_norm": 63.80616029492511, - "kl": 5.71875, + "grad_norm": 31.345278883427696, + "kl": 3.375, "learning_rate": 1.0120848524899386e-07, - "loss": 0.7564, - "reward": 2.6426069736480713, - "reward_std": 0.3232828974723816, - "rewards/accuracy_reward": 0.7083333730697632, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.02058764244429767, - "rewards/tag_count_reward": 0.9687500298023224, + "loss": 0.9379, + "reward": 2.366294264793396, + "reward_std": 0.646638810634613, + "rewards/accuracy_reward": 0.5416666865348816, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.0278029702603817, + "rewards/tag_count_reward": 0.8802083730697632, "step": 979 }, { "clip_ratio": 0.0, - "completion_length": 176.20833587646484, + "completion_length": 448.4583435058594, "epoch": 0.98, - "grad_norm": 29.931509545270455, - "kl": 4.4921875, + "grad_norm": 14.372103163574602, + "kl": 2.703125, "learning_rate": 1.0109617738307911e-07, - "loss": 0.518, - "reward": 2.8856232166290283, - "reward_std": 0.20550773665308952, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.008474118541926146, - "rewards/tag_count_reward": 0.984375, + "loss": 0.4298, + "reward": 2.4362382888793945, + "reward_std": 0.5597494542598724, + "rewards/accuracy_reward": 0.6041666865348816, + "rewards/reasoning_steps_reward": 0.9583333730697632, + "rewards/repetition_penalty_reward": -0.03251180611550808, + "rewards/tag_count_reward": 0.9062500298023224, "step": 980 }, { "clip_ratio": 0.0, - "completion_length": 199.8125114440918, + "completion_length": 372.0208435058594, "epoch": 0.981, - "grad_norm": 12.049356025407363, - "kl": 3.90625, + "grad_norm": 40.416913288237026, + "kl": 2.2578125, "learning_rate": 1.0098933926857752e-07, - "loss": 0.7866, - "reward": 2.598093628883362, - "reward_std": 0.2267475724220276, - "rewards/accuracy_reward": 0.6458333432674408, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.009545378154143691, - "rewards/tag_count_reward": 0.9687500298023224, + "loss": 0.4458, + "reward": 2.7070990800857544, + "reward_std": 0.5603702366352081, + "rewards/accuracy_reward": 0.8541666865348816, + "rewards/reasoning_steps_reward": 0.9444444477558136, + "rewards/repetition_penalty_reward": -0.018595370464026928, + "rewards/tag_count_reward": 0.9270833432674408, "step": 981 }, { "clip_ratio": 0.0, - "completion_length": 120.95833587646484, + "completion_length": 503.1666717529297, "epoch": 0.982, - "grad_norm": 27.18175342411228, - "kl": 1.578125, + "grad_norm": 23.377929734927985, + "kl": 4.28125, "learning_rate": 1.0088797220727779e-07, - "loss": 0.3885, - "reward": 2.975519061088562, - "reward_std": 0.08119437424466014, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.0019114137976430357, - "rewards/tag_count_reward": 0.984375, + "loss": 1.1209, + "reward": 2.3728084564208984, + "reward_std": 0.7684433162212372, + "rewards/accuracy_reward": 0.5625000298023224, + "rewards/reasoning_steps_reward": 0.951388955116272, + "rewards/repetition_penalty_reward": -0.03170543722808361, + "rewards/tag_count_reward": 0.8906250298023224, "step": 982 }, { "clip_ratio": 0.0, - "completion_length": 184.7291717529297, + "completion_length": 457.6250305175781, "epoch": 0.983, - "grad_norm": 19.585757372934424, - "kl": 3.6875, + "grad_norm": 23.345141824922493, + "kl": 3.3046875, "learning_rate": 1.007920774343056e-07, - "loss": 0.8981, - "reward": 2.4708478450775146, - "reward_std": 0.18791627511382103, - "rewards/accuracy_reward": 0.5208333432674408, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.0031104900408536196, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 0.7212, + "reward": 2.3101412057876587, + "reward_std": 0.5889628529548645, + "rewards/accuracy_reward": 0.4375, + "rewards/reasoning_steps_reward": 0.9930555522441864, + "rewards/repetition_penalty_reward": -0.016247691586613655, + "rewards/tag_count_reward": 0.8958333432674408, "step": 983 }, { "clip_ratio": 0.0, - "completion_length": 414.7500305175781, + "completion_length": 370.7708435058594, "epoch": 0.984, - "grad_norm": 36.82289952047221, - "kl": 13.46875, + "grad_norm": 42.40361638319742, + "kl": 2.703125, "learning_rate": 1.0070165611810855e-07, - "loss": 1.5306, - "reward": 2.630509853363037, - "reward_std": 0.5585455447435379, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/reasoning_steps_reward": 0.9513888955116272, - "rewards/repetition_penalty_reward": -0.0292125903069973, - "rewards/tag_count_reward": 0.8750000298023224, + "loss": 0.4711, + "reward": 2.4439754486083984, + "reward_std": 0.4780399203300476, + "rewards/accuracy_reward": 0.520833358168602, + "rewards/reasoning_steps_reward": 0.9861111640930176, + "rewards/repetition_penalty_reward": -0.02130233682692051, + "rewards/tag_count_reward": 0.9583333432674408, "step": 984 }, { "clip_ratio": 0.0, - "completion_length": 241.9791717529297, + "completion_length": 569.6666870117188, "epoch": 0.985, - "grad_norm": 35.39470018023907, - "kl": 4.65625, + "grad_norm": 41.35640722099136, + "kl": 6.78125, "learning_rate": 1.0061670936044178e-07, - "loss": 0.7111, - "reward": 2.819217085838318, - "reward_std": 0.25974714756011963, - "rewards/accuracy_reward": 0.875, - "rewards/reasoning_steps_reward": 0.9861111640930176, - "rewards/repetition_penalty_reward": -0.01585243782028556, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 1.0564, + "reward": 2.4486454725265503, + "reward_std": 0.6525047123432159, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.020104597322642803, + "rewards/tag_count_reward": 0.8437500298023224, "step": 985 }, { "clip_ratio": 0.0, - "completion_length": 159.02083587646484, + "completion_length": 408.7916717529297, "epoch": 0.986, - "grad_norm": 20.355547292233112, - "kl": 3.6875, + "grad_norm": 47.381447307488855, + "kl": 4.13671875, "learning_rate": 1.005372381963547e-07, - "loss": 0.6173, - "reward": 2.9158129692077637, - "reward_std": 0.19321970641613007, - "rewards/accuracy_reward": 0.9375000298023224, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.006062169559299946, - "rewards/tag_count_reward": 0.984375, + "loss": 0.6591, + "reward": 2.723273992538452, + "reward_std": 0.3431188315153122, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/reasoning_steps_reward": 0.9861111640930176, + "rewards/repetition_penalty_reward": -0.023253954481333494, + "rewards/tag_count_reward": 0.9270833730697632, "step": 986 }, { "clip_ratio": 0.0, - "completion_length": 115.25000762939453, + "completion_length": 401.1041717529297, "epoch": 0.987, - "grad_norm": 22.48069388624414, - "kl": 1.26171875, + "grad_norm": 64.30330810008361, + "kl": 2.298828125, "learning_rate": 1.0046324359417842e-07, - "loss": 0.2333, - "reward": 2.9987306594848633, - "reward_std": 0.0036900914274156094, - "rewards/accuracy_reward": 1.0, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.0012692762538790703, - "rewards/tag_count_reward": 1.0, + "loss": 0.4446, + "reward": 2.5345929861068726, + "reward_std": 0.383854431565851, + "rewards/accuracy_reward": 0.625, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.026170868426561356, + "rewards/tag_count_reward": 0.9635416865348816, "step": 987 }, { "clip_ratio": 0.0, - "completion_length": 198.43750762939453, + "completion_length": 508.54168701171875, "epoch": 0.988, - "grad_norm": 37.62411796081249, - "kl": 5.484375, + "grad_norm": 18.399449871940522, + "kl": 5.25, "learning_rate": 1.0039472645551372e-07, - "loss": 0.6127, - "reward": 2.7406944036483765, - "reward_std": 0.15007019485346973, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.007569643668830395, - "rewards/tag_count_reward": 0.984375, + "loss": 0.7395, + "reward": 2.3243101835250854, + "reward_std": 0.6121802628040314, + "rewards/accuracy_reward": 0.520833358168602, + "rewards/reasoning_steps_reward": 0.9375000298023224, + "rewards/repetition_penalty_reward": -0.035064928233623505, + "rewards/tag_count_reward": 0.9010416865348816, "step": 988 }, { "clip_ratio": 0.0, - "completion_length": 193.83334350585938, + "completion_length": 360.4583435058594, "epoch": 0.989, - "grad_norm": 36.862522493898595, - "kl": 4.125, + "grad_norm": 23.549397740992113, + "kl": 1.0419921875, "learning_rate": 1.0033168761522048e-07, - "loss": 0.7709, - "reward": 2.7877026796340942, - "reward_std": 0.25491707026958466, - "rewards/accuracy_reward": 0.8333333730697632, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.007436337182298303, - "rewards/tag_count_reward": 0.96875, + "loss": 0.3115, + "reward": 2.6984081268310547, + "reward_std": 0.3867932856082916, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 1.0, + "rewards/repetition_penalty_reward": -0.015133682638406754, + "rewards/tag_count_reward": 0.984375, "step": 989 }, { "clip_ratio": 0.0, - "completion_length": 213.5, + "completion_length": 415.16668701171875, "epoch": 0.99, - "grad_norm": 40.38773921540298, - "kl": 5.453125, + "grad_norm": 33.19104469693633, + "kl": 4.4765625, "learning_rate": 1.002741278414069e-07, - "loss": 0.8915, - "reward": 2.741626739501953, - "reward_std": 0.21591387689113617, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.008373422781005502, - "rewards/tag_count_reward": 0.9583333432674408, + "loss": 0.6084, + "reward": 2.583367347717285, + "reward_std": 0.5392884910106659, + "rewards/accuracy_reward": 0.7083333432674408, + "rewards/reasoning_steps_reward": 0.9722222685813904, + "rewards/repetition_penalty_reward": -0.019063206389546394, + "rewards/tag_count_reward": 0.9218750298023224, "step": 990 }, { "clip_ratio": 0.0, - "completion_length": 146.08333587646484, + "completion_length": 623.3125305175781, "epoch": 0.991, - "grad_norm": 25.569056900081627, - "kl": 2.359375, + "grad_norm": 80.63884599695989, + "kl": 9.921875, "learning_rate": 1.0022204783542078e-07, - "loss": 0.3153, - "reward": 2.8671200275421143, - "reward_std": 0.2562591452151537, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.007880084216594696, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 1.1105, + "reward": 2.0245230197906494, + "reward_std": 0.5311869978904724, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/reasoning_steps_reward": 0.9097222089767456, + "rewards/repetition_penalty_reward": -0.010199269745498896, + "rewards/tag_count_reward": 0.7916666865348816, "step": 991 }, { "clip_ratio": 0.0, - "completion_length": 162.08334350585938, + "completion_length": 506.25, "epoch": 0.992, - "grad_norm": 20.113193839806563, - "kl": 4.046875, + "grad_norm": 18.011334199256787, + "kl": 5.390625, "learning_rate": 1.0017544823184055e-07, - "loss": 0.7609, - "reward": 2.935148596763611, - "reward_std": 0.21449988335371017, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.007559798192232847, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.9891, + "reward": 2.471301794052124, + "reward_std": 0.5606184005737305, + "rewards/accuracy_reward": 0.6458333432674408, + "rewards/reasoning_steps_reward": 0.944444477558136, + "rewards/repetition_penalty_reward": -0.014809360727667809, + "rewards/tag_count_reward": 0.8958333730697632, "step": 992 }, { "clip_ratio": 0.0, - "completion_length": 240.9166717529297, + "completion_length": 409.43751525878906, "epoch": 0.993, - "grad_norm": 23.61159514070626, - "kl": 4.5078125, + "grad_norm": 28.657341564218928, + "kl": 1.66796875, "learning_rate": 1.001343295984676e-07, - "loss": 0.713, - "reward": 2.528616428375244, - "reward_std": 0.4445534348487854, - "rewards/accuracy_reward": 0.6041666865348816, - "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.01652263104915619, - "rewards/tag_count_reward": 0.9479166865348816, + "loss": 0.5813, + "reward": 2.359801173210144, + "reward_std": 0.4328533709049225, + "rewards/accuracy_reward": 0.4375000149011612, + "rewards/reasoning_steps_reward": 0.972222238779068, + "rewards/repetition_penalty_reward": -0.01867112284526229, + "rewards/tag_count_reward": 0.9687500298023224, "step": 993 }, { "clip_ratio": 0.0, - "completion_length": 156.89583587646484, + "completion_length": 437.9583435058594, "epoch": 0.994, - "grad_norm": 27.514186250534927, - "kl": 4.1015625, + "grad_norm": 14.15639088003171, + "kl": 2.4453125, "learning_rate": 1.0009869243631952e-07, - "loss": 0.5387, - "reward": 2.8712995052337646, - "reward_std": 0.18820101767778397, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.00890898797661066, - "rewards/tag_count_reward": 0.984375, + "loss": 0.6178, + "reward": 2.6296534538269043, + "reward_std": 0.6427464187145233, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9652777910232544, + "rewards/repetition_penalty_reward": -0.012707796646282077, + "rewards/tag_count_reward": 0.9479166865348816, "step": 994 }, { "clip_ratio": 0.0, - "completion_length": 219.625, + "completion_length": 422.2083435058594, "epoch": 0.995, - "grad_norm": 19.66417015610052, - "kl": 4.46875, + "grad_norm": 21.464668161928536, + "kl": 2.0234375, "learning_rate": 1.0006853717962393e-07, - "loss": 0.6133, - "reward": 2.1904207468032837, - "reward_std": 0.24258776009082794, - "rewards/accuracy_reward": 0.2291666716337204, + "loss": 0.5771, + "reward": 2.529154658317566, + "reward_std": 0.3804011940956116, + "rewards/accuracy_reward": 0.6041666865348816, "rewards/reasoning_steps_reward": 0.9930555820465088, - "rewards/repetition_penalty_reward": -0.016176520846784115, - "rewards/tag_count_reward": 0.984375, + "rewards/repetition_penalty_reward": -0.015984368277713656, + "rewards/tag_count_reward": 0.9479166865348816, "step": 995 }, { "clip_ratio": 0.0, - "completion_length": 425.7291717529297, + "completion_length": 450.04168701171875, "epoch": 0.996, - "grad_norm": 39.987687422071296, - "kl": 8.375, + "grad_norm": 29.84614498881874, + "kl": 3.0390625, "learning_rate": 1.000438641958131e-07, - "loss": 0.9354, - "reward": 2.5608410835266113, - "reward_std": 0.423677533864975, - "rewards/accuracy_reward": 0.6458333432674408, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.02249239105731249, - "rewards/tag_count_reward": 0.9375000298023224, + "loss": 0.9596, + "reward": 2.2574844360351562, + "reward_std": 0.5413917303085327, + "rewards/accuracy_reward": 0.375, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.015085038263350725, + "rewards/tag_count_reward": 0.9322916865348816, "step": 996 }, { "clip_ratio": 0.0, - "completion_length": 309.02083587646484, + "completion_length": 411.0208435058594, "epoch": 0.997, - "grad_norm": 20.137548571031942, - "kl": 5.15625, + "grad_norm": 22.404250638364193, + "kl": 4.38671875, "learning_rate": 1.0002467378551954e-07, - "loss": 1.1284, - "reward": 2.8604397773742676, - "reward_std": 0.37873193621635437, - "rewards/accuracy_reward": 0.9375000298023224, - "rewards/reasoning_steps_reward": 0.9930555522441864, - "rewards/repetition_penalty_reward": -0.018032606225460768, - "rewards/tag_count_reward": 0.9479166865348816, + "loss": 0.5886, + "reward": 2.3560396432876587, + "reward_std": 0.5860812664031982, + "rewards/accuracy_reward": 0.4791666865348816, + "rewards/reasoning_steps_reward": 0.9791666865348816, + "rewards/repetition_penalty_reward": -0.013752035796642303, + "rewards/tag_count_reward": 0.9114583730697632, "step": 997 }, { "clip_ratio": 0.0, - "completion_length": 254.20834350585938, + "completion_length": 406.2083435058594, "epoch": 0.998, - "grad_norm": 21.26568232294322, - "kl": 2.04296875, + "grad_norm": 14.675909635758037, + "kl": 4.03125, "learning_rate": 1.0001096618257236e-07, - "loss": 0.487, - "reward": 2.869611978530884, - "reward_std": 0.303193174302578, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.026221534237265587, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.5068, + "reward": 2.554211378097534, + "reward_std": 0.657451719045639, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9444444477558136, + "rewards/repetition_penalty_reward": -0.025649813003838062, + "rewards/tag_count_reward": 0.9062500298023224, "step": 998 }, { "clip_ratio": 0.0, - "completion_length": 171.3541717529297, + "completion_length": 529.0416717529297, "epoch": 0.999, - "grad_norm": 21.796734154408004, - "kl": 3.17578125, + "grad_norm": 25.41791128417754, + "kl": 4.296875, "learning_rate": 1.0000274155399433e-07, - "loss": 0.5638, - "reward": 2.9349790811538696, - "reward_std": 0.21680796233704314, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/reasoning_steps_reward": 0.9791666865348816, - "rewards/repetition_penalty_reward": -0.0025211232132278383, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.7834, + "reward": 2.529424786567688, + "reward_std": 0.7073009014129639, + "rewards/accuracy_reward": 0.6875000298023224, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.019186487421393394, + "rewards/tag_count_reward": 0.8958333432674408, "step": 999 }, { "clip_ratio": 0.0, - "completion_length": 246.0625, + "completion_length": 462.625, "epoch": 1.0, - "grad_norm": 13.958698702844973, - "kl": 2.66015625, + "grad_norm": 21.030282475146205, + "kl": 3.8828125, "learning_rate": 1e-07, - "loss": 0.2868, - "reward": 2.6821523904800415, - "reward_std": 0.16277752071619034, - "rewards/accuracy_reward": 0.708333358168602, - "rewards/reasoning_steps_reward": 1.0, - "rewards/repetition_penalty_reward": -0.02097256761044264, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 1.0549, + "reward": 2.609420418739319, + "reward_std": 0.6920914947986603, + "rewards/accuracy_reward": 0.7291666865348816, + "rewards/reasoning_steps_reward": 0.9652778208255768, + "rewards/repetition_penalty_reward": -0.012107519898563623, + "rewards/tag_count_reward": 0.9270833432674408, "step": 1000 }, { "epoch": 1.0, "step": 1000, "total_flos": 0.0, - "train_loss": 0.38664906876385796, - "train_runtime": 35802.6607, - "train_samples_per_second": 0.112, - "train_steps_per_second": 0.028 + "train_loss": 0.49418719741604583, + "train_runtime": 57748.1799, + "train_samples_per_second": 0.069, + "train_steps_per_second": 0.017 } ], "logging_steps": 1,