| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.9988751406074241, |
| "eval_steps": 500, |
| "global_step": 666, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 150.88541984558105, |
| "epoch": 0.0014998125234345708, |
| "grad_norm": 2.0143966719522566, |
| "kl": 0.0, |
| "learning_rate": 2.9850746268656716e-07, |
| "loss": -0.123, |
| "reward": 0.39659371972084045, |
| "reward_std": 0.6371014267206192, |
| "rewards/accuracy_reward": 0.1250000037252903, |
| "rewards/reasoning_steps_reward": 0.04861111380159855, |
| "rewards/repetition_penalty_reward": -0.05305909365415573, |
| "rewards/tag_count_reward": 0.2760416716337204, |
| "step": 1 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 139.43750381469727, |
| "epoch": 0.0029996250468691415, |
| "grad_norm": 2.260545366465926, |
| "kl": 0.0, |
| "learning_rate": 5.970149253731343e-07, |
| "loss": -0.0563, |
| "reward": 0.29565349593758583, |
| "reward_std": 0.5291136056184769, |
| "rewards/accuracy_reward": 0.07812500046566129, |
| "rewards/reasoning_steps_reward": 0.039930558297783136, |
| "rewards/repetition_penalty_reward": -0.04505832493305206, |
| "rewards/tag_count_reward": 0.2226562537252903, |
| "step": 2 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 143.8854217529297, |
| "epoch": 0.0044994375703037125, |
| "grad_norm": 2.04474730883385, |
| "kl": 0.0003943443298339844, |
| "learning_rate": 8.955223880597015e-07, |
| "loss": -0.128, |
| "reward": 0.41914862394332886, |
| "reward_std": 0.6219020187854767, |
| "rewards/accuracy_reward": 0.13020833861082792, |
| "rewards/reasoning_steps_reward": 0.06423611287027597, |
| "rewards/repetition_penalty_reward": -0.05524375103414059, |
| "rewards/tag_count_reward": 0.2799479216337204, |
| "step": 3 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 143.52605056762695, |
| "epoch": 0.005999250093738283, |
| "grad_norm": 2.088074149444087, |
| "kl": 0.0003809928894042969, |
| "learning_rate": 1.1940298507462686e-06, |
| "loss": -0.055, |
| "reward": 0.32894811406731606, |
| "reward_std": 0.5705550760030746, |
| "rewards/accuracy_reward": 0.08333333628252149, |
| "rewards/reasoning_steps_reward": 0.05381944729015231, |
| "rewards/repetition_penalty_reward": -0.03997551556676626, |
| "rewards/tag_count_reward": 0.2317708395421505, |
| "step": 4 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 131.28125381469727, |
| "epoch": 0.0074990626171728535, |
| "grad_norm": 2.0380845412513424, |
| "kl": 0.0005826950073242188, |
| "learning_rate": 1.4925373134328358e-06, |
| "loss": -0.066, |
| "reward": 0.2540438659489155, |
| "reward_std": 0.45853011310100555, |
| "rewards/accuracy_reward": 0.05208333395421505, |
| "rewards/reasoning_steps_reward": 0.04687500232830644, |
| "rewards/repetition_penalty_reward": -0.04413322079926729, |
| "rewards/tag_count_reward": 0.19921875651925802, |
| "step": 5 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 154.97917556762695, |
| "epoch": 0.008998875140607425, |
| "grad_norm": 1.891201241821577, |
| "kl": 0.0010385513305664062, |
| "learning_rate": 1.791044776119403e-06, |
| "loss": -0.0654, |
| "reward": 0.45939914882183075, |
| "reward_std": 0.5917806774377823, |
| "rewards/accuracy_reward": 0.1093750037252903, |
| "rewards/reasoning_steps_reward": 0.059027780778706074, |
| "rewards/repetition_penalty_reward": -0.05145154893398285, |
| "rewards/tag_count_reward": 0.3424479216337204, |
| "step": 6 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 135.93229484558105, |
| "epoch": 0.010498687664041995, |
| "grad_norm": 2.1855192003244244, |
| "kl": 0.00994873046875, |
| "learning_rate": 2.08955223880597e-06, |
| "loss": -0.0289, |
| "reward": 0.47621314972639084, |
| "reward_std": 0.6438451856374741, |
| "rewards/accuracy_reward": 0.13020833861082792, |
| "rewards/reasoning_steps_reward": 0.03819444729015231, |
| "rewards/repetition_penalty_reward": -0.05026257690042257, |
| "rewards/tag_count_reward": 0.358072929084301, |
| "step": 7 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 149.20313262939453, |
| "epoch": 0.011998500187476566, |
| "grad_norm": 1.5337300270924847, |
| "kl": 0.027801513671875, |
| "learning_rate": 2.3880597014925373e-06, |
| "loss": -0.0828, |
| "reward": 0.5498909652233124, |
| "reward_std": 0.6071374714374542, |
| "rewards/accuracy_reward": 0.1354166716337204, |
| "rewards/reasoning_steps_reward": 0.0763888917863369, |
| "rewards/repetition_penalty_reward": -0.05123751983046532, |
| "rewards/tag_count_reward": 0.3893229365348816, |
| "step": 8 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 163.7291717529297, |
| "epoch": 0.013498312710911136, |
| "grad_norm": 27.976622447462567, |
| "kl": 0.797607421875, |
| "learning_rate": 2.686567164179105e-06, |
| "loss": -0.0138, |
| "reward": 0.8464770168066025, |
| "reward_std": 0.7221487462520599, |
| "rewards/accuracy_reward": 0.2708333469927311, |
| "rewards/reasoning_steps_reward": 0.09895833767950535, |
| "rewards/repetition_penalty_reward": -0.06498134508728981, |
| "rewards/tag_count_reward": 0.541666679084301, |
| "step": 9 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 166.1145896911621, |
| "epoch": 0.014998125234345707, |
| "grad_norm": 173.04994011488648, |
| "kl": 4.0771484375, |
| "learning_rate": 2.9850746268656716e-06, |
| "loss": 0.0723, |
| "reward": 0.9919786900281906, |
| "reward_std": 0.7005721777677536, |
| "rewards/accuracy_reward": 0.3593750074505806, |
| "rewards/reasoning_steps_reward": 0.07291667279787362, |
| "rewards/repetition_penalty_reward": -0.08093799650669098, |
| "rewards/tag_count_reward": 0.6406250149011612, |
| "step": 10 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 163.34375381469727, |
| "epoch": 0.016497937757780277, |
| "grad_norm": 745.8172627492539, |
| "kl": 13.115234375, |
| "learning_rate": 3.283582089552239e-06, |
| "loss": 0.2316, |
| "reward": 1.0233316719532013, |
| "reward_std": 0.7426625639200211, |
| "rewards/accuracy_reward": 0.385416679084301, |
| "rewards/reasoning_steps_reward": 0.045138892251998186, |
| "rewards/repetition_penalty_reward": -0.07649472542107105, |
| "rewards/tag_count_reward": 0.669270858168602, |
| "step": 11 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 168.0729217529297, |
| "epoch": 0.01799775028121485, |
| "grad_norm": 9.77279564737439, |
| "kl": 0.47802734375, |
| "learning_rate": 3.582089552238806e-06, |
| "loss": -0.0793, |
| "reward": 1.0776500403881073, |
| "reward_std": 0.6928116679191589, |
| "rewards/accuracy_reward": 0.354166679084301, |
| "rewards/reasoning_steps_reward": 0.10243056155741215, |
| "rewards/repetition_penalty_reward": -0.06774928979575634, |
| "rewards/tag_count_reward": 0.688802108168602, |
| "step": 12 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 150.58855056762695, |
| "epoch": 0.01949756280464942, |
| "grad_norm": 1.9563170638007599, |
| "kl": 0.1324462890625, |
| "learning_rate": 3.8805970149253735e-06, |
| "loss": -0.005, |
| "reward": 1.2495136260986328, |
| "reward_std": 0.6537359356880188, |
| "rewards/accuracy_reward": 0.4270833432674408, |
| "rewards/reasoning_steps_reward": 0.0538194477558136, |
| "rewards/repetition_penalty_reward": -0.051701731979846954, |
| "rewards/tag_count_reward": 0.8203125149011612, |
| "step": 13 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 144.01042366027832, |
| "epoch": 0.02099737532808399, |
| "grad_norm": 7.69562170272425, |
| "kl": 0.294189453125, |
| "learning_rate": 4.17910447761194e-06, |
| "loss": 0.0198, |
| "reward": 1.16742305457592, |
| "reward_std": 0.6400385946035385, |
| "rewards/accuracy_reward": 0.354166679084301, |
| "rewards/reasoning_steps_reward": 0.05902778171002865, |
| "rewards/repetition_penalty_reward": -0.062177615240216255, |
| "rewards/tag_count_reward": 0.8164062798023224, |
| "step": 14 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 156.21875762939453, |
| "epoch": 0.02249718785151856, |
| "grad_norm": 2.534410431741499, |
| "kl": 0.146728515625, |
| "learning_rate": 4.477611940298508e-06, |
| "loss": 0.0501, |
| "reward": 1.4073261320590973, |
| "reward_std": 0.645281046628952, |
| "rewards/accuracy_reward": 0.494791679084301, |
| "rewards/reasoning_steps_reward": 0.1354166716337204, |
| "rewards/repetition_penalty_reward": -0.044496800750494, |
| "rewards/tag_count_reward": 0.8216145932674408, |
| "step": 15 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 142.89062881469727, |
| "epoch": 0.023997000374953132, |
| "grad_norm": 1.6522024022194932, |
| "kl": 0.08740234375, |
| "learning_rate": 4.7761194029850745e-06, |
| "loss": 0.004, |
| "reward": 1.378474086523056, |
| "reward_std": 0.6564009487628937, |
| "rewards/accuracy_reward": 0.4687500298023224, |
| "rewards/reasoning_steps_reward": 0.1215277798473835, |
| "rewards/repetition_penalty_reward": -0.04513707011938095, |
| "rewards/tag_count_reward": 0.8333333432674408, |
| "step": 16 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 160.17708587646484, |
| "epoch": 0.0254968128983877, |
| "grad_norm": 1.2954290652113556, |
| "kl": 0.07891845703125, |
| "learning_rate": 5.074626865671642e-06, |
| "loss": 0.0088, |
| "reward": 1.3321838080883026, |
| "reward_std": 0.6731031388044357, |
| "rewards/accuracy_reward": 0.421875, |
| "rewards/reasoning_steps_reward": 0.1371527872979641, |
| "rewards/repetition_penalty_reward": -0.04976071882992983, |
| "rewards/tag_count_reward": 0.8229166865348816, |
| "step": 17 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 156.93750381469727, |
| "epoch": 0.02699662542182227, |
| "grad_norm": 1.3681933966105373, |
| "kl": 0.05126953125, |
| "learning_rate": 5.37313432835821e-06, |
| "loss": -0.0754, |
| "reward": 1.3372644186019897, |
| "reward_std": 0.6545021533966064, |
| "rewards/accuracy_reward": 0.401041679084301, |
| "rewards/reasoning_steps_reward": 0.2013889029622078, |
| "rewards/repetition_penalty_reward": -0.062041101045906544, |
| "rewards/tag_count_reward": 0.7968750149011612, |
| "step": 18 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 175.35937881469727, |
| "epoch": 0.028496437945256844, |
| "grad_norm": 1.1392001469038255, |
| "kl": 0.05596923828125, |
| "learning_rate": 5.671641791044776e-06, |
| "loss": -0.0216, |
| "reward": 1.352351814508438, |
| "reward_std": 0.6970622688531876, |
| "rewards/accuracy_reward": 0.4375000074505806, |
| "rewards/reasoning_steps_reward": 0.2031250074505806, |
| "rewards/repetition_penalty_reward": -0.06561699230223894, |
| "rewards/tag_count_reward": 0.7773437649011612, |
| "step": 19 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 194.78646087646484, |
| "epoch": 0.029996250468691414, |
| "grad_norm": 1.4083775545206836, |
| "kl": 0.0611572265625, |
| "learning_rate": 5.970149253731343e-06, |
| "loss": -0.1554, |
| "reward": 1.4778603315353394, |
| "reward_std": 0.6467646211385727, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/reasoning_steps_reward": 0.2829861342906952, |
| "rewards/repetition_penalty_reward": -0.06684454903006554, |
| "rewards/tag_count_reward": 0.8242187798023224, |
| "step": 20 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 221.3541717529297, |
| "epoch": 0.031496062992125984, |
| "grad_norm": 0.9468941237796497, |
| "kl": 0.050048828125, |
| "learning_rate": 6.2686567164179116e-06, |
| "loss": -0.1031, |
| "reward": 1.576261043548584, |
| "reward_std": 0.7107058763504028, |
| "rewards/accuracy_reward": 0.4427083432674408, |
| "rewards/reasoning_steps_reward": 0.4444444701075554, |
| "rewards/repetition_penalty_reward": -0.0830271951854229, |
| "rewards/tag_count_reward": 0.7721354365348816, |
| "step": 21 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 253.48438262939453, |
| "epoch": 0.032995875515560553, |
| "grad_norm": 0.8523009396371123, |
| "kl": 0.05426025390625, |
| "learning_rate": 6.567164179104478e-06, |
| "loss": -0.0433, |
| "reward": 1.9110043048858643, |
| "reward_std": 0.7316416054964066, |
| "rewards/accuracy_reward": 0.5416666865348816, |
| "rewards/reasoning_steps_reward": 0.6388888955116272, |
| "rewards/repetition_penalty_reward": -0.08595757372677326, |
| "rewards/tag_count_reward": 0.8164062798023224, |
| "step": 22 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 282.4323043823242, |
| "epoch": 0.03449568803899512, |
| "grad_norm": 0.8161516191732155, |
| "kl": 0.07037353515625, |
| "learning_rate": 6.865671641791045e-06, |
| "loss": -0.0343, |
| "reward": 2.085984379053116, |
| "reward_std": 0.670979842543602, |
| "rewards/accuracy_reward": 0.5625000298023224, |
| "rewards/reasoning_steps_reward": 0.7743056118488312, |
| "rewards/repetition_penalty_reward": -0.10238376632332802, |
| "rewards/tag_count_reward": 0.8515625149011612, |
| "step": 23 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 277.39064025878906, |
| "epoch": 0.0359955005624297, |
| "grad_norm": 0.88714678428363, |
| "kl": 0.08349609375, |
| "learning_rate": 7.164179104477612e-06, |
| "loss": -0.0093, |
| "reward": 2.1416602730751038, |
| "reward_std": 0.6270937323570251, |
| "rewards/accuracy_reward": 0.5416666865348816, |
| "rewards/reasoning_steps_reward": 0.8420138955116272, |
| "rewards/repetition_penalty_reward": -0.10530160553753376, |
| "rewards/tag_count_reward": 0.8632812798023224, |
| "step": 24 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 303.7708435058594, |
| "epoch": 0.03749531308586427, |
| "grad_norm": 0.7677870229716102, |
| "kl": 0.0814208984375, |
| "learning_rate": 7.46268656716418e-06, |
| "loss": -0.0311, |
| "reward": 2.169845759868622, |
| "reward_std": 0.5908740907907486, |
| "rewards/accuracy_reward": 0.4843750149011612, |
| "rewards/reasoning_steps_reward": 0.8958334028720856, |
| "rewards/repetition_penalty_reward": -0.119216812774539, |
| "rewards/tag_count_reward": 0.9088542014360428, |
| "step": 25 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 300.0052185058594, |
| "epoch": 0.03899512560929884, |
| "grad_norm": 0.751193936667339, |
| "kl": 0.084228515625, |
| "learning_rate": 7.761194029850747e-06, |
| "loss": 0.0118, |
| "reward": 2.2213982343673706, |
| "reward_std": 0.5355894565582275, |
| "rewards/accuracy_reward": 0.5000000298023224, |
| "rewards/reasoning_steps_reward": 0.9461806118488312, |
| "rewards/repetition_penalty_reward": -0.1258239597082138, |
| "rewards/tag_count_reward": 0.9010416716337204, |
| "step": 26 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 294.46875762939453, |
| "epoch": 0.04049493813273341, |
| "grad_norm": 0.817822463608519, |
| "kl": 0.0811767578125, |
| "learning_rate": 8.059701492537314e-06, |
| "loss": 0.0218, |
| "reward": 2.4470449686050415, |
| "reward_std": 0.5053400099277496, |
| "rewards/accuracy_reward": 0.6145833432674408, |
| "rewards/reasoning_steps_reward": 0.9843750298023224, |
| "rewards/repetition_penalty_reward": -0.11154878698289394, |
| "rewards/tag_count_reward": 0.9596354365348816, |
| "step": 27 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 285.4166717529297, |
| "epoch": 0.04199475065616798, |
| "grad_norm": 0.8329974091920429, |
| "kl": 0.101806640625, |
| "learning_rate": 8.35820895522388e-06, |
| "loss": 0.0405, |
| "reward": 2.4431938529014587, |
| "reward_std": 0.46228019893169403, |
| "rewards/accuracy_reward": 0.614583358168602, |
| "rewards/reasoning_steps_reward": 0.9930555671453476, |
| "rewards/repetition_penalty_reward": -0.10975776612758636, |
| "rewards/tag_count_reward": 0.9453125298023224, |
| "step": 28 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 251.47396850585938, |
| "epoch": 0.04349456317960255, |
| "grad_norm": 2.1092410850036143, |
| "kl": 0.1441650390625, |
| "learning_rate": 8.656716417910447e-06, |
| "loss": 0.0865, |
| "reward": 2.475065588951111, |
| "reward_std": 0.47668417543172836, |
| "rewards/accuracy_reward": 0.661458358168602, |
| "rewards/reasoning_steps_reward": 0.9947916716337204, |
| "rewards/repetition_penalty_reward": -0.1121740210801363, |
| "rewards/tag_count_reward": 0.9309895932674408, |
| "step": 29 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 263.27083587646484, |
| "epoch": 0.04499437570303712, |
| "grad_norm": 0.8134902475342992, |
| "kl": 0.140380859375, |
| "learning_rate": 8.955223880597016e-06, |
| "loss": 0.0415, |
| "reward": 2.441520571708679, |
| "reward_std": 0.3973531872034073, |
| "rewards/accuracy_reward": 0.6093750223517418, |
| "rewards/reasoning_steps_reward": 0.9982638955116272, |
| "rewards/repetition_penalty_reward": -0.1179413478821516, |
| "rewards/tag_count_reward": 0.9518229365348816, |
| "step": 30 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 230.7916717529297, |
| "epoch": 0.046494188226471694, |
| "grad_norm": 0.8575684864473243, |
| "kl": 0.1580810546875, |
| "learning_rate": 9.253731343283582e-06, |
| "loss": 0.0852, |
| "reward": 2.5657079219818115, |
| "reward_std": 0.4555797800421715, |
| "rewards/accuracy_reward": 0.708333358168602, |
| "rewards/reasoning_steps_reward": 1.0, |
| "rewards/repetition_penalty_reward": -0.09314640611410141, |
| "rewards/tag_count_reward": 0.9505208432674408, |
| "step": 31 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 208.65625762939453, |
| "epoch": 0.047994000749906264, |
| "grad_norm": 1.2887250609562435, |
| "kl": 0.23876953125, |
| "learning_rate": 9.552238805970149e-06, |
| "loss": 0.0936, |
| "reward": 2.6088263988494873, |
| "reward_std": 0.3428891524672508, |
| "rewards/accuracy_reward": 0.755208358168602, |
| "rewards/reasoning_steps_reward": 0.9982638955116272, |
| "rewards/repetition_penalty_reward": -0.06912498734891415, |
| "rewards/tag_count_reward": 0.9244791865348816, |
| "step": 32 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 195.6197967529297, |
| "epoch": 0.049493813273340834, |
| "grad_norm": 0.998549892430521, |
| "kl": 0.164306640625, |
| "learning_rate": 9.850746268656717e-06, |
| "loss": 0.0716, |
| "reward": 2.281342178583145, |
| "reward_std": 0.3404741808772087, |
| "rewards/accuracy_reward": 0.4062500186264515, |
| "rewards/reasoning_steps_reward": 0.9965277910232544, |
| "rewards/repetition_penalty_reward": -0.06414407771080732, |
| "rewards/tag_count_reward": 0.942708358168602, |
| "step": 33 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 173.95312881469727, |
| "epoch": 0.0509936257967754, |
| "grad_norm": 1.317082973370953, |
| "kl": 0.213134765625, |
| "learning_rate": 1.0149253731343284e-05, |
| "loss": 0.1399, |
| "reward": 2.370617628097534, |
| "reward_std": 0.4167383909225464, |
| "rewards/accuracy_reward": 0.5260416716337204, |
| "rewards/reasoning_steps_reward": 0.9878472536802292, |
| "rewards/repetition_penalty_reward": -0.0443130349740386, |
| "rewards/tag_count_reward": 0.9010416865348816, |
| "step": 34 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 172.42187881469727, |
| "epoch": 0.05249343832020997, |
| "grad_norm": 1.1847166733741281, |
| "kl": 0.202392578125, |
| "learning_rate": 1.0447761194029851e-05, |
| "loss": 0.1261, |
| "reward": 2.3955256938934326, |
| "reward_std": 0.328448873013258, |
| "rewards/accuracy_reward": 0.5312500149011612, |
| "rewards/reasoning_steps_reward": 0.97743059694767, |
| "rewards/repetition_penalty_reward": -0.04284239187836647, |
| "rewards/tag_count_reward": 0.9296875149011612, |
| "step": 35 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 193.6458396911621, |
| "epoch": 0.05399325084364454, |
| "grad_norm": 1.4752348010664043, |
| "kl": 0.24560546875, |
| "learning_rate": 1.074626865671642e-05, |
| "loss": 0.4191, |
| "reward": 2.3304308652877808, |
| "reward_std": 0.5351479351520538, |
| "rewards/accuracy_reward": 0.5520833507180214, |
| "rewards/reasoning_steps_reward": 0.9340278208255768, |
| "rewards/repetition_penalty_reward": -0.0515137268230319, |
| "rewards/tag_count_reward": 0.8958333432674408, |
| "step": 36 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 182.2135467529297, |
| "epoch": 0.05549306336707911, |
| "grad_norm": 1.7682892736323388, |
| "kl": 0.32080078125, |
| "learning_rate": 1.1044776119402986e-05, |
| "loss": 0.402, |
| "reward": 2.4258365631103516, |
| "reward_std": 0.5576556175947189, |
| "rewards/accuracy_reward": 0.6562500149011612, |
| "rewards/reasoning_steps_reward": 0.89930559694767, |
| "rewards/repetition_penalty_reward": -0.038573198951780796, |
| "rewards/tag_count_reward": 0.9088541716337204, |
| "step": 37 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 240.62500381469727, |
| "epoch": 0.05699287589051369, |
| "grad_norm": 1.8853983944936439, |
| "kl": 0.3203125, |
| "learning_rate": 1.1343283582089553e-05, |
| "loss": 0.655, |
| "reward": 2.015189290046692, |
| "reward_std": 0.6033422723412514, |
| "rewards/accuracy_reward": 0.3697916716337204, |
| "rewards/reasoning_steps_reward": 0.8541667312383652, |
| "rewards/repetition_penalty_reward": -0.055123341269791126, |
| "rewards/tag_count_reward": 0.8463541865348816, |
| "step": 38 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 295.5833396911621, |
| "epoch": 0.05849268841394826, |
| "grad_norm": 3.2310358216677733, |
| "kl": 0.3486328125, |
| "learning_rate": 1.1641791044776121e-05, |
| "loss": 0.7056, |
| "reward": 2.03124737739563, |
| "reward_std": 0.8326994776725769, |
| "rewards/accuracy_reward": 0.5052083432674408, |
| "rewards/reasoning_steps_reward": 0.7847221940755844, |
| "rewards/repetition_penalty_reward": -0.052954130340367556, |
| "rewards/tag_count_reward": 0.794270858168602, |
| "step": 39 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 344.5208511352539, |
| "epoch": 0.05999250093738283, |
| "grad_norm": 38.31384293221201, |
| "kl": 0.46484375, |
| "learning_rate": 1.1940298507462686e-05, |
| "loss": 0.6778, |
| "reward": 1.9736978709697723, |
| "reward_std": 0.8218565732240677, |
| "rewards/accuracy_reward": 0.4843750149011612, |
| "rewards/reasoning_steps_reward": 0.7552083283662796, |
| "rewards/repetition_penalty_reward": -0.013281408930197358, |
| "rewards/tag_count_reward": 0.747395858168602, |
| "step": 40 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 443.59375762939453, |
| "epoch": 0.0614923134608174, |
| "grad_norm": 10963112.509325096, |
| "kl": 38711.625, |
| "learning_rate": 1.2238805970149255e-05, |
| "loss": 3011.3784, |
| "reward": 1.5866824984550476, |
| "reward_std": 0.7351708710193634, |
| "rewards/accuracy_reward": 0.3020833395421505, |
| "rewards/reasoning_steps_reward": 0.6545138955116272, |
| "rewards/repetition_penalty_reward": -0.01444607856683433, |
| "rewards/tag_count_reward": 0.6445312798023224, |
| "step": 41 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 571.9479370117188, |
| "epoch": 0.06299212598425197, |
| "grad_norm": 1562.2702573171862, |
| "kl": 13.8759765625, |
| "learning_rate": 1.2537313432835823e-05, |
| "loss": 1.2646, |
| "reward": 1.396679848432541, |
| "reward_std": 0.764959841966629, |
| "rewards/accuracy_reward": 0.2604166753590107, |
| "rewards/reasoning_steps_reward": 0.5763889253139496, |
| "rewards/repetition_penalty_reward": -0.006532001192681491, |
| "rewards/tag_count_reward": 0.5664062649011612, |
| "step": 42 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 818.1614837646484, |
| "epoch": 0.06449193850768654, |
| "grad_norm": 36.12141290169206, |
| "kl": 1.1787109375, |
| "learning_rate": 1.2835820895522388e-05, |
| "loss": 0.1815, |
| "reward": 1.0434824973344803, |
| "reward_std": 0.6087194085121155, |
| "rewards/accuracy_reward": 0.07812500279396772, |
| "rewards/reasoning_steps_reward": 0.5711805820465088, |
| "rewards/repetition_penalty_reward": -0.002958516124635935, |
| "rewards/tag_count_reward": 0.397135429084301, |
| "step": 43 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 883.0052185058594, |
| "epoch": 0.06599175103112111, |
| "grad_norm": 12.05770882883392, |
| "kl": 1.087890625, |
| "learning_rate": 1.3134328358208957e-05, |
| "loss": 0.1592, |
| "reward": 0.8965069055557251, |
| "reward_std": 0.5693257004022598, |
| "rewards/accuracy_reward": 0.057291668839752674, |
| "rewards/reasoning_steps_reward": 0.508680559694767, |
| "rewards/repetition_penalty_reward": -0.0014966219605412334, |
| "rewards/tag_count_reward": 0.3320312574505806, |
| "step": 44 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 978.4687805175781, |
| "epoch": 0.06749156355455568, |
| "grad_norm": 7.679672597113327, |
| "kl": 1.017578125, |
| "learning_rate": 1.3432835820895525e-05, |
| "loss": 0.0491, |
| "reward": 0.7406667172908783, |
| "reward_std": 0.41688016057014465, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/reasoning_steps_reward": 0.4843750223517418, |
| "rewards/repetition_penalty_reward": -0.0002187521276937332, |
| "rewards/tag_count_reward": 0.2565104253590107, |
| "step": 45 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 992.9739837646484, |
| "epoch": 0.06899137607799025, |
| "grad_norm": 3776.078403846416, |
| "kl": 122.390625, |
| "learning_rate": 1.373134328358209e-05, |
| "loss": 5.2459, |
| "reward": 0.6156022548675537, |
| "reward_std": 0.45898766070604324, |
| "rewards/accuracy_reward": 0.0052083334885537624, |
| "rewards/reasoning_steps_reward": 0.3975694626569748, |
| "rewards/repetition_penalty_reward": -0.0007172302284743637, |
| "rewards/tag_count_reward": 0.2135416716337204, |
| "step": 46 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1021.09375, |
| "epoch": 0.07049118860142482, |
| "grad_norm": 53.535086229947574, |
| "kl": 2.7412109375, |
| "learning_rate": 1.4029850746268658e-05, |
| "loss": 0.1119, |
| "reward": 0.6543789356946945, |
| "reward_std": 0.39667317271232605, |
| "rewards/accuracy_reward": 0.0052083334885537624, |
| "rewards/reasoning_steps_reward": 0.4670139029622078, |
| "rewards/repetition_penalty_reward": -0.00013500768363883253, |
| "rewards/tag_count_reward": 0.1822916716337204, |
| "step": 47 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1006.2812652587891, |
| "epoch": 0.0719910011248594, |
| "grad_norm": 9.182715345455021, |
| "kl": 0.677734375, |
| "learning_rate": 1.4328358208955224e-05, |
| "loss": 0.0131, |
| "reward": 0.5155243277549744, |
| "reward_std": 0.3933122977614403, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/reasoning_steps_reward": 0.3645833507180214, |
| "rewards/repetition_penalty_reward": -0.00010070096323033795, |
| "rewards/tag_count_reward": 0.1510416716337204, |
| "step": 48 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1017.1927185058594, |
| "epoch": 0.07349081364829396, |
| "grad_norm": 2.136051505149193, |
| "kl": 0.76416015625, |
| "learning_rate": 1.4626865671641792e-05, |
| "loss": 0.0258, |
| "reward": 0.5431747883558273, |
| "reward_std": 0.3834214210510254, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/reasoning_steps_reward": 0.3871527835726738, |
| "rewards/repetition_penalty_reward": -0.0002280306780448882, |
| "rewards/tag_count_reward": 0.1562500037252903, |
| "step": 49 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1005.1770935058594, |
| "epoch": 0.07499062617172854, |
| "grad_norm": 233.5775916802967, |
| "kl": 15.7265625, |
| "learning_rate": 1.492537313432836e-05, |
| "loss": 0.6288, |
| "reward": 0.5613471269607544, |
| "reward_std": 0.40113527327775955, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/reasoning_steps_reward": 0.418402798473835, |
| "rewards/repetition_penalty_reward": -0.0002848635776899755, |
| "rewards/tag_count_reward": 0.1432291716337204, |
| "step": 50 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 994.8333435058594, |
| "epoch": 0.0764904386951631, |
| "grad_norm": 10.304724683124707, |
| "kl": 1.4677734375, |
| "learning_rate": 1.5223880597014925e-05, |
| "loss": 0.0407, |
| "reward": 0.5884975641965866, |
| "reward_std": 0.4051181599497795, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/reasoning_steps_reward": 0.4427083507180214, |
| "rewards/repetition_penalty_reward": -4.4144707317173015e-05, |
| "rewards/tag_count_reward": 0.1458333395421505, |
| "step": 51 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 983.5208435058594, |
| "epoch": 0.07799025121859768, |
| "grad_norm": 56.749232087158944, |
| "kl": 6.04296875, |
| "learning_rate": 1.5522388059701494e-05, |
| "loss": 0.2016, |
| "reward": 0.6742116063833237, |
| "reward_std": 0.4244392439723015, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/reasoning_steps_reward": 0.5104166716337204, |
| "rewards/repetition_penalty_reward": -0.0002676011572475545, |
| "rewards/tag_count_reward": 0.1640625037252903, |
| "step": 52 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 972.0052185058594, |
| "epoch": 0.07949006374203224, |
| "grad_norm": 5.494493067960978, |
| "kl": 2.2734375, |
| "learning_rate": 1.582089552238806e-05, |
| "loss": 0.0395, |
| "reward": 0.6165060997009277, |
| "reward_std": 0.412445493042469, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/reasoning_steps_reward": 0.4565972536802292, |
| "rewards/repetition_penalty_reward": -0.00024740799563005567, |
| "rewards/tag_count_reward": 0.1601562537252903, |
| "step": 53 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 997.3750152587891, |
| "epoch": 0.08098987626546682, |
| "grad_norm": 8.732055316405138, |
| "kl": 0.92333984375, |
| "learning_rate": 1.6119402985074627e-05, |
| "loss": 0.0115, |
| "reward": 0.8314266949892044, |
| "reward_std": 0.3945605829358101, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/reasoning_steps_reward": 0.6649305820465088, |
| "rewards/repetition_penalty_reward": -0.00017056526121450588, |
| "rewards/tag_count_reward": 0.1666666716337204, |
| "step": 54 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 961.9583587646484, |
| "epoch": 0.0824896887889014, |
| "grad_norm": 14.31298550812026, |
| "kl": 1.4169921875, |
| "learning_rate": 1.6417910447761197e-05, |
| "loss": -0.0265, |
| "reward": 0.8787982016801834, |
| "reward_std": 0.3934568166732788, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/reasoning_steps_reward": 0.7152778208255768, |
| "rewards/repetition_penalty_reward": -0.0005421093846962322, |
| "rewards/tag_count_reward": 0.1640625037252903, |
| "step": 55 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 974.8125152587891, |
| "epoch": 0.08398950131233596, |
| "grad_norm": 262.24387888017645, |
| "kl": 24.07275390625, |
| "learning_rate": 1.671641791044776e-05, |
| "loss": 0.9471, |
| "reward": 0.975603461265564, |
| "reward_std": 0.3715866580605507, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/reasoning_steps_reward": 0.7951389253139496, |
| "rewards/repetition_penalty_reward": -0.0005250562171568163, |
| "rewards/tag_count_reward": 0.1809895895421505, |
| "step": 56 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 975.3854370117188, |
| "epoch": 0.08548931383577053, |
| "grad_norm": 38.16302080966472, |
| "kl": 6.939453125, |
| "learning_rate": 1.701492537313433e-05, |
| "loss": 0.1715, |
| "reward": 1.1071399450302124, |
| "reward_std": 0.2679474614560604, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/reasoning_steps_reward": 0.9114583730697632, |
| "rewards/repetition_penalty_reward": -0.0009330366592621431, |
| "rewards/tag_count_reward": 0.1966145858168602, |
| "step": 57 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 981.3020935058594, |
| "epoch": 0.0869891263592051, |
| "grad_norm": 2.325182725021432, |
| "kl": 0.68359375, |
| "learning_rate": 1.7313432835820894e-05, |
| "loss": -0.0424, |
| "reward": 1.130223572254181, |
| "reward_std": 0.20110024139285088, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/reasoning_steps_reward": 0.9496527910232544, |
| "rewards/repetition_penalty_reward": -0.003022975695785135, |
| "rewards/tag_count_reward": 0.1835937574505806, |
| "step": 58 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1004.2187652587891, |
| "epoch": 0.08848893888263967, |
| "grad_norm": 2.918231849637039, |
| "kl": 1.49267578125, |
| "learning_rate": 1.7611940298507464e-05, |
| "loss": 0.0337, |
| "reward": 1.1544174253940582, |
| "reward_std": 0.20146211609244347, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/reasoning_steps_reward": 0.9565972536802292, |
| "rewards/repetition_penalty_reward": -0.014419377315789461, |
| "rewards/tag_count_reward": 0.2122395895421505, |
| "step": 59 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1005.3697967529297, |
| "epoch": 0.08998875140607424, |
| "grad_norm": 1.073898600293748, |
| "kl": 0.59765625, |
| "learning_rate": 1.791044776119403e-05, |
| "loss": -0.0304, |
| "reward": 1.1241816580295563, |
| "reward_std": 0.17474635317921638, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/reasoning_steps_reward": 0.9739583432674408, |
| "rewards/repetition_penalty_reward": -0.06592248193919659, |
| "rewards/tag_count_reward": 0.2161458358168602, |
| "step": 60 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1009.3020935058594, |
| "epoch": 0.09148856392950881, |
| "grad_norm": 139.56708057657698, |
| "kl": 16.713623046875, |
| "learning_rate": 1.8208955223880598e-05, |
| "loss": 0.64, |
| "reward": 1.0292470455169678, |
| "reward_std": 0.155818872153759, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/reasoning_steps_reward": 0.9791666865348816, |
| "rewards/repetition_penalty_reward": -0.17908628657460213, |
| "rewards/tag_count_reward": 0.2291666753590107, |
| "step": 61 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1006.1718902587891, |
| "epoch": 0.09298837645294339, |
| "grad_norm": 0.6239077670049449, |
| "kl": 0.2763671875, |
| "learning_rate": 1.8507462686567165e-05, |
| "loss": -0.0267, |
| "reward": 0.9351680278778076, |
| "reward_std": 0.12725192122161388, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/reasoning_steps_reward": 0.9861111342906952, |
| "rewards/repetition_penalty_reward": -0.2918284982442856, |
| "rewards/tag_count_reward": 0.2408854179084301, |
| "step": 62 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 996.7395935058594, |
| "epoch": 0.09448818897637795, |
| "grad_norm": 0.7355348323994314, |
| "kl": 0.30078125, |
| "learning_rate": 1.8805970149253735e-05, |
| "loss": 0.0326, |
| "reward": 1.057679921388626, |
| "reward_std": 0.15489091351628304, |
| "rewards/accuracy_reward": 0.0052083334885537624, |
| "rewards/reasoning_steps_reward": 0.9878472238779068, |
| "rewards/repetition_penalty_reward": -0.21271947026252747, |
| "rewards/tag_count_reward": 0.2773437574505806, |
| "step": 63 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 917.3229370117188, |
| "epoch": 0.09598800149981253, |
| "grad_norm": 0.5135310040841573, |
| "kl": 0.3583984375, |
| "learning_rate": 1.9104477611940298e-05, |
| "loss": 0.0146, |
| "reward": 1.2951524555683136, |
| "reward_std": 0.2529895007610321, |
| "rewards/accuracy_reward": 0.010416666977107525, |
| "rewards/reasoning_steps_reward": 0.9340278059244156, |
| "rewards/repetition_penalty_reward": -0.051635801792144775, |
| "rewards/tag_count_reward": 0.4023437574505806, |
| "step": 64 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 826.890625, |
| "epoch": 0.09748781402324709, |
| "grad_norm": 336.6953521537797, |
| "kl": 19.52685546875, |
| "learning_rate": 1.9402985074626868e-05, |
| "loss": 0.7842, |
| "reward": 1.3793235123157501, |
| "reward_std": 0.32739875465631485, |
| "rewards/accuracy_reward": 0.04687500139698386, |
| "rewards/reasoning_steps_reward": 0.852430522441864, |
| "rewards/repetition_penalty_reward": -0.02258623158559203, |
| "rewards/tag_count_reward": 0.5026041865348816, |
| "step": 65 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 658.5781402587891, |
| "epoch": 0.09898762654668167, |
| "grad_norm": 6.106229371377812, |
| "kl": 0.70361328125, |
| "learning_rate": 1.9701492537313435e-05, |
| "loss": 0.1711, |
| "reward": 1.5613079369068146, |
| "reward_std": 0.4881432354450226, |
| "rewards/accuracy_reward": 0.1822916716337204, |
| "rewards/reasoning_steps_reward": 0.7309027910232544, |
| "rewards/repetition_penalty_reward": -0.013344902312383056, |
| "rewards/tag_count_reward": 0.6614583432674408, |
| "step": 66 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 599.6562805175781, |
| "epoch": 0.10048743907011623, |
| "grad_norm": 3.5990895260846902, |
| "kl": 0.43603515625, |
| "learning_rate": 2e-05, |
| "loss": 0.0834, |
| "reward": 1.4973010122776031, |
| "reward_std": 0.4555082842707634, |
| "rewards/accuracy_reward": 0.1718750074505806, |
| "rewards/reasoning_steps_reward": 0.630208358168602, |
| "rewards/repetition_penalty_reward": -0.01051152846775949, |
| "rewards/tag_count_reward": 0.7057291865348816, |
| "step": 67 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 480.21876525878906, |
| "epoch": 0.1019872515935508, |
| "grad_norm": 1.317667285117624, |
| "kl": 0.41748046875, |
| "learning_rate": 1.9999862464405377e-05, |
| "loss": 0.1277, |
| "reward": 1.6170935332775116, |
| "reward_std": 0.5107096880674362, |
| "rewards/accuracy_reward": 0.276041679084301, |
| "rewards/reasoning_steps_reward": 0.5694444477558136, |
| "rewards/repetition_penalty_reward": -0.013548914226703346, |
| "rewards/tag_count_reward": 0.78515625, |
| "step": 68 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 345.32813262939453, |
| "epoch": 0.10348706411698538, |
| "grad_norm": 1.7734343075276982, |
| "kl": 0.39208984375, |
| "learning_rate": 1.9999449861404716e-05, |
| "loss": 0.1866, |
| "reward": 1.814243733882904, |
| "reward_std": 0.5948401093482971, |
| "rewards/accuracy_reward": 0.354166679084301, |
| "rewards/reasoning_steps_reward": 0.6111111044883728, |
| "rewards/repetition_penalty_reward": -0.01691950182430446, |
| "rewards/tag_count_reward": 0.8658854365348816, |
| "step": 69 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 346.2291717529297, |
| "epoch": 0.10498687664041995, |
| "grad_norm": 3.2645219448529943, |
| "kl": 0.3359375, |
| "learning_rate": 1.999876220234753e-05, |
| "loss": 0.3835, |
| "reward": 2.1113045811653137, |
| "reward_std": 0.6916591078042984, |
| "rewards/accuracy_reward": 0.5312500074505806, |
| "rewards/reasoning_steps_reward": 0.7447917312383652, |
| "rewards/repetition_penalty_reward": -0.02281005564145744, |
| "rewards/tag_count_reward": 0.8580729514360428, |
| "step": 70 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 396.17708587646484, |
| "epoch": 0.10648668916385452, |
| "grad_norm": 3.618615404780119, |
| "kl": 0.33447265625, |
| "learning_rate": 1.999779950614934e-05, |
| "loss": 0.4845, |
| "reward": 2.033944606781006, |
| "reward_std": 0.7569809406995773, |
| "rewards/accuracy_reward": 0.4843750149011612, |
| "rewards/reasoning_steps_reward": 0.763888880610466, |
| "rewards/repetition_penalty_reward": -0.02421517251059413, |
| "rewards/tag_count_reward": 0.809895858168602, |
| "step": 71 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 469.48438262939453, |
| "epoch": 0.10798650168728909, |
| "grad_norm": 296.41565415069374, |
| "kl": 6.908203125, |
| "learning_rate": 1.999656179929115e-05, |
| "loss": 0.9572, |
| "reward": 1.6951223015785217, |
| "reward_std": 0.7641638964414597, |
| "rewards/accuracy_reward": 0.2500000111758709, |
| "rewards/reasoning_steps_reward": 0.8385416716337204, |
| "rewards/repetition_penalty_reward": -0.017117327079176903, |
| "rewards/tag_count_reward": 0.6236979365348816, |
| "step": 72 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 533.2343978881836, |
| "epoch": 0.10948631421072366, |
| "grad_norm": 2.6480944796648282, |
| "kl": 0.9912109375, |
| "learning_rate": 1.9995049115818706e-05, |
| "loss": 0.3052, |
| "reward": 1.2268942147493362, |
| "reward_std": 0.9322899430990219, |
| "rewards/accuracy_reward": 0.20312500558793545, |
| "rewards/reasoning_steps_reward": 0.6597222536802292, |
| "rewards/repetition_penalty_reward": -0.013557222904637456, |
| "rewards/tag_count_reward": 0.3776041716337204, |
| "step": 73 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 329.98439025878906, |
| "epoch": 0.11098612673415822, |
| "grad_norm": 399.79727881978687, |
| "kl": 12.41015625, |
| "learning_rate": 1.9993261497341575e-05, |
| "loss": 1.299, |
| "reward": 0.8436856269836426, |
| "reward_std": 0.9532901048660278, |
| "rewards/accuracy_reward": 0.1406250037252903, |
| "rewards/reasoning_steps_reward": 0.49305559694767, |
| "rewards/repetition_penalty_reward": -0.0087449811398983, |
| "rewards/tag_count_reward": 0.2187500037252903, |
| "step": 74 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 197.5885467529297, |
| "epoch": 0.1124859392575928, |
| "grad_norm": 11.601181346751435, |
| "kl": 4.046875, |
| "learning_rate": 1.9991198993031992e-05, |
| "loss": 0.1052, |
| "reward": 0.5184430181980133, |
| "reward_std": 0.8223861902952194, |
| "rewards/accuracy_reward": 0.08333333535119891, |
| "rewards/reasoning_steps_reward": 0.3211805745959282, |
| "rewards/repetition_penalty_reward": -0.005862575490027666, |
| "rewards/tag_count_reward": 0.11979166977107525, |
| "step": 75 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 169.49480056762695, |
| "epoch": 0.11398575178102738, |
| "grad_norm": 4.174677375261219, |
| "kl": 2.41796875, |
| "learning_rate": 1.99888616596235e-05, |
| "loss": -0.0061, |
| "reward": 0.7175929397344589, |
| "reward_std": 1.0486825108528137, |
| "rewards/accuracy_reward": 0.14583334140479565, |
| "rewards/reasoning_steps_reward": 0.3246527947485447, |
| "rewards/repetition_penalty_reward": -0.008101542014628649, |
| "rewards/tag_count_reward": 0.2552083395421505, |
| "step": 76 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 175.0677146911621, |
| "epoch": 0.11548556430446194, |
| "grad_norm": 3.753658615227732, |
| "kl": 2.3984375, |
| "learning_rate": 1.9986249561409415e-05, |
| "loss": -0.0758, |
| "reward": 0.7132928371429443, |
| "reward_std": 0.9969339072704315, |
| "rewards/accuracy_reward": 0.1614583395421505, |
| "rewards/reasoning_steps_reward": 0.3628472313284874, |
| "rewards/repetition_penalty_reward": -0.012835650937631726, |
| "rewards/tag_count_reward": 0.2018229216337204, |
| "step": 77 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 189.9114646911621, |
| "epoch": 0.11698537682789652, |
| "grad_norm": 31.158238324524778, |
| "kl": 4.71484375, |
| "learning_rate": 1.998336277024103e-05, |
| "loss": -0.0148, |
| "reward": 1.2587463855743408, |
| "reward_std": 1.1734082400798798, |
| "rewards/accuracy_reward": 0.3125000074505806, |
| "rewards/reasoning_steps_reward": 0.5677083730697632, |
| "rewards/repetition_penalty_reward": -0.029014051891863346, |
| "rewards/tag_count_reward": 0.4075521007180214, |
| "step": 78 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 226.31250381469727, |
| "epoch": 0.11848518935133108, |
| "grad_norm": 59.19488601013592, |
| "kl": 7.015625, |
| "learning_rate": 1.998020136552566e-05, |
| "loss": 0.0999, |
| "reward": 1.4918034672737122, |
| "reward_std": 1.0446466207504272, |
| "rewards/accuracy_reward": 0.3333333432674408, |
| "rewards/reasoning_steps_reward": 0.7187500298023224, |
| "rewards/repetition_penalty_reward": -0.07199861854314804, |
| "rewards/tag_count_reward": 0.5117187649011612, |
| "step": 79 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 298.35939025878906, |
| "epoch": 0.11998500187476566, |
| "grad_norm": 77.22679135587332, |
| "kl": 1.4228515625, |
| "learning_rate": 1.9976765434224426e-05, |
| "loss": 0.1166, |
| "reward": 2.0578636527061462, |
| "reward_std": 1.055399090051651, |
| "rewards/accuracy_reward": 0.583333358168602, |
| "rewards/reasoning_steps_reward": 0.8281250298023224, |
| "rewards/repetition_penalty_reward": -0.06974058039486408, |
| "rewards/tag_count_reward": 0.716145858168602, |
| "step": 80 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 343.8645935058594, |
| "epoch": 0.12148481439820022, |
| "grad_norm": 492.7397963149802, |
| "kl": 4.455078125, |
| "learning_rate": 1.9973055070849912e-05, |
| "loss": 0.5194, |
| "reward": 2.197775959968567, |
| "reward_std": 0.8521990180015564, |
| "rewards/accuracy_reward": 0.59375, |
| "rewards/reasoning_steps_reward": 0.913194477558136, |
| "rewards/repetition_penalty_reward": -0.12166857533156872, |
| "rewards/tag_count_reward": 0.8125000298023224, |
| "step": 81 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 370.52083587646484, |
| "epoch": 0.1229846269216348, |
| "grad_norm": 4.669423434787285, |
| "kl": 1.5625, |
| "learning_rate": 1.996907037746352e-05, |
| "loss": 0.2641, |
| "reward": 2.129128336906433, |
| "reward_std": 0.7840546369552612, |
| "rewards/accuracy_reward": 0.5000000074505806, |
| "rewards/reasoning_steps_reward": 0.9461805522441864, |
| "rewards/repetition_penalty_reward": -0.14647945389151573, |
| "rewards/tag_count_reward": 0.8294270932674408, |
| "step": 82 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 353.7447967529297, |
| "epoch": 0.12448443944506937, |
| "grad_norm": 20.98608213741596, |
| "kl": 0.47705078125, |
| "learning_rate": 1.9964811463672685e-05, |
| "loss": 0.2863, |
| "reward": 2.022731065750122, |
| "reward_std": 0.637829914689064, |
| "rewards/accuracy_reward": 0.3645833432674408, |
| "rewards/reasoning_steps_reward": 0.9583333432674408, |
| "rewards/repetition_penalty_reward": -0.16476891934871674, |
| "rewards/tag_count_reward": 0.864583358168602, |
| "step": 83 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 330.2395935058594, |
| "epoch": 0.12598425196850394, |
| "grad_norm": 5.548707042279948, |
| "kl": 0.576171875, |
| "learning_rate": 1.996027844662785e-05, |
| "loss": 0.3677, |
| "reward": 2.218412935733795, |
| "reward_std": 0.7215779423713684, |
| "rewards/accuracy_reward": 0.557291679084301, |
| "rewards/reasoning_steps_reward": 0.9548611491918564, |
| "rewards/repetition_penalty_reward": -0.17525040730834007, |
| "rewards/tag_count_reward": 0.8815104365348816, |
| "step": 84 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 347.31251525878906, |
| "epoch": 0.1274840644919385, |
| "grad_norm": 35.58826029321981, |
| "kl": 1.0712890625, |
| "learning_rate": 1.9955471451019264e-05, |
| "loss": 0.4448, |
| "reward": 2.138911247253418, |
| "reward_std": 0.6522012650966644, |
| "rewards/accuracy_reward": 0.4687500074505806, |
| "rewards/reasoning_steps_reward": 0.9357638955116272, |
| "rewards/repetition_penalty_reward": -0.1510193683207035, |
| "rewards/tag_count_reward": 0.8854166716337204, |
| "step": 85 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 331.0833435058594, |
| "epoch": 0.1289838770153731, |
| "grad_norm": 10.955955280940925, |
| "kl": 1.5302734375, |
| "learning_rate": 1.995039060907352e-05, |
| "loss": 0.4797, |
| "reward": 2.054484724998474, |
| "reward_std": 0.8432440906763077, |
| "rewards/accuracy_reward": 0.463541679084301, |
| "rewards/reasoning_steps_reward": 0.8645833730697632, |
| "rewards/repetition_penalty_reward": -0.10436956211924553, |
| "rewards/tag_count_reward": 0.8307292014360428, |
| "step": 86 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 310.32813262939453, |
| "epoch": 0.13048368953880765, |
| "grad_norm": 64784.39467186327, |
| "kl": 333.48828125, |
| "learning_rate": 1.994503606054994e-05, |
| "loss": 27.6513, |
| "reward": 2.2711276412010193, |
| "reward_std": 0.7795832455158234, |
| "rewards/accuracy_reward": 0.5885416865348816, |
| "rewards/reasoning_steps_reward": 0.9166666716337204, |
| "rewards/repetition_penalty_reward": -0.11559123173356056, |
| "rewards/tag_count_reward": 0.8815104514360428, |
| "step": 87 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 311.98438262939453, |
| "epoch": 0.13198350206224221, |
| "grad_norm": 36649.02893473602, |
| "kl": 143.029296875, |
| "learning_rate": 1.9939407952736737e-05, |
| "loss": 15.8982, |
| "reward": 1.9791134297847748, |
| "reward_std": 0.6517055183649063, |
| "rewards/accuracy_reward": 0.3489583432674408, |
| "rewards/reasoning_steps_reward": 0.9045139402151108, |
| "rewards/repetition_penalty_reward": -0.13503596186637878, |
| "rewards/tag_count_reward": 0.860677108168602, |
| "step": 88 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 329.5052185058594, |
| "epoch": 0.13348331458567678, |
| "grad_norm": 892.1216234188144, |
| "kl": 7.07421875, |
| "learning_rate": 1.9933506440446932e-05, |
| "loss": 0.984, |
| "reward": 1.9748746156692505, |
| "reward_std": 0.6786476969718933, |
| "rewards/accuracy_reward": 0.3541666716337204, |
| "rewards/reasoning_steps_reward": 0.9079861491918564, |
| "rewards/repetition_penalty_reward": -0.11670541763305664, |
| "rewards/tag_count_reward": 0.829427108168602, |
| "step": 89 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 291.7604293823242, |
| "epoch": 0.13498312710911137, |
| "grad_norm": 49.617196414779116, |
| "kl": 2.85791015625, |
| "learning_rate": 1.992733168601413e-05, |
| "loss": 0.3242, |
| "reward": 1.9374914765357971, |
| "reward_std": 0.5130168125033379, |
| "rewards/accuracy_reward": 0.27604167349636555, |
| "rewards/reasoning_steps_reward": 0.954861119389534, |
| "rewards/repetition_penalty_reward": -0.1592968888580799, |
| "rewards/tag_count_reward": 0.8658854365348816, |
| "step": 90 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 293.4739685058594, |
| "epoch": 0.13648293963254593, |
| "grad_norm": 11.069036826310084, |
| "kl": 0.42626953125, |
| "learning_rate": 1.9920883859288035e-05, |
| "loss": 0.3009, |
| "reward": 1.941705048084259, |
| "reward_std": 0.5944435596466064, |
| "rewards/accuracy_reward": 0.3177083395421505, |
| "rewards/reasoning_steps_reward": 0.9322917014360428, |
| "rewards/repetition_penalty_reward": -0.14032622054219246, |
| "rewards/tag_count_reward": 0.8320312649011612, |
| "step": 91 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 302.9948043823242, |
| "epoch": 0.1379827521559805, |
| "grad_norm": 70.67912729974059, |
| "kl": 2.607421875, |
| "learning_rate": 1.991416313762978e-05, |
| "loss": 0.3458, |
| "reward": 1.9879169762134552, |
| "reward_std": 0.5511599630117416, |
| "rewards/accuracy_reward": 0.3385416753590107, |
| "rewards/reasoning_steps_reward": 0.9270833134651184, |
| "rewards/repetition_penalty_reward": -0.14489559456706047, |
| "rewards/tag_count_reward": 0.8671875149011612, |
| "step": 92 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 327.0052185058594, |
| "epoch": 0.13948256467941508, |
| "grad_norm": 18.812219164179464, |
| "kl": 1.408203125, |
| "learning_rate": 1.990716970590706e-05, |
| "loss": 0.3788, |
| "reward": 2.0359848737716675, |
| "reward_std": 0.5520138740539551, |
| "rewards/accuracy_reward": 0.36979167675599456, |
| "rewards/reasoning_steps_reward": 0.9322917461395264, |
| "rewards/repetition_penalty_reward": -0.1345881223678589, |
| "rewards/tag_count_reward": 0.8684896230697632, |
| "step": 93 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 359.4010543823242, |
| "epoch": 0.14098237720284965, |
| "grad_norm": 14.184878742271176, |
| "kl": 0.765625, |
| "learning_rate": 1.989990375648903e-05, |
| "loss": 0.324, |
| "reward": 2.120865046977997, |
| "reward_std": 0.6140560433268547, |
| "rewards/accuracy_reward": 0.427083358168602, |
| "rewards/reasoning_steps_reward": 0.9531250298023224, |
| "rewards/repetition_penalty_reward": -0.12653085589408875, |
| "rewards/tag_count_reward": 0.8671875149011612, |
| "step": 94 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 372.76564025878906, |
| "epoch": 0.1424821897262842, |
| "grad_norm": 33.74529208746235, |
| "kl": 5.03515625, |
| "learning_rate": 1.9892365489241023e-05, |
| "loss": 0.4259, |
| "reward": 2.2041754722595215, |
| "reward_std": 0.6197474002838135, |
| "rewards/accuracy_reward": 0.5260416939854622, |
| "rewards/reasoning_steps_reward": 0.9583333730697632, |
| "rewards/repetition_penalty_reward": -0.1499912552535534, |
| "rewards/tag_count_reward": 0.8697916865348816, |
| "step": 95 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 409.7604293823242, |
| "epoch": 0.1439820022497188, |
| "grad_norm": 60.09340508863677, |
| "kl": 0.78662109375, |
| "learning_rate": 1.988455511151906e-05, |
| "loss": 0.1251, |
| "reward": 2.28587007522583, |
| "reward_std": 0.5608475357294083, |
| "rewards/accuracy_reward": 0.557291679084301, |
| "rewards/reasoning_steps_reward": 0.9600694924592972, |
| "rewards/repetition_penalty_reward": -0.1312306523323059, |
| "rewards/tag_count_reward": 0.8997395932674408, |
| "step": 96 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 485.81250762939453, |
| "epoch": 0.14548181477315336, |
| "grad_norm": 39.25906795250181, |
| "kl": 0.7392578125, |
| "learning_rate": 1.987647283816412e-05, |
| "loss": 0.2008, |
| "reward": 2.1415184140205383, |
| "reward_std": 0.593282975256443, |
| "rewards/accuracy_reward": 0.4270833432674408, |
| "rewards/reasoning_steps_reward": 0.947916716337204, |
| "rewards/repetition_penalty_reward": -0.1084816437214613, |
| "rewards/tag_count_reward": 0.8750000298023224, |
| "step": 97 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 527.7396087646484, |
| "epoch": 0.14698162729658792, |
| "grad_norm": 23.53154353337868, |
| "kl": 1.400390625, |
| "learning_rate": 1.9868118891496268e-05, |
| "loss": 0.2105, |
| "reward": 1.9524771869182587, |
| "reward_std": 0.5782586634159088, |
| "rewards/accuracy_reward": 0.2760416716337204, |
| "rewards/reasoning_steps_reward": 0.9444444328546524, |
| "rewards/repetition_penalty_reward": -0.11045688763260841, |
| "rewards/tag_count_reward": 0.8424479514360428, |
| "step": 98 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 567.171875, |
| "epoch": 0.1484814398200225, |
| "grad_norm": 32.44194296184474, |
| "kl": 3.03515625, |
| "learning_rate": 1.98594935013085e-05, |
| "loss": 0.3744, |
| "reward": 2.2206438183784485, |
| "reward_std": 0.6493653506040573, |
| "rewards/accuracy_reward": 0.520833358168602, |
| "rewards/reasoning_steps_reward": 0.9670139253139496, |
| "rewards/repetition_penalty_reward": -0.1200681421905756, |
| "rewards/tag_count_reward": 0.852864608168602, |
| "step": 99 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 613.2604522705078, |
| "epoch": 0.14998125234345708, |
| "grad_norm": 10134.408469401435, |
| "kl": 115.4140625, |
| "learning_rate": 1.985059690486045e-05, |
| "loss": 6.0425, |
| "reward": 1.976808786392212, |
| "reward_std": 0.5220333635807037, |
| "rewards/accuracy_reward": 0.2604166716337204, |
| "rewards/reasoning_steps_reward": 0.9670139253139496, |
| "rewards/repetition_penalty_reward": -0.11390306055545807, |
| "rewards/tag_count_reward": 0.8632812649011612, |
| "step": 100 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 638.0677337646484, |
| "epoch": 0.15148106486689164, |
| "grad_norm": 11770.662483971002, |
| "kl": 115.66796875, |
| "learning_rate": 1.9841429346871863e-05, |
| "loss": 7.1104, |
| "reward": 2.012724369764328, |
| "reward_std": 0.6818199008703232, |
| "rewards/accuracy_reward": 0.3750000037252903, |
| "rewards/reasoning_steps_reward": 0.9218750447034836, |
| "rewards/repetition_penalty_reward": -0.10706735588610172, |
| "rewards/tag_count_reward": 0.8229166865348816, |
| "step": 101 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 660.0156402587891, |
| "epoch": 0.1529808773903262, |
| "grad_norm": 2359.6075026833482, |
| "kl": 29.76953125, |
| "learning_rate": 1.9831991079515836e-05, |
| "loss": 1.7284, |
| "reward": 2.195272386074066, |
| "reward_std": 0.7458173632621765, |
| "rewards/accuracy_reward": 0.5416666939854622, |
| "rewards/reasoning_steps_reward": 0.9340277910232544, |
| "rewards/repetition_penalty_reward": -0.11635957658290863, |
| "rewards/tag_count_reward": 0.8359375298023224, |
| "step": 102 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 690.4166870117188, |
| "epoch": 0.1544806899137608, |
| "grad_norm": 34.92066839587673, |
| "kl": 3.017578125, |
| "learning_rate": 1.982228236241192e-05, |
| "loss": 0.2735, |
| "reward": 2.1599226593971252, |
| "reward_std": 0.7309878766536713, |
| "rewards/accuracy_reward": 0.479166679084301, |
| "rewards/reasoning_steps_reward": 0.942708358168602, |
| "rewards/repetition_penalty_reward": -0.11742118000984192, |
| "rewards/tag_count_reward": 0.8554687798023224, |
| "step": 103 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 669.7448120117188, |
| "epoch": 0.15598050243719536, |
| "grad_norm": 586.5134421102061, |
| "kl": 9.271484375, |
| "learning_rate": 1.9812303462618945e-05, |
| "loss": 0.5437, |
| "reward": 1.991296112537384, |
| "reward_std": 0.5067874565720558, |
| "rewards/accuracy_reward": 0.2916666818782687, |
| "rewards/reasoning_steps_reward": 0.9652778208255768, |
| "rewards/repetition_penalty_reward": -0.11460676416754723, |
| "rewards/tag_count_reward": 0.848958358168602, |
| "step": 104 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 727.6771087646484, |
| "epoch": 0.15748031496062992, |
| "grad_norm": 398.23420182364833, |
| "kl": 10.8125, |
| "learning_rate": 1.9802054654627694e-05, |
| "loss": 0.5388, |
| "reward": 1.8793240189552307, |
| "reward_std": 0.7508310377597809, |
| "rewards/accuracy_reward": 0.3020833432674408, |
| "rewards/reasoning_steps_reward": 0.9149306267499924, |
| "rewards/repetition_penalty_reward": -0.0942003782838583, |
| "rewards/tag_count_reward": 0.7565104365348816, |
| "step": 105 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 729.7760620117188, |
| "epoch": 0.15898012748406448, |
| "grad_norm": 14.580781126168798, |
| "kl": 0.92138671875, |
| "learning_rate": 1.9791536220353355e-05, |
| "loss": 0.1662, |
| "reward": 1.9787148237228394, |
| "reward_std": 0.6732726097106934, |
| "rewards/accuracy_reward": 0.3437500149011612, |
| "rewards/reasoning_steps_reward": 0.9531250149011612, |
| "rewards/repetition_penalty_reward": -0.10201445035636425, |
| "rewards/tag_count_reward": 0.7838541865348816, |
| "step": 106 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 700.3698120117188, |
| "epoch": 0.16047994000749907, |
| "grad_norm": 78.39112156030271, |
| "kl": 4.37890625, |
| "learning_rate": 1.9780748449127745e-05, |
| "loss": 0.3547, |
| "reward": 1.8721507489681244, |
| "reward_std": 0.7452912926673889, |
| "rewards/accuracy_reward": 0.3072916753590107, |
| "rewards/reasoning_steps_reward": 0.9253472685813904, |
| "rewards/repetition_penalty_reward": -0.08184238523244858, |
| "rewards/tag_count_reward": 0.7213541865348816, |
| "step": 107 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 751.7448120117188, |
| "epoch": 0.16197975253093364, |
| "grad_norm": 23.15048932086659, |
| "kl": 3.77734375, |
| "learning_rate": 1.976969163769137e-05, |
| "loss": 0.3573, |
| "reward": 1.7692583501338959, |
| "reward_std": 0.6871030032634735, |
| "rewards/accuracy_reward": 0.2135416753590107, |
| "rewards/reasoning_steps_reward": 0.9375000447034836, |
| "rewards/repetition_penalty_reward": -0.0770958885550499, |
| "rewards/tag_count_reward": 0.6953125149011612, |
| "step": 108 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 666.0625152587891, |
| "epoch": 0.1634795650543682, |
| "grad_norm": 13.438163054952135, |
| "kl": 0.791015625, |
| "learning_rate": 1.9758366090185255e-05, |
| "loss": 0.194, |
| "reward": 2.101614534854889, |
| "reward_std": 0.779254287481308, |
| "rewards/accuracy_reward": 0.4739583432674408, |
| "rewards/reasoning_steps_reward": 0.9357639253139496, |
| "rewards/repetition_penalty_reward": -0.08805570006370544, |
| "rewards/tag_count_reward": 0.7799479365348816, |
| "step": 109 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 657.4218902587891, |
| "epoch": 0.1649793775778028, |
| "grad_norm": 19.486727225701877, |
| "kl": 1.15625, |
| "learning_rate": 1.974677211814259e-05, |
| "loss": 0.2437, |
| "reward": 2.0427930653095245, |
| "reward_std": 0.7101475596427917, |
| "rewards/accuracy_reward": 0.4062500149011612, |
| "rewards/reasoning_steps_reward": 0.9270833879709244, |
| "rewards/repetition_penalty_reward": -0.11345694959163666, |
| "rewards/tag_count_reward": 0.8229166865348816, |
| "step": 110 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 616.7968902587891, |
| "epoch": 0.16647919010123735, |
| "grad_norm": 43.9361877854642, |
| "kl": 5.32421875, |
| "learning_rate": 1.973491004048014e-05, |
| "loss": 0.3651, |
| "reward": 2.290136158466339, |
| "reward_std": 0.5547062531113625, |
| "rewards/accuracy_reward": 0.5416666716337204, |
| "rewards/reasoning_steps_reward": 0.9531249850988388, |
| "rewards/repetition_penalty_reward": -0.11481184512376785, |
| "rewards/tag_count_reward": 0.9101562798023224, |
| "step": 111 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 616.4948120117188, |
| "epoch": 0.1679790026246719, |
| "grad_norm": 25.513657082760023, |
| "kl": 2.0654296875, |
| "learning_rate": 1.9722780183489477e-05, |
| "loss": 0.1826, |
| "reward": 2.2123002409934998, |
| "reward_std": 0.6505413353443146, |
| "rewards/accuracy_reward": 0.4687500149011612, |
| "rewards/reasoning_steps_reward": 0.953125, |
| "rewards/repetition_penalty_reward": -0.13665815442800522, |
| "rewards/tag_count_reward": 0.927083358168602, |
| "step": 112 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 639.6198120117188, |
| "epoch": 0.16947881514810648, |
| "grad_norm": 4.0215101485261595, |
| "kl": 1.24365234375, |
| "learning_rate": 1.9710382880828028e-05, |
| "loss": 0.1482, |
| "reward": 2.214249849319458, |
| "reward_std": 0.6732968837022781, |
| "rewards/accuracy_reward": 0.5000000074505806, |
| "rewards/reasoning_steps_reward": 0.9461805522441864, |
| "rewards/repetition_penalty_reward": -0.1433890499174595, |
| "rewards/tag_count_reward": 0.9114583432674408, |
| "step": 113 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 692.9062652587891, |
| "epoch": 0.17097862767154107, |
| "grad_norm": 78.14938942373506, |
| "kl": 5.59765625, |
| "learning_rate": 1.969771847350987e-05, |
| "loss": 0.4814, |
| "reward": 2.0078442692756653, |
| "reward_std": 0.8100399523973465, |
| "rewards/accuracy_reward": 0.4218750149011612, |
| "rewards/reasoning_steps_reward": 0.9253472238779068, |
| "rewards/repetition_penalty_reward": -0.0971906129270792, |
| "rewards/tag_count_reward": 0.7578125149011612, |
| "step": 114 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 873.9166870117188, |
| "epoch": 0.17247844019497563, |
| "grad_norm": 390.6492744892592, |
| "kl": 20.15625, |
| "learning_rate": 1.968478730989636e-05, |
| "loss": 0.931, |
| "reward": 1.3626113533973694, |
| "reward_std": 0.49340808391571045, |
| "rewards/accuracy_reward": 0.07291666930541396, |
| "rewards/reasoning_steps_reward": 0.9409722238779068, |
| "rewards/repetition_penalty_reward": -0.03799628745764494, |
| "rewards/tag_count_reward": 0.3867187574505806, |
| "step": 115 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 889.4166870117188, |
| "epoch": 0.1739782527184102, |
| "grad_norm": 46.43260724644146, |
| "kl": 7.609375, |
| "learning_rate": 1.9671589745686563e-05, |
| "loss": 0.3225, |
| "reward": 1.1709461510181427, |
| "reward_std": 0.3466350585222244, |
| "rewards/accuracy_reward": 0.010416666977107525, |
| "rewards/reasoning_steps_reward": 0.9201389402151108, |
| "rewards/repetition_penalty_reward": -0.023932393174618483, |
| "rewards/tag_count_reward": 0.2643229216337204, |
| "step": 116 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 855.1302185058594, |
| "epoch": 0.17547806524184478, |
| "grad_norm": 19.881821438298648, |
| "kl": 2.70703125, |
| "learning_rate": 1.965812614390743e-05, |
| "loss": 0.1427, |
| "reward": 1.1638481318950653, |
| "reward_std": 0.343635730445385, |
| "rewards/accuracy_reward": 0.010416666977107525, |
| "rewards/reasoning_steps_reward": 0.9027778059244156, |
| "rewards/repetition_penalty_reward": -0.027992176823318005, |
| "rewards/tag_count_reward": 0.2786458358168602, |
| "step": 117 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 837.4010620117188, |
| "epoch": 0.17697787776527935, |
| "grad_norm": 10.649672323547897, |
| "kl": 1.78125, |
| "learning_rate": 1.9644396874903865e-05, |
| "loss": 0.056, |
| "reward": 1.1770220398902893, |
| "reward_std": 0.30946608632802963, |
| "rewards/accuracy_reward": 0.0052083334885537624, |
| "rewards/reasoning_steps_reward": 0.927083358168602, |
| "rewards/repetition_penalty_reward": -0.028707201592624187, |
| "rewards/tag_count_reward": 0.2734375074505806, |
| "step": 118 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 783.4479370117188, |
| "epoch": 0.1784776902887139, |
| "grad_norm": 6.976349962907348, |
| "kl": 1.45703125, |
| "learning_rate": 1.9630402316328506e-05, |
| "loss": 0.0257, |
| "reward": 1.2304543852806091, |
| "reward_std": 0.30063216388225555, |
| "rewards/accuracy_reward": 0.0052083334885537624, |
| "rewards/reasoning_steps_reward": 0.9461805671453476, |
| "rewards/repetition_penalty_reward": -0.033434574957937, |
| "rewards/tag_count_reward": 0.3125000149011612, |
| "step": 119 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 655.8281402587891, |
| "epoch": 0.17997750281214847, |
| "grad_norm": 4.798337799999206, |
| "kl": 0.5478515625, |
| "learning_rate": 1.9616142853131342e-05, |
| "loss": 0.1748, |
| "reward": 2.003317028284073, |
| "reward_std": 0.682994619011879, |
| "rewards/accuracy_reward": 0.3489583432674408, |
| "rewards/reasoning_steps_reward": 0.9826389253139496, |
| "rewards/repetition_penalty_reward": -0.1004156544804573, |
| "rewards/tag_count_reward": 0.7721354514360428, |
| "step": 120 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 596.4583587646484, |
| "epoch": 0.18147731533558306, |
| "grad_norm": 7.178534631101688, |
| "kl": 0.65283203125, |
| "learning_rate": 1.9601618877549113e-05, |
| "loss": 0.1098, |
| "reward": 2.141425609588623, |
| "reward_std": 0.6803022921085358, |
| "rewards/accuracy_reward": 0.453125, |
| "rewards/reasoning_steps_reward": 0.9548611640930176, |
| "rewards/repetition_penalty_reward": -0.12593566998839378, |
| "rewards/tag_count_reward": 0.8593750298023224, |
| "step": 121 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 585.9843902587891, |
| "epoch": 0.18297712785901762, |
| "grad_norm": 5.756866189784068, |
| "kl": 1.4296875, |
| "learning_rate": 1.9586830789094548e-05, |
| "loss": 0.0643, |
| "reward": 2.145204782485962, |
| "reward_std": 0.6050543040037155, |
| "rewards/accuracy_reward": 0.4322916939854622, |
| "rewards/reasoning_steps_reward": 0.9687500298023224, |
| "rewards/repetition_penalty_reward": -0.13474315032362938, |
| "rewards/tag_count_reward": 0.8789062649011612, |
| "step": 122 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 534.2760543823242, |
| "epoch": 0.1844769403824522, |
| "grad_norm": 104.47967057005842, |
| "kl": 4.3759765625, |
| "learning_rate": 1.9571778994545356e-05, |
| "loss": 0.2084, |
| "reward": 2.2705207467079163, |
| "reward_std": 0.7510545551776886, |
| "rewards/accuracy_reward": 0.5729166865348816, |
| "rewards/reasoning_steps_reward": 0.9409722834825516, |
| "rewards/repetition_penalty_reward": -0.11706613004207611, |
| "rewards/tag_count_reward": 0.8736979365348816, |
| "step": 123 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 534.5937652587891, |
| "epoch": 0.18597675290588678, |
| "grad_norm": 14.350020581816006, |
| "kl": 1.861328125, |
| "learning_rate": 1.9556463907933038e-05, |
| "loss": 0.0927, |
| "reward": 2.2636232376098633, |
| "reward_std": 0.6755192577838898, |
| "rewards/accuracy_reward": 0.520833358168602, |
| "rewards/reasoning_steps_reward": 0.9600694328546524, |
| "rewards/repetition_penalty_reward": -0.12092556245625019, |
| "rewards/tag_count_reward": 0.9036458730697632, |
| "step": 124 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 565.2968902587891, |
| "epoch": 0.18747656542932134, |
| "grad_norm": 3.30622411421042, |
| "kl": 1.943359375, |
| "learning_rate": 1.9540885950531507e-05, |
| "loss": 0.1088, |
| "reward": 2.27703320980072, |
| "reward_std": 0.6729772835969925, |
| "rewards/accuracy_reward": 0.5677083432674408, |
| "rewards/reasoning_steps_reward": 0.9392361342906952, |
| "rewards/repetition_penalty_reward": -0.13225507363677025, |
| "rewards/tag_count_reward": 0.9023437649011612, |
| "step": 125 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 582.3177185058594, |
| "epoch": 0.1889763779527559, |
| "grad_norm": 13.34144394107955, |
| "kl": 40.078125, |
| "learning_rate": 1.9525045550845482e-05, |
| "loss": 0.1878, |
| "reward": 2.363175332546234, |
| "reward_std": 0.654795840382576, |
| "rewards/accuracy_reward": 0.6510416716337204, |
| "rewards/reasoning_steps_reward": 0.9409722536802292, |
| "rewards/repetition_penalty_reward": -0.15722395852208138, |
| "rewards/tag_count_reward": 0.9283854514360428, |
| "step": 126 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 574.8698120117188, |
| "epoch": 0.19047619047619047, |
| "grad_norm": 21.137483639050657, |
| "kl": 5.4140625, |
| "learning_rate": 1.9508943144598726e-05, |
| "loss": 0.2436, |
| "reward": 2.1040448546409607, |
| "reward_std": 0.7035096734762192, |
| "rewards/accuracy_reward": 0.4062500149011612, |
| "rewards/reasoning_steps_reward": 0.9270834028720856, |
| "rewards/repetition_penalty_reward": -0.13814280182123184, |
| "rewards/tag_count_reward": 0.9088541865348816, |
| "step": 127 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 531.4687652587891, |
| "epoch": 0.19197600299962506, |
| "grad_norm": 4.455726874489183, |
| "kl": 0.81103515625, |
| "learning_rate": 1.9492579174722043e-05, |
| "loss": -0.0353, |
| "reward": 2.3104677200317383, |
| "reward_std": 0.5839991569519043, |
| "rewards/accuracy_reward": 0.5885416865348816, |
| "rewards/reasoning_steps_reward": 0.9496528059244156, |
| "rewards/repetition_penalty_reward": -0.16132058203220367, |
| "rewards/tag_count_reward": 0.9335937798023224, |
| "step": 128 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 518.4531402587891, |
| "epoch": 0.19347581552305962, |
| "grad_norm": 681.8928268072538, |
| "kl": 14.496826171875, |
| "learning_rate": 1.9475954091341098e-05, |
| "loss": 0.7073, |
| "reward": 2.2904911935329437, |
| "reward_std": 0.6285631433129311, |
| "rewards/accuracy_reward": 0.5885416865348816, |
| "rewards/reasoning_steps_reward": 0.9236111342906952, |
| "rewards/repetition_penalty_reward": -0.14353665709495544, |
| "rewards/tag_count_reward": 0.9218750149011612, |
| "step": 129 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 495.0937728881836, |
| "epoch": 0.19497562804649418, |
| "grad_norm": 26.634074769249203, |
| "kl": 14.8203125, |
| "learning_rate": 1.9459068351764032e-05, |
| "loss": 0.1622, |
| "reward": 2.11430823802948, |
| "reward_std": 0.6787235736846924, |
| "rewards/accuracy_reward": 0.4322916865348816, |
| "rewards/reasoning_steps_reward": 0.923611119389534, |
| "rewards/repetition_penalty_reward": -0.14263629540801048, |
| "rewards/tag_count_reward": 0.9010416865348816, |
| "step": 130 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 465.46876525878906, |
| "epoch": 0.19647544056992877, |
| "grad_norm": 3.2870541929839407, |
| "kl": 1.111328125, |
| "learning_rate": 1.94419224204689e-05, |
| "loss": -0.1421, |
| "reward": 2.2763773798942566, |
| "reward_std": 0.7801897525787354, |
| "rewards/accuracy_reward": 0.630208358168602, |
| "rewards/reasoning_steps_reward": 0.9131944626569748, |
| "rewards/repetition_penalty_reward": -0.14202555641531944, |
| "rewards/tag_count_reward": 0.8750000149011612, |
| "step": 131 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 467.06250762939453, |
| "epoch": 0.19797525309336333, |
| "grad_norm": 9.180164568388207, |
| "kl": 1.109375, |
| "learning_rate": 1.9424516769090863e-05, |
| "loss": -0.0418, |
| "reward": 1.8292181193828583, |
| "reward_std": 1.029387205839157, |
| "rewards/accuracy_reward": 0.4114583358168602, |
| "rewards/reasoning_steps_reward": 0.82118059694767, |
| "rewards/repetition_penalty_reward": -0.1039416529238224, |
| "rewards/tag_count_reward": 0.700520858168602, |
| "step": 132 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 455.7291793823242, |
| "epoch": 0.1994750656167979, |
| "grad_norm": 2.948220887698058, |
| "kl": 1.56640625, |
| "learning_rate": 1.9406851876409254e-05, |
| "loss": -0.1012, |
| "reward": 1.8498985171318054, |
| "reward_std": 1.0783482491970062, |
| "rewards/accuracy_reward": 0.4843750298023224, |
| "rewards/reasoning_steps_reward": 0.7986111342906952, |
| "rewards/repetition_penalty_reward": -0.08673347532749176, |
| "rewards/tag_count_reward": 0.6536458432674408, |
| "step": 133 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 456.6927261352539, |
| "epoch": 0.20097487814023246, |
| "grad_norm": 6.426646825304548, |
| "kl": 2.6953125, |
| "learning_rate": 1.938892822833437e-05, |
| "loss": 0.0348, |
| "reward": 2.087424159049988, |
| "reward_std": 0.9264509230852127, |
| "rewards/accuracy_reward": 0.578125, |
| "rewards/reasoning_steps_reward": 0.8697917312383652, |
| "rewards/repetition_penalty_reward": -0.11830509081482887, |
| "rewards/tag_count_reward": 0.7578125149011612, |
| "step": 134 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 476.8333435058594, |
| "epoch": 0.20247469066366705, |
| "grad_norm": 51.991305129273876, |
| "kl": 22.15625, |
| "learning_rate": 1.9370746317894135e-05, |
| "loss": 0.6123, |
| "reward": 1.7000366151332855, |
| "reward_std": 0.9176836758852005, |
| "rewards/accuracy_reward": 0.3072916753590107, |
| "rewards/reasoning_steps_reward": 0.8246528208255768, |
| "rewards/repetition_penalty_reward": -0.102480823174119, |
| "rewards/tag_count_reward": 0.6705729365348816, |
| "step": 135 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 570.3385543823242, |
| "epoch": 0.2039745031871016, |
| "grad_norm": 8.533590166150786, |
| "kl": 7.75, |
| "learning_rate": 1.9352306645220518e-05, |
| "loss": 0.2573, |
| "reward": 1.376660943031311, |
| "reward_std": 0.9236660450696945, |
| "rewards/accuracy_reward": 0.2187500074505806, |
| "rewards/reasoning_steps_reward": 0.7638889104127884, |
| "rewards/repetition_penalty_reward": -0.07603009976446629, |
| "rewards/tag_count_reward": 0.4700520932674408, |
| "step": 136 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 655.1042022705078, |
| "epoch": 0.20547431571053618, |
| "grad_norm": 4.411821253364697, |
| "kl": 2.53515625, |
| "learning_rate": 1.9333609717535788e-05, |
| "loss": -0.0005, |
| "reward": 0.9961891174316406, |
| "reward_std": 0.7426871508359909, |
| "rewards/accuracy_reward": 0.06250000186264515, |
| "rewards/reasoning_steps_reward": 0.6788194477558136, |
| "rewards/repetition_penalty_reward": -0.04070331249386072, |
| "rewards/tag_count_reward": 0.295572929084301, |
| "step": 137 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 592.6458435058594, |
| "epoch": 0.20697412823397077, |
| "grad_norm": 1.4735089906729, |
| "kl": 1.529296875, |
| "learning_rate": 1.931465604913856e-05, |
| "loss": -0.0875, |
| "reward": 1.0002163350582123, |
| "reward_std": 0.7004366517066956, |
| "rewards/accuracy_reward": 0.052083334885537624, |
| "rewards/reasoning_steps_reward": 0.7013889253139496, |
| "rewards/repetition_penalty_reward": -0.04362048767507076, |
| "rewards/tag_count_reward": 0.2903645932674408, |
| "step": 138 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 725.1979370117188, |
| "epoch": 0.20847394075740533, |
| "grad_norm": 2.9297077947117667, |
| "kl": 1.375, |
| "learning_rate": 1.9295446161389644e-05, |
| "loss": 0.1009, |
| "reward": 1.0900856852531433, |
| "reward_std": 0.5820793807506561, |
| "rewards/accuracy_reward": 0.05729166837409139, |
| "rewards/reasoning_steps_reward": 0.8350694924592972, |
| "rewards/repetition_penalty_reward": -0.07310881279408932, |
| "rewards/tag_count_reward": 0.2708333395421505, |
| "step": 139 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 769.4843902587891, |
| "epoch": 0.2099737532808399, |
| "grad_norm": 2.6937963105185894, |
| "kl": 2.18359375, |
| "learning_rate": 1.9275980582697707e-05, |
| "loss": 0.1039, |
| "reward": 1.2186349630355835, |
| "reward_std": 0.5874937772750854, |
| "rewards/accuracy_reward": 0.1770833395421505, |
| "rewards/reasoning_steps_reward": 0.9166667014360428, |
| "rewards/repetition_penalty_reward": -0.13422967679798603, |
| "rewards/tag_count_reward": 0.2591145932674408, |
| "step": 140 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 845.2552337646484, |
| "epoch": 0.21147356580427445, |
| "grad_norm": 1.622095571716092, |
| "kl": 1.2060546875, |
| "learning_rate": 1.9256259848504737e-05, |
| "loss": 0.0388, |
| "reward": 1.2769785821437836, |
| "reward_std": 0.47676292806863785, |
| "rewards/accuracy_reward": 0.2968750074505806, |
| "rewards/reasoning_steps_reward": 0.9704861342906952, |
| "rewards/repetition_penalty_reward": -0.21303880959749222, |
| "rewards/tag_count_reward": 0.2226562574505806, |
| "step": 141 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 826.1771087646484, |
| "epoch": 0.21297337832770905, |
| "grad_norm": 1.4773462554374228, |
| "kl": 0.5615234375, |
| "learning_rate": 1.9236284501271317e-05, |
| "loss": -0.0577, |
| "reward": 1.3371992409229279, |
| "reward_std": 0.49016065895557404, |
| "rewards/accuracy_reward": 0.494791679084301, |
| "rewards/reasoning_steps_reward": 0.9843750149011612, |
| "rewards/repetition_penalty_reward": -0.27738412469625473, |
| "rewards/tag_count_reward": 0.1354166716337204, |
| "step": 142 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 909.7500305175781, |
| "epoch": 0.2144731908511436, |
| "grad_norm": 0.9239369002896122, |
| "kl": 0.66259765625, |
| "learning_rate": 1.9216055090461693e-05, |
| "loss": -0.0375, |
| "reward": 1.4242282509803772, |
| "reward_std": 0.5077776834368706, |
| "rewards/accuracy_reward": 0.6562500149011612, |
| "rewards/reasoning_steps_reward": 0.97743059694767, |
| "rewards/repetition_penalty_reward": -0.32794189453125, |
| "rewards/tag_count_reward": 0.11848958767950535, |
| "step": 143 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 927.3541870117188, |
| "epoch": 0.21597300337457817, |
| "grad_norm": 0.7490847651592132, |
| "kl": 0.4716796875, |
| "learning_rate": 1.9195572172528678e-05, |
| "loss": 0.0525, |
| "reward": 1.2379167079925537, |
| "reward_std": 0.5809585899114609, |
| "rewards/accuracy_reward": 0.4479166716337204, |
| "rewards/reasoning_steps_reward": 0.9809028059244156, |
| "rewards/repetition_penalty_reward": -0.41616319864988327, |
| "rewards/tag_count_reward": 0.2252604216337204, |
| "step": 144 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 782.5052337646484, |
| "epoch": 0.21747281589801276, |
| "grad_norm": 9.579120968298097, |
| "kl": 91.814208984375, |
| "learning_rate": 1.9174836310898334e-05, |
| "loss": 0.1581, |
| "reward": 1.7532488107681274, |
| "reward_std": 0.6942550539970398, |
| "rewards/accuracy_reward": 0.479166679084301, |
| "rewards/reasoning_steps_reward": 0.9843750149011612, |
| "rewards/repetition_penalty_reward": -0.35612622648477554, |
| "rewards/tag_count_reward": 0.645833358168602, |
| "step": 145 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 806.2708587646484, |
| "epoch": 0.21897262842144732, |
| "grad_norm": 1.8428465118174489, |
| "kl": 0.408447265625, |
| "learning_rate": 1.9153848075954465e-05, |
| "loss": 0.1999, |
| "reward": 1.5887240171432495, |
| "reward_std": 0.7087294459342957, |
| "rewards/accuracy_reward": 0.3437500074505806, |
| "rewards/reasoning_steps_reward": 0.9791667014360428, |
| "rewards/repetition_penalty_reward": -0.37351563572883606, |
| "rewards/tag_count_reward": 0.6393229365348816, |
| "step": 146 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 754.6458435058594, |
| "epoch": 0.2204724409448819, |
| "grad_norm": 0.7681530290808569, |
| "kl": 0.266357421875, |
| "learning_rate": 1.9132608045022954e-05, |
| "loss": 0.2529, |
| "reward": 1.4556803405284882, |
| "reward_std": 0.7271180897951126, |
| "rewards/accuracy_reward": 0.3385416753590107, |
| "rewards/reasoning_steps_reward": 0.987847238779068, |
| "rewards/repetition_penalty_reward": -0.3368544206023216, |
| "rewards/tag_count_reward": 0.4661458432674408, |
| "step": 147 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 771.1458435058594, |
| "epoch": 0.22197225346831645, |
| "grad_norm": 1.2907787204307715, |
| "kl": 0.51708984375, |
| "learning_rate": 1.9111116802355853e-05, |
| "loss": 0.2698, |
| "reward": 0.9035700112581253, |
| "reward_std": 0.7091249525547028, |
| "rewards/accuracy_reward": 0.1458333395421505, |
| "rewards/reasoning_steps_reward": 0.9513889253139496, |
| "rewards/repetition_penalty_reward": -0.44104810059070587, |
| "rewards/tag_count_reward": 0.2473958432674408, |
| "step": 148 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 578.7760467529297, |
| "epoch": 0.22347206599175104, |
| "grad_norm": 1.002753301602421, |
| "kl": 0.42529296875, |
| "learning_rate": 1.9089374939115335e-05, |
| "loss": 0.3394, |
| "reward": 1.304781287908554, |
| "reward_std": 0.7868980914354324, |
| "rewards/accuracy_reward": 0.2239583395421505, |
| "rewards/reasoning_steps_reward": 0.958333358168602, |
| "rewards/repetition_penalty_reward": -0.3111041784286499, |
| "rewards/tag_count_reward": 0.4335937649011612, |
| "step": 149 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 464.1771011352539, |
| "epoch": 0.2249718785151856, |
| "grad_norm": 1.5602927845713264, |
| "kl": 0.5703125, |
| "learning_rate": 1.906738305335741e-05, |
| "loss": 0.3492, |
| "reward": 1.3854647278785706, |
| "reward_std": 0.942527711391449, |
| "rewards/accuracy_reward": 0.2656250074505806, |
| "rewards/reasoning_steps_reward": 0.9010417014360428, |
| "rewards/repetition_penalty_reward": -0.272087462246418, |
| "rewards/tag_count_reward": 0.490885429084301, |
| "step": 150 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 455.96876525878906, |
| "epoch": 0.22647169103862017, |
| "grad_norm": 1.4291694851849999, |
| "kl": 0.609375, |
| "learning_rate": 1.90451417500155e-05, |
| "loss": 0.1916, |
| "reward": 1.4960919618606567, |
| "reward_std": 0.7673981636762619, |
| "rewards/accuracy_reward": 0.2291666753590107, |
| "rewards/reasoning_steps_reward": 0.8871527761220932, |
| "rewards/repetition_penalty_reward": -0.24392545968294144, |
| "rewards/tag_count_reward": 0.6236979365348816, |
| "step": 151 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 487.4947967529297, |
| "epoch": 0.22797150356205476, |
| "grad_norm": 3.3680398671635974, |
| "kl": 0.68359375, |
| "learning_rate": 1.902265164088378e-05, |
| "loss": 0.3357, |
| "reward": 1.656085580587387, |
| "reward_std": 0.8526331186294556, |
| "rewards/accuracy_reward": 0.3333333507180214, |
| "rewards/reasoning_steps_reward": 0.944444477558136, |
| "rewards/repetition_penalty_reward": -0.27143188565969467, |
| "rewards/tag_count_reward": 0.6497395932674408, |
| "step": 152 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 541.6562652587891, |
| "epoch": 0.22947131608548932, |
| "grad_norm": 21.24539554413327, |
| "kl": 6.5703125, |
| "learning_rate": 1.899991334460036e-05, |
| "loss": 0.3699, |
| "reward": 1.4813492000102997, |
| "reward_std": 1.0872852802276611, |
| "rewards/accuracy_reward": 0.385416679084301, |
| "rewards/reasoning_steps_reward": 0.866319477558136, |
| "rewards/repetition_penalty_reward": -0.3628349155187607, |
| "rewards/tag_count_reward": 0.5924479216337204, |
| "step": 153 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 646.5052337646484, |
| "epoch": 0.23097112860892388, |
| "grad_norm": 7.586485137427004, |
| "kl": 7.958984375, |
| "learning_rate": 1.8976927486630252e-05, |
| "loss": 0.3929, |
| "reward": 1.2929440140724182, |
| "reward_std": 0.9547273218631744, |
| "rewards/accuracy_reward": 0.2760416716337204, |
| "rewards/reasoning_steps_reward": 0.8802083432674408, |
| "rewards/repetition_penalty_reward": -0.4609622582793236, |
| "rewards/tag_count_reward": 0.5976562649011612, |
| "step": 154 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 579.0521087646484, |
| "epoch": 0.23247094113235844, |
| "grad_norm": 5.98725832415842, |
| "kl": 15.900390625, |
| "learning_rate": 1.8953694699248193e-05, |
| "loss": 0.3019, |
| "reward": 1.338868498802185, |
| "reward_std": 1.0572403371334076, |
| "rewards/accuracy_reward": 0.3541666744276881, |
| "rewards/reasoning_steps_reward": 0.8315972536802292, |
| "rewards/repetition_penalty_reward": -0.4198121204972267, |
| "rewards/tag_count_reward": 0.5729167014360428, |
| "step": 155 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 464.4948043823242, |
| "epoch": 0.23397075365579303, |
| "grad_norm": 1.5100410826543946, |
| "kl": 1.6201171875, |
| "learning_rate": 1.893021562152122e-05, |
| "loss": 0.2142, |
| "reward": 1.4770875573158264, |
| "reward_std": 1.0544297099113464, |
| "rewards/accuracy_reward": 0.3541666716337204, |
| "rewards/reasoning_steps_reward": 0.8368055671453476, |
| "rewards/repetition_penalty_reward": -0.290707603096962, |
| "rewards/tag_count_reward": 0.5768229365348816, |
| "step": 156 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 352.2135543823242, |
| "epoch": 0.2354705661792276, |
| "grad_norm": 2.5164021567299324, |
| "kl": 0.8330078125, |
| "learning_rate": 1.8906490899291125e-05, |
| "loss": 0.1355, |
| "reward": 1.7763201594352722, |
| "reward_std": 0.8412696719169617, |
| "rewards/accuracy_reward": 0.4270833358168602, |
| "rewards/reasoning_steps_reward": 0.869791716337204, |
| "rewards/repetition_penalty_reward": -0.18852365016937256, |
| "rewards/tag_count_reward": 0.6679687649011612, |
| "step": 157 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 405.21875762939453, |
| "epoch": 0.23697037870266216, |
| "grad_norm": 1.8327817177420191, |
| "kl": 0.39794921875, |
| "learning_rate": 1.888252118515666e-05, |
| "loss": 0.1901, |
| "reward": 2.067378491163254, |
| "reward_std": 0.7569487392902374, |
| "rewards/accuracy_reward": 0.5468750149011612, |
| "rewards/reasoning_steps_reward": 0.9409722536802292, |
| "rewards/repetition_penalty_reward": -0.20302094891667366, |
| "rewards/tag_count_reward": 0.7825520932674408, |
| "step": 158 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 394.5520935058594, |
| "epoch": 0.23847019122609675, |
| "grad_norm": 1.1312490111142588, |
| "kl": 0.412109375, |
| "learning_rate": 1.88583071384556e-05, |
| "loss": 0.0986, |
| "reward": 2.053210973739624, |
| "reward_std": 0.7719367817044258, |
| "rewards/accuracy_reward": 0.5000000149011612, |
| "rewards/reasoning_steps_reward": 0.9201389104127884, |
| "rewards/repetition_penalty_reward": -0.18854257836937904, |
| "rewards/tag_count_reward": 0.821614608168602, |
| "step": 159 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 534.4843826293945, |
| "epoch": 0.2399700037495313, |
| "grad_norm": 5.729314262683275, |
| "kl": 0.576171875, |
| "learning_rate": 1.883384942524661e-05, |
| "loss": 0.3881, |
| "reward": 1.6979016363620758, |
| "reward_std": 1.0777118504047394, |
| "rewards/accuracy_reward": 0.3333333432674408, |
| "rewards/reasoning_steps_reward": 0.8663194626569748, |
| "rewards/repetition_penalty_reward": -0.2621678523719311, |
| "rewards/tag_count_reward": 0.7604166716337204, |
| "step": 160 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 646.6614837646484, |
| "epoch": 0.24146981627296588, |
| "grad_norm": 2.928392277669671, |
| "kl": 0.74658203125, |
| "learning_rate": 1.880914871829092e-05, |
| "loss": 0.2085, |
| "reward": 2.010637640953064, |
| "reward_std": 0.8601251840591431, |
| "rewards/accuracy_reward": 0.526041679084301, |
| "rewards/reasoning_steps_reward": 0.8993055820465088, |
| "rewards/repetition_penalty_reward": -0.1764284037053585, |
| "rewards/tag_count_reward": 0.7617187798023224, |
| "step": 161 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 804.2083587646484, |
| "epoch": 0.24296962879640044, |
| "grad_norm": 1.389994708780378, |
| "kl": 0.5537109375, |
| "learning_rate": 1.8784205697033803e-05, |
| "loss": 0.0807, |
| "reward": 1.8883300125598907, |
| "reward_std": 0.7057935371994972, |
| "rewards/accuracy_reward": 0.4479166716337204, |
| "rewards/reasoning_steps_reward": 0.8576389253139496, |
| "rewards/repetition_penalty_reward": -0.09170474670827389, |
| "rewards/tag_count_reward": 0.6744791865348816, |
| "step": 162 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 766.2187652587891, |
| "epoch": 0.24446944131983503, |
| "grad_norm": 5.926627814628075, |
| "kl": 0.48681640625, |
| "learning_rate": 1.875902104758592e-05, |
| "loss": 0.0925, |
| "reward": 2.008990705013275, |
| "reward_std": 0.7761075049638748, |
| "rewards/accuracy_reward": 0.5885416828095913, |
| "rewards/reasoning_steps_reward": 0.854166716337204, |
| "rewards/repetition_penalty_reward": -0.07304057851433754, |
| "rewards/tag_count_reward": 0.6393229365348816, |
| "step": 163 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 725.9896087646484, |
| "epoch": 0.2459692538432696, |
| "grad_norm": 3252.328387823972, |
| "kl": 6.70703125, |
| "learning_rate": 1.873359546270442e-05, |
| "loss": 0.4991, |
| "reward": 1.9295026063919067, |
| "reward_std": 0.8431710749864578, |
| "rewards/accuracy_reward": 0.5833333507180214, |
| "rewards/reasoning_steps_reward": 0.8489583283662796, |
| "rewards/repetition_penalty_reward": -0.0731016006320715, |
| "rewards/tag_count_reward": 0.5703125298023224, |
| "step": 164 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 711.4948120117188, |
| "epoch": 0.24746906636670415, |
| "grad_norm": 35.24975659123428, |
| "kl": 0.7900390625, |
| "learning_rate": 1.8707929641773876e-05, |
| "loss": 0.2004, |
| "reward": 1.7060418128967285, |
| "reward_std": 0.8163315951824188, |
| "rewards/accuracy_reward": 0.36458334140479565, |
| "rewards/reasoning_steps_reward": 0.8437500447034836, |
| "rewards/repetition_penalty_reward": -0.06739573087543249, |
| "rewards/tag_count_reward": 0.5651041865348816, |
| "step": 165 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 631.3385772705078, |
| "epoch": 0.24896887889013875, |
| "grad_norm": 44.81319877908832, |
| "kl": 0.48486328125, |
| "learning_rate": 1.8682024290787092e-05, |
| "loss": 0.2422, |
| "reward": 1.8670725524425507, |
| "reward_std": 0.7701267302036285, |
| "rewards/accuracy_reward": 0.5000000149011612, |
| "rewards/reasoning_steps_reward": 0.8663195073604584, |
| "rewards/repetition_penalty_reward": -0.07086153514683247, |
| "rewards/tag_count_reward": 0.571614608168602, |
| "step": 166 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 610.9791870117188, |
| "epoch": 0.2504686914135733, |
| "grad_norm": 48.22852108204421, |
| "kl": 1.1845703125, |
| "learning_rate": 1.8655880122325633e-05, |
| "loss": 0.2746, |
| "reward": 1.8831318020820618, |
| "reward_std": 0.737670287489891, |
| "rewards/accuracy_reward": 0.4895833432674408, |
| "rewards/reasoning_steps_reward": 0.8975694924592972, |
| "rewards/repetition_penalty_reward": -0.07042735442519188, |
| "rewards/tag_count_reward": 0.5664062798023224, |
| "step": 167 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 637.8437805175781, |
| "epoch": 0.25196850393700787, |
| "grad_norm": 127.32727674691347, |
| "kl": 2.96484375, |
| "learning_rate": 1.862949785554025e-05, |
| "loss": 0.4427, |
| "reward": 1.6282338500022888, |
| "reward_std": 0.8880203366279602, |
| "rewards/accuracy_reward": 0.4114583432674408, |
| "rewards/reasoning_steps_reward": 0.7916667014360428, |
| "rewards/repetition_penalty_reward": -0.05535993352532387, |
| "rewards/tag_count_reward": 0.4804687649011612, |
| "step": 168 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 593.1562652587891, |
| "epoch": 0.25346831646044243, |
| "grad_norm": 66.6798484485283, |
| "kl": 2.54296875, |
| "learning_rate": 1.8602878216131093e-05, |
| "loss": 0.4588, |
| "reward": 1.8000280857086182, |
| "reward_std": 0.8398674130439758, |
| "rewards/accuracy_reward": 0.4739583432674408, |
| "rewards/reasoning_steps_reward": 0.8541666865348816, |
| "rewards/repetition_penalty_reward": -0.059346938505768776, |
| "rewards/tag_count_reward": 0.5312500149011612, |
| "step": 169 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 600.9687652587891, |
| "epoch": 0.254968128983877, |
| "grad_norm": 1723.0756525306333, |
| "kl": 5.98046875, |
| "learning_rate": 1.8576021936327747e-05, |
| "loss": 0.6964, |
| "reward": 1.6876211762428284, |
| "reward_std": 0.9227285087108612, |
| "rewards/accuracy_reward": 0.479166679084301, |
| "rewards/reasoning_steps_reward": 0.7777778059244156, |
| "rewards/repetition_penalty_reward": -0.04718789644539356, |
| "rewards/tag_count_reward": 0.4778645932674408, |
| "step": 170 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 570.3333435058594, |
| "epoch": 0.25646794150731156, |
| "grad_norm": 1469.676229468339, |
| "kl": 11.125, |
| "learning_rate": 1.8548929754869095e-05, |
| "loss": 1.1501, |
| "reward": 1.6916078925132751, |
| "reward_std": 0.8378966599702835, |
| "rewards/accuracy_reward": 0.4687500149011612, |
| "rewards/reasoning_steps_reward": 0.7812500298023224, |
| "rewards/repetition_penalty_reward": -0.050579698756337166, |
| "rewards/tag_count_reward": 0.4921875149011612, |
| "step": 171 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 634.5416870117188, |
| "epoch": 0.2579677540307462, |
| "grad_norm": 308.20087481858275, |
| "kl": 15.05859375, |
| "learning_rate": 1.8521602416982998e-05, |
| "loss": 0.8609, |
| "reward": 1.5223284363746643, |
| "reward_std": 0.8932169824838638, |
| "rewards/accuracy_reward": 0.3906250149011612, |
| "rewards/reasoning_steps_reward": 0.7343750149011612, |
| "rewards/repetition_penalty_reward": -0.044077920727431774, |
| "rewards/tag_count_reward": 0.4414062649011612, |
| "step": 172 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 575.4375228881836, |
| "epoch": 0.25946756655418074, |
| "grad_norm": 27.514037083049207, |
| "kl": 1.0087890625, |
| "learning_rate": 1.8494040674365785e-05, |
| "loss": 0.4806, |
| "reward": 1.7090361416339874, |
| "reward_std": 0.9580790549516678, |
| "rewards/accuracy_reward": 0.5000000223517418, |
| "rewards/reasoning_steps_reward": 0.7500000298023224, |
| "rewards/repetition_penalty_reward": -0.03966184053570032, |
| "rewards/tag_count_reward": 0.4986979365348816, |
| "step": 173 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 540.9948043823242, |
| "epoch": 0.2609673790776153, |
| "grad_norm": 24.242426884849852, |
| "kl": 0.8251953125, |
| "learning_rate": 1.8466245285161593e-05, |
| "loss": 0.4027, |
| "reward": 1.7482682466506958, |
| "reward_std": 0.8416074514389038, |
| "rewards/accuracy_reward": 0.4270833432674408, |
| "rewards/reasoning_steps_reward": 0.8246528506278992, |
| "rewards/repetition_penalty_reward": -0.04773869924247265, |
| "rewards/tag_count_reward": 0.544270858168602, |
| "step": 174 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 527.1041717529297, |
| "epoch": 0.26246719160104987, |
| "grad_norm": 13.61888783625401, |
| "kl": 1.12109375, |
| "learning_rate": 1.8438217013941494e-05, |
| "loss": 0.4855, |
| "reward": 1.8302274644374847, |
| "reward_std": 0.9374262690544128, |
| "rewards/accuracy_reward": 0.4375000149011612, |
| "rewards/reasoning_steps_reward": 0.8263889253139496, |
| "rewards/repetition_penalty_reward": -0.05215102527290583, |
| "rewards/tag_count_reward": 0.618489608168602, |
| "step": 175 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 509.71356201171875, |
| "epoch": 0.26396700412448443, |
| "grad_norm": 9.502433027265221, |
| "kl": 0.9228515625, |
| "learning_rate": 1.8409956631682475e-05, |
| "loss": 0.5068, |
| "reward": 1.8384932577610016, |
| "reward_std": 0.9605595469474792, |
| "rewards/accuracy_reward": 0.4270833507180214, |
| "rewards/reasoning_steps_reward": 0.80555559694767, |
| "rewards/repetition_penalty_reward": -0.03607274405658245, |
| "rewards/tag_count_reward": 0.641927108168602, |
| "step": 176 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 542.1718902587891, |
| "epoch": 0.265466816647919, |
| "grad_norm": 12.713070926311392, |
| "kl": 1.177734375, |
| "learning_rate": 1.838146491574624e-05, |
| "loss": 0.5857, |
| "reward": 1.5894387364387512, |
| "reward_std": 1.0209160447120667, |
| "rewards/accuracy_reward": 0.4114583507180214, |
| "rewards/reasoning_steps_reward": 0.6440972313284874, |
| "rewards/repetition_penalty_reward": -0.02991898776963353, |
| "rewards/tag_count_reward": 0.5638020932674408, |
| "step": 177 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 547.8333435058594, |
| "epoch": 0.26696662917135355, |
| "grad_norm": 16.206516622301223, |
| "kl": 1.232421875, |
| "learning_rate": 1.83527426498578e-05, |
| "loss": 0.6041, |
| "reward": 1.6392149925231934, |
| "reward_std": 1.0323166698217392, |
| "rewards/accuracy_reward": 0.3281250037252903, |
| "rewards/reasoning_steps_reward": 0.756944477558136, |
| "rewards/repetition_penalty_reward": -0.03830242808908224, |
| "rewards/tag_count_reward": 0.5924479365348816, |
| "step": 178 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 536.1562805175781, |
| "epoch": 0.26846644169478817, |
| "grad_norm": 129.17932118885184, |
| "kl": 3.412109375, |
| "learning_rate": 1.832379062408394e-05, |
| "loss": 0.7632, |
| "reward": 1.5638504028320312, |
| "reward_std": 0.9709765613079071, |
| "rewards/accuracy_reward": 0.3229166716337204, |
| "rewards/reasoning_steps_reward": 0.6875000298023224, |
| "rewards/repetition_penalty_reward": -0.03120174235664308, |
| "rewards/tag_count_reward": 0.5846354216337204, |
| "step": 179 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 536.7291870117188, |
| "epoch": 0.26996625421822273, |
| "grad_norm": 60.94992386504032, |
| "kl": 1.783203125, |
| "learning_rate": 1.8294609634811482e-05, |
| "loss": 0.6702, |
| "reward": 1.7184252440929413, |
| "reward_std": 1.1222001910209656, |
| "rewards/accuracy_reward": 0.4687500223517418, |
| "rewards/reasoning_steps_reward": 0.671875, |
| "rewards/repetition_penalty_reward": -0.02636647690087557, |
| "rewards/tag_count_reward": 0.6041666716337204, |
| "step": 180 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 488.0625228881836, |
| "epoch": 0.2714660667416573, |
| "grad_norm": 597.8726346831836, |
| "kl": 4.18359375, |
| "learning_rate": 1.8265200484725364e-05, |
| "loss": 0.882, |
| "reward": 1.7504200041294098, |
| "reward_std": 1.0297911912202835, |
| "rewards/accuracy_reward": 0.4270833507180214, |
| "rewards/reasoning_steps_reward": 0.7291666865348816, |
| "rewards/repetition_penalty_reward": -0.03213207516819239, |
| "rewards/tag_count_reward": 0.626302108168602, |
| "step": 181 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 577.8541870117188, |
| "epoch": 0.27296587926509186, |
| "grad_norm": 289.5259006408696, |
| "kl": 3.12109375, |
| "learning_rate": 1.823556398278657e-05, |
| "loss": 0.7697, |
| "reward": 1.4063380062580109, |
| "reward_std": 1.146880030632019, |
| "rewards/accuracy_reward": 0.3437500037252903, |
| "rewards/reasoning_steps_reward": 0.5381944701075554, |
| "rewards/repetition_penalty_reward": -0.027689829003065825, |
| "rewards/tag_count_reward": 0.5520833507180214, |
| "step": 182 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 544.1302337646484, |
| "epoch": 0.2744656917885264, |
| "grad_norm": 18278.821342314448, |
| "kl": 62.75, |
| "learning_rate": 1.820570094420989e-05, |
| "loss": 5.0174, |
| "reward": 1.5145381093025208, |
| "reward_std": 1.0834265649318695, |
| "rewards/accuracy_reward": 0.3333333432674408, |
| "rewards/reasoning_steps_reward": 0.6649305820465088, |
| "rewards/repetition_penalty_reward": -0.03320494340732694, |
| "rewards/tag_count_reward": 0.5494791865348816, |
| "step": 183 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 528.7239761352539, |
| "epoch": 0.275965504311961, |
| "grad_norm": 115.68017983054848, |
| "kl": 2.783203125, |
| "learning_rate": 1.817561219044148e-05, |
| "loss": 0.7618, |
| "reward": 1.561537265777588, |
| "reward_std": 1.062522441148758, |
| "rewards/accuracy_reward": 0.3489583469927311, |
| "rewards/reasoning_steps_reward": 0.6597222089767456, |
| "rewards/repetition_penalty_reward": -0.033080867026001215, |
| "rewards/tag_count_reward": 0.5859375149011612, |
| "step": 184 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 552.9062652587891, |
| "epoch": 0.27746531683539555, |
| "grad_norm": 185.26777317096048, |
| "kl": 14.705078125, |
| "learning_rate": 1.814529854913626e-05, |
| "loss": 0.8482, |
| "reward": 1.5542615354061127, |
| "reward_std": 1.0569299161434174, |
| "rewards/accuracy_reward": 0.3281250074505806, |
| "rewards/reasoning_steps_reward": 0.6510417014360428, |
| "rewards/repetition_penalty_reward": -0.03297804016619921, |
| "rewards/tag_count_reward": 0.6080729365348816, |
| "step": 185 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 569.4687805175781, |
| "epoch": 0.27896512935883017, |
| "grad_norm": 711.4722614182526, |
| "kl": 4.55859375, |
| "learning_rate": 1.8114760854135168e-05, |
| "loss": 0.8749, |
| "reward": 1.4769874811172485, |
| "reward_std": 1.1705779433250427, |
| "rewards/accuracy_reward": 0.3489583469927311, |
| "rewards/reasoning_steps_reward": 0.605902835726738, |
| "rewards/repetition_penalty_reward": -0.024748651776462793, |
| "rewards/tag_count_reward": 0.5468750298023224, |
| "step": 186 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 580.2656402587891, |
| "epoch": 0.28046494188226473, |
| "grad_norm": 3683.741373123422, |
| "kl": 6.32421875, |
| "learning_rate": 1.808399994544222e-05, |
| "loss": 0.9344, |
| "reward": 1.4767035245895386, |
| "reward_std": 1.1781282722949982, |
| "rewards/accuracy_reward": 0.3645833432674408, |
| "rewards/reasoning_steps_reward": 0.5954861342906952, |
| "rewards/repetition_penalty_reward": -0.02503264555707574, |
| "rewards/tag_count_reward": 0.5416666939854622, |
| "step": 187 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 512.8333511352539, |
| "epoch": 0.2819647544056993, |
| "grad_norm": 231.95111964687146, |
| "kl": 160.20703125, |
| "learning_rate": 1.805301666920138e-05, |
| "loss": 0.9703, |
| "reward": 1.5358233749866486, |
| "reward_std": 1.097620114684105, |
| "rewards/accuracy_reward": 0.3125000074505806, |
| "rewards/reasoning_steps_reward": 0.6510417014360428, |
| "rewards/repetition_penalty_reward": -0.03318707179278135, |
| "rewards/tag_count_reward": 0.6054687798023224, |
| "step": 188 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 532.8073120117188, |
| "epoch": 0.28346456692913385, |
| "grad_norm": 392.6089381205205, |
| "kl": 4.78515625, |
| "learning_rate": 1.802181187767332e-05, |
| "loss": 0.8115, |
| "reward": 1.4347640872001648, |
| "reward_std": 1.0974180102348328, |
| "rewards/accuracy_reward": 0.291666679084301, |
| "rewards/reasoning_steps_reward": 0.6093750298023224, |
| "rewards/repetition_penalty_reward": -0.028777593281120062, |
| "rewards/tag_count_reward": 0.5625000149011612, |
| "step": 189 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 590.7552185058594, |
| "epoch": 0.2849643794525684, |
| "grad_norm": 230.8284538739948, |
| "kl": 13.0234375, |
| "learning_rate": 1.7990386429211945e-05, |
| "loss": 1.0111, |
| "reward": 1.3888815939426422, |
| "reward_std": 1.1380842924118042, |
| "rewards/accuracy_reward": 0.2968750074505806, |
| "rewards/reasoning_steps_reward": 0.5781250298023224, |
| "rewards/repetition_penalty_reward": -0.02518094959668815, |
| "rewards/tag_count_reward": 0.5390625149011612, |
| "step": 190 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 523.4375076293945, |
| "epoch": 0.286464191976003, |
| "grad_norm": 198.59410394134304, |
| "kl": 2.521484375, |
| "learning_rate": 1.7958741188240808e-05, |
| "loss": 0.763, |
| "reward": 1.4029823541641235, |
| "reward_std": 1.0676278173923492, |
| "rewards/accuracy_reward": 0.2864583469927311, |
| "rewards/reasoning_steps_reward": 0.5659722536802292, |
| "rewards/repetition_penalty_reward": -0.030177415814250708, |
| "rewards/tag_count_reward": 0.5807291865348816, |
| "step": 191 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 533.6666793823242, |
| "epoch": 0.2879640044994376, |
| "grad_norm": 711.4562338474254, |
| "kl": 26.0546875, |
| "learning_rate": 1.7926877025229313e-05, |
| "loss": 1.868, |
| "reward": 1.5179384350776672, |
| "reward_std": 1.1407716572284698, |
| "rewards/accuracy_reward": 0.3437500074505806, |
| "rewards/reasoning_steps_reward": 0.6128472536802292, |
| "rewards/repetition_penalty_reward": -0.02720050560310483, |
| "rewards/tag_count_reward": 0.5885416865348816, |
| "step": 192 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 532.0208511352539, |
| "epoch": 0.28946381702287216, |
| "grad_norm": 68.3445984974822, |
| "kl": 1.984375, |
| "learning_rate": 1.789479481666878e-05, |
| "loss": 0.7144, |
| "reward": 1.4272576868534088, |
| "reward_std": 1.1221881210803986, |
| "rewards/accuracy_reward": 0.2916666753590107, |
| "rewards/reasoning_steps_reward": 0.5798611342906952, |
| "rewards/repetition_penalty_reward": -0.021093112416565418, |
| "rewards/tag_count_reward": 0.5768229216337204, |
| "step": 193 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 591.0364761352539, |
| "epoch": 0.2909636295463067, |
| "grad_norm": 50.22604651230194, |
| "kl": 2.06640625, |
| "learning_rate": 1.786249544504834e-05, |
| "loss": 0.6381, |
| "reward": 1.2664842307567596, |
| "reward_std": 1.0500086545944214, |
| "rewards/accuracy_reward": 0.24479167722165585, |
| "rewards/reasoning_steps_reward": 0.5451388955116272, |
| "rewards/repetition_penalty_reward": -0.024748508352786303, |
| "rewards/tag_count_reward": 0.5013020932674408, |
| "step": 194 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 491.8958435058594, |
| "epoch": 0.2924634420697413, |
| "grad_norm": 133.86316178041838, |
| "kl": 3.8955078125, |
| "learning_rate": 1.7829979798830646e-05, |
| "loss": 0.7041, |
| "reward": 1.6353325545787811, |
| "reward_std": 1.0081156641244888, |
| "rewards/accuracy_reward": 0.3750000074505806, |
| "rewards/reasoning_steps_reward": 0.6684028208255768, |
| "rewards/repetition_penalty_reward": -0.039580670185387135, |
| "rewards/tag_count_reward": 0.631510429084301, |
| "step": 195 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 519.8072967529297, |
| "epoch": 0.29396325459317585, |
| "grad_norm": 44.76317495895618, |
| "kl": 3.6826171875, |
| "learning_rate": 1.779724877242745e-05, |
| "loss": 0.6891, |
| "reward": 1.4282979369163513, |
| "reward_std": 1.150389850139618, |
| "rewards/accuracy_reward": 0.2968750111758709, |
| "rewards/reasoning_steps_reward": 0.5972222685813904, |
| "rewards/repetition_penalty_reward": -0.051736912690103054, |
| "rewards/tag_count_reward": 0.5859375149011612, |
| "step": 196 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 498.42708587646484, |
| "epoch": 0.2954630671166104, |
| "grad_norm": 48.296982691961496, |
| "kl": 0.994140625, |
| "learning_rate": 1.776430326617498e-05, |
| "loss": 0.6805, |
| "reward": 1.6272485554218292, |
| "reward_std": 1.1874802112579346, |
| "rewards/accuracy_reward": 0.4166666716337204, |
| "rewards/reasoning_steps_reward": 0.6458333432674408, |
| "rewards/repetition_penalty_reward": -0.07327230088412762, |
| "rewards/tag_count_reward": 0.6380208432674408, |
| "step": 197 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 516.9583435058594, |
| "epoch": 0.296962879640045, |
| "grad_norm": 70.18509930572817, |
| "kl": 2.08203125, |
| "learning_rate": 1.77311441863092e-05, |
| "loss": 0.7866, |
| "reward": 1.406771183013916, |
| "reward_std": 1.206770658493042, |
| "rewards/accuracy_reward": 0.3229166716337204, |
| "rewards/reasoning_steps_reward": 0.5833333730697632, |
| "rewards/repetition_penalty_reward": -0.0867184977978468, |
| "rewards/tag_count_reward": 0.587239608168602, |
| "step": 198 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 543.0312652587891, |
| "epoch": 0.2984626921634796, |
| "grad_norm": 45.26003893019006, |
| "kl": 1.7421875, |
| "learning_rate": 1.769777244494086e-05, |
| "loss": 0.6958, |
| "reward": 1.30002062022686, |
| "reward_std": 1.114796221256256, |
| "rewards/accuracy_reward": 0.2447916753590107, |
| "rewards/reasoning_steps_reward": 0.569444477558136, |
| "rewards/repetition_penalty_reward": -0.08583013713359833, |
| "rewards/tag_count_reward": 0.5716145932674408, |
| "step": 199 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 517.4166793823242, |
| "epoch": 0.29996250468691416, |
| "grad_norm": 29.376902334980826, |
| "kl": 1.3662109375, |
| "learning_rate": 1.7664188960030422e-05, |
| "loss": 0.6637, |
| "reward": 1.3883242011070251, |
| "reward_std": 1.100932002067566, |
| "rewards/accuracy_reward": 0.2552083432674408, |
| "rewards/reasoning_steps_reward": 0.5937500149011612, |
| "rewards/repetition_penalty_reward": -0.04136331286281347, |
| "rewards/tag_count_reward": 0.5807291865348816, |
| "step": 200 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 550.4062652587891, |
| "epoch": 0.3014623172103487, |
| "grad_norm": 17.485691153105112, |
| "kl": 1.34765625, |
| "learning_rate": 1.7630394655362798e-05, |
| "loss": 0.6612, |
| "reward": 1.3975980877876282, |
| "reward_std": 1.0539609044790268, |
| "rewards/accuracy_reward": 0.322916679084301, |
| "rewards/reasoning_steps_reward": 0.5434028059244156, |
| "rewards/repetition_penalty_reward": -0.031221389304846525, |
| "rewards/tag_count_reward": 0.5625000149011612, |
| "step": 201 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 527.9635543823242, |
| "epoch": 0.3029621297337833, |
| "grad_norm": 90.79232868194525, |
| "kl": 1.744140625, |
| "learning_rate": 1.7596390460521946e-05, |
| "loss": 0.7488, |
| "reward": 1.4379164278507233, |
| "reward_std": 1.1352272033691406, |
| "rewards/accuracy_reward": 0.3020833432674408, |
| "rewards/reasoning_steps_reward": 0.5781250149011612, |
| "rewards/repetition_penalty_reward": -0.025625293143093586, |
| "rewards/tag_count_reward": 0.5833333432674408, |
| "step": 202 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 531.1771087646484, |
| "epoch": 0.30446194225721784, |
| "grad_norm": 7199.74671681739, |
| "kl": 48.05078125, |
| "learning_rate": 1.7562177310865296e-05, |
| "loss": 4.0565, |
| "reward": 1.4444026350975037, |
| "reward_std": 1.1348033249378204, |
| "rewards/accuracy_reward": 0.3072916716337204, |
| "rewards/reasoning_steps_reward": 0.57118059694767, |
| "rewards/repetition_penalty_reward": -0.033027936704456806, |
| "rewards/tag_count_reward": 0.598958358168602, |
| "step": 203 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 506.8073043823242, |
| "epoch": 0.3059617547806524, |
| "grad_norm": 1098.2572442297885, |
| "kl": 54.375, |
| "learning_rate": 1.7527756147498026e-05, |
| "loss": 4.475, |
| "reward": 1.5475478768348694, |
| "reward_std": 1.118485450744629, |
| "rewards/accuracy_reward": 0.3645833432674408, |
| "rewards/reasoning_steps_reward": 0.6041667014360428, |
| "rewards/repetition_penalty_reward": -0.03187928069382906, |
| "rewards/tag_count_reward": 0.6106770932674408, |
| "step": 204 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 506.47396087646484, |
| "epoch": 0.30746156730408697, |
| "grad_norm": 257.4360594846129, |
| "kl": 16.2890625, |
| "learning_rate": 1.7493127917247168e-05, |
| "loss": 1.7909, |
| "reward": 1.476582944393158, |
| "reward_std": 1.1201183497905731, |
| "rewards/accuracy_reward": 0.3125000074505806, |
| "rewards/reasoning_steps_reward": 0.60243059694767, |
| "rewards/repetition_penalty_reward": -0.0255872611887753, |
| "rewards/tag_count_reward": 0.587239608168602, |
| "step": 205 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 474.36981201171875, |
| "epoch": 0.3089613798275216, |
| "grad_norm": 82.09981426436468, |
| "kl": 20.0625, |
| "learning_rate": 1.7458293572635573e-05, |
| "loss": 0.7928, |
| "reward": 1.5302923321723938, |
| "reward_std": 1.1175011098384857, |
| "rewards/accuracy_reward": 0.3177083432674408, |
| "rewards/reasoning_steps_reward": 0.625, |
| "rewards/repetition_penalty_reward": -0.03741603484377265, |
| "rewards/tag_count_reward": 0.6250000149011612, |
| "step": 206 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 488.75000762939453, |
| "epoch": 0.31046119235095615, |
| "grad_norm": 96.44909255545524, |
| "kl": 2.359375, |
| "learning_rate": 1.7423254071855696e-05, |
| "loss": 0.7126, |
| "reward": 1.6391958892345428, |
| "reward_std": 1.2212097346782684, |
| "rewards/accuracy_reward": 0.4375000074505806, |
| "rewards/reasoning_steps_reward": 0.6250000298023224, |
| "rewards/repetition_penalty_reward": -0.0313770417124033, |
| "rewards/tag_count_reward": 0.6080729365348816, |
| "step": 207 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 434.65626525878906, |
| "epoch": 0.3119610048743907, |
| "grad_norm": 120.67164118704962, |
| "kl": 24.044921875, |
| "learning_rate": 1.7388010378743255e-05, |
| "loss": 0.8024, |
| "reward": 1.6531369984149933, |
| "reward_std": 1.085878610610962, |
| "rewards/accuracy_reward": 0.3750000074505806, |
| "rewards/reasoning_steps_reward": 0.6770833432674408, |
| "rewards/repetition_penalty_reward": -0.03696717880666256, |
| "rewards/tag_count_reward": 0.638020858168602, |
| "step": 208 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 416.17189025878906, |
| "epoch": 0.3134608173978253, |
| "grad_norm": 106.78168334463734, |
| "kl": 3.224609375, |
| "learning_rate": 1.735256346275071e-05, |
| "loss": 0.8412, |
| "reward": 1.7143349051475525, |
| "reward_std": 1.07014599442482, |
| "rewards/accuracy_reward": 0.3958333507180214, |
| "rewards/reasoning_steps_reward": 0.677083358168602, |
| "rewards/repetition_penalty_reward": -0.029154742136597633, |
| "rewards/tag_count_reward": 0.6705729365348816, |
| "step": 209 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 448.38021087646484, |
| "epoch": 0.31496062992125984, |
| "grad_norm": 133.47773562969348, |
| "kl": 3.859375, |
| "learning_rate": 1.7316914298920592e-05, |
| "loss": 0.983, |
| "reward": 1.591974526643753, |
| "reward_std": 1.1013777256011963, |
| "rewards/accuracy_reward": 0.354166679084301, |
| "rewards/reasoning_steps_reward": 0.63368059694767, |
| "rewards/repetition_penalty_reward": -0.032591511495411396, |
| "rewards/tag_count_reward": 0.6367187649011612, |
| "step": 210 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 436.5364685058594, |
| "epoch": 0.3164604424446944, |
| "grad_norm": 108.63670939149665, |
| "kl": 4.8046875, |
| "learning_rate": 1.7281063867858687e-05, |
| "loss": 0.9645, |
| "reward": 1.6065455377101898, |
| "reward_std": 1.0716453790664673, |
| "rewards/accuracy_reward": 0.3593750149011612, |
| "rewards/reasoning_steps_reward": 0.657986119389534, |
| "rewards/repetition_penalty_reward": -0.04493015632033348, |
| "rewards/tag_count_reward": 0.6341145932674408, |
| "step": 211 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 398.9948043823242, |
| "epoch": 0.31796025496812896, |
| "grad_norm": 96.14698671429733, |
| "kl": 83.048828125, |
| "learning_rate": 1.7245013155707076e-05, |
| "loss": 0.8663, |
| "reward": 1.7979373633861542, |
| "reward_std": 1.0779232680797577, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/reasoning_steps_reward": 0.7152777910232544, |
| "rewards/repetition_penalty_reward": -0.034527900628745556, |
| "rewards/tag_count_reward": 0.6796875149011612, |
| "step": 212 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 443.6145935058594, |
| "epoch": 0.3194600674915636, |
| "grad_norm": 409.81304589430863, |
| "kl": 6.1015625, |
| "learning_rate": 1.7208763154116973e-05, |
| "loss": 1.0656, |
| "reward": 1.603056788444519, |
| "reward_std": 1.083729773759842, |
| "rewards/accuracy_reward": 0.3750000074505806, |
| "rewards/reasoning_steps_reward": 0.645833358168602, |
| "rewards/repetition_penalty_reward": -0.031057825777679682, |
| "rewards/tag_count_reward": 0.6132812798023224, |
| "step": 213 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 461.0364685058594, |
| "epoch": 0.32095988001499814, |
| "grad_norm": 57.764921292245205, |
| "kl": 1.998046875, |
| "learning_rate": 1.7172314860221494e-05, |
| "loss": 0.7731, |
| "reward": 1.5737330913543701, |
| "reward_std": 1.0800977945327759, |
| "rewards/accuracy_reward": 0.3229166865348816, |
| "rewards/reasoning_steps_reward": 0.64930559694767, |
| "rewards/repetition_penalty_reward": -0.03781210444867611, |
| "rewards/tag_count_reward": 0.6393229365348816, |
| "step": 214 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 478.05731201171875, |
| "epoch": 0.3224596925384327, |
| "grad_norm": 50.79407157857832, |
| "kl": 7.47265625, |
| "learning_rate": 1.713566927660818e-05, |
| "loss": 0.7083, |
| "reward": 1.4539363086223602, |
| "reward_std": 1.0611682534217834, |
| "rewards/accuracy_reward": 0.2447916679084301, |
| "rewards/reasoning_steps_reward": 0.6440972536802292, |
| "rewards/repetition_penalty_reward": -0.03391093295067549, |
| "rewards/tag_count_reward": 0.5989583432674408, |
| "step": 215 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 418.6614761352539, |
| "epoch": 0.32395950506186727, |
| "grad_norm": 62.35489423972326, |
| "kl": 1.40625, |
| "learning_rate": 1.7098827411291474e-05, |
| "loss": 0.7932, |
| "reward": 1.7967810034751892, |
| "reward_std": 1.071500152349472, |
| "rewards/accuracy_reward": 0.5052083432674408, |
| "rewards/reasoning_steps_reward": 0.6666667014360428, |
| "rewards/repetition_penalty_reward": -0.0443648905493319, |
| "rewards/tag_count_reward": 0.6692708432674408, |
| "step": 216 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 465.15625762939453, |
| "epoch": 0.32545931758530183, |
| "grad_norm": 523.1686817311862, |
| "kl": 14.51171875, |
| "learning_rate": 1.7061790277684935e-05, |
| "loss": 1.1443, |
| "reward": 1.7601740062236786, |
| "reward_std": 0.9184492155909538, |
| "rewards/accuracy_reward": 0.463541679084301, |
| "rewards/reasoning_steps_reward": 0.6875000298023224, |
| "rewards/repetition_penalty_reward": -0.03670105990022421, |
| "rewards/tag_count_reward": 0.645833358168602, |
| "step": 217 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 440.9166793823242, |
| "epoch": 0.3269591301087364, |
| "grad_norm": 138.26908034820204, |
| "kl": 2.8447265625, |
| "learning_rate": 1.7024558894573408e-05, |
| "loss": 0.8425, |
| "reward": 1.6384324729442596, |
| "reward_std": 1.0819440335035324, |
| "rewards/accuracy_reward": 0.3750000074505806, |
| "rewards/reasoning_steps_reward": 0.675347238779068, |
| "rewards/repetition_penalty_reward": -0.035612753592431545, |
| "rewards/tag_count_reward": 0.6236979365348816, |
| "step": 218 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 430.1927185058594, |
| "epoch": 0.32845894263217096, |
| "grad_norm": 36.752519571585786, |
| "kl": 1.0751953125, |
| "learning_rate": 1.698713428608497e-05, |
| "loss": 0.6927, |
| "reward": 1.7269428670406342, |
| "reward_std": 1.1123250424861908, |
| "rewards/accuracy_reward": 0.4427083507180214, |
| "rewards/reasoning_steps_reward": 0.6840278208255768, |
| "rewards/repetition_penalty_reward": -0.036512063816189766, |
| "rewards/tag_count_reward": 0.6367187798023224, |
| "step": 219 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 428.6510467529297, |
| "epoch": 0.3299587551556056, |
| "grad_norm": 27.095258985089583, |
| "kl": 1.2314453125, |
| "learning_rate": 1.694951748166278e-05, |
| "loss": 0.6542, |
| "reward": 1.7653653621673584, |
| "reward_std": 1.0500112771987915, |
| "rewards/accuracy_reward": 0.4270833432674408, |
| "rewards/reasoning_steps_reward": 0.734375, |
| "rewards/repetition_penalty_reward": -0.0458325962536037, |
| "rewards/tag_count_reward": 0.649739608168602, |
| "step": 220 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 413.37500762939453, |
| "epoch": 0.33145856767904014, |
| "grad_norm": 47.16742116134022, |
| "kl": 2.20703125, |
| "learning_rate": 1.6911709516036755e-05, |
| "loss": 0.7411, |
| "reward": 1.8477334082126617, |
| "reward_std": 1.0211681723594666, |
| "rewards/accuracy_reward": 0.4427083432674408, |
| "rewards/reasoning_steps_reward": 0.769097238779068, |
| "rewards/repetition_penalty_reward": -0.04245765320956707, |
| "rewards/tag_count_reward": 0.6783854365348816, |
| "step": 221 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 397.3645935058594, |
| "epoch": 0.3329583802024747, |
| "grad_norm": 45.5053118676597, |
| "kl": 2.3515625, |
| "learning_rate": 1.6873711429195095e-05, |
| "loss": 0.8572, |
| "reward": 1.7332542836666107, |
| "reward_std": 1.0097034871578217, |
| "rewards/accuracy_reward": 0.3906250074505806, |
| "rewards/reasoning_steps_reward": 0.7309028059244156, |
| "rewards/repetition_penalty_reward": -0.03801308758556843, |
| "rewards/tag_count_reward": 0.649739608168602, |
| "step": 222 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 435.78126525878906, |
| "epoch": 0.33445819272590926, |
| "grad_norm": 2129.317338646113, |
| "kl": 12.60546875, |
| "learning_rate": 1.6835524266355698e-05, |
| "loss": 1.8881, |
| "reward": 1.8117730021476746, |
| "reward_std": 0.9651193022727966, |
| "rewards/accuracy_reward": 0.4687500149011612, |
| "rewards/reasoning_steps_reward": 0.7083333283662796, |
| "rewards/repetition_penalty_reward": -0.03978960122913122, |
| "rewards/tag_count_reward": 0.6744791716337204, |
| "step": 223 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 315.84375762939453, |
| "epoch": 0.3359580052493438, |
| "grad_norm": 341.57819859952355, |
| "kl": 2.9423828125, |
| "learning_rate": 1.6797149077937395e-05, |
| "loss": 0.5076, |
| "reward": 1.869988203048706, |
| "reward_std": 0.9940178692340851, |
| "rewards/accuracy_reward": 0.4687500074505806, |
| "rewards/reasoning_steps_reward": 0.765625, |
| "rewards/repetition_penalty_reward": -0.05188690684735775, |
| "rewards/tag_count_reward": 0.6875000149011612, |
| "step": 224 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 330.26563262939453, |
| "epoch": 0.3374578177727784, |
| "grad_norm": 430.46296061670853, |
| "kl": 2.5078125, |
| "learning_rate": 1.6758586919531054e-05, |
| "loss": 0.5861, |
| "reward": 1.754932165145874, |
| "reward_std": 0.967212975025177, |
| "rewards/accuracy_reward": 0.4114583432674408, |
| "rewards/reasoning_steps_reward": 0.7517361342906952, |
| "rewards/repetition_penalty_reward": -0.03847070410847664, |
| "rewards/tag_count_reward": 0.6302083432674408, |
| "step": 225 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 345.4322929382324, |
| "epoch": 0.33895763029621295, |
| "grad_norm": 357.14681488285794, |
| "kl": 9.0390625, |
| "learning_rate": 1.671983885187055e-05, |
| "loss": 0.4549, |
| "reward": 1.7626317143440247, |
| "reward_std": 0.9779582172632217, |
| "rewards/accuracy_reward": 0.432291679084301, |
| "rewards/reasoning_steps_reward": 0.7309027910232544, |
| "rewards/repetition_penalty_reward": -0.035979412496089935, |
| "rewards/tag_count_reward": 0.6354167014360428, |
| "step": 226 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 446.6510467529297, |
| "epoch": 0.34045744281964757, |
| "grad_norm": 155.57552226255254, |
| "kl": 1.998046875, |
| "learning_rate": 1.6680905940803596e-05, |
| "loss": 0.7187, |
| "reward": 1.6927993595600128, |
| "reward_std": 1.1329753398895264, |
| "rewards/accuracy_reward": 0.4843750074505806, |
| "rewards/reasoning_steps_reward": 0.6996527910232544, |
| "rewards/repetition_penalty_reward": -0.04721810668706894, |
| "rewards/tag_count_reward": 0.555989608168602, |
| "step": 227 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 400.7708435058594, |
| "epoch": 0.34195725534308213, |
| "grad_norm": 52.313264872011814, |
| "kl": 1.01904296875, |
| "learning_rate": 1.66417892572624e-05, |
| "loss": 0.6703, |
| "reward": 1.7310058176517487, |
| "reward_std": 0.9815861284732819, |
| "rewards/accuracy_reward": 0.411458358168602, |
| "rewards/reasoning_steps_reward": 0.7361111491918564, |
| "rewards/repetition_penalty_reward": -0.05458456836640835, |
| "rewards/tag_count_reward": 0.6380208432674408, |
| "step": 228 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 362.2291793823242, |
| "epoch": 0.3434570678665167, |
| "grad_norm": 3.414097906304966, |
| "kl": 0.44677734375, |
| "learning_rate": 1.6602489877234235e-05, |
| "loss": 0.5655, |
| "reward": 1.9467909336090088, |
| "reward_std": 0.9363191574811935, |
| "rewards/accuracy_reward": 0.494791679084301, |
| "rewards/reasoning_steps_reward": 0.8229166865348816, |
| "rewards/repetition_penalty_reward": -0.04409462306648493, |
| "rewards/tag_count_reward": 0.6731770932674408, |
| "step": 229 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 382.7916793823242, |
| "epoch": 0.34495688038995126, |
| "grad_norm": 1.2104539856778693, |
| "kl": 0.3408203125, |
| "learning_rate": 1.656300888173181e-05, |
| "loss": 0.4857, |
| "reward": 1.8293315470218658, |
| "reward_std": 0.9052053391933441, |
| "rewards/accuracy_reward": 0.4375000074505806, |
| "rewards/reasoning_steps_reward": 0.7795139253139496, |
| "rewards/repetition_penalty_reward": -0.04393233545124531, |
| "rewards/tag_count_reward": 0.6562500298023224, |
| "step": 230 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 432.2864761352539, |
| "epoch": 0.3464566929133858, |
| "grad_norm": 2.412360731904819, |
| "kl": 0.49462890625, |
| "learning_rate": 1.6523347356763572e-05, |
| "loss": 0.5985, |
| "reward": 1.8154855072498322, |
| "reward_std": 1.1332524120807648, |
| "rewards/accuracy_reward": 0.5000000223517418, |
| "rewards/reasoning_steps_reward": 0.723958358168602, |
| "rewards/repetition_penalty_reward": -0.0517019834369421, |
| "rewards/tag_count_reward": 0.6432291865348816, |
| "step": 231 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 395.9271011352539, |
| "epoch": 0.3479565054368204, |
| "grad_norm": 4.9562590681699294, |
| "kl": 0.408203125, |
| "learning_rate": 1.6483506393303807e-05, |
| "loss": 0.6441, |
| "reward": 1.9554131031036377, |
| "reward_std": 0.9711915105581284, |
| "rewards/accuracy_reward": 0.484375, |
| "rewards/reasoning_steps_reward": 0.7968750149011612, |
| "rewards/repetition_penalty_reward": -0.04458692017942667, |
| "rewards/tag_count_reward": 0.7187500298023224, |
| "step": 232 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 427.2552261352539, |
| "epoch": 0.34945631796025495, |
| "grad_norm": 7.911974161092371, |
| "kl": 0.58642578125, |
| "learning_rate": 1.644348708726263e-05, |
| "loss": 0.5395, |
| "reward": 1.9124858379364014, |
| "reward_std": 1.0289543271064758, |
| "rewards/accuracy_reward": 0.5052083507180214, |
| "rewards/reasoning_steps_reward": 0.7673611491918564, |
| "rewards/repetition_penalty_reward": -0.046281606424599886, |
| "rewards/tag_count_reward": 0.6861979365348816, |
| "step": 233 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 377.78126525878906, |
| "epoch": 0.35095613048368957, |
| "grad_norm": 10.19550375864435, |
| "kl": 0.98681640625, |
| "learning_rate": 1.640329053945585e-05, |
| "loss": 0.5746, |
| "reward": 1.934269905090332, |
| "reward_std": 1.0438069850206375, |
| "rewards/accuracy_reward": 0.4687500074505806, |
| "rewards/reasoning_steps_reward": 0.8072916865348816, |
| "rewards/repetition_penalty_reward": -0.08526142686605453, |
| "rewards/tag_count_reward": 0.743489608168602, |
| "step": 234 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 407.5208511352539, |
| "epoch": 0.35245594300712413, |
| "grad_norm": 8.119699899556887, |
| "kl": 0.48193359375, |
| "learning_rate": 1.6362917855574694e-05, |
| "loss": 0.8051, |
| "reward": 1.804307907819748, |
| "reward_std": 1.5109181106090546, |
| "rewards/accuracy_reward": 0.5989583432674408, |
| "rewards/reasoning_steps_reward": 0.7447916865348816, |
| "rewards/repetition_penalty_reward": -0.24647344648838043, |
| "rewards/tag_count_reward": 0.7070312649011612, |
| "step": 235 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 353.2135543823242, |
| "epoch": 0.3539557555305587, |
| "grad_norm": 11.968853066603838, |
| "kl": 1.50634765625, |
| "learning_rate": 1.6322370146155372e-05, |
| "loss": 0.6385, |
| "reward": 1.9699607491493225, |
| "reward_std": 1.293946921825409, |
| "rewards/accuracy_reward": 0.5729166865348816, |
| "rewards/reasoning_steps_reward": 0.8072916865348816, |
| "rewards/repetition_penalty_reward": -0.19279972463846207, |
| "rewards/tag_count_reward": 0.782552108168602, |
| "step": 236 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 344.5416793823242, |
| "epoch": 0.35545556805399325, |
| "grad_norm": 10.830086232933182, |
| "kl": 0.59326171875, |
| "learning_rate": 1.6281648526548556e-05, |
| "loss": 0.5964, |
| "reward": 2.065182775259018, |
| "reward_std": 1.1769357174634933, |
| "rewards/accuracy_reward": 0.5781250298023224, |
| "rewards/reasoning_steps_reward": 0.8420139104127884, |
| "rewards/repetition_penalty_reward": -0.17266453802585602, |
| "rewards/tag_count_reward": 0.8177083432674408, |
| "step": 237 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 384.87500762939453, |
| "epoch": 0.3569553805774278, |
| "grad_norm": 13.320898103061964, |
| "kl": 0.56640625, |
| "learning_rate": 1.6240754116888673e-05, |
| "loss": 0.5948, |
| "reward": 1.9682828783988953, |
| "reward_std": 1.2442002594470978, |
| "rewards/accuracy_reward": 0.5416666865348816, |
| "rewards/reasoning_steps_reward": 0.8263888955116272, |
| "rewards/repetition_penalty_reward": -0.20315821841359138, |
| "rewards/tag_count_reward": 0.8033854216337204, |
| "step": 238 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 372.1354217529297, |
| "epoch": 0.3584551931008624, |
| "grad_norm": 12.247593958285009, |
| "kl": 0.85302734375, |
| "learning_rate": 1.6199688042063118e-05, |
| "loss": 0.6045, |
| "reward": 1.9835784435272217, |
| "reward_std": 1.2273003607988358, |
| "rewards/accuracy_reward": 0.5416666865348816, |
| "rewards/reasoning_steps_reward": 0.8263889253139496, |
| "rewards/repetition_penalty_reward": -0.1995813064277172, |
| "rewards/tag_count_reward": 0.8151041865348816, |
| "step": 239 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 385.89583587646484, |
| "epoch": 0.35995500562429694, |
| "grad_norm": 8.226772677936836, |
| "kl": 0.62060546875, |
| "learning_rate": 1.6158451431681292e-05, |
| "loss": 0.5773, |
| "reward": 1.98191300034523, |
| "reward_std": 1.175494760274887, |
| "rewards/accuracy_reward": 0.526041679084301, |
| "rewards/reasoning_steps_reward": 0.8437500149011612, |
| "rewards/repetition_penalty_reward": -0.19256622344255447, |
| "rewards/tag_count_reward": 0.8046875149011612, |
| "step": 240 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 338.96876525878906, |
| "epoch": 0.36145481814773156, |
| "grad_norm": 2.961510864903608, |
| "kl": 0.4189453125, |
| "learning_rate": 1.6117045420043545e-05, |
| "loss": 0.4682, |
| "reward": 2.1378689408302307, |
| "reward_std": 0.9543499946594238, |
| "rewards/accuracy_reward": 0.5625000149011612, |
| "rewards/reasoning_steps_reward": 0.8784722238779068, |
| "rewards/repetition_penalty_reward": -0.1546658743172884, |
| "rewards/tag_count_reward": 0.8515625149011612, |
| "step": 241 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 386.7760467529297, |
| "epoch": 0.3629546306711661, |
| "grad_norm": 4.568175543895563, |
| "kl": 0.7275390625, |
| "learning_rate": 1.6075471146109957e-05, |
| "loss": 0.5672, |
| "reward": 1.996113270521164, |
| "reward_std": 0.9025778025388718, |
| "rewards/accuracy_reward": 0.5468750149011612, |
| "rewards/reasoning_steps_reward": 0.7881944626569748, |
| "rewards/repetition_penalty_reward": -0.13192503154277802, |
| "rewards/tag_count_reward": 0.7929687798023224, |
| "step": 242 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 317.5260543823242, |
| "epoch": 0.3644544431946007, |
| "grad_norm": 6.725239225005341, |
| "kl": 0.38720703125, |
| "learning_rate": 1.603372975346903e-05, |
| "loss": 0.4668, |
| "reward": 2.1939920783042908, |
| "reward_std": 0.7260987460613251, |
| "rewards/accuracy_reward": 0.4635416865348816, |
| "rewards/reasoning_steps_reward": 0.9236111342906952, |
| "rewards/repetition_penalty_reward": -0.06946289446204901, |
| "rewards/tag_count_reward": 0.8763020932674408, |
| "step": 243 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 336.2552185058594, |
| "epoch": 0.36595425571803525, |
| "grad_norm": 6.395489506905219, |
| "kl": 0.546875, |
| "learning_rate": 1.599182239030621e-05, |
| "loss": 0.5301, |
| "reward": 2.0523171722888947, |
| "reward_std": 0.8000431656837463, |
| "rewards/accuracy_reward": 0.3958333432674408, |
| "rewards/reasoning_steps_reward": 0.8888888955116272, |
| "rewards/repetition_penalty_reward": -0.07094682566821575, |
| "rewards/tag_count_reward": 0.8385417014360428, |
| "step": 244 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 336.4635543823242, |
| "epoch": 0.3674540682414698, |
| "grad_norm": 1.7231966906953229, |
| "kl": 0.44287109375, |
| "learning_rate": 1.594975020937233e-05, |
| "loss": 0.4191, |
| "reward": 2.3042526245117188, |
| "reward_std": 0.7582926452159882, |
| "rewards/accuracy_reward": 0.6562500149011612, |
| "rewards/reasoning_steps_reward": 0.8576389104127884, |
| "rewards/repetition_penalty_reward": -0.07031344994902611, |
| "rewards/tag_count_reward": 0.860677108168602, |
| "step": 245 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 297.7083435058594, |
| "epoch": 0.3689538807649044, |
| "grad_norm": 78.49085937534032, |
| "kl": 0.81298828125, |
| "learning_rate": 1.590751436795186e-05, |
| "loss": 0.5026, |
| "reward": 2.197529286146164, |
| "reward_std": 0.8267102539539337, |
| "rewards/accuracy_reward": 0.4895833358168602, |
| "rewards/reasoning_steps_reward": 0.8923611342906952, |
| "rewards/repetition_penalty_reward": -0.07373820524662733, |
| "rewards/tag_count_reward": 0.8893229365348816, |
| "step": 246 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 352.8229293823242, |
| "epoch": 0.37045369328833894, |
| "grad_norm": 12.08436983860603, |
| "kl": 0.72998046875, |
| "learning_rate": 1.5865116027831123e-05, |
| "loss": 0.6898, |
| "reward": 2.0662292540073395, |
| "reward_std": 1.1430340111255646, |
| "rewards/accuracy_reward": 0.5312500074505806, |
| "rewards/reasoning_steps_reward": 0.8368055671453476, |
| "rewards/repetition_penalty_reward": -0.1273471899330616, |
| "rewards/tag_count_reward": 0.825520858168602, |
| "step": 247 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 339.5208435058594, |
| "epoch": 0.37195350581177355, |
| "grad_norm": 2.931100861119855, |
| "kl": 0.4052734375, |
| "learning_rate": 1.5822556355266302e-05, |
| "loss": 0.5591, |
| "reward": 2.0680652260780334, |
| "reward_std": 1.04281547665596, |
| "rewards/accuracy_reward": 0.5208333507180214, |
| "rewards/reasoning_steps_reward": 0.8576388955116272, |
| "rewards/repetition_penalty_reward": -0.15025084279477596, |
| "rewards/tag_count_reward": 0.8398437649011612, |
| "step": 248 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 367.25000762939453, |
| "epoch": 0.3734533183352081, |
| "grad_norm": 661.2695289482906, |
| "kl": 5.46142578125, |
| "learning_rate": 1.577983652095137e-05, |
| "loss": 0.7068, |
| "reward": 1.943572849035263, |
| "reward_std": 1.0748744010925293, |
| "rewards/accuracy_reward": 0.4270833507180214, |
| "rewards/reasoning_steps_reward": 0.861111119389534, |
| "rewards/repetition_penalty_reward": -0.1805591806769371, |
| "rewards/tag_count_reward": 0.8359375298023224, |
| "step": 249 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 305.01043701171875, |
| "epoch": 0.3749531308586427, |
| "grad_norm": 3.771451390000111, |
| "kl": 0.345703125, |
| "learning_rate": 1.5736957699985887e-05, |
| "loss": 0.4327, |
| "reward": 2.2907695174217224, |
| "reward_std": 0.8615385890007019, |
| "rewards/accuracy_reward": 0.6041666865348816, |
| "rewards/reasoning_steps_reward": 0.927083358168602, |
| "rewards/repetition_penalty_reward": -0.13761597499251366, |
| "rewards/tag_count_reward": 0.8971354365348816, |
| "step": 250 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 384.3489761352539, |
| "epoch": 0.37645294338207724, |
| "grad_norm": 194.96197462696594, |
| "kl": 0.50048828125, |
| "learning_rate": 1.5693921071842688e-05, |
| "loss": 0.6103, |
| "reward": 2.0306463837623596, |
| "reward_std": 1.2375063300132751, |
| "rewards/accuracy_reward": 0.5625000149011612, |
| "rewards/reasoning_steps_reward": 0.850694477558136, |
| "rewards/repetition_penalty_reward": -0.20155855640769005, |
| "rewards/tag_count_reward": 0.8190104216337204, |
| "step": 251 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 343.0833511352539, |
| "epoch": 0.3779527559055118, |
| "grad_norm": 14.689129884295669, |
| "kl": 0.41357421875, |
| "learning_rate": 1.5650727820335417e-05, |
| "loss": 0.6158, |
| "reward": 1.993709921836853, |
| "reward_std": 1.1109019815921783, |
| "rewards/accuracy_reward": 0.5104166716337204, |
| "rewards/reasoning_steps_reward": 0.831597238779068, |
| "rewards/repetition_penalty_reward": -0.18554353341460228, |
| "rewards/tag_count_reward": 0.837239608168602, |
| "step": 252 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 388.89064025878906, |
| "epoch": 0.37945256842894637, |
| "grad_norm": 14.608873859558672, |
| "kl": 0.4140625, |
| "learning_rate": 1.5607379133585978e-05, |
| "loss": 0.7007, |
| "reward": 2.0233902037143707, |
| "reward_std": 1.345583826303482, |
| "rewards/accuracy_reward": 0.6093750149011612, |
| "rewards/reasoning_steps_reward": 0.835069477558136, |
| "rewards/repetition_penalty_reward": -0.21662724763154984, |
| "rewards/tag_count_reward": 0.7955729365348816, |
| "step": 253 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 367.6927261352539, |
| "epoch": 0.38095238095238093, |
| "grad_norm": 10.596812649365775, |
| "kl": 0.3828125, |
| "learning_rate": 1.5563876203991856e-05, |
| "loss": 0.6745, |
| "reward": 1.9966700077056885, |
| "reward_std": 1.2292464971542358, |
| "rewards/accuracy_reward": 0.5208333507180214, |
| "rewards/reasoning_steps_reward": 0.8454861640930176, |
| "rewards/repetition_penalty_reward": -0.18735776841640472, |
| "rewards/tag_count_reward": 0.8177083432674408, |
| "step": 254 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 349.64064025878906, |
| "epoch": 0.38245219347581555, |
| "grad_norm": 6.633191911603578, |
| "kl": 0.36279296875, |
| "learning_rate": 1.55202202281933e-05, |
| "loss": 0.5756, |
| "reward": 2.1044811606407166, |
| "reward_std": 1.2099826782941818, |
| "rewards/accuracy_reward": 0.6041666939854622, |
| "rewards/reasoning_steps_reward": 0.8472222536802292, |
| "rewards/repetition_penalty_reward": -0.1776370257139206, |
| "rewards/tag_count_reward": 0.8307291865348816, |
| "step": 255 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 411.07813262939453, |
| "epoch": 0.3839520059992501, |
| "grad_norm": 1.8892823585301353, |
| "kl": 0.4150390625, |
| "learning_rate": 1.5476412407040445e-05, |
| "loss": 0.7152, |
| "reward": 1.811952918767929, |
| "reward_std": 1.4293029010295868, |
| "rewards/accuracy_reward": 0.572916679084301, |
| "rewards/reasoning_steps_reward": 0.7361111491918564, |
| "rewards/repetition_penalty_reward": -0.2223353162407875, |
| "rewards/tag_count_reward": 0.7252604365348816, |
| "step": 256 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 413.4166793823242, |
| "epoch": 0.3854518185226847, |
| "grad_norm": 2.9450172922219524, |
| "kl": 0.57177734375, |
| "learning_rate": 1.5432453945560223e-05, |
| "loss": 0.6283, |
| "reward": 1.983522891998291, |
| "reward_std": 1.2600146383047104, |
| "rewards/accuracy_reward": 0.5833333507180214, |
| "rewards/reasoning_steps_reward": 0.7777778208255768, |
| "rewards/repetition_penalty_reward": -0.13019242137670517, |
| "rewards/tag_count_reward": 0.7526041865348816, |
| "step": 257 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 445.9739685058594, |
| "epoch": 0.38695163104611924, |
| "grad_norm": 4.635567397393551, |
| "kl": 0.43603515625, |
| "learning_rate": 1.5388346052923268e-05, |
| "loss": 0.5593, |
| "reward": 2.0053739845752716, |
| "reward_std": 1.152603343129158, |
| "rewards/accuracy_reward": 0.5677083507180214, |
| "rewards/reasoning_steps_reward": 0.7847222536802292, |
| "rewards/repetition_penalty_reward": -0.0866400208324194, |
| "rewards/tag_count_reward": 0.7395833432674408, |
| "step": 258 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 511.49481201171875, |
| "epoch": 0.3884514435695538, |
| "grad_norm": 3.270106987863464, |
| "kl": 0.4833984375, |
| "learning_rate": 1.534408994241063e-05, |
| "loss": 0.4957, |
| "reward": 1.8226185142993927, |
| "reward_std": 1.1505940705537796, |
| "rewards/accuracy_reward": 0.447916679084301, |
| "rewards/reasoning_steps_reward": 0.7465278059244156, |
| "rewards/repetition_penalty_reward": -0.07495102658867836, |
| "rewards/tag_count_reward": 0.7031250149011612, |
| "step": 259 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 485.95314025878906, |
| "epoch": 0.38995125609298836, |
| "grad_norm": 2.75945020139557, |
| "kl": 0.47900390625, |
| "learning_rate": 1.5299686831380395e-05, |
| "loss": 0.4981, |
| "reward": 1.8642282485961914, |
| "reward_std": 1.083889901638031, |
| "rewards/accuracy_reward": 0.4427083432674408, |
| "rewards/reasoning_steps_reward": 0.7708333730697632, |
| "rewards/repetition_penalty_reward": -0.08108432777225971, |
| "rewards/tag_count_reward": 0.731770858168602, |
| "step": 260 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 483.5677185058594, |
| "epoch": 0.3914510686164229, |
| "grad_norm": 2.796799480256457, |
| "kl": 1.189453125, |
| "learning_rate": 1.5255137941234228e-05, |
| "loss": 0.3239, |
| "reward": 2.0941615104675293, |
| "reward_std": 1.0367512106895447, |
| "rewards/accuracy_reward": 0.5625000149011612, |
| "rewards/reasoning_steps_reward": 0.817708358168602, |
| "rewards/repetition_penalty_reward": -0.08292189985513687, |
| "rewards/tag_count_reward": 0.796875, |
| "step": 261 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 490.7343978881836, |
| "epoch": 0.39295088113985754, |
| "grad_norm": 1.775868555651766, |
| "kl": 0.47119140625, |
| "learning_rate": 1.5210444497383745e-05, |
| "loss": 0.4116, |
| "reward": 2.075753331184387, |
| "reward_std": 1.1199206858873367, |
| "rewards/accuracy_reward": 0.5625000074505806, |
| "rewards/reasoning_steps_reward": 0.8177083432674408, |
| "rewards/repetition_penalty_reward": -0.12216338887810707, |
| "rewards/tag_count_reward": 0.8177083432674408, |
| "step": 262 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 496.3958511352539, |
| "epoch": 0.3944506936632921, |
| "grad_norm": 33.57969017695382, |
| "kl": 2.5966796875, |
| "learning_rate": 1.5165607729216822e-05, |
| "loss": 0.4809, |
| "reward": 1.9561425149440765, |
| "reward_std": 1.0842882692813873, |
| "rewards/accuracy_reward": 0.432291679084301, |
| "rewards/reasoning_steps_reward": 0.8541666865348816, |
| "rewards/repetition_penalty_reward": -0.15453467145562172, |
| "rewards/tag_count_reward": 0.8242187649011612, |
| "step": 263 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 445.7448043823242, |
| "epoch": 0.39595050618672667, |
| "grad_norm": 5.931757796863058, |
| "kl": 15.0380859375, |
| "learning_rate": 1.5120628870063772e-05, |
| "loss": 0.285, |
| "reward": 2.2481858134269714, |
| "reward_std": 0.9314106553792953, |
| "rewards/accuracy_reward": 0.6145833507180214, |
| "rewards/reasoning_steps_reward": 0.9184027910232544, |
| "rewards/repetition_penalty_reward": -0.13115456514060497, |
| "rewards/tag_count_reward": 0.8463541865348816, |
| "step": 264 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 430.67188262939453, |
| "epoch": 0.39745031871016123, |
| "grad_norm": 4.7870139797358275, |
| "kl": 0.74755859375, |
| "learning_rate": 1.5075509157163422e-05, |
| "loss": 0.3762, |
| "reward": 2.189552366733551, |
| "reward_std": 1.0156923830509186, |
| "rewards/accuracy_reward": 0.6145833432674408, |
| "rewards/reasoning_steps_reward": 0.897569477558136, |
| "rewards/repetition_penalty_reward": -0.14942347817122936, |
| "rewards/tag_count_reward": 0.8268229365348816, |
| "step": 265 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 438.3958435058594, |
| "epoch": 0.3989501312335958, |
| "grad_norm": 7.646687103422767, |
| "kl": 0.68505859375, |
| "learning_rate": 1.503024983162908e-05, |
| "loss": 0.4374, |
| "reward": 2.135951668024063, |
| "reward_std": 1.0654807686805725, |
| "rewards/accuracy_reward": 0.5937500149011612, |
| "rewards/reasoning_steps_reward": 0.8906250149011612, |
| "rewards/repetition_penalty_reward": -0.17654842138290405, |
| "rewards/tag_count_reward": 0.8281250149011612, |
| "step": 266 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 425.43751525878906, |
| "epoch": 0.40044994375703036, |
| "grad_norm": 6.4245183766465415, |
| "kl": 0.4541015625, |
| "learning_rate": 1.49848521384144e-05, |
| "loss": 0.4737, |
| "reward": 2.125093847513199, |
| "reward_std": 1.0813192874193192, |
| "rewards/accuracy_reward": 0.5677083507180214, |
| "rewards/reasoning_steps_reward": 0.8906250298023224, |
| "rewards/repetition_penalty_reward": -0.16396865621209145, |
| "rewards/tag_count_reward": 0.8307292014360428, |
| "step": 267 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 344.36458587646484, |
| "epoch": 0.4019497562804649, |
| "grad_norm": 0.963320833591549, |
| "kl": 0.65771484375, |
| "learning_rate": 1.4939317326279125e-05, |
| "loss": 0.3726, |
| "reward": 2.1674315333366394, |
| "reward_std": 0.8946598768234253, |
| "rewards/accuracy_reward": 0.5781250074505806, |
| "rewards/reasoning_steps_reward": 0.8697916865348816, |
| "rewards/repetition_penalty_reward": -0.12423517927527428, |
| "rewards/tag_count_reward": 0.8437500298023224, |
| "step": 268 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 435.85938262939453, |
| "epoch": 0.40344956880389954, |
| "grad_norm": 0.7554295329920303, |
| "kl": 0.61767578125, |
| "learning_rate": 1.489364664775475e-05, |
| "loss": 0.3679, |
| "reward": 1.9763840436935425, |
| "reward_std": 0.8836520612239838, |
| "rewards/accuracy_reward": 0.588541679084301, |
| "rewards/reasoning_steps_reward": 0.8715278059244156, |
| "rewards/repetition_penalty_reward": -0.14514389261603355, |
| "rewards/tag_count_reward": 0.6614583432674408, |
| "step": 269 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 461.4635543823242, |
| "epoch": 0.4049493813273341, |
| "grad_norm": 1.4465486237113498, |
| "kl": 1.75048828125, |
| "learning_rate": 1.4847841359110058e-05, |
| "loss": 0.2759, |
| "reward": 2.053072690963745, |
| "reward_std": 0.9807803928852081, |
| "rewards/accuracy_reward": 0.6562500149011612, |
| "rewards/reasoning_steps_reward": 0.8576388955116272, |
| "rewards/repetition_penalty_reward": -0.1131600309163332, |
| "rewards/tag_count_reward": 0.6523437649011612, |
| "step": 270 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 526.7500152587891, |
| "epoch": 0.40644919385076866, |
| "grad_norm": 3.0949957954601928, |
| "kl": 0.60693359375, |
| "learning_rate": 1.480190272031657e-05, |
| "loss": 0.3736, |
| "reward": 1.8765326142311096, |
| "reward_std": 1.0632285475730896, |
| "rewards/accuracy_reward": 0.4739583469927311, |
| "rewards/reasoning_steps_reward": 0.8211805522441864, |
| "rewards/repetition_penalty_reward": -0.1126167606562376, |
| "rewards/tag_count_reward": 0.6940104216337204, |
| "step": 271 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 598.4635772705078, |
| "epoch": 0.4079490063742032, |
| "grad_norm": 2.8462767374545037, |
| "kl": 0.43017578125, |
| "learning_rate": 1.475583199501389e-05, |
| "loss": 0.2644, |
| "reward": 1.9981749355793, |
| "reward_std": 1.0920402854681015, |
| "rewards/accuracy_reward": 0.567708358168602, |
| "rewards/reasoning_steps_reward": 0.8072916865348816, |
| "rewards/repetition_penalty_reward": -0.13463760912418365, |
| "rewards/tag_count_reward": 0.7578125298023224, |
| "step": 272 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 624.3802337646484, |
| "epoch": 0.4094488188976378, |
| "grad_norm": 1.682946550152949, |
| "kl": 0.3154296875, |
| "learning_rate": 1.4709630450474936e-05, |
| "loss": 0.3079, |
| "reward": 1.9099436402320862, |
| "reward_std": 1.3163824081420898, |
| "rewards/accuracy_reward": 0.5989583507180214, |
| "rewards/reasoning_steps_reward": 0.7760416865348816, |
| "rewards/repetition_penalty_reward": -0.1812022142112255, |
| "rewards/tag_count_reward": 0.716145858168602, |
| "step": 273 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 662.1250152587891, |
| "epoch": 0.41094863142107235, |
| "grad_norm": 0.8488180233484801, |
| "kl": 0.421875, |
| "learning_rate": 1.466329935757109e-05, |
| "loss": 0.2648, |
| "reward": 1.6805398762226105, |
| "reward_std": 1.3386418521404266, |
| "rewards/accuracy_reward": 0.4687500074505806, |
| "rewards/reasoning_steps_reward": 0.7447916865348816, |
| "rewards/repetition_penalty_reward": -0.24263722822070122, |
| "rewards/tag_count_reward": 0.7096354365348816, |
| "step": 274 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 623.0469055175781, |
| "epoch": 0.4124484439445069, |
| "grad_norm": 3.2203198278586647, |
| "kl": 0.39111328125, |
| "learning_rate": 1.4616839990737232e-05, |
| "loss": 0.2989, |
| "reward": 1.7977931797504425, |
| "reward_std": 1.3433575332164764, |
| "rewards/accuracy_reward": 0.4739583358168602, |
| "rewards/reasoning_steps_reward": 0.7916666865348816, |
| "rewards/repetition_penalty_reward": -0.22955061122775078, |
| "rewards/tag_count_reward": 0.7617187649011612, |
| "step": 275 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 542.328125, |
| "epoch": 0.41394825646794153, |
| "grad_norm": 1.263395821949462, |
| "kl": 0.92236328125, |
| "learning_rate": 1.4570253627936693e-05, |
| "loss": 0.2418, |
| "reward": 2.0932159423828125, |
| "reward_std": 1.0350356549024582, |
| "rewards/accuracy_reward": 0.5416666865348816, |
| "rewards/reasoning_steps_reward": 0.8593750298023224, |
| "rewards/repetition_penalty_reward": -0.168502826243639, |
| "rewards/tag_count_reward": 0.860677108168602, |
| "step": 276 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 587.0781402587891, |
| "epoch": 0.4154480689913761, |
| "grad_norm": 0.6161220934962688, |
| "kl": 0.4345703125, |
| "learning_rate": 1.4523541550626093e-05, |
| "loss": 0.1862, |
| "reward": 2.1349116563796997, |
| "reward_std": 0.9175398647785187, |
| "rewards/accuracy_reward": 0.5677083432674408, |
| "rewards/reasoning_steps_reward": 0.8854166865348816, |
| "rewards/repetition_penalty_reward": -0.18409884721040726, |
| "rewards/tag_count_reward": 0.8658854365348816, |
| "step": 277 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 481.4427185058594, |
| "epoch": 0.41694788151481066, |
| "grad_norm": 0.6544307412376995, |
| "kl": 1.18701171875, |
| "learning_rate": 1.4476705043720099e-05, |
| "loss": 0.1448, |
| "reward": 2.3552486300468445, |
| "reward_std": 0.7834517806768417, |
| "rewards/accuracy_reward": 0.6614583507180214, |
| "rewards/reasoning_steps_reward": 0.927083358168602, |
| "rewards/repetition_penalty_reward": -0.1252202671021223, |
| "rewards/tag_count_reward": 0.8919270932674408, |
| "step": 278 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 469.12501525878906, |
| "epoch": 0.4184476940382452, |
| "grad_norm": 1.6940544921635285, |
| "kl": 0.65087890625, |
| "learning_rate": 1.4429745395556073e-05, |
| "loss": 0.1976, |
| "reward": 2.295401096343994, |
| "reward_std": 0.7886304408311844, |
| "rewards/accuracy_reward": 0.6145833432674408, |
| "rewards/reasoning_steps_reward": 0.9079861342906952, |
| "rewards/repetition_penalty_reward": -0.12300180085003376, |
| "rewards/tag_count_reward": 0.8958333432674408, |
| "step": 279 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 444.00521087646484, |
| "epoch": 0.4199475065616798, |
| "grad_norm": 2.02027175437092, |
| "kl": 0.3291015625, |
| "learning_rate": 1.4382663897858647e-05, |
| "loss": 0.0773, |
| "reward": 2.3556065559387207, |
| "reward_std": 0.5934457406401634, |
| "rewards/accuracy_reward": 0.5625000149011612, |
| "rewards/reasoning_steps_reward": 0.9618055820465088, |
| "rewards/repetition_penalty_reward": -0.10489701479673386, |
| "rewards/tag_count_reward": 0.9361979365348816, |
| "step": 280 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 408.82814025878906, |
| "epoch": 0.42144731908511435, |
| "grad_norm": 2.1520369019481316, |
| "kl": 0.3623046875, |
| "learning_rate": 1.4335461845704173e-05, |
| "loss": 0.3092, |
| "reward": 2.3092455863952637, |
| "reward_std": 0.8222566097974777, |
| "rewards/accuracy_reward": 0.6197916865348816, |
| "rewards/reasoning_steps_reward": 0.9149305820465088, |
| "rewards/repetition_penalty_reward": -0.1161017045378685, |
| "rewards/tag_count_reward": 0.8906250298023224, |
| "step": 281 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 389.1302185058594, |
| "epoch": 0.4229471316085489, |
| "grad_norm": 2.1074908824998912, |
| "kl": 0.31689453125, |
| "learning_rate": 1.428814053748512e-05, |
| "loss": 0.2336, |
| "reward": 2.3464564085006714, |
| "reward_std": 0.8324245363473892, |
| "rewards/accuracy_reward": 0.6666666716337204, |
| "rewards/reasoning_steps_reward": 0.9166666865348816, |
| "rewards/repetition_penalty_reward": -0.13140839524567127, |
| "rewards/tag_count_reward": 0.8945312798023224, |
| "step": 282 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 382.51564025878906, |
| "epoch": 0.42444694413198353, |
| "grad_norm": 2.4209991865943645, |
| "kl": 0.30810546875, |
| "learning_rate": 1.4240701274874331e-05, |
| "loss": 0.3742, |
| "reward": 2.1715636253356934, |
| "reward_std": 0.9031594395637512, |
| "rewards/accuracy_reward": 0.5729166865348816, |
| "rewards/reasoning_steps_reward": 0.8697916865348816, |
| "rewards/repetition_penalty_reward": -0.1435406319797039, |
| "rewards/tag_count_reward": 0.872395858168602, |
| "step": 283 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 351.92188262939453, |
| "epoch": 0.4259467566554181, |
| "grad_norm": 2.6366018633296884, |
| "kl": 0.48486328125, |
| "learning_rate": 1.419314536278925e-05, |
| "loss": 0.5396, |
| "reward": 2.206935942173004, |
| "reward_std": 1.114256203174591, |
| "rewards/accuracy_reward": 0.6510416865348816, |
| "rewards/reasoning_steps_reward": 0.8559028208255768, |
| "rewards/repetition_penalty_reward": -0.1619878150522709, |
| "rewards/tag_count_reward": 0.8619791865348816, |
| "step": 284 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 305.18751525878906, |
| "epoch": 0.42744656917885265, |
| "grad_norm": 1.5196765033206825, |
| "kl": 0.53564453125, |
| "learning_rate": 1.4145474109356008e-05, |
| "loss": 0.2239, |
| "reward": 2.3387808799743652, |
| "reward_std": 0.7637557685375214, |
| "rewards/accuracy_reward": 0.6458333432674408, |
| "rewards/reasoning_steps_reward": 0.9131944626569748, |
| "rewards/repetition_penalty_reward": -0.10696578957140446, |
| "rewards/tag_count_reward": 0.8867187649011612, |
| "step": 285 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 367.5989685058594, |
| "epoch": 0.4289463817022872, |
| "grad_norm": 1.1289332268594263, |
| "kl": 0.37158203125, |
| "learning_rate": 1.4097688825873437e-05, |
| "loss": 0.3786, |
| "reward": 2.217616856098175, |
| "reward_std": 0.9250410199165344, |
| "rewards/accuracy_reward": 0.6354166865348816, |
| "rewards/reasoning_steps_reward": 0.897569477558136, |
| "rewards/repetition_penalty_reward": -0.1643276885151863, |
| "rewards/tag_count_reward": 0.848958358168602, |
| "step": 286 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 383.53126525878906, |
| "epoch": 0.4304461942257218, |
| "grad_norm": 0.8172511360961945, |
| "kl": 0.3134765625, |
| "learning_rate": 1.4049790826777016e-05, |
| "loss": 0.316, |
| "reward": 2.058010071516037, |
| "reward_std": 0.7540641278028488, |
| "rewards/accuracy_reward": 0.4270833432674408, |
| "rewards/reasoning_steps_reward": 0.911458358168602, |
| "rewards/repetition_penalty_reward": -0.13990668766200542, |
| "rewards/tag_count_reward": 0.8593750298023224, |
| "step": 287 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 366.4479293823242, |
| "epoch": 0.43194600674915634, |
| "grad_norm": 2.8494106639784276, |
| "kl": 0.578125, |
| "learning_rate": 1.4001781429602704e-05, |
| "loss": 0.2977, |
| "reward": 2.2450991570949554, |
| "reward_std": 0.8115597367286682, |
| "rewards/accuracy_reward": 0.5572916716337204, |
| "rewards/reasoning_steps_reward": 0.9270833283662796, |
| "rewards/repetition_penalty_reward": -0.11036965623497963, |
| "rewards/tag_count_reward": 0.8710937649011612, |
| "step": 288 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 362.0573043823242, |
| "epoch": 0.4334458192725909, |
| "grad_norm": 1.5771836903904428, |
| "kl": 0.34228515625, |
| "learning_rate": 1.3953661954950693e-05, |
| "loss": 0.213, |
| "reward": 2.2435672879219055, |
| "reward_std": 0.7003582268953323, |
| "rewards/accuracy_reward": 0.5260416865348816, |
| "rewards/reasoning_steps_reward": 0.9461805820465088, |
| "rewards/repetition_penalty_reward": -0.11407159268856049, |
| "rewards/tag_count_reward": 0.8854166865348816, |
| "step": 289 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 314.3020935058594, |
| "epoch": 0.4349456317960255, |
| "grad_norm": 1.307478764620523, |
| "kl": 0.40576171875, |
| "learning_rate": 1.3905433726449102e-05, |
| "loss": 0.1718, |
| "reward": 2.420071065425873, |
| "reward_std": 0.5883182883262634, |
| "rewards/accuracy_reward": 0.6041666865348816, |
| "rewards/reasoning_steps_reward": 0.9652777761220932, |
| "rewards/repetition_penalty_reward": -0.08426924794912338, |
| "rewards/tag_count_reward": 0.934895858168602, |
| "step": 290 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 344.4270935058594, |
| "epoch": 0.4364454443194601, |
| "grad_norm": 1.066920831840518, |
| "kl": 0.69140625, |
| "learning_rate": 1.3857098070717543e-05, |
| "loss": 0.2395, |
| "reward": 2.4870429635047913, |
| "reward_std": 0.7747205495834351, |
| "rewards/accuracy_reward": 0.7291666939854622, |
| "rewards/reasoning_steps_reward": 0.9461805820465088, |
| "rewards/repetition_penalty_reward": -0.10106483101844788, |
| "rewards/tag_count_reward": 0.9127604365348816, |
| "step": 291 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 349.6822967529297, |
| "epoch": 0.43794525684289465, |
| "grad_norm": 0.737418903498002, |
| "kl": 0.33349609375, |
| "learning_rate": 1.3808656317330646e-05, |
| "loss": 0.1173, |
| "reward": 2.453067660331726, |
| "reward_std": 0.43977494165301323, |
| "rewards/accuracy_reward": 0.6406250298023224, |
| "rewards/reasoning_steps_reward": 0.9809028059244156, |
| "rewards/repetition_penalty_reward": -0.10465812310576439, |
| "rewards/tag_count_reward": 0.9361979365348816, |
| "step": 292 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 384.39063262939453, |
| "epoch": 0.4394450693663292, |
| "grad_norm": 1.8989977209226134, |
| "kl": 0.5810546875, |
| "learning_rate": 1.3760109798781489e-05, |
| "loss": 0.3604, |
| "reward": 2.2495803833007812, |
| "reward_std": 0.849450945854187, |
| "rewards/accuracy_reward": 0.5625000149011612, |
| "rewards/reasoning_steps_reward": 0.927083358168602, |
| "rewards/repetition_penalty_reward": -0.13974259793758392, |
| "rewards/tag_count_reward": 0.8997395932674408, |
| "step": 293 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 369.5833435058594, |
| "epoch": 0.4409448818897638, |
| "grad_norm": 0.5939727210211808, |
| "kl": 0.7841796875, |
| "learning_rate": 1.3711459850444923e-05, |
| "loss": 0.2594, |
| "reward": 2.291065901517868, |
| "reward_std": 0.7707606852054596, |
| "rewards/accuracy_reward": 0.5416666716337204, |
| "rewards/reasoning_steps_reward": 0.9479166865348816, |
| "rewards/repetition_penalty_reward": -0.11648627929389477, |
| "rewards/tag_count_reward": 0.9179687649011612, |
| "step": 294 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 386.82813262939453, |
| "epoch": 0.44244469441319834, |
| "grad_norm": 2.0394609526475236, |
| "kl": 0.73681640625, |
| "learning_rate": 1.3662707810540867e-05, |
| "loss": 0.3294, |
| "reward": 2.276679277420044, |
| "reward_std": 1.02875255048275, |
| "rewards/accuracy_reward": 0.6354166865348816, |
| "rewards/reasoning_steps_reward": 0.8888889104127884, |
| "rewards/repetition_penalty_reward": -0.13825136795639992, |
| "rewards/tag_count_reward": 0.8906250298023224, |
| "step": 295 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 359.828125, |
| "epoch": 0.4439445069366329, |
| "grad_norm": 0.9901125024422729, |
| "kl": 0.392578125, |
| "learning_rate": 1.3613855020097477e-05, |
| "loss": 0.2768, |
| "reward": 2.5605881214141846, |
| "reward_std": 0.7392320334911346, |
| "rewards/accuracy_reward": 0.8177083432674408, |
| "rewards/reasoning_steps_reward": 0.9392361491918564, |
| "rewards/repetition_penalty_reward": -0.12083561439067125, |
| "rewards/tag_count_reward": 0.9244791716337204, |
| "step": 296 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 383.4427185058594, |
| "epoch": 0.4454443194600675, |
| "grad_norm": 0.6075657581629706, |
| "kl": 1.2001953125, |
| "learning_rate": 1.3564902822914274e-05, |
| "loss": 0.3837, |
| "reward": 2.383248746395111, |
| "reward_std": 1.0118870586156845, |
| "rewards/accuracy_reward": 0.7239583432674408, |
| "rewards/reasoning_steps_reward": 0.907986119389534, |
| "rewards/repetition_penalty_reward": -0.1510394662618637, |
| "rewards/tag_count_reward": 0.9023437798023224, |
| "step": 297 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 421.82813262939453, |
| "epoch": 0.4469441319835021, |
| "grad_norm": 0.7992325861558693, |
| "kl": 0.39208984375, |
| "learning_rate": 1.3515852565525167e-05, |
| "loss": 0.4008, |
| "reward": 2.303019016981125, |
| "reward_std": 1.0580395609140396, |
| "rewards/accuracy_reward": 0.723958358168602, |
| "rewards/reasoning_steps_reward": 0.8854166716337204, |
| "rewards/repetition_penalty_reward": -0.1839602366089821, |
| "rewards/tag_count_reward": 0.8776042014360428, |
| "step": 298 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 360.6666793823242, |
| "epoch": 0.44844394450693664, |
| "grad_norm": 0.7019719444995071, |
| "kl": 0.45654296875, |
| "learning_rate": 1.3466705597161416e-05, |
| "loss": 0.0816, |
| "reward": 2.5587180852890015, |
| "reward_std": 0.4348931238055229, |
| "rewards/accuracy_reward": 0.6927083656191826, |
| "rewards/reasoning_steps_reward": 0.9791666716337204, |
| "rewards/repetition_penalty_reward": -0.08971942402422428, |
| "rewards/tag_count_reward": 0.9765625149011612, |
| "step": 299 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 360.20833587646484, |
| "epoch": 0.4499437570303712, |
| "grad_norm": 0.8473895074586508, |
| "kl": 0.51953125, |
| "learning_rate": 1.3417463269714525e-05, |
| "loss": 0.1846, |
| "reward": 2.5228134989738464, |
| "reward_std": 0.6405130326747894, |
| "rewards/accuracy_reward": 0.6927083432674408, |
| "rewards/reasoning_steps_reward": 0.9652777910232544, |
| "rewards/repetition_penalty_reward": -0.09350610896945, |
| "rewards/tag_count_reward": 0.9583333432674408, |
| "step": 300 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 404.3698043823242, |
| "epoch": 0.45144356955380577, |
| "grad_norm": 1.59719244073744, |
| "kl": 0.48876953125, |
| "learning_rate": 1.3368126937699055e-05, |
| "loss": 0.3579, |
| "reward": 2.2757957577705383, |
| "reward_std": 0.9483175575733185, |
| "rewards/accuracy_reward": 0.6145833507180214, |
| "rewards/reasoning_steps_reward": 0.904513880610466, |
| "rewards/repetition_penalty_reward": -0.14694742858409882, |
| "rewards/tag_count_reward": 0.903645858168602, |
| "step": 301 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 378.29688262939453, |
| "epoch": 0.45294338207724033, |
| "grad_norm": 0.47093010825160103, |
| "kl": 0.86376953125, |
| "learning_rate": 1.3318697958215358e-05, |
| "loss": 0.3148, |
| "reward": 2.259345531463623, |
| "reward_std": 1.0043332874774933, |
| "rewards/accuracy_reward": 0.5885416716337204, |
| "rewards/reasoning_steps_reward": 0.9062500298023224, |
| "rewards/repetition_penalty_reward": -0.13258161395788193, |
| "rewards/tag_count_reward": 0.8971354514360428, |
| "step": 302 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 426.34376525878906, |
| "epoch": 0.4544431946006749, |
| "grad_norm": 1.6840109923746656, |
| "kl": 0.615234375, |
| "learning_rate": 1.3269177690912244e-05, |
| "loss": 0.5099, |
| "reward": 2.000718355178833, |
| "reward_std": 1.0972090363502502, |
| "rewards/accuracy_reward": 0.494791679084301, |
| "rewards/reasoning_steps_reward": 0.8593750149011612, |
| "rewards/repetition_penalty_reward": -0.20370880514383316, |
| "rewards/tag_count_reward": 0.8502604514360428, |
| "step": 303 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 503.51564025878906, |
| "epoch": 0.4559430071241095, |
| "grad_norm": 0.8957611247139153, |
| "kl": 0.5615234375, |
| "learning_rate": 1.3219567497949603e-05, |
| "loss": 0.5607, |
| "reward": 1.748295396566391, |
| "reward_std": 1.3299023509025574, |
| "rewards/accuracy_reward": 0.4270833469927311, |
| "rewards/reasoning_steps_reward": 0.8368055522441864, |
| "rewards/repetition_penalty_reward": -0.311166375875473, |
| "rewards/tag_count_reward": 0.7955729514360428, |
| "step": 304 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 463.82813262939453, |
| "epoch": 0.4574428196475441, |
| "grad_norm": 0.7521310552587273, |
| "kl": 0.373046875, |
| "learning_rate": 1.3169868743960904e-05, |
| "loss": 0.5557, |
| "reward": 2.0611188113689423, |
| "reward_std": 1.065147504210472, |
| "rewards/accuracy_reward": 0.5364583432674408, |
| "rewards/reasoning_steps_reward": 0.9236111342906952, |
| "rewards/repetition_penalty_reward": -0.25572141259908676, |
| "rewards/tag_count_reward": 0.856770858168602, |
| "step": 305 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 403.75001525878906, |
| "epoch": 0.45894263217097864, |
| "grad_norm": 0.531106231660718, |
| "kl": 0.33935546875, |
| "learning_rate": 1.3120082796015694e-05, |
| "loss": 0.4341, |
| "reward": 2.1974433958530426, |
| "reward_std": 0.9661893397569656, |
| "rewards/accuracy_reward": 0.5468750074505806, |
| "rewards/reasoning_steps_reward": 0.9270833432674408, |
| "rewards/repetition_penalty_reward": -0.17234836518764496, |
| "rewards/tag_count_reward": 0.895833358168602, |
| "step": 306 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 389.5104293823242, |
| "epoch": 0.4604424446944132, |
| "grad_norm": 0.9868194546337975, |
| "kl": 8.66748046875, |
| "learning_rate": 1.3070211023581959e-05, |
| "loss": 0.5127, |
| "reward": 2.096434473991394, |
| "reward_std": 1.0376380234956741, |
| "rewards/accuracy_reward": 0.5000000260770321, |
| "rewards/reasoning_steps_reward": 0.9027778208255768, |
| "rewards/repetition_penalty_reward": -0.18134328350424767, |
| "rewards/tag_count_reward": 0.8750000149011612, |
| "step": 307 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 346.2239761352539, |
| "epoch": 0.46194225721784776, |
| "grad_norm": 1.1602735630677965, |
| "kl": 0.60205078125, |
| "learning_rate": 1.302025479848847e-05, |
| "loss": 0.6158, |
| "reward": 2.2226256132125854, |
| "reward_std": 1.0068841725587845, |
| "rewards/accuracy_reward": 0.5625000149011612, |
| "rewards/reasoning_steps_reward": 0.9236111491918564, |
| "rewards/repetition_penalty_reward": -0.15801683440804482, |
| "rewards/tag_count_reward": 0.8945312649011612, |
| "step": 308 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 365.1145935058594, |
| "epoch": 0.4634420697412823, |
| "grad_norm": 0.5983467148707611, |
| "kl": 0.783203125, |
| "learning_rate": 1.2970215494887057e-05, |
| "loss": 0.4409, |
| "reward": 2.213726222515106, |
| "reward_std": 0.9471542239189148, |
| "rewards/accuracy_reward": 0.5312500149011612, |
| "rewards/reasoning_steps_reward": 0.9375000298023224, |
| "rewards/repetition_penalty_reward": -0.1339300237596035, |
| "rewards/tag_count_reward": 0.8789062649011612, |
| "step": 309 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 383.40626525878906, |
| "epoch": 0.4649418822647169, |
| "grad_norm": 0.5734600272497099, |
| "kl": 0.890625, |
| "learning_rate": 1.2920094489214794e-05, |
| "loss": 0.4356, |
| "reward": 2.073159396648407, |
| "reward_std": 1.0565957874059677, |
| "rewards/accuracy_reward": 0.4843750223517418, |
| "rewards/reasoning_steps_reward": 0.9062500298023224, |
| "rewards/repetition_penalty_reward": -0.16642405465245247, |
| "rewards/tag_count_reward": 0.848958358168602, |
| "step": 310 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 301.56250762939453, |
| "epoch": 0.4664416947881515, |
| "grad_norm": 0.7211277806322656, |
| "kl": 0.6845703125, |
| "learning_rate": 1.2869893160156144e-05, |
| "loss": 0.2848, |
| "reward": 2.1854459047317505, |
| "reward_std": 0.8107419461011887, |
| "rewards/accuracy_reward": 0.4375000149011612, |
| "rewards/reasoning_steps_reward": 0.9322916865348816, |
| "rewards/repetition_penalty_reward": -0.1036166287958622, |
| "rewards/tag_count_reward": 0.9192708432674408, |
| "step": 311 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 323.3333511352539, |
| "epoch": 0.46794150731158607, |
| "grad_norm": 1.8610423986745928, |
| "kl": 0.54345703125, |
| "learning_rate": 1.2819612888605038e-05, |
| "loss": 0.4624, |
| "reward": 2.25553822517395, |
| "reward_std": 1.062050774693489, |
| "rewards/accuracy_reward": 0.6406250149011612, |
| "rewards/reasoning_steps_reward": 0.880208358168602, |
| "rewards/repetition_penalty_reward": -0.13769106939435005, |
| "rewards/tag_count_reward": 0.872395858168602, |
| "step": 312 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 269.31771087646484, |
| "epoch": 0.46944131983502063, |
| "grad_norm": 1.106507460021578, |
| "kl": 1.8193359375, |
| "learning_rate": 1.2769255057626879e-05, |
| "loss": 0.0841, |
| "reward": 2.6005271077156067, |
| "reward_std": 0.4192248545587063, |
| "rewards/accuracy_reward": 0.7291666939854622, |
| "rewards/reasoning_steps_reward": 0.9687500149011612, |
| "rewards/repetition_penalty_reward": -0.06353538297116756, |
| "rewards/tag_count_reward": 0.9661458432674408, |
| "step": 313 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 262.1979217529297, |
| "epoch": 0.4709411323584552, |
| "grad_norm": 11.712890420404275, |
| "kl": 12.896484375, |
| "learning_rate": 1.2718821052420518e-05, |
| "loss": 0.4463, |
| "reward": 2.492148220539093, |
| "reward_std": 0.6998696699738503, |
| "rewards/accuracy_reward": 0.6822917014360428, |
| "rewards/reasoning_steps_reward": 0.9461805671453476, |
| "rewards/repetition_penalty_reward": -0.0842407438904047, |
| "rewards/tag_count_reward": 0.9479167014360428, |
| "step": 314 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 265.77605056762695, |
| "epoch": 0.47244094488188976, |
| "grad_norm": 0.6975411820310382, |
| "kl": 1.18310546875, |
| "learning_rate": 1.2668312260280136e-05, |
| "loss": 0.1567, |
| "reward": 2.5813058018684387, |
| "reward_std": 0.6979668289422989, |
| "rewards/accuracy_reward": 0.7656250149011612, |
| "rewards/reasoning_steps_reward": 0.9375, |
| "rewards/repetition_penalty_reward": -0.07103797234594822, |
| "rewards/tag_count_reward": 0.9492187649011612, |
| "step": 315 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 237.0989646911621, |
| "epoch": 0.4739407574053243, |
| "grad_norm": 0.9429145740895207, |
| "kl": 0.6083984375, |
| "learning_rate": 1.2617730070557079e-05, |
| "loss": 0.1167, |
| "reward": 2.5124951004981995, |
| "reward_std": 0.5172496251761913, |
| "rewards/accuracy_reward": 0.619791679084301, |
| "rewards/reasoning_steps_reward": 0.9739583432674408, |
| "rewards/repetition_penalty_reward": -0.05391131527721882, |
| "rewards/tag_count_reward": 0.97265625, |
| "step": 316 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 268.03125762939453, |
| "epoch": 0.4754405699287589, |
| "grad_norm": 0.8535133809055152, |
| "kl": 0.49755859375, |
| "learning_rate": 1.2567075874621658e-05, |
| "loss": 0.1603, |
| "reward": 2.632077991962433, |
| "reward_std": 0.5518362149596214, |
| "rewards/accuracy_reward": 0.7656250298023224, |
| "rewards/reasoning_steps_reward": 0.9687500447034836, |
| "rewards/repetition_penalty_reward": -0.06583872064948082, |
| "rewards/tag_count_reward": 0.9635416716337204, |
| "step": 317 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 311.51564025878906, |
| "epoch": 0.4769403824521935, |
| "grad_norm": 0.7780541688251532, |
| "kl": 0.5283203125, |
| "learning_rate": 1.2516351065824864e-05, |
| "loss": 0.1901, |
| "reward": 2.402199685573578, |
| "reward_std": 0.6393345445394516, |
| "rewards/accuracy_reward": 0.5989583432674408, |
| "rewards/reasoning_steps_reward": 0.9548611491918564, |
| "rewards/repetition_penalty_reward": -0.10083861276507378, |
| "rewards/tag_count_reward": 0.9492187649011612, |
| "step": 318 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 299.9947967529297, |
| "epoch": 0.47844019497562806, |
| "grad_norm": 0.7104797645236225, |
| "kl": 0.8134765625, |
| "learning_rate": 1.2465557039460048e-05, |
| "loss": 0.3304, |
| "reward": 2.40736585855484, |
| "reward_std": 0.7196965366601944, |
| "rewards/accuracy_reward": 0.6197916865348816, |
| "rewards/reasoning_steps_reward": 0.94618059694767, |
| "rewards/repetition_penalty_reward": -0.10001271404325962, |
| "rewards/tag_count_reward": 0.9414062798023224, |
| "step": 319 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 326.98439025878906, |
| "epoch": 0.4799400074990626, |
| "grad_norm": 0.7194211623846433, |
| "kl": 0.42431640625, |
| "learning_rate": 1.241469519272453e-05, |
| "loss": 0.5949, |
| "reward": 2.308800458908081, |
| "reward_std": 0.9084680825471878, |
| "rewards/accuracy_reward": 0.6093750149011612, |
| "rewards/reasoning_steps_reward": 0.9427083432674408, |
| "rewards/repetition_penalty_reward": -0.1495329923927784, |
| "rewards/tag_count_reward": 0.9062500298023224, |
| "step": 320 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 357.56771087646484, |
| "epoch": 0.4814398200224972, |
| "grad_norm": 0.9593534999944733, |
| "kl": 0.48095703125, |
| "learning_rate": 1.2363766924681178e-05, |
| "loss": 0.5733, |
| "reward": 2.4493626952171326, |
| "reward_std": 0.9347756206989288, |
| "rewards/accuracy_reward": 0.7447917014360428, |
| "rewards/reasoning_steps_reward": 0.9496527910232544, |
| "rewards/repetition_penalty_reward": -0.15914421156048775, |
| "rewards/tag_count_reward": 0.9140625298023224, |
| "step": 321 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 293.7448043823242, |
| "epoch": 0.48293963254593175, |
| "grad_norm": 4.081284787550109, |
| "kl": 0.697265625, |
| "learning_rate": 1.2312773636219919e-05, |
| "loss": 0.5756, |
| "reward": 2.2688207626342773, |
| "reward_std": 0.8435943201184273, |
| "rewards/accuracy_reward": 0.5364583507180214, |
| "rewards/reasoning_steps_reward": 0.9288194924592972, |
| "rewards/repetition_penalty_reward": -0.12614460662007332, |
| "rewards/tag_count_reward": 0.9296875149011612, |
| "step": 322 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 309.8489685058594, |
| "epoch": 0.4844394450693663, |
| "grad_norm": 1.404497176795523, |
| "kl": 1.14306640625, |
| "learning_rate": 1.2261716730019202e-05, |
| "loss": 0.5032, |
| "reward": 2.3895642161369324, |
| "reward_std": 0.8716422617435455, |
| "rewards/accuracy_reward": 0.6770833432674408, |
| "rewards/reasoning_steps_reward": 0.9479167014360428, |
| "rewards/repetition_penalty_reward": -0.14819632470607758, |
| "rewards/tag_count_reward": 0.9127604216337204, |
| "step": 323 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 249.86459732055664, |
| "epoch": 0.4859392575928009, |
| "grad_norm": 1.926446222656527, |
| "kl": 0.4814453125, |
| "learning_rate": 1.2210597610507418e-05, |
| "loss": 0.4739, |
| "reward": 2.306950032711029, |
| "reward_std": 0.7142234891653061, |
| "rewards/accuracy_reward": 0.510416679084301, |
| "rewards/reasoning_steps_reward": 0.9548611491918564, |
| "rewards/repetition_penalty_reward": -0.10884871147572994, |
| "rewards/tag_count_reward": 0.950520858168602, |
| "step": 324 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 254.2187614440918, |
| "epoch": 0.4874390701162355, |
| "grad_norm": 5.256123203383221, |
| "kl": 0.7255859375, |
| "learning_rate": 1.2159417683824266e-05, |
| "loss": 0.5998, |
| "reward": 2.4224319458007812, |
| "reward_std": 0.787856787443161, |
| "rewards/accuracy_reward": 0.6666666865348816, |
| "rewards/reasoning_steps_reward": 0.9357639104127884, |
| "rewards/repetition_penalty_reward": -0.12270701117813587, |
| "rewards/tag_count_reward": 0.942708358168602, |
| "step": 325 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 327.9114685058594, |
| "epoch": 0.48893888263967006, |
| "grad_norm": 9.312199613733055, |
| "kl": 1.2685546875, |
| "learning_rate": 1.2108178357782079e-05, |
| "loss": 0.8976, |
| "reward": 1.884236216545105, |
| "reward_std": 1.2027931809425354, |
| "rewards/accuracy_reward": 0.4375000074505806, |
| "rewards/reasoning_steps_reward": 0.8020833730697632, |
| "rewards/repetition_penalty_reward": -0.2108159288764, |
| "rewards/tag_count_reward": 0.8554687649011612, |
| "step": 326 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 275.9791717529297, |
| "epoch": 0.4904386951631046, |
| "grad_norm": 15.398650924864615, |
| "kl": 0.73828125, |
| "learning_rate": 1.205688104182709e-05, |
| "loss": 0.7201, |
| "reward": 2.0736686289310455, |
| "reward_std": 0.9543730318546295, |
| "rewards/accuracy_reward": 0.4635416939854622, |
| "rewards/reasoning_steps_reward": 0.866319477558136, |
| "rewards/repetition_penalty_reward": -0.1533280350267887, |
| "rewards/tag_count_reward": 0.8971354365348816, |
| "step": 327 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 241.10417556762695, |
| "epoch": 0.4919385076865392, |
| "grad_norm": 37.30122920300535, |
| "kl": 0.7822265625, |
| "learning_rate": 1.2005527147000663e-05, |
| "loss": 0.7101, |
| "reward": 2.245617926120758, |
| "reward_std": 0.8258701711893082, |
| "rewards/accuracy_reward": 0.552083358168602, |
| "rewards/reasoning_steps_reward": 0.9027778059244156, |
| "rewards/repetition_penalty_reward": -0.12070165574550629, |
| "rewards/tag_count_reward": 0.911458358168602, |
| "step": 328 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 215.0520896911621, |
| "epoch": 0.49343832020997375, |
| "grad_norm": 60.05024321951368, |
| "kl": 0.625, |
| "learning_rate": 1.1954118085900503e-05, |
| "loss": 0.5475, |
| "reward": 2.432074725627899, |
| "reward_std": 0.7434251010417938, |
| "rewards/accuracy_reward": 0.6354167014360428, |
| "rewards/reasoning_steps_reward": 0.9479166865348816, |
| "rewards/repetition_penalty_reward": -0.08224830403923988, |
| "rewards/tag_count_reward": 0.9309895932674408, |
| "step": 329 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 224.40625381469727, |
| "epoch": 0.4949381327334083, |
| "grad_norm": 98.14120807441263, |
| "kl": 0.5615234375, |
| "learning_rate": 1.1902655272641756e-05, |
| "loss": 0.5814, |
| "reward": 2.4700867533683777, |
| "reward_std": 0.6907303482294083, |
| "rewards/accuracy_reward": 0.6718750298023224, |
| "rewards/reasoning_steps_reward": 0.9392361342906952, |
| "rewards/repetition_penalty_reward": -0.08112853486090899, |
| "rewards/tag_count_reward": 0.9401041865348816, |
| "step": 330 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 227.70833587646484, |
| "epoch": 0.49643794525684287, |
| "grad_norm": 28.380941791399632, |
| "kl": 0.796875, |
| "learning_rate": 1.1851140122818155e-05, |
| "loss": 0.6384, |
| "reward": 2.440628468990326, |
| "reward_std": 0.5516751855611801, |
| "rewards/accuracy_reward": 0.5937500149011612, |
| "rewards/reasoning_steps_reward": 0.9652777761220932, |
| "rewards/repetition_penalty_reward": -0.05069108493626118, |
| "rewards/tag_count_reward": 0.9322917014360428, |
| "step": 331 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 189.9635467529297, |
| "epoch": 0.4979377577802775, |
| "grad_norm": 90.64584397678644, |
| "kl": 0.7421875, |
| "learning_rate": 1.1799574053463048e-05, |
| "loss": 0.2312, |
| "reward": 2.4951672554016113, |
| "reward_std": 0.4933183267712593, |
| "rewards/accuracy_reward": 0.6145833432674408, |
| "rewards/reasoning_steps_reward": 0.9774305820465088, |
| "rewards/repetition_penalty_reward": -0.04476337507367134, |
| "rewards/tag_count_reward": 0.9479166865348816, |
| "step": 332 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 198.1822967529297, |
| "epoch": 0.49943757030371205, |
| "grad_norm": 47.44537979910489, |
| "kl": 0.79296875, |
| "learning_rate": 1.1747958483010438e-05, |
| "loss": 0.1878, |
| "reward": 2.4803009629249573, |
| "reward_std": 0.4909311309456825, |
| "rewards/accuracy_reward": 0.614583358168602, |
| "rewards/reasoning_steps_reward": 0.9687500447034836, |
| "rewards/repetition_penalty_reward": -0.054855335503816605, |
| "rewards/tag_count_reward": 0.9518229365348816, |
| "step": 333 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 213.3489646911621, |
| "epoch": 0.5009373828271466, |
| "grad_norm": 320.06224533206546, |
| "kl": 2.462890625, |
| "learning_rate": 1.1696294831255961e-05, |
| "loss": 0.9267, |
| "reward": 2.6039064526557922, |
| "reward_std": 0.5018587484955788, |
| "rewards/accuracy_reward": 0.7500000149011612, |
| "rewards/reasoning_steps_reward": 0.960069477558136, |
| "rewards/repetition_penalty_reward": -0.04366318695247173, |
| "rewards/tag_count_reward": 0.9375000298023224, |
| "step": 334 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 173.62500381469727, |
| "epoch": 0.5024371953505812, |
| "grad_norm": 1405.8210110352265, |
| "kl": 1.1259765625, |
| "learning_rate": 1.1644584519317828e-05, |
| "loss": 0.2665, |
| "reward": 2.5448213815689087, |
| "reward_std": 0.4410114288330078, |
| "rewards/accuracy_reward": 0.6302083358168602, |
| "rewards/reasoning_steps_reward": 0.9809027761220932, |
| "rewards/repetition_penalty_reward": -0.03764389827847481, |
| "rewards/tag_count_reward": 0.9713541865348816, |
| "step": 335 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 197.0885467529297, |
| "epoch": 0.5039370078740157, |
| "grad_norm": 266.36597808183785, |
| "kl": 1.8984375, |
| "learning_rate": 1.159282896959774e-05, |
| "loss": 0.6665, |
| "reward": 2.542451798915863, |
| "reward_std": 0.4349544197320938, |
| "rewards/accuracy_reward": 0.6562500223517418, |
| "rewards/reasoning_steps_reward": 0.9826388955116272, |
| "rewards/repetition_penalty_reward": -0.04695798270404339, |
| "rewards/tag_count_reward": 0.9505208432674408, |
| "step": 336 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 203.6666717529297, |
| "epoch": 0.5054368203974503, |
| "grad_norm": 124.39297508584644, |
| "kl": 1.185546875, |
| "learning_rate": 1.1541029605741758e-05, |
| "loss": 0.6041, |
| "reward": 2.4829863905906677, |
| "reward_std": 0.5544790178537369, |
| "rewards/accuracy_reward": 0.6093750298023224, |
| "rewards/reasoning_steps_reward": 0.9791666865348816, |
| "rewards/repetition_penalty_reward": -0.043055362068116665, |
| "rewards/tag_count_reward": 0.9375000298023224, |
| "step": 337 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 182.04167556762695, |
| "epoch": 0.5069366329208849, |
| "grad_norm": 178.0490848911099, |
| "kl": 0.923828125, |
| "learning_rate": 1.1489187852601147e-05, |
| "loss": 0.5043, |
| "reward": 2.643836498260498, |
| "reward_std": 0.4666139706969261, |
| "rewards/accuracy_reward": 0.7447916865348816, |
| "rewards/reasoning_steps_reward": 0.9756944626569748, |
| "rewards/repetition_penalty_reward": -0.03237887378782034, |
| "rewards/tag_count_reward": 0.9557292014360428, |
| "step": 338 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 168.6822967529297, |
| "epoch": 0.5084364454443194, |
| "grad_norm": 33.126865780616335, |
| "kl": 0.7275390625, |
| "learning_rate": 1.143730513619317e-05, |
| "loss": 0.2163, |
| "reward": 2.5825945138931274, |
| "reward_std": 0.41466938704252243, |
| "rewards/accuracy_reward": 0.6666666716337204, |
| "rewards/reasoning_steps_reward": 0.97743059694767, |
| "rewards/repetition_penalty_reward": -0.04587779473513365, |
| "rewards/tag_count_reward": 0.9843750149011612, |
| "step": 339 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 199.1666717529297, |
| "epoch": 0.509936257967754, |
| "grad_norm": 100.89545618937568, |
| "kl": 1.208984375, |
| "learning_rate": 1.1385382883661881e-05, |
| "loss": 0.6377, |
| "reward": 2.652986526489258, |
| "reward_std": 0.45409294590353966, |
| "rewards/accuracy_reward": 0.7968750149011612, |
| "rewards/reasoning_steps_reward": 0.958333358168602, |
| "rewards/repetition_penalty_reward": -0.05144065525382757, |
| "rewards/tag_count_reward": 0.9492187649011612, |
| "step": 340 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 175.04687881469727, |
| "epoch": 0.5114360704911886, |
| "grad_norm": 294.488166826066, |
| "kl": 0.87890625, |
| "learning_rate": 1.1333422523238858e-05, |
| "loss": 0.3826, |
| "reward": 2.572603166103363, |
| "reward_std": 0.42086203396320343, |
| "rewards/accuracy_reward": 0.6614583432674408, |
| "rewards/reasoning_steps_reward": 0.9774305820465088, |
| "rewards/repetition_penalty_reward": -0.04154628235846758, |
| "rewards/tag_count_reward": 0.9752604365348816, |
| "step": 341 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 206.58333587646484, |
| "epoch": 0.5129358830146231, |
| "grad_norm": 56.33307741176176, |
| "kl": 1.21875, |
| "learning_rate": 1.1281425484203908e-05, |
| "loss": 0.4977, |
| "reward": 2.42953622341156, |
| "reward_std": 0.4737429544329643, |
| "rewards/accuracy_reward": 0.5572916716337204, |
| "rewards/reasoning_steps_reward": 0.972222238779068, |
| "rewards/repetition_penalty_reward": -0.060915243811905384, |
| "rewards/tag_count_reward": 0.9609375149011612, |
| "step": 342 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 166.45313262939453, |
| "epoch": 0.5144356955380578, |
| "grad_norm": 7.667419781427111, |
| "kl": 0.6015625, |
| "learning_rate": 1.122939319684577e-05, |
| "loss": 0.1143, |
| "reward": 2.5116847157478333, |
| "reward_std": 0.43062853813171387, |
| "rewards/accuracy_reward": 0.6302083507180214, |
| "rewards/reasoning_steps_reward": 0.9513888955116272, |
| "rewards/repetition_penalty_reward": -0.05428746622055769, |
| "rewards/tag_count_reward": 0.9843750149011612, |
| "step": 343 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 178.51562881469727, |
| "epoch": 0.5159355080614924, |
| "grad_norm": 23.5357959938352, |
| "kl": 1.126953125, |
| "learning_rate": 1.1177327092422761e-05, |
| "loss": 0.3605, |
| "reward": 2.4563730359077454, |
| "reward_std": 0.4659854732453823, |
| "rewards/accuracy_reward": 0.572916679084301, |
| "rewards/reasoning_steps_reward": 0.9600694477558136, |
| "rewards/repetition_penalty_reward": -0.04275886481627822, |
| "rewards/tag_count_reward": 0.9661458432674408, |
| "step": 344 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 184.5104217529297, |
| "epoch": 0.5174353205849269, |
| "grad_norm": 31.72517924718908, |
| "kl": 0.88671875, |
| "learning_rate": 1.1125228603123408e-05, |
| "loss": 0.3874, |
| "reward": 2.5461575388908386, |
| "reward_std": 0.5080005489289761, |
| "rewards/accuracy_reward": 0.6875000298023224, |
| "rewards/reasoning_steps_reward": 0.9444444626569748, |
| "rewards/repetition_penalty_reward": -0.05193278752267361, |
| "rewards/tag_count_reward": 0.966145858168602, |
| "step": 345 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 180.89062881469727, |
| "epoch": 0.5189351331083615, |
| "grad_norm": 87.98939762631694, |
| "kl": 0.7275390625, |
| "learning_rate": 1.107309916202705e-05, |
| "loss": 0.381, |
| "reward": 2.5558876395225525, |
| "reward_std": 0.5220728367567062, |
| "rewards/accuracy_reward": 0.6822916865348816, |
| "rewards/reasoning_steps_reward": 0.9635416716337204, |
| "rewards/repetition_penalty_reward": -0.05609158892184496, |
| "rewards/tag_count_reward": 0.966145858168602, |
| "step": 346 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 176.32812881469727, |
| "epoch": 0.520434945631796, |
| "grad_norm": 12.257830254456051, |
| "kl": 0.60009765625, |
| "learning_rate": 1.1020940203064425e-05, |
| "loss": 0.228, |
| "reward": 2.6993812322616577, |
| "reward_std": 0.442532442510128, |
| "rewards/accuracy_reward": 0.848958358168602, |
| "rewards/reasoning_steps_reward": 0.9427083432674408, |
| "rewards/repetition_penalty_reward": -0.05582709517329931, |
| "rewards/tag_count_reward": 0.9635416865348816, |
| "step": 347 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 204.01042938232422, |
| "epoch": 0.5219347581552306, |
| "grad_norm": 166.2835482202829, |
| "kl": 3.4541015625, |
| "learning_rate": 1.096875316097822e-05, |
| "loss": 0.7129, |
| "reward": 2.664002478122711, |
| "reward_std": 0.5798608511686325, |
| "rewards/accuracy_reward": 0.8593750149011612, |
| "rewards/reasoning_steps_reward": 0.9305556118488312, |
| "rewards/repetition_penalty_reward": -0.0829594787210226, |
| "rewards/tag_count_reward": 0.9570312649011612, |
| "step": 348 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 216.75000381469727, |
| "epoch": 0.5234345706786652, |
| "grad_norm": 24.78884718715676, |
| "kl": 0.576171875, |
| "learning_rate": 1.0916539471283607e-05, |
| "loss": 0.3904, |
| "reward": 2.557326078414917, |
| "reward_std": 0.5754078030586243, |
| "rewards/accuracy_reward": 0.723958358168602, |
| "rewards/reasoning_steps_reward": 0.9687500149011612, |
| "rewards/repetition_penalty_reward": -0.093715600669384, |
| "rewards/tag_count_reward": 0.958333358168602, |
| "step": 349 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 185.2604217529297, |
| "epoch": 0.5249343832020997, |
| "grad_norm": 10.924883170422843, |
| "kl": 0.55810546875, |
| "learning_rate": 1.0864300570228757e-05, |
| "loss": 0.2519, |
| "reward": 2.49711149930954, |
| "reward_std": 0.5190073773264885, |
| "rewards/accuracy_reward": 0.6406250223517418, |
| "rewards/reasoning_steps_reward": 0.9618055522441864, |
| "rewards/repetition_penalty_reward": -0.07016285322606564, |
| "rewards/tag_count_reward": 0.9648437649011612, |
| "step": 350 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 198.96875, |
| "epoch": 0.5264341957255343, |
| "grad_norm": 32.87973860027691, |
| "kl": 1.43359375, |
| "learning_rate": 1.0812037894755336e-05, |
| "loss": 0.5601, |
| "reward": 2.497571051120758, |
| "reward_std": 0.710141509771347, |
| "rewards/accuracy_reward": 0.692708358168602, |
| "rewards/reasoning_steps_reward": 0.9236111491918564, |
| "rewards/repetition_penalty_reward": -0.07577977981418371, |
| "rewards/tag_count_reward": 0.9570312649011612, |
| "step": 351 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 189.5885467529297, |
| "epoch": 0.5279340082489689, |
| "grad_norm": 35.65905039343771, |
| "kl": 1.181640625, |
| "learning_rate": 1.0759752882458972e-05, |
| "loss": 0.4482, |
| "reward": 2.4979015588760376, |
| "reward_std": 0.5550749897956848, |
| "rewards/accuracy_reward": 0.6562500223517418, |
| "rewards/reasoning_steps_reward": 0.9565972238779068, |
| "rewards/repetition_penalty_reward": -0.07067496795207262, |
| "rewards/tag_count_reward": 0.9557291865348816, |
| "step": 352 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 194.76563262939453, |
| "epoch": 0.5294338207724034, |
| "grad_norm": 37.03762904065605, |
| "kl": 1.2021484375, |
| "learning_rate": 1.0707446971549717e-05, |
| "loss": 0.5284, |
| "reward": 2.480072259902954, |
| "reward_std": 0.5370368957519531, |
| "rewards/accuracy_reward": 0.6510416865348816, |
| "rewards/reasoning_steps_reward": 0.9409722834825516, |
| "rewards/repetition_penalty_reward": -0.07938963826745749, |
| "rewards/tag_count_reward": 0.9674479365348816, |
| "step": 353 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 197.4010467529297, |
| "epoch": 0.530933633295838, |
| "grad_norm": 47.85939515897191, |
| "kl": 1.6240234375, |
| "learning_rate": 1.0655121600812482e-05, |
| "loss": 0.3828, |
| "reward": 2.545462191104889, |
| "reward_std": 0.44792424887418747, |
| "rewards/accuracy_reward": 0.6822916865348816, |
| "rewards/reasoning_steps_reward": 0.9635416865348816, |
| "rewards/repetition_penalty_reward": -0.06521502183750272, |
| "rewards/tag_count_reward": 0.9648437649011612, |
| "step": 354 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 197.23437881469727, |
| "epoch": 0.5324334458192725, |
| "grad_norm": 46.48162850362066, |
| "kl": 1.5673828125, |
| "learning_rate": 1.0602778209567462e-05, |
| "loss": 0.501, |
| "reward": 2.4815279245376587, |
| "reward_std": 0.5560100227594376, |
| "rewards/accuracy_reward": 0.6250000149011612, |
| "rewards/reasoning_steps_reward": 0.9635416716337204, |
| "rewards/repetition_penalty_reward": -0.07055553328245878, |
| "rewards/tag_count_reward": 0.9635416865348816, |
| "step": 355 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 195.2968864440918, |
| "epoch": 0.5339332583427071, |
| "grad_norm": 29.59912755896137, |
| "kl": 1.427734375, |
| "learning_rate": 1.0550418237630547e-05, |
| "loss": 0.4327, |
| "reward": 2.5716618597507477, |
| "reward_std": 0.5098543167114258, |
| "rewards/accuracy_reward": 0.776041679084301, |
| "rewards/reasoning_steps_reward": 0.9288194924592972, |
| "rewards/repetition_penalty_reward": -0.08241807296872139, |
| "rewards/tag_count_reward": 0.9492187649011612, |
| "step": 356 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 208.03125, |
| "epoch": 0.5354330708661418, |
| "grad_norm": 39.48130096076877, |
| "kl": 1.7080078125, |
| "learning_rate": 1.0498043125273714e-05, |
| "loss": 0.6548, |
| "reward": 2.4025214314460754, |
| "reward_std": 0.6107548177242279, |
| "rewards/accuracy_reward": 0.5781250223517418, |
| "rewards/reasoning_steps_reward": 0.9375000596046448, |
| "rewards/repetition_penalty_reward": -0.05190564412623644, |
| "rewards/tag_count_reward": 0.9388020932674408, |
| "step": 357 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 209.78125381469727, |
| "epoch": 0.5369328833895763, |
| "grad_norm": 38.82027339580314, |
| "kl": 1.8056640625, |
| "learning_rate": 1.0445654313185402e-05, |
| "loss": 0.7392, |
| "reward": 2.5126326084136963, |
| "reward_std": 0.5534802153706551, |
| "rewards/accuracy_reward": 0.6510416865348816, |
| "rewards/reasoning_steps_reward": 0.9513888955116272, |
| "rewards/repetition_penalty_reward": -0.04813140258193016, |
| "rewards/tag_count_reward": 0.958333358168602, |
| "step": 358 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 191.0729217529297, |
| "epoch": 0.5384326959130109, |
| "grad_norm": 24.170858033164397, |
| "kl": 1.42626953125, |
| "learning_rate": 1.0393253242430898e-05, |
| "loss": 0.4034, |
| "reward": 2.4246557354927063, |
| "reward_std": 0.5046445429325104, |
| "rewards/accuracy_reward": 0.5625000149011612, |
| "rewards/reasoning_steps_reward": 0.9565972685813904, |
| "rewards/repetition_penalty_reward": -0.04496251605451107, |
| "rewards/tag_count_reward": 0.9505208432674408, |
| "step": 359 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 222.46355056762695, |
| "epoch": 0.5399325084364455, |
| "grad_norm": 153.4349595006434, |
| "kl": 2.580078125, |
| "learning_rate": 1.0340841354412688e-05, |
| "loss": 0.8895, |
| "reward": 2.522955060005188, |
| "reward_std": 0.5852703154087067, |
| "rewards/accuracy_reward": 0.713541679084301, |
| "rewards/reasoning_steps_reward": 0.9236111342906952, |
| "rewards/repetition_penalty_reward": -0.043885353952646255, |
| "rewards/tag_count_reward": 0.9296875149011612, |
| "step": 360 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 234.3958396911621, |
| "epoch": 0.54143232095988, |
| "grad_norm": 166.56500053569175, |
| "kl": 4.076171875, |
| "learning_rate": 1.0288420090830803e-05, |
| "loss": 1.174, |
| "reward": 2.4644437432289124, |
| "reward_std": 0.5811575800180435, |
| "rewards/accuracy_reward": 0.6093750223517418, |
| "rewards/reasoning_steps_reward": 0.9618055671453476, |
| "rewards/repetition_penalty_reward": -0.04033063165843487, |
| "rewards/tag_count_reward": 0.9335937649011612, |
| "step": 361 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 223.4583396911621, |
| "epoch": 0.5429321334833146, |
| "grad_norm": 33.889969513651074, |
| "kl": 1.30078125, |
| "learning_rate": 1.0235990893643184e-05, |
| "loss": 0.5957, |
| "reward": 2.4738687872886658, |
| "reward_std": 0.524218238890171, |
| "rewards/accuracy_reward": 0.6093750149011612, |
| "rewards/reasoning_steps_reward": 0.9618055820465088, |
| "rewards/repetition_penalty_reward": -0.04913472477346659, |
| "rewards/tag_count_reward": 0.9518229365348816, |
| "step": 362 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 228.3645896911621, |
| "epoch": 0.5444319460067492, |
| "grad_norm": 60.56364684515394, |
| "kl": 2.970703125, |
| "learning_rate": 1.0183555205025986e-05, |
| "loss": 0.6895, |
| "reward": 2.5394935607910156, |
| "reward_std": 0.5755364149808884, |
| "rewards/accuracy_reward": 0.682291679084301, |
| "rewards/reasoning_steps_reward": 0.9461805820465088, |
| "rewards/repetition_penalty_reward": -0.03819744661450386, |
| "rewards/tag_count_reward": 0.9492187649011612, |
| "step": 363 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 202.82813262939453, |
| "epoch": 0.5459317585301837, |
| "grad_norm": 152.09115123663648, |
| "kl": 24.283203125, |
| "learning_rate": 1.0131114467333935e-05, |
| "loss": 1.2772, |
| "reward": 2.660370111465454, |
| "reward_std": 0.5182667449116707, |
| "rewards/accuracy_reward": 0.8072916865348816, |
| "rewards/reasoning_steps_reward": 0.9392361044883728, |
| "rewards/repetition_penalty_reward": -0.044491049367934465, |
| "rewards/tag_count_reward": 0.958333358168602, |
| "step": 364 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 234.3750114440918, |
| "epoch": 0.5474315710536183, |
| "grad_norm": 45.62424706171924, |
| "kl": 6.0390625, |
| "learning_rate": 1.0078670123060638e-05, |
| "loss": 1.4476, |
| "reward": 2.325054883956909, |
| "reward_std": 0.6013178080320358, |
| "rewards/accuracy_reward": 0.536458358168602, |
| "rewards/reasoning_steps_reward": 0.911458358168602, |
| "rewards/repetition_penalty_reward": -0.047341120429337025, |
| "rewards/tag_count_reward": 0.9244791865348816, |
| "step": 365 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 249.9895896911621, |
| "epoch": 0.5489313835770528, |
| "grad_norm": 394.34439096252885, |
| "kl": 3.89453125, |
| "learning_rate": 1.002622361479891e-05, |
| "loss": 1.04, |
| "reward": 2.2679548859596252, |
| "reward_std": 0.636200875043869, |
| "rewards/accuracy_reward": 0.4531250223517418, |
| "rewards/reasoning_steps_reward": 0.9218750298023224, |
| "rewards/repetition_penalty_reward": -0.032826476730406284, |
| "rewards/tag_count_reward": 0.9257812798023224, |
| "step": 366 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 228.17188262939453, |
| "epoch": 0.5504311961004874, |
| "grad_norm": 27.602993232186183, |
| "kl": 1.998046875, |
| "learning_rate": 9.973776385201093e-06, |
| "loss": 0.5853, |
| "reward": 2.419560134410858, |
| "reward_std": 0.6460568159818649, |
| "rewards/accuracy_reward": 0.5937500149011612, |
| "rewards/reasoning_steps_reward": 0.9357638657093048, |
| "rewards/repetition_penalty_reward": -0.04354754835367203, |
| "rewards/tag_count_reward": 0.9335937649011612, |
| "step": 367 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 245.15625381469727, |
| "epoch": 0.551931008623922, |
| "grad_norm": 31.72593423602673, |
| "kl": 2.052734375, |
| "learning_rate": 9.921329876939365e-06, |
| "loss": 0.725, |
| "reward": 2.489229917526245, |
| "reward_std": 0.5430986732244492, |
| "rewards/accuracy_reward": 0.6302083432674408, |
| "rewards/reasoning_steps_reward": 0.9531250298023224, |
| "rewards/repetition_penalty_reward": -0.04722849931567907, |
| "rewards/tag_count_reward": 0.9531250149011612, |
| "step": 368 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 235.01563262939453, |
| "epoch": 0.5534308211473565, |
| "grad_norm": 11.874407407969304, |
| "kl": 1.76171875, |
| "learning_rate": 9.868885532666068e-06, |
| "loss": 0.6634, |
| "reward": 2.2123183608055115, |
| "reward_std": 0.538506917655468, |
| "rewards/accuracy_reward": 0.3906250260770321, |
| "rewards/reasoning_steps_reward": 0.9427083283662796, |
| "rewards/repetition_penalty_reward": -0.052004692144691944, |
| "rewards/tag_count_reward": 0.9309895932674408, |
| "step": 369 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 211.4739646911621, |
| "epoch": 0.5549306336707911, |
| "grad_norm": 46.973349870839904, |
| "kl": 2.095703125, |
| "learning_rate": 9.816444794974018e-06, |
| "loss": 0.7028, |
| "reward": 2.6153268814086914, |
| "reward_std": 0.5184621512889862, |
| "rewards/accuracy_reward": 0.7656250298023224, |
| "rewards/reasoning_steps_reward": 0.9427083432674408, |
| "rewards/repetition_penalty_reward": -0.0591523889452219, |
| "rewards/tag_count_reward": 0.966145858168602, |
| "step": 370 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 212.7395896911621, |
| "epoch": 0.5564304461942258, |
| "grad_norm": 57.08569018760819, |
| "kl": 16.5546875, |
| "learning_rate": 9.76400910635682e-06, |
| "loss": 0.7212, |
| "reward": 2.2974973320961, |
| "reward_std": 0.499550960958004, |
| "rewards/accuracy_reward": 0.4583333432674408, |
| "rewards/reasoning_steps_reward": 0.94618059694767, |
| "rewards/repetition_penalty_reward": -0.045818757731467485, |
| "rewards/tag_count_reward": 0.938802108168602, |
| "step": 371 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 224.7239646911621, |
| "epoch": 0.5579302587176603, |
| "grad_norm": 18.858608822380706, |
| "kl": 1.96875, |
| "learning_rate": 9.7115799091692e-06, |
| "loss": 0.4937, |
| "reward": 2.616053879261017, |
| "reward_std": 0.44517990201711655, |
| "rewards/accuracy_reward": 0.7291666865348816, |
| "rewards/reasoning_steps_reward": 0.9618055820465088, |
| "rewards/repetition_penalty_reward": -0.03845999389886856, |
| "rewards/tag_count_reward": 0.9635416865348816, |
| "step": 372 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 227.83334350585938, |
| "epoch": 0.5594300712410949, |
| "grad_norm": 28.416379388658825, |
| "kl": 3.048828125, |
| "learning_rate": 9.659158645587319e-06, |
| "loss": 0.9709, |
| "reward": 2.659669041633606, |
| "reward_std": 0.5419558137655258, |
| "rewards/accuracy_reward": 0.7864583432674408, |
| "rewards/reasoning_steps_reward": 0.9600694626569748, |
| "rewards/repetition_penalty_reward": -0.04649423388764262, |
| "rewards/tag_count_reward": 0.9596354514360428, |
| "step": 373 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 235.84375381469727, |
| "epoch": 0.5609298837645295, |
| "grad_norm": 94.84357384526665, |
| "kl": 5.3369140625, |
| "learning_rate": 9.606746757569107e-06, |
| "loss": 1.197, |
| "reward": 2.5181403756141663, |
| "reward_std": 0.6076074615120888, |
| "rewards/accuracy_reward": 0.7135416716337204, |
| "rewards/reasoning_steps_reward": 0.9166666716337204, |
| "rewards/repetition_penalty_reward": -0.04175550863146782, |
| "rewards/tag_count_reward": 0.9296875149011612, |
| "step": 374 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 229.70313262939453, |
| "epoch": 0.562429696287964, |
| "grad_norm": 26.282174538587235, |
| "kl": 1.5439453125, |
| "learning_rate": 9.554345686814601e-06, |
| "loss": 0.767, |
| "reward": 2.4584012627601624, |
| "reward_std": 0.5215486437082291, |
| "rewards/accuracy_reward": 0.6093750074505806, |
| "rewards/reasoning_steps_reward": 0.9548610895872116, |
| "rewards/repetition_penalty_reward": -0.04854332935065031, |
| "rewards/tag_count_reward": 0.942708358168602, |
| "step": 375 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 221.25, |
| "epoch": 0.5639295088113986, |
| "grad_norm": 12.593832343409222, |
| "kl": 1.1826171875, |
| "learning_rate": 9.501956874726289e-06, |
| "loss": 0.5623, |
| "reward": 2.5942240357398987, |
| "reward_std": 0.4434950575232506, |
| "rewards/accuracy_reward": 0.7031250074505806, |
| "rewards/reasoning_steps_reward": 0.960069477558136, |
| "rewards/repetition_penalty_reward": -0.03121019806712866, |
| "rewards/tag_count_reward": 0.962239608168602, |
| "step": 376 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 212.0572967529297, |
| "epoch": 0.5654293213348331, |
| "grad_norm": 4.53299182911921, |
| "kl": 1.1650390625, |
| "learning_rate": 9.449581762369454e-06, |
| "loss": 0.4889, |
| "reward": 2.432799220085144, |
| "reward_std": 0.5172925740480423, |
| "rewards/accuracy_reward": 0.5416666939854622, |
| "rewards/reasoning_steps_reward": 0.9670138955116272, |
| "rewards/repetition_penalty_reward": -0.03942307736724615, |
| "rewards/tag_count_reward": 0.9635416865348816, |
| "step": 377 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 273.8177146911621, |
| "epoch": 0.5669291338582677, |
| "grad_norm": 148.2456214132894, |
| "kl": 5.01953125, |
| "learning_rate": 9.39722179043254e-06, |
| "loss": 1.2941, |
| "reward": 2.3988120555877686, |
| "reward_std": 0.6988454312086105, |
| "rewards/accuracy_reward": 0.6093750298023224, |
| "rewards/reasoning_steps_reward": 0.918402835726738, |
| "rewards/repetition_penalty_reward": -0.03912200313061476, |
| "rewards/tag_count_reward": 0.9101562649011612, |
| "step": 378 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 333.6145935058594, |
| "epoch": 0.5684289463817023, |
| "grad_norm": 116.2915823519784, |
| "kl": 11.8125, |
| "learning_rate": 9.344878399187521e-06, |
| "loss": 2.0047, |
| "reward": 2.2520939111709595, |
| "reward_std": 0.821685403585434, |
| "rewards/accuracy_reward": 0.5572916716337204, |
| "rewards/reasoning_steps_reward": 0.8524305522441864, |
| "rewards/repetition_penalty_reward": -0.03262835554778576, |
| "rewards/tag_count_reward": 0.8750000149011612, |
| "step": 379 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 420.4947967529297, |
| "epoch": 0.5699287589051368, |
| "grad_norm": 1197.114266206702, |
| "kl": 12.4609375, |
| "learning_rate": 9.292553028450286e-06, |
| "loss": 1.6502, |
| "reward": 2.098519653081894, |
| "reward_std": 0.8776163309812546, |
| "rewards/accuracy_reward": 0.5052083432674408, |
| "rewards/reasoning_steps_reward": 0.8263889402151108, |
| "rewards/repetition_penalty_reward": -0.0390672804787755, |
| "rewards/tag_count_reward": 0.805989608168602, |
| "step": 380 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 426.5677185058594, |
| "epoch": 0.5714285714285714, |
| "grad_norm": 26.549926170164234, |
| "kl": 6.15234375, |
| "learning_rate": 9.240247117541031e-06, |
| "loss": 1.2117, |
| "reward": 2.119348645210266, |
| "reward_std": 0.8967611789703369, |
| "rewards/accuracy_reward": 0.479166679084301, |
| "rewards/reasoning_steps_reward": 0.84375, |
| "rewards/repetition_penalty_reward": -0.02648477186448872, |
| "rewards/tag_count_reward": 0.8229166865348816, |
| "step": 381 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 391.78125762939453, |
| "epoch": 0.572928383952006, |
| "grad_norm": 88.00179700818907, |
| "kl": 7.25, |
| "learning_rate": 9.187962105244667e-06, |
| "loss": 1.377, |
| "reward": 2.291730046272278, |
| "reward_std": 0.8747196942567825, |
| "rewards/accuracy_reward": 0.6354166716337204, |
| "rewards/reasoning_steps_reward": 0.835069477558136, |
| "rewards/repetition_penalty_reward": -0.02511032810434699, |
| "rewards/tag_count_reward": 0.8463542014360428, |
| "step": 382 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 451.3541793823242, |
| "epoch": 0.5744281964754405, |
| "grad_norm": 13.361311656907796, |
| "kl": 1.662109375, |
| "learning_rate": 9.135699429771245e-06, |
| "loss": 0.7943, |
| "reward": 2.1707218885421753, |
| "reward_std": 0.9110157489776611, |
| "rewards/accuracy_reward": 0.5468750149011612, |
| "rewards/reasoning_steps_reward": 0.8298611044883728, |
| "rewards/repetition_penalty_reward": -0.02372267236933112, |
| "rewards/tag_count_reward": 0.817708358168602, |
| "step": 383 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 295.9948043823242, |
| "epoch": 0.5759280089988752, |
| "grad_norm": 30.04886642000228, |
| "kl": 0.908203125, |
| "learning_rate": 9.083460528716396e-06, |
| "loss": 0.6667, |
| "reward": 2.325647294521332, |
| "reward_std": 0.6908200830221176, |
| "rewards/accuracy_reward": 0.5520833395421505, |
| "rewards/reasoning_steps_reward": 0.9045139253139496, |
| "rewards/repetition_penalty_reward": -0.0345958243124187, |
| "rewards/tag_count_reward": 0.903645858168602, |
| "step": 384 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 326.80730056762695, |
| "epoch": 0.5774278215223098, |
| "grad_norm": 10.663780511762228, |
| "kl": 0.76953125, |
| "learning_rate": 9.031246839021783e-06, |
| "loss": 0.6302, |
| "reward": 2.3826356530189514, |
| "reward_std": 0.6792797073721886, |
| "rewards/accuracy_reward": 0.5937500298023224, |
| "rewards/reasoning_steps_reward": 0.9340278208255768, |
| "rewards/repetition_penalty_reward": -0.051392185501754284, |
| "rewards/tag_count_reward": 0.9062500149011612, |
| "step": 385 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 231.76041793823242, |
| "epoch": 0.5789276340457443, |
| "grad_norm": 61988.54428984002, |
| "kl": 7898.1875, |
| "learning_rate": 8.979059796935578e-06, |
| "loss": 243.7071, |
| "reward": 2.450250804424286, |
| "reward_std": 0.6126392781734467, |
| "rewards/accuracy_reward": 0.6093750149011612, |
| "rewards/reasoning_steps_reward": 0.9409722238779068, |
| "rewards/repetition_penalty_reward": -0.0402006134390831, |
| "rewards/tag_count_reward": 0.9401041865348816, |
| "step": 386 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 200.89062881469727, |
| "epoch": 0.5804274465691789, |
| "grad_norm": 40219.98047828671, |
| "kl": 1208.828125, |
| "learning_rate": 8.926900837972953e-06, |
| "loss": 56.1702, |
| "reward": 2.596910834312439, |
| "reward_std": 0.3909384198486805, |
| "rewards/accuracy_reward": 0.6822916716337204, |
| "rewards/reasoning_steps_reward": 0.9774305671453476, |
| "rewards/repetition_penalty_reward": -0.04197810683399439, |
| "rewards/tag_count_reward": 0.9791666865348816, |
| "step": 387 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 225.0260467529297, |
| "epoch": 0.5819272590926134, |
| "grad_norm": 21433.51255502709, |
| "kl": 3312.6201171875, |
| "learning_rate": 8.874771396876597e-06, |
| "loss": 88.2373, |
| "reward": 2.6155471205711365, |
| "reward_std": 0.478760302066803, |
| "rewards/accuracy_reward": 0.7343750149011612, |
| "rewards/reasoning_steps_reward": 0.9687500447034836, |
| "rewards/repetition_penalty_reward": -0.040702925994992256, |
| "rewards/tag_count_reward": 0.9531250298023224, |
| "step": 388 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 228.98958587646484, |
| "epoch": 0.583427071616048, |
| "grad_norm": 599372.7708149451, |
| "kl": 12804.0703125, |
| "learning_rate": 8.822672907577244e-06, |
| "loss": 575.4508, |
| "reward": 2.526801884174347, |
| "reward_std": 0.5001873224973679, |
| "rewards/accuracy_reward": 0.6562500149011612, |
| "rewards/reasoning_steps_reward": 0.9565972536802292, |
| "rewards/repetition_penalty_reward": -0.04437878727912903, |
| "rewards/tag_count_reward": 0.9583333730697632, |
| "step": 389 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 196.5833396911621, |
| "epoch": 0.5849268841394826, |
| "grad_norm": 98.60380013944835, |
| "kl": 11.30810546875, |
| "learning_rate": 8.770606803154235e-06, |
| "loss": 0.4166, |
| "reward": 2.5563814640045166, |
| "reward_std": 0.3433222845196724, |
| "rewards/accuracy_reward": 0.6093750223517418, |
| "rewards/reasoning_steps_reward": 0.998263880610466, |
| "rewards/repetition_penalty_reward": -0.04344499483704567, |
| "rewards/tag_count_reward": 0.9921875149011612, |
| "step": 390 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 196.7083396911621, |
| "epoch": 0.5864266966629171, |
| "grad_norm": 100.03328215519356, |
| "kl": 8.48876953125, |
| "learning_rate": 8.718574515796099e-06, |
| "loss": 0.6527, |
| "reward": 2.6383371353149414, |
| "reward_std": 0.4097052291035652, |
| "rewards/accuracy_reward": 0.723958358168602, |
| "rewards/reasoning_steps_reward": 0.9791667014360428, |
| "rewards/repetition_penalty_reward": -0.04395460430532694, |
| "rewards/tag_count_reward": 0.9791666865348816, |
| "step": 391 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 185.48437881469727, |
| "epoch": 0.5879265091863517, |
| "grad_norm": 4314.703902280122, |
| "kl": 8.80859375, |
| "learning_rate": 8.666577476761147e-06, |
| "loss": 0.6093, |
| "reward": 2.56997287273407, |
| "reward_std": 0.39948707073926926, |
| "rewards/accuracy_reward": 0.6718750298023224, |
| "rewards/reasoning_steps_reward": 0.973958358168602, |
| "rewards/repetition_penalty_reward": -0.04851668328046799, |
| "rewards/tag_count_reward": 0.9726562649011612, |
| "step": 392 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 194.4739646911621, |
| "epoch": 0.5894263217097863, |
| "grad_norm": 5.999735316199694, |
| "kl": 0.830078125, |
| "learning_rate": 8.61461711633812e-06, |
| "loss": 0.2302, |
| "reward": 2.6648696064949036, |
| "reward_std": 0.3630646914243698, |
| "rewards/accuracy_reward": 0.7552083432674408, |
| "rewards/reasoning_steps_reward": 0.987847238779068, |
| "rewards/repetition_penalty_reward": -0.04954020772129297, |
| "rewards/tag_count_reward": 0.9713542014360428, |
| "step": 393 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 187.77083587646484, |
| "epoch": 0.5909261342332208, |
| "grad_norm": 1.1014377641969626, |
| "kl": 0.564453125, |
| "learning_rate": 8.562694863806833e-06, |
| "loss": 0.1169, |
| "reward": 2.4198758602142334, |
| "reward_std": 0.39315300434827805, |
| "rewards/accuracy_reward": 0.4895833432674408, |
| "rewards/reasoning_steps_reward": 0.9913194477558136, |
| "rewards/repetition_penalty_reward": -0.04930820316076279, |
| "rewards/tag_count_reward": 0.9882812649011612, |
| "step": 394 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 222.7500114440918, |
| "epoch": 0.5924259467566554, |
| "grad_norm": 4.04988083724822, |
| "kl": 0.7509765625, |
| "learning_rate": 8.510812147398857e-06, |
| "loss": 0.4591, |
| "reward": 2.632589817047119, |
| "reward_std": 0.5965047925710678, |
| "rewards/accuracy_reward": 0.7812500149011612, |
| "rewards/reasoning_steps_reward": 0.9600694626569748, |
| "rewards/repetition_penalty_reward": -0.06706297304481268, |
| "rewards/tag_count_reward": 0.958333358168602, |
| "step": 395 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 204.34896850585938, |
| "epoch": 0.59392575928009, |
| "grad_norm": 2.0214381753763924, |
| "kl": 0.5986328125, |
| "learning_rate": 8.458970394258244e-06, |
| "loss": 0.308, |
| "reward": 2.5586500763893127, |
| "reward_std": 0.43028920516371727, |
| "rewards/accuracy_reward": 0.6562500298023224, |
| "rewards/reasoning_steps_reward": 0.9774305671453476, |
| "rewards/repetition_penalty_reward": -0.05289509380236268, |
| "rewards/tag_count_reward": 0.9778645932674408, |
| "step": 396 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 183.1979217529297, |
| "epoch": 0.5954255718035245, |
| "grad_norm": 0.9700944248252047, |
| "kl": 0.46533203125, |
| "learning_rate": 8.407171030402263e-06, |
| "loss": 0.0956, |
| "reward": 2.5613616704940796, |
| "reward_std": 0.3627118840813637, |
| "rewards/accuracy_reward": 0.6250000149011612, |
| "rewards/reasoning_steps_reward": 1.0, |
| "rewards/repetition_penalty_reward": -0.0545238540507853, |
| "rewards/tag_count_reward": 0.9908854216337204, |
| "step": 397 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 206.43750381469727, |
| "epoch": 0.5969253843269592, |
| "grad_norm": 3.0901812570047826, |
| "kl": 0.66455078125, |
| "learning_rate": 8.355415480682176e-06, |
| "loss": 0.4091, |
| "reward": 2.456976294517517, |
| "reward_std": 0.3992188200354576, |
| "rewards/accuracy_reward": 0.5520833432674408, |
| "rewards/reasoning_steps_reward": 0.9826389104127884, |
| "rewards/repetition_penalty_reward": -0.05300639010965824, |
| "rewards/tag_count_reward": 0.9752604365348816, |
| "step": 398 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 205.4322967529297, |
| "epoch": 0.5984251968503937, |
| "grad_norm": 0.9560339346770327, |
| "kl": 0.677734375, |
| "learning_rate": 8.303705168744042e-06, |
| "loss": 0.386, |
| "reward": 2.42321240901947, |
| "reward_std": 0.4326848238706589, |
| "rewards/accuracy_reward": 0.5052083432674408, |
| "rewards/reasoning_steps_reward": 0.9774305820465088, |
| "rewards/repetition_penalty_reward": -0.03338485397398472, |
| "rewards/tag_count_reward": 0.9739583432674408, |
| "step": 399 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 223.17708587646484, |
| "epoch": 0.5999250093738283, |
| "grad_norm": 3.695423652449584, |
| "kl": 1.115234375, |
| "learning_rate": 8.252041516989565e-06, |
| "loss": 0.7209, |
| "reward": 2.4974151253700256, |
| "reward_std": 0.5143779292702675, |
| "rewards/accuracy_reward": 0.5937500223517418, |
| "rewards/reasoning_steps_reward": 0.9704861342906952, |
| "rewards/repetition_penalty_reward": -0.02775857038795948, |
| "rewards/tag_count_reward": 0.9609375298023224, |
| "step": 400 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 278.4583435058594, |
| "epoch": 0.6014248218972629, |
| "grad_norm": 1.4515991916390434, |
| "kl": 1.1875, |
| "learning_rate": 8.200425946536956e-06, |
| "loss": 0.9453, |
| "reward": 2.54727965593338, |
| "reward_std": 0.6352264881134033, |
| "rewards/accuracy_reward": 0.7083333432674408, |
| "rewards/reasoning_steps_reward": 0.9409722685813904, |
| "rewards/repetition_penalty_reward": -0.023900966625660658, |
| "rewards/tag_count_reward": 0.9218750149011612, |
| "step": 401 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 253.0260467529297, |
| "epoch": 0.6029246344206974, |
| "grad_norm": 1.6992040499230678, |
| "kl": 1.1376953125, |
| "learning_rate": 8.148859877181849e-06, |
| "loss": 0.6776, |
| "reward": 2.558239758014679, |
| "reward_std": 0.5012032613158226, |
| "rewards/accuracy_reward": 0.6927083432674408, |
| "rewards/reasoning_steps_reward": 0.951388880610466, |
| "rewards/repetition_penalty_reward": -0.02205547597259283, |
| "rewards/tag_count_reward": 0.9361979514360428, |
| "step": 402 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 280.1458435058594, |
| "epoch": 0.604424446944132, |
| "grad_norm": 1.2357512902908514, |
| "kl": 1.0625, |
| "learning_rate": 8.097344727358247e-06, |
| "loss": 0.6856, |
| "reward": 2.4696335196495056, |
| "reward_std": 0.6513710990548134, |
| "rewards/accuracy_reward": 0.6197916716337204, |
| "rewards/reasoning_steps_reward": 0.9444444626569748, |
| "rewards/repetition_penalty_reward": -0.019081805367022753, |
| "rewards/tag_count_reward": 0.9244791865348816, |
| "step": 403 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 218.3802146911621, |
| "epoch": 0.6059242594675666, |
| "grad_norm": 2.5806348482702632, |
| "kl": 0.6376953125, |
| "learning_rate": 8.045881914099503e-06, |
| "loss": 0.6086, |
| "reward": 2.3685402274131775, |
| "reward_std": 0.4312494173645973, |
| "rewards/accuracy_reward": 0.463541679084301, |
| "rewards/reasoning_steps_reward": 0.967013880610466, |
| "rewards/repetition_penalty_reward": -0.024254921358078718, |
| "rewards/tag_count_reward": 0.962239608168602, |
| "step": 404 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 214.18750381469727, |
| "epoch": 0.6074240719910011, |
| "grad_norm": 3.0889995708717803, |
| "kl": 0.802734375, |
| "learning_rate": 7.99447285299934e-06, |
| "loss": 0.9386, |
| "reward": 2.6483540534973145, |
| "reward_std": 0.57868642359972, |
| "rewards/accuracy_reward": 0.7760416865348816, |
| "rewards/reasoning_steps_reward": 0.9357639253139496, |
| "rewards/repetition_penalty_reward": -0.012670394266024232, |
| "rewards/tag_count_reward": 0.9492187798023224, |
| "step": 405 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 255.90625381469727, |
| "epoch": 0.6089238845144357, |
| "grad_norm": 6.886520330181463, |
| "kl": 1.41796875, |
| "learning_rate": 7.943118958172917e-06, |
| "loss": 0.9431, |
| "reward": 2.4443989396095276, |
| "reward_std": 0.7025687843561172, |
| "rewards/accuracy_reward": 0.6562500149011612, |
| "rewards/reasoning_steps_reward": 0.8854166865348816, |
| "rewards/repetition_penalty_reward": -0.01914279453922063, |
| "rewards/tag_count_reward": 0.9218750149011612, |
| "step": 406 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 270.8437614440918, |
| "epoch": 0.6104236970378702, |
| "grad_norm": 4.059148084337778, |
| "kl": 1.1845703125, |
| "learning_rate": 7.891821642217926e-06, |
| "loss": 0.9331, |
| "reward": 2.216310352087021, |
| "reward_std": 0.7253094911575317, |
| "rewards/accuracy_reward": 0.442708358168602, |
| "rewards/reasoning_steps_reward": 0.8854167014360428, |
| "rewards/repetition_penalty_reward": -0.02066887845285237, |
| "rewards/tag_count_reward": 0.9088541865348816, |
| "step": 407 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 251.8072967529297, |
| "epoch": 0.6119235095613048, |
| "grad_norm": 3.615908850384094, |
| "kl": 0.9541015625, |
| "learning_rate": 7.840582316175737e-06, |
| "loss": 0.7562, |
| "reward": 2.1759954690933228, |
| "reward_std": 0.7425311505794525, |
| "rewards/accuracy_reward": 0.3958333432674408, |
| "rewards/reasoning_steps_reward": 0.892361119389534, |
| "rewards/repetition_penalty_reward": -0.01975110382772982, |
| "rewards/tag_count_reward": 0.907552108168602, |
| "step": 408 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 223.29687881469727, |
| "epoch": 0.6134233220847394, |
| "grad_norm": 7.82629047907828, |
| "kl": 0.95703125, |
| "learning_rate": 7.789402389492582e-06, |
| "loss": 1.0177, |
| "reward": 2.383803129196167, |
| "reward_std": 0.6715415120124817, |
| "rewards/accuracy_reward": 0.5520833507180214, |
| "rewards/reasoning_steps_reward": 0.9184027910232544, |
| "rewards/repetition_penalty_reward": -0.013766373042017221, |
| "rewards/tag_count_reward": 0.9270833432674408, |
| "step": 409 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 244.07292556762695, |
| "epoch": 0.6149231346081739, |
| "grad_norm": 7.0703944532697145, |
| "kl": 0.80419921875, |
| "learning_rate": 7.738283269980798e-06, |
| "loss": 0.8963, |
| "reward": 2.4084761142730713, |
| "reward_std": 0.7272981628775597, |
| "rewards/accuracy_reward": 0.6197916716337204, |
| "rewards/reasoning_steps_reward": 0.8958333730697632, |
| "rewards/repetition_penalty_reward": -0.018607289995998144, |
| "rewards/tag_count_reward": 0.9114583432674408, |
| "step": 410 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 174.3489646911621, |
| "epoch": 0.6164229471316085, |
| "grad_norm": 3.740634216779511, |
| "kl": 0.63427734375, |
| "learning_rate": 7.687226363780084e-06, |
| "loss": 0.5151, |
| "reward": 2.5853647589683533, |
| "reward_std": 0.46327926218509674, |
| "rewards/accuracy_reward": 0.677083358168602, |
| "rewards/reasoning_steps_reward": 0.9618055820465088, |
| "rewards/repetition_penalty_reward": -0.023576234467327595, |
| "rewards/tag_count_reward": 0.970052108168602, |
| "step": 411 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 205.3177146911621, |
| "epoch": 0.6179227596550432, |
| "grad_norm": 2.7293051628640344, |
| "kl": 0.50927734375, |
| "learning_rate": 7.636233075318824e-06, |
| "loss": 0.6524, |
| "reward": 2.4624382853507996, |
| "reward_std": 0.5226383320987225, |
| "rewards/accuracy_reward": 0.5989583432674408, |
| "rewards/reasoning_steps_reward": 0.9340278059244156, |
| "rewards/repetition_penalty_reward": -0.017162481555715203, |
| "rewards/tag_count_reward": 0.9466145932674408, |
| "step": 412 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 208.1822967529297, |
| "epoch": 0.6194225721784777, |
| "grad_norm": 14.252860190179556, |
| "kl": 0.8994140625, |
| "learning_rate": 7.585304807275473e-06, |
| "loss": 0.7206, |
| "reward": 2.4753499627113342, |
| "reward_std": 0.536663368344307, |
| "rewards/accuracy_reward": 0.5989583507180214, |
| "rewards/reasoning_steps_reward": 0.9409722238779068, |
| "rewards/repetition_penalty_reward": -0.01510148635134101, |
| "rewards/tag_count_reward": 0.950520858168602, |
| "step": 413 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 174.04687881469727, |
| "epoch": 0.6209223847019123, |
| "grad_norm": 2.8964016768843623, |
| "kl": 0.73388671875, |
| "learning_rate": 7.534442960539956e-06, |
| "loss": 0.4618, |
| "reward": 2.6283841133117676, |
| "reward_std": 0.4127518758177757, |
| "rewards/accuracy_reward": 0.7031250223517418, |
| "rewards/reasoning_steps_reward": 0.9670139253139496, |
| "rewards/repetition_penalty_reward": -0.011807008180767298, |
| "rewards/tag_count_reward": 0.970052108168602, |
| "step": 414 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 184.96875762939453, |
| "epoch": 0.6224221972253469, |
| "grad_norm": 5.401451233630221, |
| "kl": 0.54443359375, |
| "learning_rate": 7.483648934175138e-06, |
| "loss": 0.5382, |
| "reward": 2.485370635986328, |
| "reward_std": 0.4186149761080742, |
| "rewards/accuracy_reward": 0.5833333432674408, |
| "rewards/reasoning_steps_reward": 0.9600694626569748, |
| "rewards/repetition_penalty_reward": -0.020271844463422894, |
| "rewards/tag_count_reward": 0.962239608168602, |
| "step": 415 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 156.63021087646484, |
| "epoch": 0.6239220097487814, |
| "grad_norm": 12.179499233084556, |
| "kl": 0.55126953125, |
| "learning_rate": 7.432924125378345e-06, |
| "loss": 0.4525, |
| "reward": 2.612027883529663, |
| "reward_std": 0.34135545045137405, |
| "rewards/accuracy_reward": 0.6875000149011612, |
| "rewards/reasoning_steps_reward": 0.9687500298023224, |
| "rewards/repetition_penalty_reward": -0.011670089792460203, |
| "rewards/tag_count_reward": 0.9674479514360428, |
| "step": 416 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 185.3802146911621, |
| "epoch": 0.625421822272216, |
| "grad_norm": 27.64718995248203, |
| "kl": 0.61962890625, |
| "learning_rate": 7.382269929442925e-06, |
| "loss": 0.3543, |
| "reward": 2.6146509051322937, |
| "reward_std": 0.3778613116592169, |
| "rewards/accuracy_reward": 0.6979166716337204, |
| "rewards/reasoning_steps_reward": 0.9739583432674408, |
| "rewards/repetition_penalty_reward": -0.01425547618418932, |
| "rewards/tag_count_reward": 0.9570312649011612, |
| "step": 417 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 200.8333396911621, |
| "epoch": 0.6269216347956506, |
| "grad_norm": 35.112382210723155, |
| "kl": 0.74609375, |
| "learning_rate": 7.331687739719868e-06, |
| "loss": 0.3159, |
| "reward": 2.7482373118400574, |
| "reward_std": 0.4861888214945793, |
| "rewards/accuracy_reward": 0.8750000149011612, |
| "rewards/reasoning_steps_reward": 0.9618055820465088, |
| "rewards/repetition_penalty_reward": -0.03388085588812828, |
| "rewards/tag_count_reward": 0.9453125298023224, |
| "step": 418 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 186.5416717529297, |
| "epoch": 0.6284214473190851, |
| "grad_norm": 9.808027472374857, |
| "kl": 0.75390625, |
| "learning_rate": 7.281178947579484e-06, |
| "loss": 0.3908, |
| "reward": 2.5124480724334717, |
| "reward_std": 0.39367250353097916, |
| "rewards/accuracy_reward": 0.6145833432674408, |
| "rewards/reasoning_steps_reward": 0.9895833432674408, |
| "rewards/repetition_penalty_reward": -0.050051978789269924, |
| "rewards/tag_count_reward": 0.958333358168602, |
| "step": 419 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 210.65625762939453, |
| "epoch": 0.6299212598425197, |
| "grad_norm": 316.11143834667564, |
| "kl": 6.1318359375, |
| "learning_rate": 7.230744942373125e-06, |
| "loss": 1.196, |
| "reward": 2.612563908100128, |
| "reward_std": 0.5070172511041164, |
| "rewards/accuracy_reward": 0.7447916865348816, |
| "rewards/reasoning_steps_reward": 0.9791666865348816, |
| "rewards/repetition_penalty_reward": -0.06451944634318352, |
| "rewards/tag_count_reward": 0.953125, |
| "step": 420 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 167.5520896911621, |
| "epoch": 0.6314210723659542, |
| "grad_norm": 2256.612496535129, |
| "kl": 120.529296875, |
| "learning_rate": 7.1803871113949675e-06, |
| "loss": 5.9692, |
| "reward": 2.5741260647773743, |
| "reward_std": 0.364616222679615, |
| "rewards/accuracy_reward": 0.645833358168602, |
| "rewards/reasoning_steps_reward": 0.9843750149011612, |
| "rewards/repetition_penalty_reward": -0.03134271129965782, |
| "rewards/tag_count_reward": 0.9752604365348816, |
| "step": 421 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 210.5885467529297, |
| "epoch": 0.6329208848893888, |
| "grad_norm": 31650.105311482544, |
| "kl": 379.5390625, |
| "learning_rate": 7.13010683984386e-06, |
| "loss": 45.4656, |
| "reward": 2.404063105583191, |
| "reward_std": 0.6292674243450165, |
| "rewards/accuracy_reward": 0.5833333507180214, |
| "rewards/reasoning_steps_reward": 0.9565972536802292, |
| "rewards/repetition_penalty_reward": -0.07336751371622086, |
| "rewards/tag_count_reward": 0.9375000149011612, |
| "step": 422 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 195.60937881469727, |
| "epoch": 0.6344206974128234, |
| "grad_norm": 69.45154521104752, |
| "kl": 2.13525390625, |
| "learning_rate": 7.07990551078521e-06, |
| "loss": 0.6039, |
| "reward": 2.7996047139167786, |
| "reward_std": 0.37217236310243607, |
| "rewards/accuracy_reward": 0.8854166716337204, |
| "rewards/reasoning_steps_reward": 0.9843750149011612, |
| "rewards/repetition_penalty_reward": -0.03633271250873804, |
| "rewards/tag_count_reward": 0.9661458432674408, |
| "step": 423 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 221.22917938232422, |
| "epoch": 0.6359205099362579, |
| "grad_norm": 63.609726332225755, |
| "kl": 2.5576171875, |
| "learning_rate": 7.029784505112948e-06, |
| "loss": 0.7597, |
| "reward": 2.4892146587371826, |
| "reward_std": 0.55179613083601, |
| "rewards/accuracy_reward": 0.6406250149011612, |
| "rewards/reasoning_steps_reward": 0.9618055820465088, |
| "rewards/repetition_penalty_reward": -0.05332011543214321, |
| "rewards/tag_count_reward": 0.9401042014360428, |
| "step": 424 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 196.7864646911621, |
| "epoch": 0.6374203224596925, |
| "grad_norm": 30.322718341900693, |
| "kl": 0.7666015625, |
| "learning_rate": 6.979745201511531e-06, |
| "loss": 0.5128, |
| "reward": 2.699239432811737, |
| "reward_std": 0.4398190379142761, |
| "rewards/accuracy_reward": 0.8020833432674408, |
| "rewards/reasoning_steps_reward": 0.973958358168602, |
| "rewards/repetition_penalty_reward": -0.031229355605319142, |
| "rewards/tag_count_reward": 0.954427108168602, |
| "step": 425 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 175.7291717529297, |
| "epoch": 0.6389201349831272, |
| "grad_norm": 763.6392481614376, |
| "kl": 442.4111328125, |
| "learning_rate": 6.929788976418044e-06, |
| "loss": 1.9958, |
| "reward": 2.833931624889374, |
| "reward_std": 0.3742608136963099, |
| "rewards/accuracy_reward": 0.9062500149011612, |
| "rewards/reasoning_steps_reward": 0.973958358168602, |
| "rewards/repetition_penalty_reward": -0.0215371900703758, |
| "rewards/tag_count_reward": 0.9752604365348816, |
| "step": 426 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 226.6927146911621, |
| "epoch": 0.6404199475065617, |
| "grad_norm": 11.61389336470008, |
| "kl": 0.611328125, |
| "learning_rate": 6.879917203984306e-06, |
| "loss": 0.4047, |
| "reward": 2.439822733402252, |
| "reward_std": 0.40970153361558914, |
| "rewards/accuracy_reward": 0.5520833507180214, |
| "rewards/reasoning_steps_reward": 0.984375, |
| "rewards/repetition_penalty_reward": -0.05366701539605856, |
| "rewards/tag_count_reward": 0.9570312798023224, |
| "step": 427 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 212.5989646911621, |
| "epoch": 0.6419197600299963, |
| "grad_norm": 8.516759254331243, |
| "kl": 0.5595703125, |
| "learning_rate": 6.830131256039094e-06, |
| "loss": 0.4026, |
| "reward": 2.511458396911621, |
| "reward_std": 0.4569522365927696, |
| "rewards/accuracy_reward": 0.614583358168602, |
| "rewards/reasoning_steps_reward": 0.9895833432674408, |
| "rewards/repetition_penalty_reward": -0.058854201808571815, |
| "rewards/tag_count_reward": 0.966145858168602, |
| "step": 428 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 193.1458396911621, |
| "epoch": 0.6434195725534309, |
| "grad_norm": 3.0244823017102758, |
| "kl": 0.5078125, |
| "learning_rate": 6.7804325020504e-06, |
| "loss": 0.3328, |
| "reward": 2.6280587315559387, |
| "reward_std": 0.33434533327817917, |
| "rewards/accuracy_reward": 0.6875000298023224, |
| "rewards/reasoning_steps_reward": 0.9947916716337204, |
| "rewards/repetition_penalty_reward": -0.04251436982303858, |
| "rewards/tag_count_reward": 0.9882812798023224, |
| "step": 429 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 213.2708396911621, |
| "epoch": 0.6449193850768654, |
| "grad_norm": 54.16361365177273, |
| "kl": 1.1318359375, |
| "learning_rate": 6.730822309087756e-06, |
| "loss": 0.5304, |
| "reward": 2.617822825908661, |
| "reward_std": 0.5841463133692741, |
| "rewards/accuracy_reward": 0.7343750298023224, |
| "rewards/reasoning_steps_reward": 0.9739583432674408, |
| "rewards/repetition_penalty_reward": -0.061864775605499744, |
| "rewards/tag_count_reward": 0.9713541865348816, |
| "step": 430 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 233.47396087646484, |
| "epoch": 0.6464191976003, |
| "grad_norm": 9.396207453842536, |
| "kl": 0.5869140625, |
| "learning_rate": 6.6813020417846456e-06, |
| "loss": 0.5755, |
| "reward": 2.444421410560608, |
| "reward_std": 0.732051394879818, |
| "rewards/accuracy_reward": 0.630208358168602, |
| "rewards/reasoning_steps_reward": 0.9479166567325592, |
| "rewards/repetition_penalty_reward": -0.08552652504295111, |
| "rewards/tag_count_reward": 0.9518229514360428, |
| "step": 431 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 217.7604217529297, |
| "epoch": 0.6479190101237345, |
| "grad_norm": 104.85884344998551, |
| "kl": 0.8720703125, |
| "learning_rate": 6.6318730623009465e-06, |
| "loss": 0.7651, |
| "reward": 2.6597663164138794, |
| "reward_std": 0.7930338382720947, |
| "rewards/accuracy_reward": 0.8333333432674408, |
| "rewards/reasoning_steps_reward": 0.9496527910232544, |
| "rewards/repetition_penalty_reward": -0.07634495198726654, |
| "rewards/tag_count_reward": 0.9531250149011612, |
| "step": 432 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 194.21875381469727, |
| "epoch": 0.6494188226471691, |
| "grad_norm": 4283739.768658449, |
| "kl": 21367.5, |
| "learning_rate": 6.582536730285476e-06, |
| "loss": 4439.4258, |
| "reward": 2.63147896528244, |
| "reward_std": 0.7544302493333817, |
| "rewards/accuracy_reward": 0.833333358168602, |
| "rewards/reasoning_steps_reward": 0.928819477558136, |
| "rewards/repetition_penalty_reward": -0.07338221184909344, |
| "rewards/tag_count_reward": 0.9427083730697632, |
| "step": 433 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 235.91667556762695, |
| "epoch": 0.6509186351706037, |
| "grad_norm": 210786.70991777242, |
| "kl": 8894.5, |
| "learning_rate": 6.5332944028385885e-06, |
| "loss": 447.474, |
| "reward": 2.3411070704460144, |
| "reward_std": 0.8666418790817261, |
| "rewards/accuracy_reward": 0.6093750149011612, |
| "rewards/reasoning_steps_reward": 0.9149305820465088, |
| "rewards/repetition_penalty_reward": -0.10637565143406391, |
| "rewards/tag_count_reward": 0.923177108168602, |
| "step": 434 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 223.79687881469727, |
| "epoch": 0.6524184476940382, |
| "grad_norm": 1518.3629252174667, |
| "kl": 64.171875, |
| "learning_rate": 6.484147434474837e-06, |
| "loss": 5.1614, |
| "reward": 2.2417006492614746, |
| "reward_std": 0.8650868535041809, |
| "rewards/accuracy_reward": 0.5104166865348816, |
| "rewards/reasoning_steps_reward": 0.9114583432674408, |
| "rewards/repetition_penalty_reward": -0.09814327582716942, |
| "rewards/tag_count_reward": 0.9179687649011612, |
| "step": 435 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 220.6197967529297, |
| "epoch": 0.6539182602174728, |
| "grad_norm": 50.76737216477434, |
| "kl": 0.66455078125, |
| "learning_rate": 6.435097177085728e-06, |
| "loss": 0.6146, |
| "reward": 2.2728312611579895, |
| "reward_std": 0.8381006345152855, |
| "rewards/accuracy_reward": 0.5052083432674408, |
| "rewards/reasoning_steps_reward": 0.9218750298023224, |
| "rewards/repetition_penalty_reward": -0.08914803247898817, |
| "rewards/tag_count_reward": 0.934895858168602, |
| "step": 436 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 190.6458396911621, |
| "epoch": 0.6554180727409074, |
| "grad_norm": 27.797869724896955, |
| "kl": 0.7705078125, |
| "learning_rate": 6.386144979902527e-06, |
| "loss": 0.3753, |
| "reward": 2.3050162196159363, |
| "reward_std": 0.77731654047966, |
| "rewards/accuracy_reward": 0.5208333432674408, |
| "rewards/reasoning_steps_reward": 0.927083358168602, |
| "rewards/repetition_penalty_reward": -0.07909846305847168, |
| "rewards/tag_count_reward": 0.9361979365348816, |
| "step": 437 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 230.3697967529297, |
| "epoch": 0.6569178852643419, |
| "grad_norm": 117.99768545242851, |
| "kl": 1.0126953125, |
| "learning_rate": 6.337292189459139e-06, |
| "loss": 0.6907, |
| "reward": 2.48311048746109, |
| "reward_std": 0.833274632692337, |
| "rewards/accuracy_reward": 0.7447916865348816, |
| "rewards/reasoning_steps_reward": 0.91493059694767, |
| "rewards/repetition_penalty_reward": -0.10109082609415054, |
| "rewards/tag_count_reward": 0.9244791716337204, |
| "step": 438 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 208.05208587646484, |
| "epoch": 0.6584176977877765, |
| "grad_norm": 2775.548898191414, |
| "kl": 79.640625, |
| "learning_rate": 6.2885401495550826e-06, |
| "loss": 10.0176, |
| "reward": 2.4023959636688232, |
| "reward_std": 0.7233280688524246, |
| "rewards/accuracy_reward": 0.5937500298023224, |
| "rewards/reasoning_steps_reward": 0.935763880610466, |
| "rewards/repetition_penalty_reward": -0.0737327765673399, |
| "rewards/tag_count_reward": 0.9466145932674408, |
| "step": 439 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 211.5260467529297, |
| "epoch": 0.6599175103112112, |
| "grad_norm": 19900.674181692015, |
| "kl": 446.0, |
| "learning_rate": 6.239890201218517e-06, |
| "loss": 65.0693, |
| "reward": 2.270346522331238, |
| "reward_std": 0.8372077494859695, |
| "rewards/accuracy_reward": 0.5208333507180214, |
| "rewards/reasoning_steps_reward": 0.9114583730697632, |
| "rewards/repetition_penalty_reward": -0.08512232266366482, |
| "rewards/tag_count_reward": 0.923177108168602, |
| "step": 440 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 213.95313262939453, |
| "epoch": 0.6614173228346457, |
| "grad_norm": 1172.7691458595361, |
| "kl": 36.59375, |
| "learning_rate": 6.191343682669357e-06, |
| "loss": 5.7344, |
| "reward": 2.4517070651054382, |
| "reward_std": 0.7322164475917816, |
| "rewards/accuracy_reward": 0.6406250074505806, |
| "rewards/reasoning_steps_reward": 0.944444477558136, |
| "rewards/repetition_penalty_reward": -0.07867502607405186, |
| "rewards/tag_count_reward": 0.9453125149011612, |
| "step": 441 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 250.46355056762695, |
| "epoch": 0.6629171353580803, |
| "grad_norm": 105.35481274422847, |
| "kl": 1.66796875, |
| "learning_rate": 6.142901929282459e-06, |
| "loss": 0.852, |
| "reward": 2.3720561265945435, |
| "reward_std": 0.936857059597969, |
| "rewards/accuracy_reward": 0.6354166865348816, |
| "rewards/reasoning_steps_reward": 0.9166666865348816, |
| "rewards/repetition_penalty_reward": -0.10190227814018726, |
| "rewards/tag_count_reward": 0.921875, |
| "step": 442 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 229.20313262939453, |
| "epoch": 0.6644169478815148, |
| "grad_norm": 95.75169686366347, |
| "kl": 1.2333984375, |
| "learning_rate": 6.094566273550899e-06, |
| "loss": 0.6136, |
| "reward": 2.508533537387848, |
| "reward_std": 0.9894896894693375, |
| "rewards/accuracy_reward": 0.8072917014360428, |
| "rewards/reasoning_steps_reward": 0.8958333432674408, |
| "rewards/repetition_penalty_reward": -0.09823731146752834, |
| "rewards/tag_count_reward": 0.9036458432674408, |
| "step": 443 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 287.8229217529297, |
| "epoch": 0.6659167604049494, |
| "grad_norm": 62.198954440927515, |
| "kl": 3.91796875, |
| "learning_rate": 6.046338045049307e-06, |
| "loss": 1.0033, |
| "reward": 2.1896302700042725, |
| "reward_std": 1.0947272032499313, |
| "rewards/accuracy_reward": 0.5677083730697632, |
| "rewards/reasoning_steps_reward": 0.8802083432674408, |
| "rewards/repetition_penalty_reward": -0.14760945178568363, |
| "rewards/tag_count_reward": 0.8893229216337204, |
| "step": 444 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 210.73438262939453, |
| "epoch": 0.667416572928384, |
| "grad_norm": 624.3572181079038, |
| "kl": 32.46875, |
| "learning_rate": 5.998218570397298e-06, |
| "loss": 3.7004, |
| "reward": 2.489894211292267, |
| "reward_std": 0.7204201519489288, |
| "rewards/accuracy_reward": 0.6979166865348816, |
| "rewards/reasoning_steps_reward": 0.9340277761220932, |
| "rewards/repetition_penalty_reward": -0.08345651999115944, |
| "rewards/tag_count_reward": 0.9414062649011612, |
| "step": 445 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 192.6927146911621, |
| "epoch": 0.6689163854518185, |
| "grad_norm": 1291.0531241853187, |
| "kl": 58.40625, |
| "learning_rate": 5.950209173222985e-06, |
| "loss": 6.5055, |
| "reward": 2.347103238105774, |
| "reward_std": 1.0178454369306564, |
| "rewards/accuracy_reward": 0.666666679084301, |
| "rewards/reasoning_steps_reward": 0.8715278059244156, |
| "rewards/repetition_penalty_reward": -0.08432047069072723, |
| "rewards/tag_count_reward": 0.8932291865348816, |
| "step": 446 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 188.81250381469727, |
| "epoch": 0.6704161979752531, |
| "grad_norm": 167.1713247377813, |
| "kl": 12.703125, |
| "learning_rate": 5.902311174126565e-06, |
| "loss": 1.3266, |
| "reward": 2.487611711025238, |
| "reward_std": 0.847619041800499, |
| "rewards/accuracy_reward": 0.723958358168602, |
| "rewards/reasoning_steps_reward": 0.9184027910232544, |
| "rewards/repetition_penalty_reward": -0.07792646810412407, |
| "rewards/tag_count_reward": 0.9231770932674408, |
| "step": 447 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 217.70312881469727, |
| "epoch": 0.6719160104986877, |
| "grad_norm": 46.56836347829574, |
| "kl": 2.587890625, |
| "learning_rate": 5.854525890643996e-06, |
| "loss": 0.5503, |
| "reward": 2.271707057952881, |
| "reward_std": 0.9676771610975266, |
| "rewards/accuracy_reward": 0.583333358168602, |
| "rewards/reasoning_steps_reward": 0.880208358168602, |
| "rewards/repetition_penalty_reward": -0.09678260423243046, |
| "rewards/tag_count_reward": 0.9049479365348816, |
| "step": 448 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 212.70833587646484, |
| "epoch": 0.6734158230221222, |
| "grad_norm": 58.535345861262144, |
| "kl": 1.36328125, |
| "learning_rate": 5.806854637210752e-06, |
| "loss": 0.4487, |
| "reward": 2.4692699909210205, |
| "reward_std": 1.0627684146165848, |
| "rewards/accuracy_reward": 0.7812500149011612, |
| "rewards/reasoning_steps_reward": 0.8802083730697632, |
| "rewards/repetition_penalty_reward": -0.09583419561386108, |
| "rewards/tag_count_reward": 0.9036458432674408, |
| "step": 449 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 217.21875381469727, |
| "epoch": 0.6749156355455568, |
| "grad_norm": 44.853870574601515, |
| "kl": 2.22265625, |
| "learning_rate": 5.759298725125671e-06, |
| "loss": 0.57, |
| "reward": 2.438233256340027, |
| "reward_std": 0.9969596564769745, |
| "rewards/accuracy_reward": 0.739583358168602, |
| "rewards/reasoning_steps_reward": 0.8975694626569748, |
| "rewards/repetition_penalty_reward": -0.09475286118686199, |
| "rewards/tag_count_reward": 0.895833358168602, |
| "step": 450 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 181.16146850585938, |
| "epoch": 0.6764154480689913, |
| "grad_norm": 45.94535859885079, |
| "kl": 5.15234375, |
| "learning_rate": 5.711859462514883e-06, |
| "loss": 0.6531, |
| "reward": 2.3718193769454956, |
| "reward_std": 0.8176105469465256, |
| "rewards/accuracy_reward": 0.5937500149011612, |
| "rewards/reasoning_steps_reward": 0.9149305671453476, |
| "rewards/repetition_penalty_reward": -0.05483006127178669, |
| "rewards/tag_count_reward": 0.9179687649011612, |
| "step": 451 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 183.48958587646484, |
| "epoch": 0.6779152605924259, |
| "grad_norm": 54.34402687729516, |
| "kl": 7.81640625, |
| "learning_rate": 5.664538154295827e-06, |
| "loss": 0.6031, |
| "reward": 2.3453221917152405, |
| "reward_std": 0.8092229068279266, |
| "rewards/accuracy_reward": 0.604166679084301, |
| "rewards/reasoning_steps_reward": 0.8906250149011612, |
| "rewards/repetition_penalty_reward": -0.05702169891446829, |
| "rewards/tag_count_reward": 0.907552108168602, |
| "step": 452 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 177.88541793823242, |
| "epoch": 0.6794150731158605, |
| "grad_norm": 8.683387363521502, |
| "kl": 2.787109375, |
| "learning_rate": 5.617336102141356e-06, |
| "loss": 0.3205, |
| "reward": 2.4910064935684204, |
| "reward_std": 0.6290017366409302, |
| "rewards/accuracy_reward": 0.6354166865348816, |
| "rewards/reasoning_steps_reward": 0.9531250149011612, |
| "rewards/repetition_penalty_reward": -0.04545190371572971, |
| "rewards/tag_count_reward": 0.9479166865348816, |
| "step": 453 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 178.65625381469727, |
| "epoch": 0.6809148856392951, |
| "grad_norm": 21.261847254596837, |
| "kl": 1.23828125, |
| "learning_rate": 5.570254604443929e-06, |
| "loss": 0.0513, |
| "reward": 2.4800949692726135, |
| "reward_std": 0.9092780351638794, |
| "rewards/accuracy_reward": 0.7083333432674408, |
| "rewards/reasoning_steps_reward": 0.9149305820465088, |
| "rewards/repetition_penalty_reward": -0.052023096941411495, |
| "rewards/tag_count_reward": 0.9088541865348816, |
| "step": 454 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 189.70833587646484, |
| "epoch": 0.6824146981627297, |
| "grad_norm": 32.86231011635845, |
| "kl": 0.9814453125, |
| "learning_rate": 5.5232949562799055e-06, |
| "loss": 0.2151, |
| "reward": 2.370435267686844, |
| "reward_std": 0.8025718629360199, |
| "rewards/accuracy_reward": 0.6041666939854622, |
| "rewards/reasoning_steps_reward": 0.911458358168602, |
| "rewards/repetition_penalty_reward": -0.05143989436328411, |
| "rewards/tag_count_reward": 0.9062500298023224, |
| "step": 455 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 191.734375, |
| "epoch": 0.6839145106861643, |
| "grad_norm": 17.49572401123107, |
| "kl": 1.21484375, |
| "learning_rate": 5.4764584493739095e-06, |
| "loss": 0.0338, |
| "reward": 2.206531524658203, |
| "reward_std": 0.7957804501056671, |
| "rewards/accuracy_reward": 0.4375000223517418, |
| "rewards/reasoning_steps_reward": 0.913194477558136, |
| "rewards/repetition_penalty_reward": -0.05171508714556694, |
| "rewards/tag_count_reward": 0.907552108168602, |
| "step": 456 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 185.22916793823242, |
| "epoch": 0.6854143232095988, |
| "grad_norm": 12.624756226573414, |
| "kl": 2.28515625, |
| "learning_rate": 5.429746372063309e-06, |
| "loss": 0.2271, |
| "reward": 2.3943753242492676, |
| "reward_std": 0.7135017514228821, |
| "rewards/accuracy_reward": 0.5885416716337204, |
| "rewards/reasoning_steps_reward": 0.9270833432674408, |
| "rewards/repetition_penalty_reward": -0.04703101795166731, |
| "rewards/tag_count_reward": 0.9257812798023224, |
| "step": 457 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 193.81250381469727, |
| "epoch": 0.6869141357330334, |
| "grad_norm": 105.8474864914367, |
| "kl": 8.5908203125, |
| "learning_rate": 5.3831600092627704e-06, |
| "loss": 1.1342, |
| "reward": 2.7418383955955505, |
| "reward_std": 0.4850631505250931, |
| "rewards/accuracy_reward": 0.864583358168602, |
| "rewards/reasoning_steps_reward": 0.9670139104127884, |
| "rewards/repetition_penalty_reward": -0.04418608546257019, |
| "rewards/tag_count_reward": 0.954427108168602, |
| "step": 458 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 200.70833587646484, |
| "epoch": 0.688413948256468, |
| "grad_norm": 87.25261858275486, |
| "kl": 10.28515625, |
| "learning_rate": 5.336700642428913e-06, |
| "loss": 0.6609, |
| "reward": 2.498172700405121, |
| "reward_std": 0.5890463814139366, |
| "rewards/accuracy_reward": 0.5937500223517418, |
| "rewards/reasoning_steps_reward": 0.973958358168602, |
| "rewards/repetition_penalty_reward": -0.04219203256070614, |
| "rewards/tag_count_reward": 0.9726562649011612, |
| "step": 459 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 203.2083396911621, |
| "epoch": 0.6899137607799025, |
| "grad_norm": 125.79936841970297, |
| "kl": 8.3828125, |
| "learning_rate": 5.290369549525066e-06, |
| "loss": 0.9341, |
| "reward": 2.5255751609802246, |
| "reward_std": 0.44053927063941956, |
| "rewards/accuracy_reward": 0.6197916865348816, |
| "rewards/reasoning_steps_reward": 0.973958358168602, |
| "rewards/repetition_penalty_reward": -0.03952907770872116, |
| "rewards/tag_count_reward": 0.9713541865348816, |
| "step": 460 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 192.34375762939453, |
| "epoch": 0.6914135733033371, |
| "grad_norm": 32.0003121799561, |
| "kl": 5.546875, |
| "learning_rate": 5.2441680049861125e-06, |
| "loss": 0.4044, |
| "reward": 2.5426307916641235, |
| "reward_std": 0.570788636803627, |
| "rewards/accuracy_reward": 0.6979166865348816, |
| "rewards/reasoning_steps_reward": 0.942708358168602, |
| "rewards/repetition_penalty_reward": -0.039400530513376, |
| "rewards/tag_count_reward": 0.9414062649011612, |
| "step": 461 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 192.3854217529297, |
| "epoch": 0.6929133858267716, |
| "grad_norm": 1.8867372567328653, |
| "kl": 0.93701171875, |
| "learning_rate": 5.198097279683434e-06, |
| "loss": 0.0048, |
| "reward": 2.4953662753105164, |
| "reward_std": 0.4802846685051918, |
| "rewards/accuracy_reward": 0.6093750149011612, |
| "rewards/reasoning_steps_reward": 0.9635416865348816, |
| "rewards/repetition_penalty_reward": -0.04239422548562288, |
| "rewards/tag_count_reward": 0.9648437649011612, |
| "step": 462 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 218.31250381469727, |
| "epoch": 0.6944131983502062, |
| "grad_norm": 14.34352821328907, |
| "kl": 2.4140625, |
| "learning_rate": 5.152158640889947e-06, |
| "loss": 0.1836, |
| "reward": 2.5026406049728394, |
| "reward_std": 0.5393296033143997, |
| "rewards/accuracy_reward": 0.6458333432674408, |
| "rewards/reasoning_steps_reward": 0.958333358168602, |
| "rewards/repetition_penalty_reward": -0.05595315434038639, |
| "rewards/tag_count_reward": 0.9544270932674408, |
| "step": 463 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 218.9270896911621, |
| "epoch": 0.6959130108736408, |
| "grad_norm": 2.647430771526064, |
| "kl": 0.92138671875, |
| "learning_rate": 5.106353352245254e-06, |
| "loss": -0.0398, |
| "reward": 2.567262351512909, |
| "reward_std": 0.44548412412405014, |
| "rewards/accuracy_reward": 0.6979166865348816, |
| "rewards/reasoning_steps_reward": 0.9635416865348816, |
| "rewards/repetition_penalty_reward": -0.05252937041223049, |
| "rewards/tag_count_reward": 0.958333358168602, |
| "step": 464 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 231.65105056762695, |
| "epoch": 0.6974128233970753, |
| "grad_norm": 1.81334431827224, |
| "kl": 0.81884765625, |
| "learning_rate": 5.060682673720878e-06, |
| "loss": -0.0188, |
| "reward": 2.3878976106643677, |
| "reward_std": 0.4217522442340851, |
| "rewards/accuracy_reward": 0.510416679084301, |
| "rewards/reasoning_steps_reward": 0.9670138955116272, |
| "rewards/repetition_penalty_reward": -0.05307460017502308, |
| "rewards/tag_count_reward": 0.9635416716337204, |
| "step": 465 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 222.82813262939453, |
| "epoch": 0.6989126359205099, |
| "grad_norm": 4.851401629836836, |
| "kl": 1.81005859375, |
| "learning_rate": 5.015147861585603e-06, |
| "loss": 0.0997, |
| "reward": 2.486442983150482, |
| "reward_std": 0.49670958518981934, |
| "rewards/accuracy_reward": 0.5989583432674408, |
| "rewards/reasoning_steps_reward": 0.9635416865348816, |
| "rewards/repetition_penalty_reward": -0.03959879372268915, |
| "rewards/tag_count_reward": 0.9635416716337204, |
| "step": 466 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 294.14583587646484, |
| "epoch": 0.7004124484439445, |
| "grad_norm": 9.019683071245192, |
| "kl": 1.919921875, |
| "learning_rate": 4.969750168370924e-06, |
| "loss": 0.2538, |
| "reward": 2.627793550491333, |
| "reward_std": 0.3618628829717636, |
| "rewards/accuracy_reward": 0.7239583432674408, |
| "rewards/reasoning_steps_reward": 0.9895833432674408, |
| "rewards/repetition_penalty_reward": -0.0688210241496563, |
| "rewards/tag_count_reward": 0.9830729216337204, |
| "step": 467 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 248.39062881469727, |
| "epoch": 0.7019122609673791, |
| "grad_norm": 38.2303230093381, |
| "kl": 7.177734375, |
| "learning_rate": 4.924490842836584e-06, |
| "loss": 0.3121, |
| "reward": 2.486271381378174, |
| "reward_std": 0.3947894722223282, |
| "rewards/accuracy_reward": 0.5885416828095913, |
| "rewards/reasoning_steps_reward": 0.9739583432674408, |
| "rewards/repetition_penalty_reward": -0.04497876111418009, |
| "rewards/tag_count_reward": 0.9687500149011612, |
| "step": 468 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 341.1302185058594, |
| "epoch": 0.7034120734908137, |
| "grad_norm": 12.42233946312446, |
| "kl": 1.4111328125, |
| "learning_rate": 4.879371129936233e-06, |
| "loss": 0.2385, |
| "reward": 2.7423893213272095, |
| "reward_std": 0.3288619890809059, |
| "rewards/accuracy_reward": 0.848958358168602, |
| "rewards/reasoning_steps_reward": 0.9947916716337204, |
| "rewards/repetition_penalty_reward": -0.09094424080103636, |
| "rewards/tag_count_reward": 0.9895833432674408, |
| "step": 469 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 331.8645935058594, |
| "epoch": 0.7049118860142483, |
| "grad_norm": 4.8140326164405645, |
| "kl": 3.64794921875, |
| "learning_rate": 4.834392270783183e-06, |
| "loss": 0.0767, |
| "reward": 2.6093015670776367, |
| "reward_std": 0.49695973843336105, |
| "rewards/accuracy_reward": 0.7343750298023224, |
| "rewards/reasoning_steps_reward": 0.9843750149011612, |
| "rewards/repetition_penalty_reward": -0.08080272283405066, |
| "rewards/tag_count_reward": 0.9713542014360428, |
| "step": 470 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 342.6979293823242, |
| "epoch": 0.7064116985376828, |
| "grad_norm": 2.1505682137010256, |
| "kl": 0.66015625, |
| "learning_rate": 4.789555502616258e-06, |
| "loss": 0.0636, |
| "reward": 2.6130124926567078, |
| "reward_std": 0.35396523028612137, |
| "rewards/accuracy_reward": 0.7239583730697632, |
| "rewards/reasoning_steps_reward": 0.9843750149011612, |
| "rewards/repetition_penalty_reward": -0.07839393150061369, |
| "rewards/tag_count_reward": 0.9830729365348816, |
| "step": 471 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 393.6770935058594, |
| "epoch": 0.7079115110611174, |
| "grad_norm": 5.089873005092783, |
| "kl": 0.68115234375, |
| "learning_rate": 4.744862058765776e-06, |
| "loss": 0.1038, |
| "reward": 2.644687831401825, |
| "reward_std": 0.3760165199637413, |
| "rewards/accuracy_reward": 0.7812500149011612, |
| "rewards/reasoning_steps_reward": 0.9947916716337204, |
| "rewards/repetition_penalty_reward": -0.10661418363451958, |
| "rewards/tag_count_reward": 0.9752604216337204, |
| "step": 472 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 348.5104217529297, |
| "epoch": 0.709411323584552, |
| "grad_norm": 2.613896254667977, |
| "kl": 0.80517578125, |
| "learning_rate": 4.700313168619608e-06, |
| "loss": 0.0745, |
| "reward": 2.535249173641205, |
| "reward_std": 0.3934590071439743, |
| "rewards/accuracy_reward": 0.6718750298023224, |
| "rewards/reasoning_steps_reward": 0.9791666865348816, |
| "rewards/repetition_penalty_reward": -0.08975092135369778, |
| "rewards/tag_count_reward": 0.9739583432674408, |
| "step": 473 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 342.15625762939453, |
| "epoch": 0.7109111361079865, |
| "grad_norm": 0.927779678297728, |
| "kl": 1.31298828125, |
| "learning_rate": 4.655910057589377e-06, |
| "loss": 0.0322, |
| "reward": 2.5929868817329407, |
| "reward_std": 0.37964776903390884, |
| "rewards/accuracy_reward": 0.7291666716337204, |
| "rewards/reasoning_steps_reward": 0.9739583432674408, |
| "rewards/repetition_penalty_reward": -0.07498200424015522, |
| "rewards/tag_count_reward": 0.96484375, |
| "step": 474 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 286.5677185058594, |
| "epoch": 0.7124109486314211, |
| "grad_norm": 3.0029493567846113, |
| "kl": 1.36328125, |
| "learning_rate": 4.611653947076732e-06, |
| "loss": 0.1304, |
| "reward": 2.478628635406494, |
| "reward_std": 0.5730241611599922, |
| "rewards/accuracy_reward": 0.6197916865348816, |
| "rewards/reasoning_steps_reward": 0.9791666865348816, |
| "rewards/repetition_penalty_reward": -0.059131866320967674, |
| "rewards/tag_count_reward": 0.938802108168602, |
| "step": 475 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 287.01563262939453, |
| "epoch": 0.7139107611548556, |
| "grad_norm": 15.492730466605252, |
| "kl": 4.71484375, |
| "learning_rate": 4.567546054439777e-06, |
| "loss": 0.2672, |
| "reward": 2.366079330444336, |
| "reward_std": 0.7147725075483322, |
| "rewards/accuracy_reward": 0.5885416865348816, |
| "rewards/reasoning_steps_reward": 0.9756944626569748, |
| "rewards/repetition_penalty_reward": -0.06143816187977791, |
| "rewards/tag_count_reward": 0.8632812649011612, |
| "step": 476 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 298.7343864440918, |
| "epoch": 0.7154105736782902, |
| "grad_norm": 49.985480375628754, |
| "kl": 8.77734375, |
| "learning_rate": 4.523587592959557e-06, |
| "loss": 0.6553, |
| "reward": 2.199244737625122, |
| "reward_std": 0.696681559085846, |
| "rewards/accuracy_reward": 0.463541679084301, |
| "rewards/reasoning_steps_reward": 0.9704861044883728, |
| "rewards/repetition_penalty_reward": -0.07853319868445396, |
| "rewards/tag_count_reward": 0.8437500298023224, |
| "step": 477 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 264.0104293823242, |
| "epoch": 0.7169103862017248, |
| "grad_norm": 9.130997990308742, |
| "kl": 2.22802734375, |
| "learning_rate": 4.479779771806699e-06, |
| "loss": 0.2573, |
| "reward": 2.621893048286438, |
| "reward_std": 0.5179613158106804, |
| "rewards/accuracy_reward": 0.7291666865348816, |
| "rewards/reasoning_steps_reward": 0.9947916716337204, |
| "rewards/repetition_penalty_reward": -0.05779455881565809, |
| "rewards/tag_count_reward": 0.9557292014360428, |
| "step": 478 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 250.98958587646484, |
| "epoch": 0.7184101987251593, |
| "grad_norm": 1.642461450377282, |
| "kl": 1.078125, |
| "learning_rate": 4.436123796008149e-06, |
| "loss": 0.081, |
| "reward": 2.540480315685272, |
| "reward_std": 0.4395308271050453, |
| "rewards/accuracy_reward": 0.6406250149011612, |
| "rewards/reasoning_steps_reward": 0.9791666716337204, |
| "rewards/repetition_penalty_reward": -0.046759432181715965, |
| "rewards/tag_count_reward": 0.9674479216337204, |
| "step": 479 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 264.43750762939453, |
| "epoch": 0.7199100112485939, |
| "grad_norm": 2.118847794368944, |
| "kl": 0.8173828125, |
| "learning_rate": 4.392620866414026e-06, |
| "loss": 0.0134, |
| "reward": 2.5877268314361572, |
| "reward_std": 0.45883308351039886, |
| "rewards/accuracy_reward": 0.697916679084301, |
| "rewards/reasoning_steps_reward": 0.9826389104127884, |
| "rewards/repetition_penalty_reward": -0.05246431287378073, |
| "rewards/tag_count_reward": 0.9596354365348816, |
| "step": 480 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 263.8698043823242, |
| "epoch": 0.7214098237720284, |
| "grad_norm": 2.650392492259305, |
| "kl": 0.63818359375, |
| "learning_rate": 4.349272179664586e-06, |
| "loss": 0.0475, |
| "reward": 2.623681426048279, |
| "reward_std": 0.4385247528553009, |
| "rewards/accuracy_reward": 0.7083333432674408, |
| "rewards/reasoning_steps_reward": 0.9843750149011612, |
| "rewards/repetition_penalty_reward": -0.04689165949821472, |
| "rewards/tag_count_reward": 0.977864608168602, |
| "step": 481 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 260.54687881469727, |
| "epoch": 0.7229096362954631, |
| "grad_norm": 1.8488154063721305, |
| "kl": 1.2900390625, |
| "learning_rate": 4.3060789281573135e-06, |
| "loss": 0.0759, |
| "reward": 2.620662033557892, |
| "reward_std": 0.5110099017620087, |
| "rewards/accuracy_reward": 0.755208358168602, |
| "rewards/reasoning_steps_reward": 0.9635416716337204, |
| "rewards/repetition_penalty_reward": -0.053817191161215305, |
| "rewards/tag_count_reward": 0.9557291716337204, |
| "step": 482 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 236.7708396911621, |
| "epoch": 0.7244094488188977, |
| "grad_norm": 0.8685220289150081, |
| "kl": 0.9541015625, |
| "learning_rate": 4.263042300014112e-06, |
| "loss": 0.0159, |
| "reward": 2.6224602460861206, |
| "reward_std": 0.432863712310791, |
| "rewards/accuracy_reward": 0.7135416865348816, |
| "rewards/reasoning_steps_reward": 0.975694477558136, |
| "rewards/repetition_penalty_reward": -0.036828051786869764, |
| "rewards/tag_count_reward": 0.9700520932674408, |
| "step": 483 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 247.7604217529297, |
| "epoch": 0.7259092613423322, |
| "grad_norm": 8.634827245487672, |
| "kl": 2.59814453125, |
| "learning_rate": 4.220163479048632e-06, |
| "loss": 0.0962, |
| "reward": 2.589983582496643, |
| "reward_std": 0.55703204870224, |
| "rewards/accuracy_reward": 0.7239583507180214, |
| "rewards/reasoning_steps_reward": 0.9548611491918564, |
| "rewards/repetition_penalty_reward": -0.039356810972094536, |
| "rewards/tag_count_reward": 0.9505208432674408, |
| "step": 484 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 243.9947967529297, |
| "epoch": 0.7274090738657668, |
| "grad_norm": 2.8247050710119654, |
| "kl": 1.40234375, |
| "learning_rate": 4.177443644733699e-06, |
| "loss": 0.0324, |
| "reward": 2.7192403078079224, |
| "reward_std": 0.2972189523279667, |
| "rewards/accuracy_reward": 0.8020833432674408, |
| "rewards/reasoning_steps_reward": 0.984375, |
| "rewards/repetition_penalty_reward": -0.04117643600329757, |
| "rewards/tag_count_reward": 0.973958358168602, |
| "step": 485 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 251.07291793823242, |
| "epoch": 0.7289088863892014, |
| "grad_norm": 24.101080804460526, |
| "kl": 4.90234375, |
| "learning_rate": 4.134883972168877e-06, |
| "loss": 0.3001, |
| "reward": 2.681569218635559, |
| "reward_std": 0.5304646193981171, |
| "rewards/accuracy_reward": 0.802083358168602, |
| "rewards/reasoning_steps_reward": 0.9618055671453476, |
| "rewards/repetition_penalty_reward": -0.03935106098651886, |
| "rewards/tag_count_reward": 0.9570312649011612, |
| "step": 486 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 249.77604293823242, |
| "epoch": 0.7304086989126359, |
| "grad_norm": 9.052170965793191, |
| "kl": 2.7578125, |
| "learning_rate": 4.092485632048142e-06, |
| "loss": 0.2795, |
| "reward": 2.5998495221138, |
| "reward_std": 0.3634401187300682, |
| "rewards/accuracy_reward": 0.6718750074505806, |
| "rewards/reasoning_steps_reward": 0.9843750149011612, |
| "rewards/repetition_penalty_reward": -0.031660950277000666, |
| "rewards/tag_count_reward": 0.9752604365348816, |
| "step": 487 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 257.9947967529297, |
| "epoch": 0.7319085114360705, |
| "grad_norm": 1.4255147156673067, |
| "kl": 1.27685546875, |
| "learning_rate": 4.050249790627675e-06, |
| "loss": 0.1139, |
| "reward": 2.5903435349464417, |
| "reward_std": 0.47754330188035965, |
| "rewards/accuracy_reward": 0.677083358168602, |
| "rewards/reasoning_steps_reward": 0.9809027910232544, |
| "rewards/repetition_penalty_reward": -0.04029886703938246, |
| "rewards/tag_count_reward": 0.9726562649011612, |
| "step": 488 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 243.2864646911621, |
| "epoch": 0.7334083239595051, |
| "grad_norm": 8.575587858559526, |
| "kl": 3.0546875, |
| "learning_rate": 4.008177609693791e-06, |
| "loss": 0.0417, |
| "reward": 2.663625717163086, |
| "reward_std": 0.5341070555150509, |
| "rewards/accuracy_reward": 0.7812500149011612, |
| "rewards/reasoning_steps_reward": 0.960069477558136, |
| "rewards/repetition_penalty_reward": -0.03342306334525347, |
| "rewards/tag_count_reward": 0.9557292014360428, |
| "step": 489 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 245.7395896911621, |
| "epoch": 0.7349081364829396, |
| "grad_norm": 2.765671602090238, |
| "kl": 0.9609375, |
| "learning_rate": 3.966270246530975e-06, |
| "loss": 0.0215, |
| "reward": 2.6076095700263977, |
| "reward_std": 0.4131758604198694, |
| "rewards/accuracy_reward": 0.692708358168602, |
| "rewards/reasoning_steps_reward": 0.9791666716337204, |
| "rewards/repetition_penalty_reward": -0.03561965608969331, |
| "rewards/tag_count_reward": 0.9713541865348816, |
| "step": 490 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 277.2135543823242, |
| "epoch": 0.7364079490063742, |
| "grad_norm": 2.188660242549928, |
| "kl": 0.86328125, |
| "learning_rate": 3.924528853890046e-06, |
| "loss": 0.0266, |
| "reward": 2.5408560633659363, |
| "reward_std": 0.4740516468882561, |
| "rewards/accuracy_reward": 0.6718750223517418, |
| "rewards/reasoning_steps_reward": 0.9687500149011612, |
| "rewards/repetition_penalty_reward": -0.05159200169146061, |
| "rewards/tag_count_reward": 0.9518229365348816, |
| "step": 491 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 270.3020935058594, |
| "epoch": 0.7379077615298087, |
| "grad_norm": 3.1759937413803816, |
| "kl": 0.900390625, |
| "learning_rate": 3.882954579956455e-06, |
| "loss": 0.001, |
| "reward": 2.583053767681122, |
| "reward_std": 0.4794854000210762, |
| "rewards/accuracy_reward": 0.6875000149011612, |
| "rewards/reasoning_steps_reward": 0.973958358168602, |
| "rewards/repetition_penalty_reward": -0.043248409405350685, |
| "rewards/tag_count_reward": 0.9648437798023224, |
| "step": 492 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 282.2239761352539, |
| "epoch": 0.7394075740532433, |
| "grad_norm": 2.975773686014791, |
| "kl": 1.705078125, |
| "learning_rate": 3.841548568318706e-06, |
| "loss": 0.0655, |
| "reward": 2.4317139387130737, |
| "reward_std": 0.6113808900117874, |
| "rewards/accuracy_reward": 0.5885416716337204, |
| "rewards/reasoning_steps_reward": 0.9565972536802292, |
| "rewards/repetition_penalty_reward": -0.048320941627025604, |
| "rewards/tag_count_reward": 0.9348958432674408, |
| "step": 493 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 249.14583587646484, |
| "epoch": 0.7409073865766779, |
| "grad_norm": 1.441571062260193, |
| "kl": 1.27587890625, |
| "learning_rate": 3.8003119579368806e-06, |
| "loss": 0.0064, |
| "reward": 2.4567288756370544, |
| "reward_std": 0.39962563663721085, |
| "rewards/accuracy_reward": 0.541666679084301, |
| "rewards/reasoning_steps_reward": 0.9739583432674408, |
| "rewards/repetition_penalty_reward": -0.03285460267215967, |
| "rewards/tag_count_reward": 0.9739583432674408, |
| "step": 494 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 259.42188262939453, |
| "epoch": 0.7424071991001124, |
| "grad_norm": 2.6425323421375118, |
| "kl": 0.888671875, |
| "learning_rate": 3.7592458831113256e-06, |
| "loss": 0.0337, |
| "reward": 2.3374595046043396, |
| "reward_std": 0.3100433573126793, |
| "rewards/accuracy_reward": 0.38541668839752674, |
| "rewards/reasoning_steps_reward": 1.0, |
| "rewards/repetition_penalty_reward": -0.04144685994833708, |
| "rewards/tag_count_reward": 0.9934895932674408, |
| "step": 495 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 285.37500762939453, |
| "epoch": 0.7439070116235471, |
| "grad_norm": 1.6657973548432092, |
| "kl": 1.005859375, |
| "learning_rate": 3.718351473451448e-06, |
| "loss": 0.0419, |
| "reward": 2.6762454509735107, |
| "reward_std": 0.2155936686322093, |
| "rewards/accuracy_reward": 0.7500000149011612, |
| "rewards/reasoning_steps_reward": 0.9895833432674408, |
| "rewards/repetition_penalty_reward": -0.04380669817328453, |
| "rewards/tag_count_reward": 0.9804687649011612, |
| "step": 496 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 288.8593864440918, |
| "epoch": 0.7454068241469817, |
| "grad_norm": 0.7716130912654924, |
| "kl": 0.6650390625, |
| "learning_rate": 3.6776298538446307e-06, |
| "loss": 0.0417, |
| "reward": 2.51417738199234, |
| "reward_std": 0.32952259480953217, |
| "rewards/accuracy_reward": 0.5989583432674408, |
| "rewards/reasoning_steps_reward": 0.9843750149011612, |
| "rewards/repetition_penalty_reward": -0.04962483886629343, |
| "rewards/tag_count_reward": 0.9804687798023224, |
| "step": 497 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 310.2760543823242, |
| "epoch": 0.7469066366704162, |
| "grad_norm": 5.134089109133943, |
| "kl": 2.931640625, |
| "learning_rate": 3.6370821444253112e-06, |
| "loss": 0.0652, |
| "reward": 2.4335711002349854, |
| "reward_std": 0.3948415219783783, |
| "rewards/accuracy_reward": 0.5572916865348816, |
| "rewards/reasoning_steps_reward": 0.9687500149011612, |
| "rewards/repetition_penalty_reward": -0.050803929567337036, |
| "rewards/tag_count_reward": 0.958333358168602, |
| "step": 498 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 295.4270935058594, |
| "epoch": 0.7484064491938508, |
| "grad_norm": 2.574704298931235, |
| "kl": 1.828125, |
| "learning_rate": 3.5967094605441545e-06, |
| "loss": -0.0396, |
| "reward": 2.6697959899902344, |
| "reward_std": 0.49928344041109085, |
| "rewards/accuracy_reward": 0.7968750149011612, |
| "rewards/reasoning_steps_reward": 0.9670139253139496, |
| "rewards/repetition_penalty_reward": -0.05112431012094021, |
| "rewards/tag_count_reward": 0.9570312649011612, |
| "step": 499 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 264.76563262939453, |
| "epoch": 0.7499062617172854, |
| "grad_norm": 1.121236474277334, |
| "kl": 0.8974609375, |
| "learning_rate": 3.5565129127373765e-06, |
| "loss": -0.0091, |
| "reward": 2.7433581352233887, |
| "reward_std": 0.39212197065353394, |
| "rewards/accuracy_reward": 0.8229166865348816, |
| "rewards/reasoning_steps_reward": 0.9843750149011612, |
| "rewards/repetition_penalty_reward": -0.04049617797136307, |
| "rewards/tag_count_reward": 0.9765625149011612, |
| "step": 500 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 271.3906364440918, |
| "epoch": 0.7514060742407199, |
| "grad_norm": 0.7018570637816092, |
| "kl": 1.29638671875, |
| "learning_rate": 3.5164936066961984e-06, |
| "loss": -0.0223, |
| "reward": 2.6698635816574097, |
| "reward_std": 0.3935448888223618, |
| "rewards/accuracy_reward": 0.7760416865348816, |
| "rewards/reasoning_steps_reward": 0.9687500149011612, |
| "rewards/repetition_penalty_reward": -0.03846996137872338, |
| "rewards/tag_count_reward": 0.9635416865348816, |
| "step": 501 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 314.0052185058594, |
| "epoch": 0.7529058867641545, |
| "grad_norm": 1.1907324808542725, |
| "kl": 0.833984375, |
| "learning_rate": 3.476652643236431e-06, |
| "loss": 0.0334, |
| "reward": 2.6077919006347656, |
| "reward_std": 0.3832826167345047, |
| "rewards/accuracy_reward": 0.7291666865348816, |
| "rewards/reasoning_steps_reward": 0.9739583432674408, |
| "rewards/repetition_penalty_reward": -0.05757272336632013, |
| "rewards/tag_count_reward": 0.962239608168602, |
| "step": 502 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 317.2448043823242, |
| "epoch": 0.754405699287589, |
| "grad_norm": 1.0569480071148656, |
| "kl": 1.544921875, |
| "learning_rate": 3.436991118268195e-06, |
| "loss": 0.0521, |
| "reward": 2.522133708000183, |
| "reward_std": 0.4893868714570999, |
| "rewards/accuracy_reward": 0.6562500223517418, |
| "rewards/reasoning_steps_reward": 0.973958358168602, |
| "rewards/repetition_penalty_reward": -0.06119971442967653, |
| "rewards/tag_count_reward": 0.9531250298023224, |
| "step": 503 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 301.3802146911621, |
| "epoch": 0.7559055118110236, |
| "grad_norm": 5.910389855911775, |
| "kl": 11.95703125, |
| "learning_rate": 3.3975101227657726e-06, |
| "loss": -0.1243, |
| "reward": 2.3981975317001343, |
| "reward_std": 0.6437404751777649, |
| "rewards/accuracy_reward": 0.5468750223517418, |
| "rewards/reasoning_steps_reward": 0.958333358168602, |
| "rewards/repetition_penalty_reward": -0.05102120712399483, |
| "rewards/tag_count_reward": 0.9440104514360428, |
| "step": 504 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 340.42189025878906, |
| "epoch": 0.7574053243344582, |
| "grad_norm": 12.978062482129692, |
| "kl": 16.43701171875, |
| "learning_rate": 3.3582107427376044e-06, |
| "loss": -0.0418, |
| "reward": 2.647988438606262, |
| "reward_std": 0.5043715462088585, |
| "rewards/accuracy_reward": 0.770833358168602, |
| "rewards/reasoning_steps_reward": 0.9739583432674408, |
| "rewards/repetition_penalty_reward": -0.057740709744393826, |
| "rewards/tag_count_reward": 0.9609375149011612, |
| "step": 505 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 349.92188262939453, |
| "epoch": 0.7589051368578927, |
| "grad_norm": 6.073801480071967, |
| "kl": 1.833984375, |
| "learning_rate": 3.3190940591964094e-06, |
| "loss": 0.0047, |
| "reward": 2.440945327281952, |
| "reward_std": 0.4255194067955017, |
| "rewards/accuracy_reward": 0.5781250149011612, |
| "rewards/reasoning_steps_reward": 0.972222238779068, |
| "rewards/repetition_penalty_reward": -0.07554787117987871, |
| "rewards/tag_count_reward": 0.966145858168602, |
| "step": 506 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 413.4791793823242, |
| "epoch": 0.7604049493813273, |
| "grad_norm": 2.37952801915241, |
| "kl": 4.4140625, |
| "learning_rate": 3.2801611481294538e-06, |
| "loss": 0.0238, |
| "reward": 2.686852276325226, |
| "reward_std": 0.3788864966481924, |
| "rewards/accuracy_reward": 0.8385416716337204, |
| "rewards/reasoning_steps_reward": 0.9774305820465088, |
| "rewards/repetition_penalty_reward": -0.08875534310936928, |
| "rewards/tag_count_reward": 0.9596354365348816, |
| "step": 507 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 381.5520935058594, |
| "epoch": 0.7619047619047619, |
| "grad_norm": 0.9844048707484628, |
| "kl": 1.52587890625, |
| "learning_rate": 3.2414130804689492e-06, |
| "loss": 0.0616, |
| "reward": 2.4849266409873962, |
| "reward_std": 0.44034913927316666, |
| "rewards/accuracy_reward": 0.6093750074505806, |
| "rewards/reasoning_steps_reward": 0.9843750149011612, |
| "rewards/repetition_penalty_reward": -0.06845879275351763, |
| "rewards/tag_count_reward": 0.9596354365348816, |
| "step": 508 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 337.81251525878906, |
| "epoch": 0.7634045744281964, |
| "grad_norm": 0.8845374524712641, |
| "kl": 1.2822265625, |
| "learning_rate": 3.202850922062607e-06, |
| "loss": -0.0007, |
| "reward": 2.6837249398231506, |
| "reward_std": 0.5106394588947296, |
| "rewards/accuracy_reward": 0.8125000298023224, |
| "rewards/reasoning_steps_reward": 0.973958358168602, |
| "rewards/repetition_penalty_reward": -0.06106678955256939, |
| "rewards/tag_count_reward": 0.958333358168602, |
| "step": 509 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 366.96875762939453, |
| "epoch": 0.7649043869516311, |
| "grad_norm": 0.6486204612451865, |
| "kl": 1.52978515625, |
| "learning_rate": 3.1644757336443023e-06, |
| "loss": -0.0686, |
| "reward": 2.4383880496025085, |
| "reward_std": 0.602533221244812, |
| "rewards/accuracy_reward": 0.609375, |
| "rewards/reasoning_steps_reward": 0.958333358168602, |
| "rewards/repetition_penalty_reward": -0.06551818642765284, |
| "rewards/tag_count_reward": 0.9361979365348816, |
| "step": 510 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 355.81250762939453, |
| "epoch": 0.7664041994750657, |
| "grad_norm": 0.8344794010535831, |
| "kl": 0.908203125, |
| "learning_rate": 3.126288570804906e-06, |
| "loss": 0.0427, |
| "reward": 2.516266345977783, |
| "reward_std": 0.5227550566196442, |
| "rewards/accuracy_reward": 0.635416679084301, |
| "rewards/reasoning_steps_reward": 0.9826388955116272, |
| "rewards/repetition_penalty_reward": -0.05882055405527353, |
| "rewards/tag_count_reward": 0.9570312649011612, |
| "step": 511 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 365.3385467529297, |
| "epoch": 0.7679040119985002, |
| "grad_norm": 159.77587834072062, |
| "kl": 10.79296875, |
| "learning_rate": 3.0882904839632476e-06, |
| "loss": 0.6306, |
| "reward": 2.5336210131645203, |
| "reward_std": 0.548546776175499, |
| "rewards/accuracy_reward": 0.6718750149011612, |
| "rewards/reasoning_steps_reward": 0.9791666865348816, |
| "rewards/repetition_penalty_reward": -0.06143125705420971, |
| "rewards/tag_count_reward": 0.9440104365348816, |
| "step": 512 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 364.0677185058594, |
| "epoch": 0.7694038245219348, |
| "grad_norm": 45.25299778164268, |
| "kl": 2.84375, |
| "learning_rate": 3.050482518337221e-06, |
| "loss": 0.1528, |
| "reward": 2.5458881855010986, |
| "reward_std": 0.5097367987036705, |
| "rewards/accuracy_reward": 0.6927083432674408, |
| "rewards/reasoning_steps_reward": 0.9687500298023224, |
| "rewards/repetition_penalty_reward": -0.06739312317222357, |
| "rewards/tag_count_reward": 0.9518229365348816, |
| "step": 513 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 346.5416793823242, |
| "epoch": 0.7709036370453693, |
| "grad_norm": 1.324438735835731, |
| "kl": 1.3828125, |
| "learning_rate": 3.012865713915033e-06, |
| "loss": -0.0002, |
| "reward": 2.548605740070343, |
| "reward_std": 0.4542975649237633, |
| "rewards/accuracy_reward": 0.6822916865348816, |
| "rewards/reasoning_steps_reward": 0.9791666865348816, |
| "rewards/repetition_penalty_reward": -0.05686304159462452, |
| "rewards/tag_count_reward": 0.9440104365348816, |
| "step": 514 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 372.2552261352539, |
| "epoch": 0.7724034495688039, |
| "grad_norm": 1.3346884321616983, |
| "kl": 1.03759765625, |
| "learning_rate": 2.9754411054265966e-06, |
| "loss": 0.033, |
| "reward": 2.5564919114112854, |
| "reward_std": 0.36123840510845184, |
| "rewards/accuracy_reward": 0.6562500149011612, |
| "rewards/reasoning_steps_reward": 0.9895833432674408, |
| "rewards/repetition_penalty_reward": -0.05809137877076864, |
| "rewards/tag_count_reward": 0.9687500298023224, |
| "step": 515 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 350.9791717529297, |
| "epoch": 0.7739032620922385, |
| "grad_norm": 4.616902608747506, |
| "kl": 3.259765625, |
| "learning_rate": 2.9382097223150675e-06, |
| "loss": 0.0779, |
| "reward": 2.524458348751068, |
| "reward_std": 0.5577153712511063, |
| "rewards/accuracy_reward": 0.677083358168602, |
| "rewards/reasoning_steps_reward": 0.9687500149011612, |
| "rewards/repetition_penalty_reward": -0.0549686960875988, |
| "rewards/tag_count_reward": 0.9335937649011612, |
| "step": 516 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 437.5729217529297, |
| "epoch": 0.775403074615673, |
| "grad_norm": 2.0004592558673195, |
| "kl": 0.81689453125, |
| "learning_rate": 2.9011725887085286e-06, |
| "loss": 0.0369, |
| "reward": 2.481614410877228, |
| "reward_std": 0.5043549910187721, |
| "rewards/accuracy_reward": 0.6093750298023224, |
| "rewards/reasoning_steps_reward": 0.9843750149011612, |
| "rewards/repetition_penalty_reward": -0.07177106104791164, |
| "rewards/tag_count_reward": 0.9596354365348816, |
| "step": 517 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 333.14063262939453, |
| "epoch": 0.7769028871391076, |
| "grad_norm": 2.055433226987207, |
| "kl": 2.642578125, |
| "learning_rate": 2.8643307233918192e-06, |
| "loss": 0.0882, |
| "reward": 2.5522547364234924, |
| "reward_std": 0.5509222820401192, |
| "rewards/accuracy_reward": 0.708333358168602, |
| "rewards/reasoning_steps_reward": 0.9618055820465088, |
| "rewards/repetition_penalty_reward": -0.05408210679888725, |
| "rewards/tag_count_reward": 0.9361979365348816, |
| "step": 518 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 391.0208435058594, |
| "epoch": 0.7784026996625422, |
| "grad_norm": 4.053898086811571, |
| "kl": 2.04736328125, |
| "learning_rate": 2.827685139778511e-06, |
| "loss": 0.0504, |
| "reward": 2.5092581510543823, |
| "reward_std": 0.5159156918525696, |
| "rewards/accuracy_reward": 0.6354166716337204, |
| "rewards/reasoning_steps_reward": 0.9861111342906952, |
| "rewards/repetition_penalty_reward": -0.06148845702409744, |
| "rewards/tag_count_reward": 0.9492187649011612, |
| "step": 519 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 337.0885543823242, |
| "epoch": 0.7799025121859767, |
| "grad_norm": 0.8528892895209994, |
| "kl": 1.2861328125, |
| "learning_rate": 2.7912368458830295e-06, |
| "loss": 0.0657, |
| "reward": 2.51828670501709, |
| "reward_std": 0.46771006286144257, |
| "rewards/accuracy_reward": 0.6250000149011612, |
| "rewards/reasoning_steps_reward": 0.9843750149011612, |
| "rewards/repetition_penalty_reward": -0.05463008023798466, |
| "rewards/tag_count_reward": 0.9635416865348816, |
| "step": 520 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 324.42708587646484, |
| "epoch": 0.7814023247094113, |
| "grad_norm": 1.5995624359002987, |
| "kl": 1.50927734375, |
| "learning_rate": 2.7549868442929286e-06, |
| "loss": 0.0114, |
| "reward": 2.534961521625519, |
| "reward_std": 0.42034848034381866, |
| "rewards/accuracy_reward": 0.6510416716337204, |
| "rewards/reasoning_steps_reward": 0.9843750149011612, |
| "rewards/repetition_penalty_reward": -0.04706980008631945, |
| "rewards/tag_count_reward": 0.946614608168602, |
| "step": 521 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 343.7395935058594, |
| "epoch": 0.7829021372328459, |
| "grad_norm": 2.8590022801404706, |
| "kl": 1.525390625, |
| "learning_rate": 2.7189361321413144e-06, |
| "loss": 0.1087, |
| "reward": 2.500767946243286, |
| "reward_std": 0.4306958243250847, |
| "rewards/accuracy_reward": 0.6041666865348816, |
| "rewards/reasoning_steps_reward": 0.9809028059244156, |
| "rewards/repetition_penalty_reward": -0.05044758692383766, |
| "rewards/tag_count_reward": 0.9661458432674408, |
| "step": 522 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 353.6145935058594, |
| "epoch": 0.7844019497562804, |
| "grad_norm": 0.9462589535923942, |
| "kl": 1.6015625, |
| "learning_rate": 2.683085701079412e-06, |
| "loss": 0.0087, |
| "reward": 2.624489724636078, |
| "reward_std": 0.46847573667764664, |
| "rewards/accuracy_reward": 0.7447916865348816, |
| "rewards/reasoning_steps_reward": 0.9809028059244156, |
| "rewards/repetition_penalty_reward": -0.05563185177743435, |
| "rewards/tag_count_reward": 0.9544270932674408, |
| "step": 523 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 277.2135467529297, |
| "epoch": 0.7859017622797151, |
| "grad_norm": 0.6580260031686143, |
| "kl": 1.3935546875, |
| "learning_rate": 2.647436537249294e-06, |
| "loss": -0.0381, |
| "reward": 2.4149619936943054, |
| "reward_std": 0.39340294152498245, |
| "rewards/accuracy_reward": 0.5052083544433117, |
| "rewards/reasoning_steps_reward": 0.9791666865348816, |
| "rewards/repetition_penalty_reward": -0.036860992200672626, |
| "rewards/tag_count_reward": 0.9674479365348816, |
| "step": 524 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 323.4114761352539, |
| "epoch": 0.7874015748031497, |
| "grad_norm": 1.021591339198619, |
| "kl": 1.7890625, |
| "learning_rate": 2.611989621256745e-06, |
| "loss": 0.0141, |
| "reward": 2.434917449951172, |
| "reward_std": 0.5576092228293419, |
| "rewards/accuracy_reward": 0.588541679084301, |
| "rewards/reasoning_steps_reward": 0.9583333730697632, |
| "rewards/repetition_penalty_reward": -0.05075975460931659, |
| "rewards/tag_count_reward": 0.938802108168602, |
| "step": 525 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 251.5000114440918, |
| "epoch": 0.7889013873265842, |
| "grad_norm": 2.0873915039216073, |
| "kl": 2.01171875, |
| "learning_rate": 2.5767459281443064e-06, |
| "loss": -0.0672, |
| "reward": 2.2912232875823975, |
| "reward_std": 0.42987949773669243, |
| "rewards/accuracy_reward": 0.42708335584029555, |
| "rewards/reasoning_steps_reward": 0.9652778059244156, |
| "rewards/repetition_penalty_reward": -0.036033790558576584, |
| "rewards/tag_count_reward": 0.9348958432674408, |
| "step": 526 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 308.4895896911621, |
| "epoch": 0.7904011998500188, |
| "grad_norm": 1.7291159865969803, |
| "kl": 1.93310546875, |
| "learning_rate": 2.541706427364431e-06, |
| "loss": -0.0153, |
| "reward": 2.6689711213111877, |
| "reward_std": 0.5232843309640884, |
| "rewards/accuracy_reward": 0.7760416716337204, |
| "rewards/reasoning_steps_reward": 0.9809028059244156, |
| "rewards/repetition_penalty_reward": -0.04500464163720608, |
| "rewards/tag_count_reward": 0.9570312798023224, |
| "step": 527 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 264.5572967529297, |
| "epoch": 0.7919010123734533, |
| "grad_norm": 1.784246342813351, |
| "kl": 0.90576171875, |
| "learning_rate": 2.506872082752834e-06, |
| "loss": 0.0971, |
| "reward": 2.664152204990387, |
| "reward_std": 0.30452917888760567, |
| "rewards/accuracy_reward": 0.7239583432674408, |
| "rewards/reasoning_steps_reward": 0.9947916716337204, |
| "rewards/repetition_penalty_reward": -0.03376449551433325, |
| "rewards/tag_count_reward": 0.9791666865348816, |
| "step": 528 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 325.4791793823242, |
| "epoch": 0.7934008248968879, |
| "grad_norm": 3.266992308801891, |
| "kl": 1.8212890625, |
| "learning_rate": 2.4722438525019764e-06, |
| "loss": 0.0034, |
| "reward": 2.676040768623352, |
| "reward_std": 0.44805118814110756, |
| "rewards/accuracy_reward": 0.770833358168602, |
| "rewards/reasoning_steps_reward": 0.9878472238779068, |
| "rewards/repetition_penalty_reward": -0.05399402230978012, |
| "rewards/tag_count_reward": 0.9713541865348816, |
| "step": 529 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 258.14063262939453, |
| "epoch": 0.7949006374203225, |
| "grad_norm": 2.2367523347428366, |
| "kl": 2.3408203125, |
| "learning_rate": 2.4378226891347056e-06, |
| "loss": -0.0131, |
| "reward": 2.6123253107070923, |
| "reward_std": 0.4946172907948494, |
| "rewards/accuracy_reward": 0.7291666865348816, |
| "rewards/reasoning_steps_reward": 0.9687500149011612, |
| "rewards/repetition_penalty_reward": -0.03611219022423029, |
| "rewards/tag_count_reward": 0.9505208432674408, |
| "step": 530 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 357.5104293823242, |
| "epoch": 0.796400449943757, |
| "grad_norm": 2.0536966546593494, |
| "kl": 2.0966796875, |
| "learning_rate": 2.403609539478056e-06, |
| "loss": 0.015, |
| "reward": 2.6887767910957336, |
| "reward_std": 0.43632883578538895, |
| "rewards/accuracy_reward": 0.8072917014360428, |
| "rewards/reasoning_steps_reward": 0.9895833432674408, |
| "rewards/repetition_penalty_reward": -0.06382743921130896, |
| "rewards/tag_count_reward": 0.9557291716337204, |
| "step": 531 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 291.60937881469727, |
| "epoch": 0.7979002624671916, |
| "grad_norm": 68.06193885878405, |
| "kl": 3.470703125, |
| "learning_rate": 2.3696053446372026e-06, |
| "loss": 0.1695, |
| "reward": 2.4727484583854675, |
| "reward_std": 0.5058952420949936, |
| "rewards/accuracy_reward": 0.5833333432674408, |
| "rewards/reasoning_steps_reward": 0.9809028059244156, |
| "rewards/repetition_penalty_reward": -0.04721691645681858, |
| "rewards/tag_count_reward": 0.9557291865348816, |
| "step": 532 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 283.70833587646484, |
| "epoch": 0.7994000749906262, |
| "grad_norm": 7.175346457591991, |
| "kl": 2.5625, |
| "learning_rate": 2.3358110399695788e-06, |
| "loss": 0.1764, |
| "reward": 2.499284565448761, |
| "reward_std": 0.42855924367904663, |
| "rewards/accuracy_reward": 0.5989583507180214, |
| "rewards/reasoning_steps_reward": 0.9791666716337204, |
| "rewards/repetition_penalty_reward": -0.04108010325580835, |
| "rewards/tag_count_reward": 0.9622396230697632, |
| "step": 533 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 247.94792556762695, |
| "epoch": 0.8008998875140607, |
| "grad_norm": 1.6428729718898072, |
| "kl": 0.5927734375, |
| "learning_rate": 2.302227555059141e-06, |
| "loss": 0.0496, |
| "reward": 2.5920801758766174, |
| "reward_std": 0.3266643173992634, |
| "rewards/accuracy_reward": 0.6458333432674408, |
| "rewards/reasoning_steps_reward": 1.0, |
| "rewards/repetition_penalty_reward": -0.03422196349129081, |
| "rewards/tag_count_reward": 0.9804687798023224, |
| "step": 534 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 309.5833396911621, |
| "epoch": 0.8023997000374953, |
| "grad_norm": 47.540127693483335, |
| "kl": 4.4296875, |
| "learning_rate": 2.2688558136908025e-06, |
| "loss": 0.0615, |
| "reward": 2.5951295495033264, |
| "reward_std": 0.41650524735450745, |
| "rewards/accuracy_reward": 0.6822916865348816, |
| "rewards/reasoning_steps_reward": 0.9895833432674408, |
| "rewards/repetition_penalty_reward": -0.04940184485167265, |
| "rewards/tag_count_reward": 0.9726562649011612, |
| "step": 535 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 260.6510467529297, |
| "epoch": 0.8038995125609298, |
| "grad_norm": 15.375690774431247, |
| "kl": 17.03125, |
| "learning_rate": 2.2356967338250223e-06, |
| "loss": 0.1811, |
| "reward": 2.492598533630371, |
| "reward_std": 0.4195599779486656, |
| "rewards/accuracy_reward": 0.5989583507180214, |
| "rewards/reasoning_steps_reward": 0.973958358168602, |
| "rewards/repetition_penalty_reward": -0.03604749217629433, |
| "rewards/tag_count_reward": 0.9557291865348816, |
| "step": 536 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 292.3854217529297, |
| "epoch": 0.8053993250843644, |
| "grad_norm": 9.475530371016637, |
| "kl": 2.486328125, |
| "learning_rate": 2.202751227572556e-06, |
| "loss": 0.0533, |
| "reward": 2.648472845554352, |
| "reward_std": 0.3912891000509262, |
| "rewards/accuracy_reward": 0.7343750298023224, |
| "rewards/reasoning_steps_reward": 0.9895833432674408, |
| "rewards/repetition_penalty_reward": -0.050745952408760786, |
| "rewards/tag_count_reward": 0.9752604365348816, |
| "step": 537 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 315.9739761352539, |
| "epoch": 0.8068991376077991, |
| "grad_norm": 1.4250516842234306, |
| "kl": 1.0771484375, |
| "learning_rate": 2.1700202011693573e-06, |
| "loss": 0.0594, |
| "reward": 2.7068479657173157, |
| "reward_std": 0.3395584188401699, |
| "rewards/accuracy_reward": 0.7968750149011612, |
| "rewards/reasoning_steps_reward": 0.9895833432674408, |
| "rewards/repetition_penalty_reward": -0.05487091187387705, |
| "rewards/tag_count_reward": 0.9752604514360428, |
| "step": 538 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 266.8489646911621, |
| "epoch": 0.8083989501312336, |
| "grad_norm": 3.4214757569933, |
| "kl": 1.81640625, |
| "learning_rate": 2.1375045549516636e-06, |
| "loss": 0.0096, |
| "reward": 2.5113611817359924, |
| "reward_std": 0.5310123562812805, |
| "rewards/accuracy_reward": 0.6250000149011612, |
| "rewards/reasoning_steps_reward": 0.973958358168602, |
| "rewards/repetition_penalty_reward": -0.044628492556512356, |
| "rewards/tag_count_reward": 0.9570312649011612, |
| "step": 539 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 255.57292556762695, |
| "epoch": 0.8098987626546682, |
| "grad_norm": 2.1241861424196147, |
| "kl": 1.56494140625, |
| "learning_rate": 2.105205183331224e-06, |
| "loss": 0.0114, |
| "reward": 2.7599127888679504, |
| "reward_std": 0.18678605183959007, |
| "rewards/accuracy_reward": 0.8333333432674408, |
| "rewards/reasoning_steps_reward": 0.9895833432674408, |
| "rewards/repetition_penalty_reward": -0.04477462079375982, |
| "rewards/tag_count_reward": 0.9817708432674408, |
| "step": 540 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 331.3385543823242, |
| "epoch": 0.8113985751781028, |
| "grad_norm": 1.5714564014928512, |
| "kl": 1.33203125, |
| "learning_rate": 2.0731229747706926e-06, |
| "loss": 0.0368, |
| "reward": 2.4418932795524597, |
| "reward_std": 0.4528404325246811, |
| "rewards/accuracy_reward": 0.5468750149011612, |
| "rewards/reasoning_steps_reward": 0.9895833432674408, |
| "rewards/repetition_penalty_reward": -0.058106820564717054, |
| "rewards/tag_count_reward": 0.9635416865348816, |
| "step": 541 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 344.2760467529297, |
| "epoch": 0.8128983877015373, |
| "grad_norm": 0.7740942337468878, |
| "kl": 1.365234375, |
| "learning_rate": 2.041258811759195e-06, |
| "loss": -0.0408, |
| "reward": 2.485717535018921, |
| "reward_std": 0.3474634326994419, |
| "rewards/accuracy_reward": 0.5937500111758709, |
| "rewards/reasoning_steps_reward": 0.9895833432674408, |
| "rewards/repetition_penalty_reward": -0.07417836599051952, |
| "rewards/tag_count_reward": 0.9765625149011612, |
| "step": 542 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 255.31771850585938, |
| "epoch": 0.8143982002249719, |
| "grad_norm": 2.0654717881411986, |
| "kl": 1.12646484375, |
| "learning_rate": 2.009613570788057e-06, |
| "loss": 0.1574, |
| "reward": 2.770683705806732, |
| "reward_std": 0.2916657757014036, |
| "rewards/accuracy_reward": 0.8177083730697632, |
| "rewards/reasoning_steps_reward": 1.0, |
| "rewards/repetition_penalty_reward": -0.03660803521052003, |
| "rewards/tag_count_reward": 0.9895833432674408, |
| "step": 543 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 298.3854217529297, |
| "epoch": 0.8158980127484065, |
| "grad_norm": 3.1495657832552553, |
| "kl": 2.18017578125, |
| "learning_rate": 1.978188122326683e-06, |
| "loss": 0.1062, |
| "reward": 2.6357369422912598, |
| "reward_std": 0.3864106610417366, |
| "rewards/accuracy_reward": 0.723958358168602, |
| "rewards/reasoning_steps_reward": 0.9843750149011612, |
| "rewards/repetition_penalty_reward": -0.053065232932567596, |
| "rewards/tag_count_reward": 0.9804687798023224, |
| "step": 544 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 277.87501525878906, |
| "epoch": 0.817397825271841, |
| "grad_norm": 4.472839155741773, |
| "kl": 2.248046875, |
| "learning_rate": 1.946983330798621e-06, |
| "loss": 0.1409, |
| "reward": 2.5282241702079773, |
| "reward_std": 0.4547024220228195, |
| "rewards/accuracy_reward": 0.640625, |
| "rewards/reasoning_steps_reward": 0.973958358168602, |
| "rewards/repetition_penalty_reward": -0.043390512466430664, |
| "rewards/tag_count_reward": 0.9570312798023224, |
| "step": 545 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 247.5989646911621, |
| "epoch": 0.8188976377952756, |
| "grad_norm": 24.974566214450327, |
| "kl": 2.99853515625, |
| "learning_rate": 1.916000054557783e-06, |
| "loss": 0.4573, |
| "reward": 2.6690892577171326, |
| "reward_std": 0.4272429645061493, |
| "rewards/accuracy_reward": 0.7447916865348816, |
| "rewards/reasoning_steps_reward": 0.9895833432674408, |
| "rewards/repetition_penalty_reward": -0.03794198762625456, |
| "rewards/tag_count_reward": 0.9726562798023224, |
| "step": 546 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 299.86458587646484, |
| "epoch": 0.8203974503187101, |
| "grad_norm": 10.956923315304682, |
| "kl": 5.841796875, |
| "learning_rate": 1.8852391458648323e-06, |
| "loss": 0.3112, |
| "reward": 2.711550772190094, |
| "reward_std": 0.4324747771024704, |
| "rewards/accuracy_reward": 0.817708358168602, |
| "rewards/reasoning_steps_reward": 0.9843750149011612, |
| "rewards/repetition_penalty_reward": -0.04886595252901316, |
| "rewards/tag_count_reward": 0.9583333432674408, |
| "step": 547 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 264.12500762939453, |
| "epoch": 0.8218972628421447, |
| "grad_norm": 10.26335940256424, |
| "kl": 3.107421875, |
| "learning_rate": 1.854701450863744e-06, |
| "loss": 0.3093, |
| "reward": 2.495303452014923, |
| "reward_std": 0.406343936920166, |
| "rewards/accuracy_reward": 0.6093750149011612, |
| "rewards/reasoning_steps_reward": 0.9791666865348816, |
| "rewards/repetition_penalty_reward": -0.0489674867130816, |
| "rewards/tag_count_reward": 0.9557291865348816, |
| "step": 548 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 268.10938262939453, |
| "epoch": 0.8233970753655793, |
| "grad_norm": 3.7641473071951963, |
| "kl": 3.146484375, |
| "learning_rate": 1.8243878095585244e-06, |
| "loss": 0.1475, |
| "reward": 2.7090840935707092, |
| "reward_std": 0.482110857963562, |
| "rewards/accuracy_reward": 0.8281250298023224, |
| "rewards/reasoning_steps_reward": 0.9791666865348816, |
| "rewards/repetition_penalty_reward": -0.04872854123823345, |
| "rewards/tag_count_reward": 0.9505208432674408, |
| "step": 549 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 293.93750762939453, |
| "epoch": 0.8248968878890138, |
| "grad_norm": 4.346280398373657, |
| "kl": 1.73583984375, |
| "learning_rate": 1.7942990557901119e-06, |
| "loss": 0.1208, |
| "reward": 2.5294079780578613, |
| "reward_std": 0.44582589715719223, |
| "rewards/accuracy_reward": 0.6197916939854622, |
| "rewards/reasoning_steps_reward": 0.9895833432674408, |
| "rewards/repetition_penalty_reward": -0.05262336507439613, |
| "rewards/tag_count_reward": 0.9726562649011612, |
| "step": 550 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 268.5572967529297, |
| "epoch": 0.8263967004124484, |
| "grad_norm": 12.194807976372957, |
| "kl": 0.8740234375, |
| "learning_rate": 1.7644360172134323e-06, |
| "loss": 0.2462, |
| "reward": 2.5935970544815063, |
| "reward_std": 0.3754644878208637, |
| "rewards/accuracy_reward": 0.6666667014360428, |
| "rewards/reasoning_steps_reward": 0.9895833432674408, |
| "rewards/repetition_penalty_reward": -0.044423856772482395, |
| "rewards/tag_count_reward": 0.981770858168602, |
| "step": 551 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 271.2864761352539, |
| "epoch": 0.8278965129358831, |
| "grad_norm": 3.492663781225749, |
| "kl": 0.78857421875, |
| "learning_rate": 1.734799515274641e-06, |
| "loss": 0.1673, |
| "reward": 2.57718825340271, |
| "reward_std": 0.3006473407149315, |
| "rewards/accuracy_reward": 0.6562500149011612, |
| "rewards/reasoning_steps_reward": 0.9913194477558136, |
| "rewards/repetition_penalty_reward": -0.0521521158516407, |
| "rewards/tag_count_reward": 0.981770858168602, |
| "step": 552 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 206.7031364440918, |
| "epoch": 0.8293963254593176, |
| "grad_norm": 1.0800649313598587, |
| "kl": 0.86865234375, |
| "learning_rate": 1.7053903651885217e-06, |
| "loss": 0.069, |
| "reward": 2.665630042552948, |
| "reward_std": 0.27732831984758377, |
| "rewards/accuracy_reward": 0.7135416865348816, |
| "rewards/reasoning_steps_reward": 0.9930555522441864, |
| "rewards/repetition_penalty_reward": -0.02924854727461934, |
| "rewards/tag_count_reward": 0.9882812649011612, |
| "step": 553 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 255.51562881469727, |
| "epoch": 0.8308961379827522, |
| "grad_norm": 6.658607380817447, |
| "kl": 2.01953125, |
| "learning_rate": 1.6762093759160614e-06, |
| "loss": 0.247, |
| "reward": 2.5456653237342834, |
| "reward_std": 0.5418491065502167, |
| "rewards/accuracy_reward": 0.6718750298023224, |
| "rewards/reasoning_steps_reward": 0.9687500149011612, |
| "rewards/repetition_penalty_reward": -0.051991009153425694, |
| "rewards/tag_count_reward": 0.9570312649011612, |
| "step": 554 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 228.35938262939453, |
| "epoch": 0.8323959505061868, |
| "grad_norm": 5.821343444236744, |
| "kl": 2.45849609375, |
| "learning_rate": 1.647257350142204e-06, |
| "loss": 0.1243, |
| "reward": 2.5485109090805054, |
| "reward_std": 0.4390419125556946, |
| "rewards/accuracy_reward": 0.645833358168602, |
| "rewards/reasoning_steps_reward": 0.9791666716337204, |
| "rewards/repetition_penalty_reward": -0.04133300017565489, |
| "rewards/tag_count_reward": 0.9648437649011612, |
| "step": 555 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 226.7291717529297, |
| "epoch": 0.8338957630296213, |
| "grad_norm": 1.79998365352896, |
| "kl": 1.49853515625, |
| "learning_rate": 1.618535084253765e-06, |
| "loss": 0.1319, |
| "reward": 2.61370986700058, |
| "reward_std": 0.3187000434845686, |
| "rewards/accuracy_reward": 0.6822916716337204, |
| "rewards/reasoning_steps_reward": 0.9930555522441864, |
| "rewards/repetition_penalty_reward": -0.03689793427474797, |
| "rewards/tag_count_reward": 0.9752604365348816, |
| "step": 556 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 230.59375762939453, |
| "epoch": 0.8353955755530559, |
| "grad_norm": 16.824204922339664, |
| "kl": 3.234375, |
| "learning_rate": 1.5900433683175277e-06, |
| "loss": 0.5503, |
| "reward": 2.6835938692092896, |
| "reward_std": 0.4415631741285324, |
| "rewards/accuracy_reward": 0.7604166865348816, |
| "rewards/reasoning_steps_reward": 0.9930555522441864, |
| "rewards/repetition_penalty_reward": -0.04253482352942228, |
| "rewards/tag_count_reward": 0.9726562649011612, |
| "step": 557 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 225.4375114440918, |
| "epoch": 0.8368953880764904, |
| "grad_norm": 8.685664685007387, |
| "kl": 2.427734375, |
| "learning_rate": 1.5617829860585087e-06, |
| "loss": 0.1994, |
| "reward": 2.531205892562866, |
| "reward_std": 0.43269092589616776, |
| "rewards/accuracy_reward": 0.6093750298023224, |
| "rewards/reasoning_steps_reward": 0.986111119389534, |
| "rewards/repetition_penalty_reward": -0.03172821528278291, |
| "rewards/tag_count_reward": 0.9674479365348816, |
| "step": 558 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 208.06250762939453, |
| "epoch": 0.838395200599925, |
| "grad_norm": 11.917922673045826, |
| "kl": 2.44091796875, |
| "learning_rate": 1.533754714838408e-06, |
| "loss": 0.1954, |
| "reward": 2.4975586533546448, |
| "reward_std": 0.44412345439195633, |
| "rewards/accuracy_reward": 0.5937500149011612, |
| "rewards/reasoning_steps_reward": 0.9635416865348816, |
| "rewards/repetition_penalty_reward": -0.02848310861736536, |
| "rewards/tag_count_reward": 0.9687500298023224, |
| "step": 559 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 259.4427185058594, |
| "epoch": 0.8398950131233596, |
| "grad_norm": 4.400151882822108, |
| "kl": 2.544921875, |
| "learning_rate": 1.5059593256342142e-06, |
| "loss": 0.0563, |
| "reward": 2.6064106225967407, |
| "reward_std": 0.4480983540415764, |
| "rewards/accuracy_reward": 0.7187500149011612, |
| "rewards/reasoning_steps_reward": 0.9722222238779068, |
| "rewards/repetition_penalty_reward": -0.05331165995448828, |
| "rewards/tag_count_reward": 0.9687500298023224, |
| "step": 560 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 234.78646850585938, |
| "epoch": 0.8413948256467941, |
| "grad_norm": 4.108382543830119, |
| "kl": 3.171875, |
| "learning_rate": 1.4783975830170028e-06, |
| "loss": 0.6147, |
| "reward": 2.4788121581077576, |
| "reward_std": 0.5859697312116623, |
| "rewards/accuracy_reward": 0.614583358168602, |
| "rewards/reasoning_steps_reward": 0.967013880610466, |
| "rewards/repetition_penalty_reward": -0.046795559115707874, |
| "rewards/tag_count_reward": 0.9440104514360428, |
| "step": 561 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 219.51042938232422, |
| "epoch": 0.8428946381702287, |
| "grad_norm": 3.9632943549968522, |
| "kl": 2.30029296875, |
| "learning_rate": 1.4510702451309055e-06, |
| "loss": 0.2233, |
| "reward": 2.5965715646743774, |
| "reward_std": 0.48612427711486816, |
| "rewards/accuracy_reward": 0.7239583730697632, |
| "rewards/reasoning_steps_reward": 0.9600694924592972, |
| "rewards/repetition_penalty_reward": -0.04578963201493025, |
| "rewards/tag_count_reward": 0.958333358168602, |
| "step": 562 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 234.33855438232422, |
| "epoch": 0.8443944506936633, |
| "grad_norm": 12.689273098315706, |
| "kl": 2.89453125, |
| "learning_rate": 1.4239780636722555e-06, |
| "loss": 0.5173, |
| "reward": 2.568632483482361, |
| "reward_std": 0.4845578037202358, |
| "rewards/accuracy_reward": 0.6875000074505806, |
| "rewards/reasoning_steps_reward": 0.9756944924592972, |
| "rewards/repetition_penalty_reward": -0.05029124254360795, |
| "rewards/tag_count_reward": 0.9557292014360428, |
| "step": 563 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 255.00521087646484, |
| "epoch": 0.8458942632170978, |
| "grad_norm": 22.887883651104566, |
| "kl": 2.71875, |
| "learning_rate": 1.39712178386891e-06, |
| "loss": 0.7815, |
| "reward": 2.606330990791321, |
| "reward_std": 0.6394063234329224, |
| "rewards/accuracy_reward": 0.7500000298023224, |
| "rewards/reasoning_steps_reward": 0.9618056118488312, |
| "rewards/repetition_penalty_reward": -0.05859959963709116, |
| "rewards/tag_count_reward": 0.9531250298023224, |
| "step": 564 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 291.60939025878906, |
| "epoch": 0.8473940757405324, |
| "grad_norm": 26.21108108487782, |
| "kl": 7.546875, |
| "learning_rate": 1.3705021444597521e-06, |
| "loss": 1.1027, |
| "reward": 2.4728458523750305, |
| "reward_std": 0.6928077340126038, |
| "rewards/accuracy_reward": 0.6406250149011612, |
| "rewards/reasoning_steps_reward": 0.96180559694767, |
| "rewards/repetition_penalty_reward": -0.07489718683063984, |
| "rewards/tag_count_reward": 0.9453125149011612, |
| "step": 565 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 201.21875381469727, |
| "epoch": 0.8488938882639671, |
| "grad_norm": 3.5925945801436803, |
| "kl": 2.39453125, |
| "learning_rate": 1.344119877674368e-06, |
| "loss": 0.301, |
| "reward": 2.4886980652809143, |
| "reward_std": 0.45957837253808975, |
| "rewards/accuracy_reward": 0.5885416865348816, |
| "rewards/reasoning_steps_reward": 0.9687500149011612, |
| "rewards/repetition_penalty_reward": -0.036041736137121916, |
| "rewards/tag_count_reward": 0.9674479216337204, |
| "step": 566 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 256.2395896911621, |
| "epoch": 0.8503937007874016, |
| "grad_norm": 20.274471532130267, |
| "kl": 5.91796875, |
| "learning_rate": 1.3179757092129087e-06, |
| "loss": 1.0635, |
| "reward": 2.3603609800338745, |
| "reward_std": 0.6862609535455704, |
| "rewards/accuracy_reward": 0.5625000260770321, |
| "rewards/reasoning_steps_reward": 0.9513889104127884, |
| "rewards/repetition_penalty_reward": -0.06238204799592495, |
| "rewards/tag_count_reward": 0.9088541865348816, |
| "step": 567 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 226.59375762939453, |
| "epoch": 0.8518935133108362, |
| "grad_norm": 9.701437197347186, |
| "kl": 8.5390625, |
| "learning_rate": 1.292070358226124e-06, |
| "loss": 0.3018, |
| "reward": 2.3854750990867615, |
| "reward_std": 0.6592359095811844, |
| "rewards/accuracy_reward": 0.5416666939854622, |
| "rewards/reasoning_steps_reward": 0.9565972536802292, |
| "rewards/repetition_penalty_reward": -0.05028895568102598, |
| "rewards/tag_count_reward": 0.9375000298023224, |
| "step": 568 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 210.29687881469727, |
| "epoch": 0.8533933258342707, |
| "grad_norm": 10.655977657198079, |
| "kl": 3.45703125, |
| "learning_rate": 1.2664045372955858e-06, |
| "loss": 0.6047, |
| "reward": 2.418867766857147, |
| "reward_std": 0.6335871517658234, |
| "rewards/accuracy_reward": 0.5729166865348816, |
| "rewards/reasoning_steps_reward": 0.9548611491918564, |
| "rewards/repetition_penalty_reward": -0.045107895508408546, |
| "rewards/tag_count_reward": 0.9361979365348816, |
| "step": 569 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 277.40625762939453, |
| "epoch": 0.8548931383577053, |
| "grad_norm": 11.027587159101916, |
| "kl": 6.21484375, |
| "learning_rate": 1.2409789524140813e-06, |
| "loss": 0.854, |
| "reward": 2.5739906430244446, |
| "reward_std": 0.7429305166006088, |
| "rewards/accuracy_reward": 0.770833358168602, |
| "rewards/reasoning_steps_reward": 0.935763955116272, |
| "rewards/repetition_penalty_reward": -0.06750249583274126, |
| "rewards/tag_count_reward": 0.934895858168602, |
| "step": 570 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 266.0937614440918, |
| "epoch": 0.8563929508811399, |
| "grad_norm": 14.64342890452312, |
| "kl": 9.6875, |
| "learning_rate": 1.2157943029661977e-06, |
| "loss": 0.9433, |
| "reward": 2.343918889760971, |
| "reward_std": 0.8442835658788681, |
| "rewards/accuracy_reward": 0.6510416716337204, |
| "rewards/reasoning_steps_reward": 0.8923611491918564, |
| "rewards/repetition_penalty_reward": -0.0770882060751319, |
| "rewards/tag_count_reward": 0.8776041865348816, |
| "step": 571 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 190.70312881469727, |
| "epoch": 0.8578927634045744, |
| "grad_norm": 4.726128645967551, |
| "kl": 1.662109375, |
| "learning_rate": 1.1908512817090833e-06, |
| "loss": 0.2499, |
| "reward": 2.4979942440986633, |
| "reward_std": 0.40954509750008583, |
| "rewards/accuracy_reward": 0.5937500223517418, |
| "rewards/reasoning_steps_reward": 0.9826389253139496, |
| "rewards/repetition_penalty_reward": -0.032821862027049065, |
| "rewards/tag_count_reward": 0.954427108168602, |
| "step": 572 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 229.52605056762695, |
| "epoch": 0.859392575928009, |
| "grad_norm": 5.56550512002148, |
| "kl": 2.75390625, |
| "learning_rate": 1.1661505747533897e-06, |
| "loss": 0.3795, |
| "reward": 2.5485902428627014, |
| "reward_std": 0.7243528515100479, |
| "rewards/accuracy_reward": 0.7343750149011612, |
| "rewards/reasoning_steps_reward": 0.9288194626569748, |
| "rewards/repetition_penalty_reward": -0.048198044300079346, |
| "rewards/tag_count_reward": 0.9335937649011612, |
| "step": 573 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 243.60938262939453, |
| "epoch": 0.8608923884514436, |
| "grad_norm": 7.741451341316265, |
| "kl": 3.44140625, |
| "learning_rate": 1.1416928615444013e-06, |
| "loss": 0.5439, |
| "reward": 2.4548073410987854, |
| "reward_std": 0.7477606385946274, |
| "rewards/accuracy_reward": 0.6406250298023224, |
| "rewards/reasoning_steps_reward": 0.9531250596046448, |
| "rewards/repetition_penalty_reward": -0.05040103476494551, |
| "rewards/tag_count_reward": 0.911458358168602, |
| "step": 574 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 239.36459350585938, |
| "epoch": 0.8623922009748781, |
| "grad_norm": 12.69637648259022, |
| "kl": 5.462890625, |
| "learning_rate": 1.1174788148433423e-06, |
| "loss": 0.7872, |
| "reward": 2.5563725233078003, |
| "reward_std": 0.7221258133649826, |
| "rewards/accuracy_reward": 0.7760416716337204, |
| "rewards/reasoning_steps_reward": 0.9184028208255768, |
| "rewards/repetition_penalty_reward": -0.0625512283295393, |
| "rewards/tag_count_reward": 0.9244792014360428, |
| "step": 575 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 232.67708587646484, |
| "epoch": 0.8638920134983127, |
| "grad_norm": 6.651240220699383, |
| "kl": 2.841796875, |
| "learning_rate": 1.0935091007088761e-06, |
| "loss": 0.5463, |
| "reward": 2.4423410296440125, |
| "reward_std": 0.6437135934829712, |
| "rewards/accuracy_reward": 0.619791679084301, |
| "rewards/reasoning_steps_reward": 0.9409722238779068, |
| "rewards/repetition_penalty_reward": -0.0559229115024209, |
| "rewards/tag_count_reward": 0.9375000298023224, |
| "step": 576 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 215.6822967529297, |
| "epoch": 0.8653918260217472, |
| "grad_norm": 9.069503918590028, |
| "kl": 3.73046875, |
| "learning_rate": 1.069784378478781e-06, |
| "loss": 0.5511, |
| "reward": 2.417185962200165, |
| "reward_std": 0.6344876810908318, |
| "rewards/accuracy_reward": 0.6093750223517418, |
| "rewards/reasoning_steps_reward": 0.9340277761220932, |
| "rewards/repetition_penalty_reward": -0.04809193778783083, |
| "rewards/tag_count_reward": 0.9218750149011612, |
| "step": 577 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 210.9166717529297, |
| "epoch": 0.8668916385451818, |
| "grad_norm": 3.9657531302416285, |
| "kl": 2.365234375, |
| "learning_rate": 1.046305300751811e-06, |
| "loss": 0.2354, |
| "reward": 2.5444520115852356, |
| "reward_std": 0.678747646510601, |
| "rewards/accuracy_reward": 0.7135416865348816, |
| "rewards/reasoning_steps_reward": 0.9531250149011612, |
| "rewards/repetition_penalty_reward": -0.0466939271427691, |
| "rewards/tag_count_reward": 0.9244791716337204, |
| "step": 578 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 248.43750381469727, |
| "epoch": 0.8683914510686164, |
| "grad_norm": 13.803690175273248, |
| "kl": 3.46484375, |
| "learning_rate": 1.0230725133697495e-06, |
| "loss": 0.7416, |
| "reward": 2.505746603012085, |
| "reward_std": 0.6760745644569397, |
| "rewards/accuracy_reward": 0.692708358168602, |
| "rewards/reasoning_steps_reward": 0.947916716337204, |
| "rewards/repetition_penalty_reward": -0.05935762915760279, |
| "rewards/tag_count_reward": 0.9244792014360428, |
| "step": 579 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 222.7552146911621, |
| "epoch": 0.869891263592051, |
| "grad_norm": 6.038364170548584, |
| "kl": 1.857421875, |
| "learning_rate": 1.0000866553996436e-06, |
| "loss": 0.3797, |
| "reward": 2.563169300556183, |
| "reward_std": 0.49185874313116074, |
| "rewards/accuracy_reward": 0.6822916865348816, |
| "rewards/reasoning_steps_reward": 0.958333358168602, |
| "rewards/repetition_penalty_reward": -0.0383933181874454, |
| "rewards/tag_count_reward": 0.9609375298023224, |
| "step": 580 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 206.24480056762695, |
| "epoch": 0.8713910761154856, |
| "grad_norm": 2.7457030988838373, |
| "kl": 2.619140625, |
| "learning_rate": 9.773483591162203e-07, |
| "loss": 0.3962, |
| "reward": 2.5448489785194397, |
| "reward_std": 0.5937497019767761, |
| "rewards/accuracy_reward": 0.6875000149011612, |
| "rewards/reasoning_steps_reward": 0.9635417014360428, |
| "rewards/repetition_penalty_reward": -0.04499485623091459, |
| "rewards/tag_count_reward": 0.938802108168602, |
| "step": 581 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 204.1354217529297, |
| "epoch": 0.8728908886389202, |
| "grad_norm": 3.9924980474405847, |
| "kl": 3.34765625, |
| "learning_rate": 9.548582499845015e-07, |
| "loss": 0.0155, |
| "reward": 2.46910959482193, |
| "reward_std": 0.7316218465566635, |
| "rewards/accuracy_reward": 0.6718750298023224, |
| "rewards/reasoning_steps_reward": 0.9114583730697632, |
| "rewards/repetition_penalty_reward": -0.04260919149965048, |
| "rewards/tag_count_reward": 0.9283854365348816, |
| "step": 582 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 185.3177146911621, |
| "epoch": 0.8743907011623547, |
| "grad_norm": 3.3857582260464882, |
| "kl": 1.8125, |
| "learning_rate": 9.326169466425916e-07, |
| "loss": 0.2775, |
| "reward": 2.697448194026947, |
| "reward_std": 0.5078239142894745, |
| "rewards/accuracy_reward": 0.7760416865348816, |
| "rewards/reasoning_steps_reward": 0.9843750149011612, |
| "rewards/repetition_penalty_reward": -0.03432267066091299, |
| "rewards/tag_count_reward": 0.9713541865348816, |
| "step": 583 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 206.1875, |
| "epoch": 0.8758905136857893, |
| "grad_norm": 11.742745507237299, |
| "kl": 2.9140625, |
| "learning_rate": 9.106250608846679e-07, |
| "loss": 0.5072, |
| "reward": 2.6075586080551147, |
| "reward_std": 0.620074912905693, |
| "rewards/accuracy_reward": 0.7343750149011612, |
| "rewards/reasoning_steps_reward": 0.9600694924592972, |
| "rewards/repetition_penalty_reward": -0.036104677245020866, |
| "rewards/tag_count_reward": 0.9492187649011612, |
| "step": 584 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 217.8541717529297, |
| "epoch": 0.8773903262092239, |
| "grad_norm": 2.2041577561531684, |
| "kl": 2.1953125, |
| "learning_rate": 8.888831976441481e-07, |
| "loss": 0.4111, |
| "reward": 2.6042516231536865, |
| "reward_std": 0.47759709507226944, |
| "rewards/accuracy_reward": 0.6822916865348816, |
| "rewards/reasoning_steps_reward": 0.9809028059244156, |
| "rewards/repetition_penalty_reward": -0.03159915190190077, |
| "rewards/tag_count_reward": 0.9726562649011612, |
| "step": 585 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 205.8229217529297, |
| "epoch": 0.8788901387326584, |
| "grad_norm": 4.274612409465976, |
| "kl": 2.3701171875, |
| "learning_rate": 8.673919549770483e-07, |
| "loss": 0.3398, |
| "reward": 2.611320972442627, |
| "reward_std": 0.5121708884835243, |
| "rewards/accuracy_reward": 0.7291666865348816, |
| "rewards/reasoning_steps_reward": 0.960069477558136, |
| "rewards/repetition_penalty_reward": -0.044061001390218735, |
| "rewards/tag_count_reward": 0.9661458432674408, |
| "step": 586 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 202.00521087646484, |
| "epoch": 0.880389951256093, |
| "grad_norm": 3.736601399113756, |
| "kl": 2.330078125, |
| "learning_rate": 8.461519240455362e-07, |
| "loss": 0.3635, |
| "reward": 2.6235088109970093, |
| "reward_std": 0.5086311176419258, |
| "rewards/accuracy_reward": 0.7187500149011612, |
| "rewards/reasoning_steps_reward": 0.986111119389534, |
| "rewards/repetition_penalty_reward": -0.037081570364534855, |
| "rewards/tag_count_reward": 0.9557291716337204, |
| "step": 587 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 209.96355056762695, |
| "epoch": 0.8818897637795275, |
| "grad_norm": 17.45784244105432, |
| "kl": 20.078125, |
| "learning_rate": 8.251636891016702e-07, |
| "loss": 0.5062, |
| "reward": 2.5059803128242493, |
| "reward_std": 0.631468877196312, |
| "rewards/accuracy_reward": 0.6354166865348816, |
| "rewards/reasoning_steps_reward": 0.9618055522441864, |
| "rewards/repetition_penalty_reward": -0.037856731098145247, |
| "rewards/tag_count_reward": 0.9466146230697632, |
| "step": 588 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 219.26042556762695, |
| "epoch": 0.8833895763029621, |
| "grad_norm": 2.4924144055714925, |
| "kl": 2.31640625, |
| "learning_rate": 8.044278274713246e-07, |
| "loss": 0.6179, |
| "reward": 2.7582138180732727, |
| "reward_std": 0.4812832549214363, |
| "rewards/accuracy_reward": 0.8697916865348816, |
| "rewards/reasoning_steps_reward": 0.9722222685813904, |
| "rewards/repetition_penalty_reward": -0.048643797636032104, |
| "rewards/tag_count_reward": 0.9648437649011612, |
| "step": 589 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 213.8072967529297, |
| "epoch": 0.8848893888263967, |
| "grad_norm": 4.087042616486125, |
| "kl": 2.037109375, |
| "learning_rate": 7.839449095383111e-07, |
| "loss": 0.4912, |
| "reward": 2.69020676612854, |
| "reward_std": 0.5107688158750534, |
| "rewards/accuracy_reward": 0.786458358168602, |
| "rewards/reasoning_steps_reward": 0.9791666865348816, |
| "rewards/repetition_penalty_reward": -0.04026202671229839, |
| "rewards/tag_count_reward": 0.9648437798023224, |
| "step": 590 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 220.7447967529297, |
| "epoch": 0.8863892013498312, |
| "grad_norm": 14.915815383649845, |
| "kl": 2.236328125, |
| "learning_rate": 7.637154987286888e-07, |
| "loss": 0.4128, |
| "reward": 2.6318877935409546, |
| "reward_std": 0.41991502046585083, |
| "rewards/accuracy_reward": 0.7343750298023224, |
| "rewards/reasoning_steps_reward": 0.9826389104127884, |
| "rewards/repetition_penalty_reward": -0.04476172663271427, |
| "rewards/tag_count_reward": 0.9596354365348816, |
| "step": 591 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 212.46875762939453, |
| "epoch": 0.8878890138732658, |
| "grad_norm": 6.050727563412662, |
| "kl": 2.1787109375, |
| "learning_rate": 7.437401514952646e-07, |
| "loss": 0.3731, |
| "reward": 2.4536707997322083, |
| "reward_std": 0.4574524015188217, |
| "rewards/accuracy_reward": 0.5729166716337204, |
| "rewards/reasoning_steps_reward": 0.9704861342906952, |
| "rewards/repetition_penalty_reward": -0.04676327481865883, |
| "rewards/tag_count_reward": 0.9570312798023224, |
| "step": 592 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 222.2552146911621, |
| "epoch": 0.8893888263967004, |
| "grad_norm": 9.16124525709699, |
| "kl": 4.005859375, |
| "learning_rate": 7.240194173022941e-07, |
| "loss": 0.5731, |
| "reward": 2.6355690360069275, |
| "reward_std": 0.5524905323982239, |
| "rewards/accuracy_reward": 0.7812500149011612, |
| "rewards/reasoning_steps_reward": 0.963541716337204, |
| "rewards/repetition_penalty_reward": -0.05713945999741554, |
| "rewards/tag_count_reward": 0.9479166865348816, |
| "step": 593 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 231.3489646911621, |
| "epoch": 0.890888638920135, |
| "grad_norm": 22.380535968926747, |
| "kl": 3.015625, |
| "learning_rate": 7.045538386103579e-07, |
| "loss": 0.4854, |
| "reward": 2.4118546843528748, |
| "reward_std": 0.6012802645564079, |
| "rewards/accuracy_reward": 0.5677083507180214, |
| "rewards/reasoning_steps_reward": 0.9600694477558136, |
| "rewards/repetition_penalty_reward": -0.057329487055540085, |
| "rewards/tag_count_reward": 0.9414062798023224, |
| "step": 594 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 268.4322967529297, |
| "epoch": 0.8923884514435696, |
| "grad_norm": 3.774203451456834, |
| "kl": 3.220703125, |
| "learning_rate": 6.853439508614412e-07, |
| "loss": 0.6382, |
| "reward": 2.497507393360138, |
| "reward_std": 0.5804141908884048, |
| "rewards/accuracy_reward": 0.6510416865348816, |
| "rewards/reasoning_steps_reward": 0.9739583283662796, |
| "rewards/repetition_penalty_reward": -0.06759683508425951, |
| "rewards/tag_count_reward": 0.9401041865348816, |
| "step": 595 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 195.17709350585938, |
| "epoch": 0.8938882639670042, |
| "grad_norm": 4.560347288059126, |
| "kl": 2.53125, |
| "learning_rate": 6.663902824642132e-07, |
| "loss": 0.2865, |
| "reward": 2.6934563517570496, |
| "reward_std": 0.5186980329453945, |
| "rewards/accuracy_reward": 0.8281250149011612, |
| "rewards/reasoning_steps_reward": 0.9496527761220932, |
| "rewards/repetition_penalty_reward": -0.03614447545260191, |
| "rewards/tag_count_reward": 0.9518229365348816, |
| "step": 596 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 251.75, |
| "epoch": 0.8953880764904387, |
| "grad_norm": 20.778553368266753, |
| "kl": 2.6484375, |
| "learning_rate": 6.47693354779484e-07, |
| "loss": 0.6244, |
| "reward": 2.6013853549957275, |
| "reward_std": 0.6506573259830475, |
| "rewards/accuracy_reward": 0.7552083507180214, |
| "rewards/reasoning_steps_reward": 0.9635416716337204, |
| "rewards/repetition_penalty_reward": -0.0626771878451109, |
| "rewards/tag_count_reward": 0.9453125, |
| "step": 597 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 249.87500762939453, |
| "epoch": 0.8968878890138733, |
| "grad_norm": 9.731098045072335, |
| "kl": 2.041015625, |
| "learning_rate": 6.29253682105866e-07, |
| "loss": 0.4267, |
| "reward": 2.6623805165290833, |
| "reward_std": 0.5037910491228104, |
| "rewards/accuracy_reward": 0.7916667014360428, |
| "rewards/reasoning_steps_reward": 0.9809028059244156, |
| "rewards/repetition_penalty_reward": -0.05940787214785814, |
| "rewards/tag_count_reward": 0.9492187649011612, |
| "step": 598 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 239.6197967529297, |
| "epoch": 0.8983877015373078, |
| "grad_norm": 8.23836216740641, |
| "kl": 2.60546875, |
| "learning_rate": 6.110717716656289e-07, |
| "loss": 0.4356, |
| "reward": 2.4078712463378906, |
| "reward_std": 0.6441041380167007, |
| "rewards/accuracy_reward": 0.5677083432674408, |
| "rewards/reasoning_steps_reward": 0.9496528059244156, |
| "rewards/repetition_penalty_reward": -0.06001073541119695, |
| "rewards/tag_count_reward": 0.9505208432674408, |
| "step": 599 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 233.6458396911621, |
| "epoch": 0.8998875140607424, |
| "grad_norm": 5.950701052370844, |
| "kl": 3.33203125, |
| "learning_rate": 5.931481235907466e-07, |
| "loss": 0.5106, |
| "reward": 2.4382471442222595, |
| "reward_std": 0.7465489208698273, |
| "rewards/accuracy_reward": 0.6406250149011612, |
| "rewards/reasoning_steps_reward": 0.927083358168602, |
| "rewards/repetition_penalty_reward": -0.05394035950303078, |
| "rewards/tag_count_reward": 0.9244791865348816, |
| "step": 600 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 232.4635467529297, |
| "epoch": 0.901387326584177, |
| "grad_norm": 8.835485931240282, |
| "kl": 3.53515625, |
| "learning_rate": 5.754832309091362e-07, |
| "loss": 0.6214, |
| "reward": 2.6882564425468445, |
| "reward_std": 0.6361054480075836, |
| "rewards/accuracy_reward": 0.833333358168602, |
| "rewards/reasoning_steps_reward": 0.9670139253139496, |
| "rewards/repetition_penalty_reward": -0.058705421164631844, |
| "rewards/tag_count_reward": 0.946614608168602, |
| "step": 601 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 213.64584350585938, |
| "epoch": 0.9028871391076115, |
| "grad_norm": 4.936233068343242, |
| "kl": 2.7421875, |
| "learning_rate": 5.580775795311033e-07, |
| "loss": 0.3129, |
| "reward": 2.5619428157806396, |
| "reward_std": 0.6096071302890778, |
| "rewards/accuracy_reward": 0.713541679084301, |
| "rewards/reasoning_steps_reward": 0.9513889402151108, |
| "rewards/repetition_penalty_reward": -0.04439397854730487, |
| "rewards/tag_count_reward": 0.9414062798023224, |
| "step": 602 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 226.42188262939453, |
| "epoch": 0.9043869516310461, |
| "grad_norm": 4.234086119916309, |
| "kl": 3.302734375, |
| "learning_rate": 5.409316482359694e-07, |
| "loss": 0.4198, |
| "reward": 2.5895190834999084, |
| "reward_std": 0.5498283728957176, |
| "rewards/accuracy_reward": 0.7083333432674408, |
| "rewards/reasoning_steps_reward": 0.973958358168602, |
| "rewards/repetition_penalty_reward": -0.0471997894346714, |
| "rewards/tag_count_reward": 0.954427108168602, |
| "step": 603 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 213.32291793823242, |
| "epoch": 0.9058867641544807, |
| "grad_norm": 11.124045036921478, |
| "kl": 3.056640625, |
| "learning_rate": 5.240459086589056e-07, |
| "loss": 0.5874, |
| "reward": 2.640231668949127, |
| "reward_std": 0.507370337843895, |
| "rewards/accuracy_reward": 0.7656250298023224, |
| "rewards/reasoning_steps_reward": 0.9722222089767456, |
| "rewards/repetition_penalty_reward": -0.04683440877124667, |
| "rewards/tag_count_reward": 0.9492187649011612, |
| "step": 604 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 252.64064025878906, |
| "epoch": 0.9073865766779152, |
| "grad_norm": 8.760847362194967, |
| "kl": 3.0859375, |
| "learning_rate": 5.074208252779589e-07, |
| "loss": 0.6574, |
| "reward": 2.524926483631134, |
| "reward_std": 0.6083704084157944, |
| "rewards/accuracy_reward": 0.630208358168602, |
| "rewards/reasoning_steps_reward": 0.9809028059244156, |
| "rewards/repetition_penalty_reward": -0.048424381762742996, |
| "rewards/tag_count_reward": 0.962239608168602, |
| "step": 605 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 255.46355056762695, |
| "epoch": 0.9088863892013498, |
| "grad_norm": 10.952534285750453, |
| "kl": 3.94921875, |
| "learning_rate": 4.910568554012751e-07, |
| "loss": 0.7252, |
| "reward": 2.4818572998046875, |
| "reward_std": 0.5148201733827591, |
| "rewards/accuracy_reward": 0.5937500149011612, |
| "rewards/reasoning_steps_reward": 0.9878472238779068, |
| "rewards/repetition_penalty_reward": -0.06067758519202471, |
| "rewards/tag_count_reward": 0.9609375298023224, |
| "step": 606 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 201.06771087646484, |
| "epoch": 0.9103862017247843, |
| "grad_norm": 9.032595902462477, |
| "kl": 3.17578125, |
| "learning_rate": 4.749544491545199e-07, |
| "loss": 0.5553, |
| "reward": 2.5263285636901855, |
| "reward_std": 0.5500727593898773, |
| "rewards/accuracy_reward": 0.6562500223517418, |
| "rewards/reasoning_steps_reward": 0.9687500149011612, |
| "rewards/repetition_penalty_reward": -0.046588233672082424, |
| "rewards/tag_count_reward": 0.9479166865348816, |
| "step": 607 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 234.6510467529297, |
| "epoch": 0.911886014248219, |
| "grad_norm": 3.4371655420891853, |
| "kl": 2.94140625, |
| "learning_rate": 4.591140494684965e-07, |
| "loss": 0.5164, |
| "reward": 2.340811610221863, |
| "reward_std": 0.4512625113129616, |
| "rewards/accuracy_reward": 0.4635416865348816, |
| "rewards/reasoning_steps_reward": 0.9774305671453476, |
| "rewards/repetition_penalty_reward": -0.05198352737352252, |
| "rewards/tag_count_reward": 0.9518229365348816, |
| "step": 608 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 238.08333587646484, |
| "epoch": 0.9133858267716536, |
| "grad_norm": 5.5444529742596265, |
| "kl": 2.482421875, |
| "learning_rate": 4.435360920669618e-07, |
| "loss": 0.4373, |
| "reward": 2.7700709104537964, |
| "reward_std": 0.48688896745443344, |
| "rewards/accuracy_reward": 0.8750000149011612, |
| "rewards/reasoning_steps_reward": 0.9809028059244156, |
| "rewards/repetition_penalty_reward": -0.0454674381762743, |
| "rewards/tag_count_reward": 0.9596354216337204, |
| "step": 609 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 215.10937881469727, |
| "epoch": 0.9148856392950881, |
| "grad_norm": 2.692978335091646, |
| "kl": 3.86328125, |
| "learning_rate": 4.282210054546454e-07, |
| "loss": 0.7832, |
| "reward": 2.5629987120628357, |
| "reward_std": 0.5417983531951904, |
| "rewards/accuracy_reward": 0.7031250223517418, |
| "rewards/reasoning_steps_reward": 0.9652778059244156, |
| "rewards/repetition_penalty_reward": -0.053320798091590405, |
| "rewards/tag_count_reward": 0.9479166865348816, |
| "step": 610 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 201.1666717529297, |
| "epoch": 0.9163854518185227, |
| "grad_norm": 6.972427384378082, |
| "kl": 2.6015625, |
| "learning_rate": 4.1316921090545305e-07, |
| "loss": 0.1151, |
| "reward": 2.58855801820755, |
| "reward_std": 0.4848596230149269, |
| "rewards/accuracy_reward": 0.6822916865348816, |
| "rewards/reasoning_steps_reward": 0.973958358168602, |
| "rewards/repetition_penalty_reward": -0.03513997979462147, |
| "rewards/tag_count_reward": 0.9674479365348816, |
| "step": 611 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 235.22396850585938, |
| "epoch": 0.9178852643419573, |
| "grad_norm": 4.93152288537315, |
| "kl": 3.115234375, |
| "learning_rate": 3.9838112245088934e-07, |
| "loss": 0.5991, |
| "reward": 2.601243317127228, |
| "reward_std": 0.5846336483955383, |
| "rewards/accuracy_reward": 0.7395833730697632, |
| "rewards/reasoning_steps_reward": 0.9635416716337204, |
| "rewards/repetition_penalty_reward": -0.05240248655900359, |
| "rewards/tag_count_reward": 0.950520858168602, |
| "step": 612 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 262.22396087646484, |
| "epoch": 0.9193850768653918, |
| "grad_norm": 12.62005524980775, |
| "kl": 5.4765625, |
| "learning_rate": 3.8385714686866137e-07, |
| "loss": 1.1448, |
| "reward": 2.5160588026046753, |
| "reward_std": 0.7233484238386154, |
| "rewards/accuracy_reward": 0.7031250149011612, |
| "rewards/reasoning_steps_reward": 0.9531249850988388, |
| "rewards/repetition_penalty_reward": -0.06987872999161482, |
| "rewards/tag_count_reward": 0.9296875298023224, |
| "step": 613 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 235.5989646911621, |
| "epoch": 0.9208848893888264, |
| "grad_norm": 11.546226463954667, |
| "kl": 4.08984375, |
| "learning_rate": 3.695976836714932e-07, |
| "loss": 0.6332, |
| "reward": 2.414819598197937, |
| "reward_std": 0.6042335405945778, |
| "rewards/accuracy_reward": 0.5677083507180214, |
| "rewards/reasoning_steps_reward": 0.9652778208255768, |
| "rewards/repetition_penalty_reward": -0.05306240823119879, |
| "rewards/tag_count_reward": 0.9348958730697632, |
| "step": 614 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 227.06250381469727, |
| "epoch": 0.922384701912261, |
| "grad_norm": 8.789826083743023, |
| "kl": 2.94140625, |
| "learning_rate": 3.556031250961356e-07, |
| "loss": 0.5855, |
| "reward": 2.4650281071662903, |
| "reward_std": 0.5545367747545242, |
| "rewards/accuracy_reward": 0.6041666865348816, |
| "rewards/reasoning_steps_reward": 0.9670139104127884, |
| "rewards/repetition_penalty_reward": -0.05146504007279873, |
| "rewards/tag_count_reward": 0.9453125, |
| "step": 615 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 235.88542556762695, |
| "epoch": 0.9238845144356955, |
| "grad_norm": 5.27373650876101, |
| "kl": 2.8310546875, |
| "learning_rate": 3.4187385609257275e-07, |
| "loss": 0.4564, |
| "reward": 2.5718677043914795, |
| "reward_std": 0.5033154115080833, |
| "rewards/accuracy_reward": 0.7031250298023224, |
| "rewards/reasoning_steps_reward": 0.9791667014360428, |
| "rewards/repetition_penalty_reward": -0.06224700016900897, |
| "rewards/tag_count_reward": 0.9518229365348816, |
| "step": 616 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 254.36458587646484, |
| "epoch": 0.9253843269591301, |
| "grad_norm": 7.526278127901626, |
| "kl": 1.7294921875, |
| "learning_rate": 3.284102543134426e-07, |
| "loss": 0.3597, |
| "reward": 2.546690046787262, |
| "reward_std": 0.5290744379162788, |
| "rewards/accuracy_reward": 0.6666666865348816, |
| "rewards/reasoning_steps_reward": 0.972222238779068, |
| "rewards/repetition_penalty_reward": -0.055740559473633766, |
| "rewards/tag_count_reward": 0.9635416716337204, |
| "step": 617 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 206.28125762939453, |
| "epoch": 0.9268841394825647, |
| "grad_norm": 7.942979632041828, |
| "kl": 5.625, |
| "learning_rate": 3.152126901036401e-07, |
| "loss": 0.2052, |
| "reward": 2.519493043422699, |
| "reward_std": 0.4696878641843796, |
| "rewards/accuracy_reward": 0.6093750149011612, |
| "rewards/reasoning_steps_reward": 0.9791666865348816, |
| "rewards/repetition_penalty_reward": -0.035194564145058393, |
| "rewards/tag_count_reward": 0.966145858168602, |
| "step": 618 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 228.27084350585938, |
| "epoch": 0.9283839520059992, |
| "grad_norm": 7.258620893937008, |
| "kl": 2.21875, |
| "learning_rate": 3.0228152649013133e-07, |
| "loss": 0.5912, |
| "reward": 2.6479897499084473, |
| "reward_std": 0.4475140795111656, |
| "rewards/accuracy_reward": 0.7500000149011612, |
| "rewards/reasoning_steps_reward": 0.9930555671453476, |
| "rewards/repetition_penalty_reward": -0.05209715012460947, |
| "rewards/tag_count_reward": 0.9570312649011612, |
| "step": 619 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 205.40625381469727, |
| "epoch": 0.9298837645294338, |
| "grad_norm": 4.763052053787403, |
| "kl": 2.03466796875, |
| "learning_rate": 2.896171191719743e-07, |
| "loss": 0.2601, |
| "reward": 2.6514768600463867, |
| "reward_std": 0.5323489122092724, |
| "rewards/accuracy_reward": 0.7552083432674408, |
| "rewards/reasoning_steps_reward": 0.9739583432674408, |
| "rewards/repetition_penalty_reward": -0.03602320980280638, |
| "rewards/tag_count_reward": 0.9583333432674408, |
| "step": 620 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 246.7291717529297, |
| "epoch": 0.9313835770528683, |
| "grad_norm": 4.657452344039985, |
| "kl": 3.841796875, |
| "learning_rate": 2.772198165105267e-07, |
| "loss": 0.9583, |
| "reward": 2.4812204241752625, |
| "reward_std": 0.6263424828648567, |
| "rewards/accuracy_reward": 0.6197916716337204, |
| "rewards/reasoning_steps_reward": 0.9722222238779068, |
| "rewards/repetition_penalty_reward": -0.06261651404201984, |
| "rewards/tag_count_reward": 0.9518229365348816, |
| "step": 621 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 234.05730056762695, |
| "epoch": 0.932883389576303, |
| "grad_norm": 7.137978430111716, |
| "kl": 3.41796875, |
| "learning_rate": 2.6508995951986526e-07, |
| "loss": 0.6458, |
| "reward": 2.60908043384552, |
| "reward_std": 0.6289382129907608, |
| "rewards/accuracy_reward": 0.7656250298023224, |
| "rewards/reasoning_steps_reward": 0.9565972685813904, |
| "rewards/repetition_penalty_reward": -0.06236053630709648, |
| "rewards/tag_count_reward": 0.9492187798023224, |
| "step": 622 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 203.58855056762695, |
| "epoch": 0.9343832020997376, |
| "grad_norm": 8.613599095668501, |
| "kl": 2.337890625, |
| "learning_rate": 2.532278818574108e-07, |
| "loss": 0.5707, |
| "reward": 2.6460150480270386, |
| "reward_std": 0.5471399649977684, |
| "rewards/accuracy_reward": 0.755208358168602, |
| "rewards/reasoning_steps_reward": 0.9774305671453476, |
| "rewards/repetition_penalty_reward": -0.04625924210995436, |
| "rewards/tag_count_reward": 0.9596354365348816, |
| "step": 623 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 220.9322967529297, |
| "epoch": 0.9358830146231721, |
| "grad_norm": 3.902612646632179, |
| "kl": 0.83935546875, |
| "learning_rate": 2.4163390981474354e-07, |
| "loss": 0.2083, |
| "reward": 2.6754491925239563, |
| "reward_std": 0.3636674992740154, |
| "rewards/accuracy_reward": 0.7500000298023224, |
| "rewards/reasoning_steps_reward": 0.9895833432674408, |
| "rewards/repetition_penalty_reward": -0.041998728178441525, |
| "rewards/tag_count_reward": 0.977864608168602, |
| "step": 624 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 235.92187881469727, |
| "epoch": 0.9373828271466067, |
| "grad_norm": 31.877078305835827, |
| "kl": 4.0859375, |
| "learning_rate": 2.3030836230863108e-07, |
| "loss": 0.728, |
| "reward": 2.5619365572929382, |
| "reward_std": 0.5274968221783638, |
| "rewards/accuracy_reward": 0.677083358168602, |
| "rewards/reasoning_steps_reward": 0.9809028059244156, |
| "rewards/repetition_penalty_reward": -0.05568512622267008, |
| "rewards/tag_count_reward": 0.9596354514360428, |
| "step": 625 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 234.72917556762695, |
| "epoch": 0.9388826396700413, |
| "grad_norm": 5.128413845770082, |
| "kl": 3.59765625, |
| "learning_rate": 2.192515508722559e-07, |
| "loss": 0.5811, |
| "reward": 2.5278205275535583, |
| "reward_std": 0.5462368726730347, |
| "rewards/accuracy_reward": 0.6510416865348816, |
| "rewards/reasoning_steps_reward": 0.9687500298023224, |
| "rewards/repetition_penalty_reward": -0.04900246858596802, |
| "rewards/tag_count_reward": 0.9570312649011612, |
| "step": 626 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 217.0572967529297, |
| "epoch": 0.9403824521934758, |
| "grad_norm": 11.417578022441003, |
| "kl": 3.625, |
| "learning_rate": 2.08463779646646e-07, |
| "loss": 0.6657, |
| "reward": 2.617713212966919, |
| "reward_std": 0.613451674580574, |
| "rewards/accuracy_reward": 0.7500000149011612, |
| "rewards/reasoning_steps_reward": 0.9652778208255768, |
| "rewards/repetition_penalty_reward": -0.049387591890990734, |
| "rewards/tag_count_reward": 0.9518229365348816, |
| "step": 627 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 220.74480056762695, |
| "epoch": 0.9418822647169104, |
| "grad_norm": 13.927624029030772, |
| "kl": 6.705078125, |
| "learning_rate": 1.979453453723057e-07, |
| "loss": 0.5758, |
| "reward": 2.5704716444015503, |
| "reward_std": 0.4331911653280258, |
| "rewards/accuracy_reward": 0.6770833432674408, |
| "rewards/reasoning_steps_reward": 0.9791666716337204, |
| "rewards/repetition_penalty_reward": -0.03890332626178861, |
| "rewards/tag_count_reward": 0.9531250298023224, |
| "step": 628 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 230.04687881469727, |
| "epoch": 0.943382077240345, |
| "grad_norm": 2.359934750542652, |
| "kl": 1.4609375, |
| "learning_rate": 1.8769653738105797e-07, |
| "loss": 0.1944, |
| "reward": 2.650016129016876, |
| "reward_std": 0.4671716019511223, |
| "rewards/accuracy_reward": 0.7812500298023224, |
| "rewards/reasoning_steps_reward": 0.9479166865348816, |
| "rewards/repetition_penalty_reward": -0.05180681962519884, |
| "rewards/tag_count_reward": 0.9726562649011612, |
| "step": 629 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 241.45312881469727, |
| "epoch": 0.9448818897637795, |
| "grad_norm": 6.681812402325648, |
| "kl": 4.34375, |
| "learning_rate": 1.7771763758808403e-07, |
| "loss": 0.7057, |
| "reward": 2.5360206961631775, |
| "reward_std": 0.6756476908922195, |
| "rewards/accuracy_reward": 0.6875000298023224, |
| "rewards/reasoning_steps_reward": 0.9704861342906952, |
| "rewards/repetition_penalty_reward": -0.06727802660316229, |
| "rewards/tag_count_reward": 0.9453125149011612, |
| "step": 630 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 193.1041717529297, |
| "epoch": 0.9463817022872141, |
| "grad_norm": 4.5498762858817585, |
| "kl": 2.009765625, |
| "learning_rate": 1.6800892048416618e-07, |
| "loss": 0.4715, |
| "reward": 2.745138168334961, |
| "reward_std": 0.4257928729057312, |
| "rewards/accuracy_reward": 0.8177083730697632, |
| "rewards/reasoning_steps_reward": 0.9878472238779068, |
| "rewards/repetition_penalty_reward": -0.03828221885487437, |
| "rewards/tag_count_reward": 0.977864608168602, |
| "step": 631 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 224.62500762939453, |
| "epoch": 0.9478815148106486, |
| "grad_norm": 2.3610080061202314, |
| "kl": 3.5078125, |
| "learning_rate": 1.5857065312814058e-07, |
| "loss": 0.786, |
| "reward": 2.422154188156128, |
| "reward_std": 0.622757077217102, |
| "rewards/accuracy_reward": 0.5781250074505806, |
| "rewards/reasoning_steps_reward": 0.956597238779068, |
| "rewards/repetition_penalty_reward": -0.05657860916107893, |
| "rewards/tag_count_reward": 0.9440104365348816, |
| "step": 632 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 206.20312881469727, |
| "epoch": 0.9493813273340832, |
| "grad_norm": 14.033209063185351, |
| "kl": 3.33984375, |
| "learning_rate": 1.4940309513955088e-07, |
| "loss": 0.782, |
| "reward": 2.64290452003479, |
| "reward_std": 0.5470430329442024, |
| "rewards/accuracy_reward": 0.7447917014360428, |
| "rewards/reasoning_steps_reward": 0.987847238779068, |
| "rewards/repetition_penalty_reward": -0.048067858442664146, |
| "rewards/tag_count_reward": 0.958333358168602, |
| "step": 633 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 217.67709350585938, |
| "epoch": 0.9508811398575178, |
| "grad_norm": 1.7544974641972677, |
| "kl": 2.810546875, |
| "learning_rate": 1.405064986915028e-07, |
| "loss": 0.5779, |
| "reward": 2.556717336177826, |
| "reward_std": 0.5784207582473755, |
| "rewards/accuracy_reward": 0.6927083432674408, |
| "rewards/reasoning_steps_reward": 0.9583333730697632, |
| "rewards/repetition_penalty_reward": -0.051355627831071615, |
| "rewards/tag_count_reward": 0.9570312649011612, |
| "step": 634 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 232.2031364440918, |
| "epoch": 0.9523809523809523, |
| "grad_norm": 2.733833244509274, |
| "kl": 2.9609375, |
| "learning_rate": 1.3188110850373527e-07, |
| "loss": 0.5971, |
| "reward": 2.5860196948051453, |
| "reward_std": 0.5797842293977737, |
| "rewards/accuracy_reward": 0.7135416865348816, |
| "rewards/reasoning_steps_reward": 0.970486119389534, |
| "rewards/repetition_penalty_reward": -0.05503946170210838, |
| "rewards/tag_count_reward": 0.9570312798023224, |
| "step": 635 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 256.3697967529297, |
| "epoch": 0.953880764904387, |
| "grad_norm": 16.69375796765008, |
| "kl": 4.93359375, |
| "learning_rate": 1.2352716183588022e-07, |
| "loss": 0.9024, |
| "reward": 2.4972673654556274, |
| "reward_std": 0.5852114260196686, |
| "rewards/accuracy_reward": 0.6822916865348816, |
| "rewards/reasoning_steps_reward": 0.9565972536802292, |
| "rewards/repetition_penalty_reward": -0.0674027856439352, |
| "rewards/tag_count_reward": 0.9257812649011612, |
| "step": 636 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 237.43750381469727, |
| "epoch": 0.9553805774278216, |
| "grad_norm": 2.7912977723178796, |
| "kl": 2.740234375, |
| "learning_rate": 1.1544488848094338e-07, |
| "loss": 0.4881, |
| "reward": 2.582638919353485, |
| "reward_std": 0.6020847111940384, |
| "rewards/accuracy_reward": 0.7343750149011612, |
| "rewards/reasoning_steps_reward": 0.9652778208255768, |
| "rewards/repetition_penalty_reward": -0.05190982669591904, |
| "rewards/tag_count_reward": 0.934895858168602, |
| "step": 637 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 240.64063262939453, |
| "epoch": 0.9568803899512561, |
| "grad_norm": 6.968113723896311, |
| "kl": 2.9375, |
| "learning_rate": 1.0763451075897713e-07, |
| "loss": 0.6358, |
| "reward": 2.5640820264816284, |
| "reward_std": 0.6016581207513809, |
| "rewards/accuracy_reward": 0.6875000223517418, |
| "rewards/reasoning_steps_reward": 0.9791666716337204, |
| "rewards/repetition_penalty_reward": -0.0635221004486084, |
| "rewards/tag_count_reward": 0.9609375298023224, |
| "step": 638 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 226.46875762939453, |
| "epoch": 0.9583802024746907, |
| "grad_norm": 3.460380168793295, |
| "kl": 3.59765625, |
| "learning_rate": 1.0009624351097313e-07, |
| "loss": 0.5328, |
| "reward": 2.6055824160575867, |
| "reward_std": 0.5846581794321537, |
| "rewards/accuracy_reward": 0.7447916865348816, |
| "rewards/reasoning_steps_reward": 0.9600694477558136, |
| "rewards/repetition_penalty_reward": -0.05240369774401188, |
| "rewards/tag_count_reward": 0.9531250149011612, |
| "step": 639 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 232.04688262939453, |
| "epoch": 0.9598800149981253, |
| "grad_norm": 5.342162377565693, |
| "kl": 2.919921875, |
| "learning_rate": 9.283029409294263e-08, |
| "loss": 0.5766, |
| "reward": 2.615228056907654, |
| "reward_std": 0.5253854840993881, |
| "rewards/accuracy_reward": 0.7291667014360428, |
| "rewards/reasoning_steps_reward": 0.972222238779068, |
| "rewards/repetition_penalty_reward": -0.058817002922296524, |
| "rewards/tag_count_reward": 0.9726562649011612, |
| "step": 640 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 231.62500762939453, |
| "epoch": 0.9613798275215598, |
| "grad_norm": 15.144779529429583, |
| "kl": 2.9951171875, |
| "learning_rate": 8.583686237022105e-08, |
| "loss": 0.6178, |
| "reward": 2.459158480167389, |
| "reward_std": 0.49489714950323105, |
| "rewards/accuracy_reward": 0.567708358168602, |
| "rewards/reasoning_steps_reward": 0.9791667014360428, |
| "rewards/repetition_penalty_reward": -0.055164570454508066, |
| "rewards/tag_count_reward": 0.9674479365348816, |
| "step": 641 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 248.58334732055664, |
| "epoch": 0.9628796400449944, |
| "grad_norm": 7.8949182961312445, |
| "kl": 3.7138671875, |
| "learning_rate": 7.911614071196671e-08, |
| "loss": 0.7138, |
| "reward": 2.503873646259308, |
| "reward_std": 0.4214767701923847, |
| "rewards/accuracy_reward": 0.6562500149011612, |
| "rewards/reasoning_steps_reward": 0.9670139104127884, |
| "rewards/repetition_penalty_reward": -0.07251533772796392, |
| "rewards/tag_count_reward": 0.953125, |
| "step": 642 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 221.640625, |
| "epoch": 0.9643794525684289, |
| "grad_norm": 3.935827963006508, |
| "kl": 2.220703125, |
| "learning_rate": 7.266831398587082e-08, |
| "loss": 0.3811, |
| "reward": 2.5909196734428406, |
| "reward_std": 0.5641670525074005, |
| "rewards/accuracy_reward": 0.7031250149011612, |
| "rewards/reasoning_steps_reward": 0.9791666865348816, |
| "rewards/repetition_penalty_reward": -0.043195023201406, |
| "rewards/tag_count_reward": 0.9518229514360428, |
| "step": 643 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 249.91667938232422, |
| "epoch": 0.9658792650918635, |
| "grad_norm": 3.3366997354557224, |
| "kl": 2.41796875, |
| "learning_rate": 6.649355955306802e-08, |
| "loss": 0.558, |
| "reward": 2.6643757820129395, |
| "reward_std": 0.4191970229148865, |
| "rewards/accuracy_reward": 0.770833358168602, |
| "rewards/reasoning_steps_reward": 0.9809028059244156, |
| "rewards/repetition_penalty_reward": -0.05611046589910984, |
| "rewards/tag_count_reward": 0.9687500298023224, |
| "step": 644 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 232.72396850585938, |
| "epoch": 0.9673790776152981, |
| "grad_norm": 6.998025777128476, |
| "kl": 2.767578125, |
| "learning_rate": 6.059204726326373e-08, |
| "loss": 0.6031, |
| "reward": 2.2929863333702087, |
| "reward_std": 0.6047337800264359, |
| "rewards/accuracy_reward": 0.416666679084301, |
| "rewards/reasoning_steps_reward": 0.973958358168602, |
| "rewards/repetition_penalty_reward": -0.04815956763923168, |
| "rewards/tag_count_reward": 0.9505208432674408, |
| "step": 645 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 209.05208587646484, |
| "epoch": 0.9688788901387326, |
| "grad_norm": 7.463341518776408, |
| "kl": 2.57421875, |
| "learning_rate": 5.4963939450057846e-08, |
| "loss": 0.5299, |
| "reward": 2.7416443824768066, |
| "reward_std": 0.513618029654026, |
| "rewards/accuracy_reward": 0.8385416865348816, |
| "rewards/reasoning_steps_reward": 0.9774305820465088, |
| "rewards/repetition_penalty_reward": -0.03917174320667982, |
| "rewards/tag_count_reward": 0.9648437798023224, |
| "step": 646 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 230.57813262939453, |
| "epoch": 0.9703787026621672, |
| "grad_norm": 6.850765735428718, |
| "kl": 1.62255859375, |
| "learning_rate": 4.960939092648165e-08, |
| "loss": 0.3969, |
| "reward": 2.651937246322632, |
| "reward_std": 0.4466264098882675, |
| "rewards/accuracy_reward": 0.7343750149011612, |
| "rewards/reasoning_steps_reward": 0.9947916865348816, |
| "rewards/repetition_penalty_reward": -0.04728165362030268, |
| "rewards/tag_count_reward": 0.970052108168602, |
| "step": 647 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 244.28646087646484, |
| "epoch": 0.9718785151856018, |
| "grad_norm": 18.96391662542068, |
| "kl": 3.640625, |
| "learning_rate": 4.452854898073788e-08, |
| "loss": 0.5956, |
| "reward": 2.4319705963134766, |
| "reward_std": 0.581806406378746, |
| "rewards/accuracy_reward": 0.5885416716337204, |
| "rewards/reasoning_steps_reward": 0.9565972983837128, |
| "rewards/repetition_penalty_reward": -0.06368919461965561, |
| "rewards/tag_count_reward": 0.950520858168602, |
| "step": 648 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 228.5677146911621, |
| "epoch": 0.9733783277090363, |
| "grad_norm": 6.53111177098122, |
| "kl": 2.421875, |
| "learning_rate": 3.9721553372150665e-08, |
| "loss": 0.4821, |
| "reward": 2.584895968437195, |
| "reward_std": 0.510861761868, |
| "rewards/accuracy_reward": 0.697916679084301, |
| "rewards/reasoning_steps_reward": 0.9670139104127884, |
| "rewards/repetition_penalty_reward": -0.048784732818603516, |
| "rewards/tag_count_reward": 0.9687500149011612, |
| "step": 649 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 248.70834350585938, |
| "epoch": 0.974878140232471, |
| "grad_norm": 8.617456792461612, |
| "kl": 3.447265625, |
| "learning_rate": 3.5188536327318554e-08, |
| "loss": 0.7325, |
| "reward": 2.51455420255661, |
| "reward_std": 0.6581666991114616, |
| "rewards/accuracy_reward": 0.6614583432674408, |
| "rewards/reasoning_steps_reward": 0.9652777761220932, |
| "rewards/repetition_penalty_reward": -0.05879658181220293, |
| "rewards/tag_count_reward": 0.946614608168602, |
| "step": 650 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 219.52083587646484, |
| "epoch": 0.9763779527559056, |
| "grad_norm": 6.114393105789686, |
| "kl": 2.314453125, |
| "learning_rate": 3.092962253648302e-08, |
| "loss": 0.4754, |
| "reward": 2.638841927051544, |
| "reward_std": 0.5403295606374741, |
| "rewards/accuracy_reward": 0.7500000149011612, |
| "rewards/reasoning_steps_reward": 0.9739583432674408, |
| "rewards/repetition_penalty_reward": -0.044752005487680435, |
| "rewards/tag_count_reward": 0.9596354365348816, |
| "step": 651 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 222.9791717529297, |
| "epoch": 0.9778777652793401, |
| "grad_norm": 4.664735172295086, |
| "kl": 4.345703125, |
| "learning_rate": 2.694492915009006e-08, |
| "loss": 0.5058, |
| "reward": 2.429598391056061, |
| "reward_std": 0.4925737604498863, |
| "rewards/accuracy_reward": 0.5468750149011612, |
| "rewards/reasoning_steps_reward": 0.9791666865348816, |
| "rewards/repetition_penalty_reward": -0.0482663013972342, |
| "rewards/tag_count_reward": 0.9518229365348816, |
| "step": 652 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 262.4739685058594, |
| "epoch": 0.9793775778027747, |
| "grad_norm": 9.946082586920667, |
| "kl": 3.6181640625, |
| "learning_rate": 2.3234565775575034e-08, |
| "loss": 0.7296, |
| "reward": 2.446805000305176, |
| "reward_std": 0.5700404495000839, |
| "rewards/accuracy_reward": 0.5989583432674408, |
| "rewards/reasoning_steps_reward": 0.9756944626569748, |
| "rewards/repetition_penalty_reward": -0.07316032983362675, |
| "rewards/tag_count_reward": 0.9453125298023224, |
| "step": 653 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 213.90625381469727, |
| "epoch": 0.9808773903262092, |
| "grad_norm": 7.538573623853772, |
| "kl": 2.619140625, |
| "learning_rate": 1.9798634474345048e-08, |
| "loss": 0.5537, |
| "reward": 2.517378091812134, |
| "reward_std": 0.41347331553697586, |
| "rewards/accuracy_reward": 0.5989583507180214, |
| "rewards/reasoning_steps_reward": 0.9895833432674408, |
| "rewards/repetition_penalty_reward": -0.043819915503263474, |
| "rewards/tag_count_reward": 0.9726562649011612, |
| "step": 654 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 222.4791717529297, |
| "epoch": 0.9823772028496438, |
| "grad_norm": 1.4494224041136212, |
| "kl": 1.38916015625, |
| "learning_rate": 1.6637229758970087e-08, |
| "loss": 0.2744, |
| "reward": 2.6669066548347473, |
| "reward_std": 0.3448048084974289, |
| "rewards/accuracy_reward": 0.7447916865348816, |
| "rewards/reasoning_steps_reward": 0.9878472238779068, |
| "rewards/repetition_penalty_reward": -0.04229486454278231, |
| "rewards/tag_count_reward": 0.9765625149011612, |
| "step": 655 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 241.97396850585938, |
| "epoch": 0.9838770153730784, |
| "grad_norm": 5.752377864686793, |
| "kl": 1.896484375, |
| "learning_rate": 1.3750438590586223e-08, |
| "loss": 0.4106, |
| "reward": 2.751551926136017, |
| "reward_std": 0.3733288496732712, |
| "rewards/accuracy_reward": 0.8437500149011612, |
| "rewards/reasoning_steps_reward": 0.9878472536802292, |
| "rewards/repetition_penalty_reward": -0.055305857211351395, |
| "rewards/tag_count_reward": 0.9752604365348816, |
| "step": 656 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 233.6093864440918, |
| "epoch": 0.9853768278965129, |
| "grad_norm": 1.7146871873625331, |
| "kl": 2.14794921875, |
| "learning_rate": 1.1138340376501966e-08, |
| "loss": 0.2227, |
| "reward": 2.6160547137260437, |
| "reward_std": 0.5579579994082451, |
| "rewards/accuracy_reward": 0.7343750149011612, |
| "rewards/reasoning_steps_reward": 0.9670139104127884, |
| "rewards/repetition_penalty_reward": -0.05538627551868558, |
| "rewards/tag_count_reward": 0.9700520932674408, |
| "step": 657 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 221.94271087646484, |
| "epoch": 0.9868766404199475, |
| "grad_norm": 9.514836835014375, |
| "kl": 3.48046875, |
| "learning_rate": 8.801006968012227e-09, |
| "loss": 0.6615, |
| "reward": 2.577666759490967, |
| "reward_std": 0.4780506566166878, |
| "rewards/accuracy_reward": 0.6875000149011612, |
| "rewards/reasoning_steps_reward": 0.9809028059244156, |
| "rewards/repetition_penalty_reward": -0.05167360603809357, |
| "rewards/tag_count_reward": 0.9609375149011612, |
| "step": 658 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 194.54687881469727, |
| "epoch": 0.9883764529433821, |
| "grad_norm": 7.048358144573533, |
| "kl": 2.072265625, |
| "learning_rate": 6.738502658426571e-09, |
| "loss": 0.4597, |
| "reward": 2.7473713755607605, |
| "reward_std": 0.4053123965859413, |
| "rewards/accuracy_reward": 0.8281250149011612, |
| "rewards/reasoning_steps_reward": 0.9861111342906952, |
| "rewards/repetition_penalty_reward": -0.035614716820418835, |
| "rewards/tag_count_reward": 0.9687500298023224, |
| "step": 659 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 197.7760467529297, |
| "epoch": 0.9898762654668166, |
| "grad_norm": 7.604534130768124, |
| "kl": 1.7197265625, |
| "learning_rate": 4.950884181295079e-09, |
| "loss": 0.2393, |
| "reward": 2.58710515499115, |
| "reward_std": 0.4081360250711441, |
| "rewards/accuracy_reward": 0.661458358168602, |
| "rewards/reasoning_steps_reward": 0.986111119389534, |
| "rewards/repetition_penalty_reward": -0.04353731218725443, |
| "rewards/tag_count_reward": 0.9830729514360428, |
| "step": 660 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 249.40105056762695, |
| "epoch": 0.9913760779902512, |
| "grad_norm": 7.864265781403792, |
| "kl": 4.31640625, |
| "learning_rate": 3.4382007088518134e-09, |
| "loss": 0.7781, |
| "reward": 2.3653112053871155, |
| "reward_std": 0.5962060615420341, |
| "rewards/accuracy_reward": 0.5468750149011612, |
| "rewards/reasoning_steps_reward": 0.9479167014360428, |
| "rewards/repetition_penalty_reward": -0.06047023739665747, |
| "rewards/tag_count_reward": 0.930989608168602, |
| "step": 661 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 230.3489646911621, |
| "epoch": 0.9928758905136857, |
| "grad_norm": 3.5888298694634995, |
| "kl": 3.0, |
| "learning_rate": 2.200493850662566e-09, |
| "loss": 0.5777, |
| "reward": 2.563965141773224, |
| "reward_std": 0.5070576220750809, |
| "rewards/accuracy_reward": 0.6822916716337204, |
| "rewards/reasoning_steps_reward": 0.9826389104127884, |
| "rewards/repetition_penalty_reward": -0.057996807619929314, |
| "rewards/tag_count_reward": 0.9570312649011612, |
| "step": 662 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 223.26041793823242, |
| "epoch": 0.9943757030371203, |
| "grad_norm": 4.0715790482727705, |
| "kl": 2.197265625, |
| "learning_rate": 1.2377976524746705e-09, |
| "loss": 0.2855, |
| "reward": 2.672169864177704, |
| "reward_std": 0.48223114758729935, |
| "rewards/accuracy_reward": 0.7916666865348816, |
| "rewards/reasoning_steps_reward": 0.9687500447034836, |
| "rewards/repetition_penalty_reward": -0.04527819249778986, |
| "rewards/tag_count_reward": 0.9570312649011612, |
| "step": 663 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 246.0677146911621, |
| "epoch": 0.995875515560555, |
| "grad_norm": 3.596922089624617, |
| "kl": 2.80078125, |
| "learning_rate": 5.501385952888516e-10, |
| "loss": 0.8216, |
| "reward": 2.5335150957107544, |
| "reward_std": 0.5313072800636292, |
| "rewards/accuracy_reward": 0.6302083507180214, |
| "rewards/reasoning_steps_reward": 0.9947916716337204, |
| "rewards/repetition_penalty_reward": -0.04981834441423416, |
| "rewards/tag_count_reward": 0.9583333432674408, |
| "step": 664 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 218.2447967529297, |
| "epoch": 0.9973753280839895, |
| "grad_norm": 3.9248590354460755, |
| "kl": 1.875, |
| "learning_rate": 1.375355946242607e-10, |
| "loss": 0.3974, |
| "reward": 2.5833805203437805, |
| "reward_std": 0.4426957219839096, |
| "rewards/accuracy_reward": 0.6718750298023224, |
| "rewards/reasoning_steps_reward": 0.9809027910232544, |
| "rewards/repetition_penalty_reward": -0.03944949712604284, |
| "rewards/tag_count_reward": 0.970052108168602, |
| "step": 665 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 223.3671875, |
| "epoch": 0.9988751406074241, |
| "grad_norm": 7.299493407835698, |
| "kl": 2.71875, |
| "learning_rate": 0.0, |
| "loss": 0.5344, |
| "reward": 2.6338536143302917, |
| "reward_std": 0.6461013555526733, |
| "rewards/accuracy_reward": 0.7656250149011612, |
| "rewards/reasoning_steps_reward": 0.963541716337204, |
| "rewards/repetition_penalty_reward": -0.045833950862288475, |
| "rewards/tag_count_reward": 0.950520858168602, |
| "step": 666 |
| }, |
| { |
| "epoch": 0.9988751406074241, |
| "step": 666, |
| "total_flos": 0.0, |
| "train_loss": 14.041643001426444, |
| "train_runtime": 23054.463, |
| "train_samples_per_second": 0.347, |
| "train_steps_per_second": 0.029 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 666, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 200, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|