diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -10,15019 +10,13019 @@ "log_history": [ { "clip_ratio": 0.0, - "completion_length": 451.6458435058594, + "completion_length": 483.91668701171875, "epoch": 0.001, - "grad_norm": 3.0469120263636107, + "grad_norm": 1.9742541324645637, "kl": 0.0, "learning_rate": 1e-08, - "loss": -0.0585, - "reward": 1.1093750596046448, - "reward_std": 0.3163445144891739, - "rewards/accuracy_reward": 0.6041666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5052083432674408, + "loss": -0.0694, + "reward": 0.5208333432674408, + "reward_std": 0.05689104273915291, + "rewards/tag_count_reward": 0.5208333432674408, "step": 1 }, { "clip_ratio": 0.0, - "completion_length": 440.37501525878906, + "completion_length": 461.93751525878906, "epoch": 0.002, - "grad_norm": 2.655689746041415, + "grad_norm": 1.9070621603104843, "kl": 0.0, "learning_rate": 2e-08, - "loss": 0.0377, - "reward": 1.140625, - "reward_std": 0.4682169407606125, - "rewards/accuracy_reward": 0.625, - "rewards/format_reward": 0.0, + "loss": 0.0922, + "reward": 0.515625, + "reward_std": 0.06549893878400326, "rewards/tag_count_reward": 0.515625, "step": 2 }, { "clip_ratio": 0.0, - "completion_length": 425.3541717529297, + "completion_length": 397.56251525878906, "epoch": 0.003, - "grad_norm": 2.6777296148152496, - "kl": 0.00014400482177734375, + "grad_norm": 1.3934674579092003, + "kl": 0.00014257431030273438, "learning_rate": 3e-08, - "loss": 0.0133, - "reward": 1.2395833730697632, - "reward_std": 0.47577761113643646, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5312500298023224, + "loss": -0.0461, + "reward": 0.5052083432674408, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.5052083432674408, "step": 3 }, { "clip_ratio": 0.0, - "completion_length": 464.25001525878906, + "completion_length": 474.1458435058594, "epoch": 0.004, - "grad_norm": 2.5137815072466263, - "kl": 0.00014829635620117188, + "grad_norm": 2.0218828049837487, + "kl": 0.00012159347534179688, "learning_rate": 4e-08, - "loss": 0.008, - "reward": 1.1250000596046448, - "reward_std": 0.2899996042251587, - "rewards/accuracy_reward": 0.6041666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5208333432674408, + "loss": -0.1318, + "reward": 0.5156250298023224, + "reward_std": 0.05412658676505089, + "rewards/tag_count_reward": 0.5156250298023224, "step": 4 }, { "clip_ratio": 0.0, - "completion_length": 455.5208435058594, + "completion_length": 456.31251525878906, "epoch": 0.005, - "grad_norm": 2.764883600981797, - "kl": 0.0001773834228515625, + "grad_norm": 0.008997411780226844, + "kl": 0.00015211105346679688, "learning_rate": 5e-08, - "loss": 0.0064, - "reward": 0.9375000596046448, - "reward_std": 0.4112800657749176, - "rewards/accuracy_reward": 0.4375000149011612, - "rewards/format_reward": 0.0, + "loss": 0.0, + "reward": 0.5, + "reward_std": 0.0, "rewards/tag_count_reward": 0.5, "step": 5 }, { "clip_ratio": 0.0, - "completion_length": 485.89585876464844, + "completion_length": 459.9166717529297, "epoch": 0.006, - "grad_norm": 2.2184862306289843, - "kl": 0.00014925003051757812, + "grad_norm": 2.2346740493751307, + "kl": 0.00015592575073242188, "learning_rate": 6e-08, - "loss": 0.0465, - "reward": 1.3229167461395264, - "reward_std": 0.28646042197942734, - "rewards/accuracy_reward": 0.8125000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5104166865348816, + "loss": -0.0896, + "reward": 0.515625, + "reward_std": 0.04237028583884239, + "rewards/tag_count_reward": 0.515625, "step": 6 }, { "clip_ratio": 0.0, - "completion_length": 412.4791717529297, + "completion_length": 397.1041717529297, "epoch": 0.007, - "grad_norm": 2.279756981941202, - "kl": 0.0001544952392578125, + "grad_norm": 1.460322899370902, + "kl": 0.0001239776611328125, "learning_rate": 7e-08, - "loss": 0.0142, - "reward": 1.1770833432674408, - "reward_std": 0.22131992876529694, - "rewards/accuracy_reward": 0.6666666716337204, - "rewards/format_reward": 0.0, + "loss": 0.0045, + "reward": 0.5104166865348816, + "reward_std": 0.03608439117670059, "rewards/tag_count_reward": 0.5104166865348816, "step": 7 }, { "clip_ratio": 0.0, - "completion_length": 432.7291717529297, + "completion_length": 457.7708435058594, "epoch": 0.008, - "grad_norm": 2.4871182111101153, - "kl": 0.00017023086547851562, + "grad_norm": 1.6052570950629088, + "kl": 0.000202178955078125, "learning_rate": 8e-08, - "loss": -0.0215, - "reward": 1.1458333730697632, - "reward_std": 0.31117893755435944, - "rewards/accuracy_reward": 0.6250000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5208333432674408, + "loss": 0.0522, + "reward": 0.5104166865348816, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.5104166865348816, "step": 8 }, { "clip_ratio": 0.0, - "completion_length": 413.0, + "completion_length": 423.68751525878906, "epoch": 0.009, - "grad_norm": 2.569929089471837, - "kl": 0.00015592575073242188, + "grad_norm": 1.7356930498415466, + "kl": 0.00014591217041015625, "learning_rate": 9e-08, - "loss": 0.0395, - "reward": 1.2187500596046448, - "reward_std": 0.3326523005962372, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5104166865348816, + "loss": -0.0082, + "reward": 0.5312500298023224, + "reward_std": 0.08474057167768478, + "rewards/tag_count_reward": 0.5312500298023224, "step": 9 }, { "clip_ratio": 0.0, - "completion_length": 474.0833435058594, + "completion_length": 433.5208435058594, "epoch": 0.01, - "grad_norm": 2.5888318231530376, - "kl": 0.00014209747314453125, + "grad_norm": 1.264090874402394, + "kl": 0.00014019012451171875, "learning_rate": 1e-07, - "loss": 0.0213, - "reward": 1.2916667461395264, - "reward_std": 0.4207582473754883, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5, + "loss": 0.0134, + "reward": 0.5052083432674408, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.5052083432674408, "step": 10 }, { "clip_ratio": 0.0, - "completion_length": 445.75, + "completion_length": 423.6666717529297, "epoch": 0.011, - "grad_norm": 2.6569919846004244, - "kl": 0.0001659393310546875, + "grad_norm": 2.0692894517993046, + "kl": 0.00016450881958007812, "learning_rate": 1.0999999999999999e-07, - "loss": 0.0603, - "reward": 1.1145833432674408, - "reward_std": 0.39135053753852844, - "rewards/accuracy_reward": 0.6041666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5104166865348816, + "loss": -0.0564, + "reward": 0.5156250298023224, + "reward_std": 0.04237028583884239, + "rewards/tag_count_reward": 0.5156250298023224, "step": 11 }, { "clip_ratio": 0.0, - "completion_length": 446.75, + "completion_length": 423.0416717529297, "epoch": 0.012, - "grad_norm": 2.931889194243234, - "kl": 0.00019550323486328125, + "grad_norm": 2.2937606521774225, + "kl": 0.00015497207641601562, "learning_rate": 1.2e-07, - "loss": -0.0752, - "reward": 1.2552083730697632, - "reward_std": 0.3132514953613281, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5260416865348816, + "loss": -0.0519, + "reward": 0.5416666865348816, + "reward_std": 0.10202579200267792, + "rewards/tag_count_reward": 0.5416666865348816, "step": 12 }, { "clip_ratio": 0.0, - "completion_length": 494.1458435058594, + "completion_length": 469.0208435058594, "epoch": 0.013, - "grad_norm": 2.00415050750538, - "kl": 0.00014925003051757812, + "grad_norm": 2.0091151598756, + "kl": 0.00016117095947265625, "learning_rate": 1.3e-07, - "loss": -0.0219, - "reward": 1.4114583730697632, - "reward_std": 0.28175655752420425, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5156250298023224, + "loss": -0.0659, + "reward": 0.5208333432674408, + "reward_std": 0.05689104646444321, + "rewards/tag_count_reward": 0.5208333432674408, "step": 13 }, { "clip_ratio": 0.0, - "completion_length": 465.1666717529297, + "completion_length": 487.0208435058594, "epoch": 0.014, - "grad_norm": 2.6074800631566832, - "kl": 0.00015974044799804688, + "grad_norm": 2.972602173974025, + "kl": 0.0001373291015625, "learning_rate": 1.4e-07, - "loss": 0.0006, - "reward": 1.0677083730697632, - "reward_std": 0.30128014087677, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5260416865348816, + "loss": -0.1794, + "reward": 0.5364583730697632, + "reward_std": 0.07692287117242813, + "rewards/tag_count_reward": 0.5364583730697632, "step": 14 }, { "clip_ratio": 0.0, - "completion_length": 429.3125, + "completion_length": 426.75001525878906, "epoch": 0.015, - "grad_norm": 1.9696832446572003, - "kl": 0.00016355514526367188, + "grad_norm": 1.196878010043536, + "kl": 0.00016832351684570312, "learning_rate": 1.5e-07, - "loss": 0.0453, - "reward": 1.3125, - "reward_std": 0.2436249926686287, - "rewards/accuracy_reward": 0.8125, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5, + "loss": 0.0144, + "reward": 0.5104166865348816, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.5104166865348816, "step": 15 }, { "clip_ratio": 0.0, - "completion_length": 468.4583435058594, + "completion_length": 458.2708435058594, "epoch": 0.016, - "grad_norm": 2.40608728487424, - "kl": 0.00013494491577148438, + "grad_norm": 1.3805638695285072, + "kl": 0.00014972686767578125, "learning_rate": 1.6e-07, - "loss": -0.0023, - "reward": 1.2135417461395264, - "reward_std": 0.38365595042705536, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/format_reward": 0.0, + "loss": -0.0195, + "reward": 0.5052083432674408, + "reward_std": 0.018042195588350296, "rewards/tag_count_reward": 0.5052083432674408, "step": 16 }, { "clip_ratio": 0.0, - "completion_length": 456.81251525878906, + "completion_length": 440.4583435058594, "epoch": 0.017, - "grad_norm": 2.9064769913617954, - "kl": 0.00015544891357421875, + "grad_norm": 2.336321857820306, + "kl": 0.00016546249389648438, "learning_rate": 1.7000000000000001e-07, - "loss": 0.0178, - "reward": 1.0208333730697632, - "reward_std": 0.32381847500801086, - "rewards/accuracy_reward": 0.5, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5208333730697632, + "loss": 0.0661, + "reward": 0.5260416865348816, + "reward_std": 0.09021097794175148, + "rewards/tag_count_reward": 0.5260416865348816, "step": 17 }, { "clip_ratio": 0.0, - "completion_length": 413.50001525878906, + "completion_length": 422.0416717529297, "epoch": 0.018, - "grad_norm": 2.7065281106248147, - "kl": 0.00014734268188476562, + "grad_norm": 1.5675504056835527, + "kl": 0.000171661376953125, "learning_rate": 1.8e-07, - "loss": -0.0132, - "reward": 1.2135417461395264, - "reward_std": 0.24544557929039001, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5052083432674408, + "loss": 0.0444, + "reward": 0.5208333432674408, + "reward_std": 0.07216878235340118, + "rewards/tag_count_reward": 0.5208333432674408, "step": 18 }, { "clip_ratio": 0.0, - "completion_length": 509.00001525878906, + "completion_length": 479.37501525878906, "epoch": 0.019, - "grad_norm": 2.559307346193476, - "kl": 0.00016450881958007812, + "grad_norm": 1.8987192317698203, + "kl": 0.00018739700317382812, "learning_rate": 1.8999999999999998e-07, - "loss": -0.0188, - "reward": 1.1718750596046448, - "reward_std": 0.4405389875173569, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5052083432674408, + "loss": -0.035, + "reward": 0.5208333432674408, + "reward_std": 0.05689104646444321, + "rewards/tag_count_reward": 0.5208333432674408, "step": 19 }, { "clip_ratio": 0.0, - "completion_length": 496.9791717529297, + "completion_length": 456.3125, "epoch": 0.02, - "grad_norm": 2.512056455278375, - "kl": 0.00015354156494140625, + "grad_norm": 1.579116890174869, + "kl": 0.00020647048950195312, "learning_rate": 2e-07, - "loss": -0.0015, - "reward": 1.1250000596046448, - "reward_std": 0.45455072820186615, - "rewards/accuracy_reward": 0.6250000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5, + "loss": -0.0064, + "reward": 0.515625, + "reward_std": 0.05412658676505089, + "rewards/tag_count_reward": 0.515625, "step": 20 }, { "clip_ratio": 0.0, - "completion_length": 425.0416717529297, + "completion_length": 452.68751525878906, "epoch": 0.021, - "grad_norm": 2.730428025307325, - "kl": 0.00012826919555664062, + "grad_norm": 2.2831411081336745, + "kl": 0.00019550323486328125, "learning_rate": 2.0999999999999997e-07, - "loss": -0.0113, - "reward": 1.3333333730697632, - "reward_std": 0.4157644957304001, - "rewards/accuracy_reward": 0.8125000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5208333432674408, + "loss": -0.0486, + "reward": 0.5364583432674408, + "reward_std": 0.10278276726603508, + "rewards/tag_count_reward": 0.5364583432674408, "step": 21 }, { "clip_ratio": 0.0, - "completion_length": 427.0208435058594, + "completion_length": 420.5625, "epoch": 0.022, - "grad_norm": 2.711172676108638, - "kl": 0.00017118453979492188, + "grad_norm": 2.2074315098000388, + "kl": 0.00019168853759765625, "learning_rate": 2.1999999999999998e-07, - "loss": 0.0797, - "reward": 1.4166667461395264, - "reward_std": 0.2304430827498436, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5208333432674408, + "loss": -0.0398, + "reward": 0.5364583432674408, + "reward_std": 0.0992613285779953, + "rewards/tag_count_reward": 0.5364583432674408, "step": 22 }, { "clip_ratio": 0.0, - "completion_length": 393.3958435058594, + "completion_length": 414.2083435058594, "epoch": 0.023, - "grad_norm": 2.2635175482113086, - "kl": 0.00015163421630859375, + "grad_norm": 1.3397307174661355, + "kl": 0.00022554397583007812, "learning_rate": 2.3e-07, - "loss": 0.0408, - "reward": 1.2395833730697632, - "reward_std": 0.24646349996328354, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5104166865348816, + "loss": -0.0109, + "reward": 0.5052083432674408, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.5052083432674408, "step": 23 }, { "clip_ratio": 0.0, - "completion_length": 462.8958435058594, + "completion_length": 415.2083435058594, "epoch": 0.024, - "grad_norm": 2.6707402281932215, - "kl": 0.00017690658569335938, + "grad_norm": 2.2052134784327566, + "kl": 0.000335693359375, "learning_rate": 2.4e-07, - "loss": 0.0458, - "reward": 1.3177083730697632, - "reward_std": 0.37440434098243713, - "rewards/accuracy_reward": 0.8125, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5052083432674408, + "loss": 0.0173, + "reward": 0.53125, + "reward_std": 0.07298427075147629, + "rewards/tag_count_reward": 0.53125, "step": 24 }, { "clip_ratio": 0.0, - "completion_length": 387.4791717529297, + "completion_length": 386.43751525878906, "epoch": 0.025, - "grad_norm": 2.609454999030999, - "kl": 0.00012540817260742188, + "grad_norm": 1.179666671856239, + "kl": 0.00025177001953125, "learning_rate": 2.5e-07, - "loss": 0.0407, - "reward": 1.4270833730697632, - "reward_std": 0.27773431688547134, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, + "loss": 0.0124, + "reward": 0.5104166865348816, + "reward_std": 0.03608439117670059, "rewards/tag_count_reward": 0.5104166865348816, "step": 25 }, { "clip_ratio": 0.0, - "completion_length": 417.35418701171875, + "completion_length": 400.9375, "epoch": 0.026, - "grad_norm": 2.9850647698943598, - "kl": 0.00015974044799804688, + "grad_norm": 1.20187270876094, + "kl": 0.0008182525634765625, "learning_rate": 2.6e-07, - "loss": 0.0169, - "reward": 1.3541667461395264, - "reward_std": 0.3558124005794525, - "rewards/accuracy_reward": 0.8333333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5208333730697632, + "loss": 0.0233, + "reward": 0.5104166865348816, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.5104166865348816, "step": 26 }, { "clip_ratio": 0.0, - "completion_length": 441.3958435058594, + "completion_length": 409.4583435058594, "epoch": 0.027, - "grad_norm": 1.8743396197191182, - "kl": 0.00015687942504882812, + "grad_norm": 1.6272380189012048, + "kl": 0.000553131103515625, "learning_rate": 2.7e-07, - "loss": 0.0016, - "reward": 1.296875, - "reward_std": 0.2178530991077423, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5052083432674408, + "loss": 0.0289, + "reward": 0.5156250298023224, + "reward_std": 0.05412658676505089, + "rewards/tag_count_reward": 0.5156250298023224, "step": 27 }, { "clip_ratio": 0.0, - "completion_length": 448.5833435058594, + "completion_length": 463.31251525878906, "epoch": 0.028, - "grad_norm": 2.3474333126640023, - "kl": 0.00017499923706054688, + "grad_norm": 2.5525370582436557, + "kl": 0.0007915496826171875, "learning_rate": 2.8e-07, - "loss": 0.0327, - "reward": 1.1718750596046448, - "reward_std": 0.3373970687389374, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5052083432674408, + "loss": -0.0902, + "reward": 0.5312500298023224, + "reward_std": 0.07679307460784912, + "rewards/tag_count_reward": 0.5312500298023224, "step": 28 }, { "clip_ratio": 0.0, - "completion_length": 483.72918701171875, + "completion_length": 430.625, "epoch": 0.029, - "grad_norm": 2.92517651378256, - "kl": 0.00015926361083984375, + "grad_norm": 2.1748951967997017, + "kl": 0.000789642333984375, "learning_rate": 2.9e-07, - "loss": 0.0381, - "reward": 1.2552083730697632, - "reward_std": 0.4495217353105545, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5260416865348816, + "loss": -0.036, + "reward": 0.5312500298023224, + "reward_std": 0.06760228797793388, + "rewards/tag_count_reward": 0.5312500298023224, "step": 29 }, { "clip_ratio": 0.0, - "completion_length": 404.79168701171875, + "completion_length": 397.625, "epoch": 0.03, - "grad_norm": 1.44020398523571, - "kl": 0.00015211105346679688, + "grad_norm": 2.80101139329913, + "kl": 0.001964569091796875, "learning_rate": 3e-07, - "loss": 0.0281, - "reward": 1.125, - "reward_std": 0.13055823743343353, - "rewards/accuracy_reward": 0.625, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5, + "loss": -0.0724, + "reward": 0.5208333432674408, + "reward_std": 0.05689104646444321, + "rewards/tag_count_reward": 0.5208333432674408, "step": 30 }, { "clip_ratio": 0.0, - "completion_length": 478.7083435058594, + "completion_length": 420.54168701171875, "epoch": 0.031, - "grad_norm": 2.4449797370262556, - "kl": 0.0001583099365234375, + "grad_norm": 2.070920146055396, + "kl": 0.001697540283203125, "learning_rate": 3.1e-07, - "loss": -0.0111, - "reward": 1.1197916865348816, - "reward_std": 0.37443745136260986, - "rewards/accuracy_reward": 0.6041666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5156250298023224, + "loss": -0.1031, + "reward": 0.5260416865348816, + "reward_std": 0.0631769448518753, + "rewards/tag_count_reward": 0.5260416865348816, "step": 31 }, { "clip_ratio": 0.0, - "completion_length": 442.1666717529297, + "completion_length": 518.0, "epoch": 0.032, - "grad_norm": 2.3212145443145085, - "kl": 0.00015687942504882812, + "grad_norm": 2.620019141438342, + "kl": 0.00270843505859375, "learning_rate": 3.2e-07, - "loss": -0.0285, - "reward": 1.15625, - "reward_std": 0.32524681836366653, - "rewards/accuracy_reward": 0.6458333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5104166865348816, + "loss": 0.0303, + "reward": 0.5572916865348816, + "reward_std": 0.12656269967556, + "rewards/tag_count_reward": 0.5572916865348816, "step": 32 }, { "clip_ratio": 0.0, - "completion_length": 481.2916717529297, + "completion_length": 448.1666717529297, "epoch": 0.033, - "grad_norm": 2.6677168481139457, - "kl": 0.00014972686767578125, + "grad_norm": 2.8682602323119757, + "kl": 0.0037078857421875, "learning_rate": 3.3e-07, - "loss": 0.0347, - "reward": 1.359375, - "reward_std": 0.29683054983615875, - "rewards/accuracy_reward": 0.8333333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5260416865348816, + "loss": -0.1747, + "reward": 0.5781250298023224, + "reward_std": 0.13166124746203423, + "rewards/tag_count_reward": 0.5781250298023224, "step": 33 }, { "clip_ratio": 0.0, - "completion_length": 420.375, + "completion_length": 462.3333435058594, "epoch": 0.034, - "grad_norm": 2.025430325087902, - "kl": 0.00016021728515625, + "grad_norm": 2.6216099803692225, + "kl": 0.00449371337890625, "learning_rate": 3.4000000000000003e-07, - "loss": -0.0484, - "reward": 1.234375, - "reward_std": 0.18752333521842957, - "rewards/accuracy_reward": 0.7291666716337204, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5052083432674408, + "loss": -0.1132, + "reward": 0.5729166865348816, + "reward_std": 0.14286355674266815, + "rewards/tag_count_reward": 0.5729166865348816, "step": 34 }, { "clip_ratio": 0.0, - "completion_length": 408.79168701171875, + "completion_length": 452.81251525878906, "epoch": 0.035, - "grad_norm": 3.110884286809781, - "kl": 0.00015544891357421875, + "grad_norm": 3.574170032603353, + "kl": 0.0053863525390625, "learning_rate": 3.5e-07, - "loss": 0.0464, - "reward": 1.1770833730697632, - "reward_std": 0.4579962342977524, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5104166865348816, + "loss": -0.1787, + "reward": 0.5468750298023224, + "reward_std": 0.12251907214522362, + "rewards/tag_count_reward": 0.5468750298023224, "step": 35 }, { "clip_ratio": 0.0, - "completion_length": 413.3541717529297, + "completion_length": 440.6458435058594, "epoch": 0.036, - "grad_norm": 2.444303300389994, - "kl": 0.00018548965454101562, + "grad_norm": 3.1958385995196497, + "kl": 0.01177978515625, "learning_rate": 3.6e-07, - "loss": -0.0397, - "reward": 1.2447917461395264, - "reward_std": 0.29017117619514465, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.515625, + "loss": -0.0178, + "reward": 0.5989583432674408, + "reward_std": 0.14811573922634125, + "rewards/tag_count_reward": 0.5989583432674408, "step": 36 }, { "clip_ratio": 0.0, - "completion_length": 456.0208435058594, + "completion_length": 490.02085876464844, "epoch": 0.037, - "grad_norm": 2.8517734720401773, - "kl": 0.00019359588623046875, + "grad_norm": 2.7030726759210393, + "kl": 0.0063934326171875, "learning_rate": 3.7e-07, - "loss": 0.0151, - "reward": 1.453125, - "reward_std": 0.3240216076374054, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5364583730697632, + "loss": 0.0312, + "reward": 0.6041666865348816, + "reward_std": 0.16423005610704422, + "rewards/tag_count_reward": 0.6041666865348816, "step": 37 }, { "clip_ratio": 0.0, - "completion_length": 503.3541717529297, + "completion_length": 580.1666870117188, "epoch": 0.038, - "grad_norm": 2.7513322450048086, - "kl": 0.000164031982421875, + "grad_norm": 2.479505822262608, + "kl": 0.01202392578125, "learning_rate": 3.7999999999999996e-07, - "loss": 0.0306, - "reward": 1.046875, - "reward_std": 0.4578232765197754, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5052083432674408, + "loss": -0.1006, + "reward": 0.5885416865348816, + "reward_std": 0.13912813365459442, + "rewards/tag_count_reward": 0.5885416865348816, "step": 38 }, { "clip_ratio": 0.0, - "completion_length": 422.2291717529297, + "completion_length": 540.9375305175781, "epoch": 0.039, - "grad_norm": 2.2289364216881307, - "kl": 0.00017261505126953125, + "grad_norm": 2.5257873356130585, + "kl": 0.011932373046875, "learning_rate": 3.8999999999999997e-07, - "loss": -0.0232, - "reward": 1.4427083730697632, - "reward_std": 0.24567216634750366, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5260416865348816, + "loss": 0.0596, + "reward": 0.6250000298023224, + "reward_std": 0.20110102742910385, + "rewards/tag_count_reward": 0.6250000298023224, "step": 39 }, { "clip_ratio": 0.0, - "completion_length": 442.9583435058594, + "completion_length": 479.375, "epoch": 0.04, - "grad_norm": 2.4459574573216525, - "kl": 0.0002002716064453125, + "grad_norm": 2.6339110604344533, + "kl": 0.01409912109375, "learning_rate": 4e-07, - "loss": -0.0124, - "reward": 1.4166666865348816, - "reward_std": 0.31381872296333313, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5208333730697632, + "loss": -0.156, + "reward": 0.6197916865348816, + "reward_std": 0.1571667194366455, + "rewards/tag_count_reward": 0.6197916865348816, "step": 40 }, { "clip_ratio": 0.0, - "completion_length": 453.3333435058594, + "completion_length": 567.9166870117188, "epoch": 0.041, - "grad_norm": 2.5328860954570365, - "kl": 0.00021409988403320312, + "grad_norm": 2.443225018059453, + "kl": 0.012725830078125, "learning_rate": 4.0999999999999994e-07, - "loss": 0.0051, - "reward": 1.1510416865348816, - "reward_std": 0.44576670229434967, - "rewards/accuracy_reward": 0.645833358168602, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5052083432674408, + "loss": -0.0628, + "reward": 0.609375, + "reward_std": 0.16926345974206924, + "rewards/tag_count_reward": 0.609375, "step": 41 }, { "clip_ratio": 0.0, - "completion_length": 451.7291717529297, + "completion_length": 532.1250305175781, "epoch": 0.042, - "grad_norm": 2.3704618606581556, - "kl": 0.00020265579223632812, + "grad_norm": 2.3700512836648335, + "kl": 0.019378662109375, "learning_rate": 4.1999999999999995e-07, - "loss": -0.0278, - "reward": 1.1354166865348816, - "reward_std": 0.3743131458759308, - "rewards/accuracy_reward": 0.625, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5104166865348816, + "loss": -0.1196, + "reward": 0.6510416865348816, + "reward_std": 0.15612749755382538, + "rewards/tag_count_reward": 0.6510416865348816, "step": 42 }, { "clip_ratio": 0.0, - "completion_length": 489.00001525878906, + "completion_length": 572.8333587646484, "epoch": 0.043, - "grad_norm": 2.513538712154315, - "kl": 0.00019693374633789062, + "grad_norm": 2.5447339536309275, + "kl": 0.013427734375, "learning_rate": 4.2999999999999996e-07, - "loss": 0.034, - "reward": 1.2708333730697632, - "reward_std": 0.3412121832370758, - "rewards/accuracy_reward": 0.75, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5208333432674408, + "loss": -0.06, + "reward": 0.6354166865348816, + "reward_std": 0.1819113940000534, + "rewards/tag_count_reward": 0.6354166865348816, "step": 43 }, { "clip_ratio": 0.0, - "completion_length": 487.0208435058594, + "completion_length": 593.5416870117188, "epoch": 0.044, - "grad_norm": 2.375064364205028, - "kl": 0.0002231597900390625, + "grad_norm": 2.2168144267626757, + "kl": 0.0108489990234375, "learning_rate": 4.3999999999999997e-07, - "loss": -0.0335, - "reward": 1.0520833730697632, - "reward_std": 0.4231126457452774, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5104166865348816, + "loss": -0.0839, + "reward": 0.6770833432674408, + "reward_std": 0.16930782049894333, + "rewards/tag_count_reward": 0.6770833432674408, "step": 44 }, { "clip_ratio": 0.0, - "completion_length": 426.12501525878906, + "completion_length": 531.3958435058594, "epoch": 0.045, - "grad_norm": 2.5904194710577655, - "kl": 0.000244140625, + "grad_norm": 2.563561843819508, + "kl": 0.01080322265625, "learning_rate": 4.5e-07, - "loss": -0.0468, - "reward": 1.0468750596046448, - "reward_std": 0.31281809508800507, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5052083432674408, + "loss": -0.0386, + "reward": 0.5833333730697632, + "reward_std": 0.13584764301776886, + "rewards/tag_count_reward": 0.5833333730697632, "step": 45 }, { "clip_ratio": 0.0, - "completion_length": 449.4791717529297, + "completion_length": 494.41668701171875, "epoch": 0.046, - "grad_norm": 2.282732157496969, - "kl": 0.000293731689453125, + "grad_norm": 2.7395597608840148, + "kl": 0.016021728515625, "learning_rate": 4.6e-07, - "loss": 0.0189, - "reward": 1.25, - "reward_std": 0.3317491039633751, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5416666865348816, + "loss": -0.0226, + "reward": 0.6562500298023224, + "reward_std": 0.1605742834508419, + "rewards/tag_count_reward": 0.6562500298023224, "step": 46 }, { "clip_ratio": 0.0, - "completion_length": 412.56251525878906, + "completion_length": 429.7708435058594, "epoch": 0.047, - "grad_norm": 2.7291290366275534, - "kl": 0.00025844573974609375, + "grad_norm": 2.9160640815755015, + "kl": 0.011383056640625, "learning_rate": 4.6999999999999995e-07, - "loss": 0.0256, - "reward": 1.1666666865348816, - "reward_std": 0.37212641537189484, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5, + "loss": -0.0479, + "reward": 0.6562500298023224, + "reward_std": 0.19549089670181274, + "rewards/tag_count_reward": 0.6562500298023224, "step": 47 }, { "clip_ratio": 0.0, - "completion_length": 387.9791717529297, + "completion_length": 414.66668701171875, "epoch": 0.048, - "grad_norm": 2.4166184127335852, - "kl": 0.0002994537353515625, + "grad_norm": 2.41669012498755, + "kl": 0.008087158203125, "learning_rate": 4.8e-07, - "loss": 0.0318, - "reward": 1.2604166865348816, - "reward_std": 0.3299110010266304, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.53125, + "loss": 0.0447, + "reward": 0.7083333432674408, + "reward_std": 0.22293969988822937, + "rewards/tag_count_reward": 0.7083333432674408, "step": 48 }, { "clip_ratio": 0.0, - "completion_length": 413.2916717529297, + "completion_length": 489.3958435058594, "epoch": 0.049, - "grad_norm": 2.525493813031061, - "kl": 0.00030803680419921875, + "grad_norm": 2.375274599067812, + "kl": 0.007904052734375, "learning_rate": 4.9e-07, - "loss": -0.0423, - "reward": 0.8437500298023224, - "reward_std": 0.2982381731271744, - "rewards/accuracy_reward": 0.2916666679084301, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5520833730697632, + "loss": -0.0285, + "reward": 0.6979166865348816, + "reward_std": 0.2077343687415123, + "rewards/tag_count_reward": 0.6979166865348816, "step": 49 }, { "clip_ratio": 0.0, - "completion_length": 412.0208435058594, + "completion_length": 497.22918701171875, "epoch": 0.05, - "grad_norm": 1.8318948101423083, - "kl": 0.00035858154296875, + "grad_norm": 2.4330465227691427, + "kl": 0.0083160400390625, "learning_rate": 5e-07, - "loss": 0.0307, - "reward": 1.3958333730697632, - "reward_std": 0.19526028633117676, - "rewards/accuracy_reward": 0.8958333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5, + "loss": -0.0264, + "reward": 0.7083333432674408, + "reward_std": 0.19867133349180222, + "rewards/tag_count_reward": 0.7083333432674408, "step": 50 }, { "clip_ratio": 0.0, - "completion_length": 407.2083435058594, + "completion_length": 439.3125, "epoch": 0.051, - "grad_norm": 2.7256697070031546, - "kl": 0.0003833770751953125, + "grad_norm": 2.8050886122578973, + "kl": 0.0108795166015625, "learning_rate": 5.1e-07, - "loss": 0.0402, - "reward": 1.3541667461395264, - "reward_std": 0.346695140004158, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5208333432674408, + "loss": 0.0674, + "reward": 0.7968750298023224, + "reward_std": 0.22688449919223785, + "rewards/tag_count_reward": 0.7968750298023224, "step": 51 }, { "clip_ratio": 0.0, - "completion_length": 487.3958435058594, + "completion_length": 581.4375305175781, "epoch": 0.052, - "grad_norm": 2.877350462247381, - "kl": 0.00042819976806640625, + "grad_norm": 2.64701027690505, + "kl": 0.008026123046875, "learning_rate": 5.2e-07, - "loss": 0.0166, - "reward": 1.1770833730697632, - "reward_std": 0.42429319024086, - "rewards/accuracy_reward": 0.6250000149011612, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5520833432674408, + "loss": 0.1935, + "reward": 0.8125000298023224, + "reward_std": 0.2059411108493805, + "rewards/tag_count_reward": 0.8125000298023224, "step": 52 }, { "clip_ratio": 0.0, - "completion_length": 463.4166717529297, + "completion_length": 441.81251525878906, "epoch": 0.053, - "grad_norm": 3.0118686878979495, - "kl": 0.00055694580078125, + "grad_norm": 3.023119957597956, + "kl": 0.01275634765625, "learning_rate": 5.3e-07, - "loss": 0.0556, - "reward": 1.3072917461395264, - "reward_std": 0.3087276667356491, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5572916865348816, + "loss": -0.0015, + "reward": 0.8072916865348816, + "reward_std": 0.2015344277024269, + "rewards/tag_count_reward": 0.8072916865348816, "step": 53 }, { "clip_ratio": 0.0, - "completion_length": 419.8958435058594, + "completion_length": 440.87501525878906, "epoch": 0.054, - "grad_norm": 2.5300886286528756, - "kl": 0.0006256103515625, + "grad_norm": 2.901413813399593, + "kl": 0.01080322265625, "learning_rate": 5.4e-07, - "loss": -0.1026, - "reward": 1.1822917461395264, - "reward_std": 0.3605181947350502, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5156250298023224, + "loss": 0.0923, + "reward": 0.8802083432674408, + "reward_std": 0.1856703907251358, + "rewards/tag_count_reward": 0.8802083432674408, "step": 54 }, { "clip_ratio": 0.0, - "completion_length": 401.3958435058594, + "completion_length": 430.16668701171875, "epoch": 0.055, - "grad_norm": 2.8124199366223817, - "kl": 0.0008087158203125, + "grad_norm": 3.111751227197174, + "kl": 0.016357421875, "learning_rate": 5.5e-07, - "loss": -0.0178, - "reward": 1.1302083730697632, - "reward_std": 0.3549676835536957, - "rewards/accuracy_reward": 0.5625000149011612, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5677083730697632, + "loss": 0.1086, + "reward": 0.890625, + "reward_std": 0.18489186465740204, + "rewards/tag_count_reward": 0.890625, "step": 55 }, { "clip_ratio": 0.0, - "completion_length": 404.25001525878906, + "completion_length": 427.3958435058594, "epoch": 0.056, - "grad_norm": 2.577595024653426, - "kl": 0.0007476806640625, + "grad_norm": 2.8553539008783244, + "kl": 0.0155029296875, "learning_rate": 5.6e-07, - "loss": -0.1183, - "reward": 1.5208333730697632, - "reward_std": 0.07216878235340118, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5208333432674408, + "loss": 0.1519, + "reward": 0.9166666865348816, + "reward_std": 0.1548745259642601, + "rewards/tag_count_reward": 0.9166666865348816, "step": 56 }, { "clip_ratio": 0.0, - "completion_length": 413.18751525878906, + "completion_length": 437.8125, "epoch": 0.057, - "grad_norm": 2.730423719759646, - "kl": 0.001117706298828125, + "grad_norm": 2.925778803233364, + "kl": 0.0120849609375, "learning_rate": 5.699999999999999e-07, - "loss": -0.0169, - "reward": 1.0052083730697632, - "reward_std": 0.318894624710083, - "rewards/accuracy_reward": 0.4791666679084301, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5260416865348816, + "loss": 0.1911, + "reward": 0.9479166865348816, + "reward_std": 0.09902853146195412, + "rewards/tag_count_reward": 0.9479166865348816, "step": 57 }, { "clip_ratio": 0.0, - "completion_length": 484.2708435058594, + "completion_length": 404.25, "epoch": 0.058, - "grad_norm": 2.9609179333018605, - "kl": 0.0012359619140625, + "grad_norm": 1.6735431769305273, + "kl": 0.0142822265625, "learning_rate": 5.8e-07, - "loss": 0.0008, - "reward": 1.3333333730697632, - "reward_std": 0.46193382143974304, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5625000298023224, + "loss": 0.0434, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 58 }, { "clip_ratio": 0.0, - "completion_length": 457.0833435058594, + "completion_length": 412.9791717529297, "epoch": 0.059, - "grad_norm": 2.6071090350630057, - "kl": 0.001129150390625, + "grad_norm": 1.7942238123951728, + "kl": 0.0159912109375, "learning_rate": 5.9e-07, - "loss": -0.0464, - "reward": 0.8437500298023224, - "reward_std": 0.37812960147857666, - "rewards/accuracy_reward": 0.2916666716337204, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5520833730697632, + "loss": -0.0323, + "reward": 0.984375, + "reward_std": 0.05412658676505089, + "rewards/tag_count_reward": 0.984375, "step": 59 }, { "clip_ratio": 0.0, - "completion_length": 490.7708435058594, + "completion_length": 427.25001525878906, "epoch": 0.06, - "grad_norm": 2.5629387063773934, - "kl": 0.001255035400390625, + "grad_norm": 2.3794770067519453, + "kl": 0.009979248046875, "learning_rate": 6e-07, - "loss": -0.0507, - "reward": 1.1875000596046448, - "reward_std": 0.3230332285165787, - "rewards/accuracy_reward": 0.6458333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5416666865348816, + "loss": 0.0452, + "reward": 0.9791666865348816, + "reward_std": 0.060412485152482986, + "rewards/tag_count_reward": 0.9791666865348816, "step": 60 }, { "clip_ratio": 0.0, - "completion_length": 445.00001525878906, + "completion_length": 454.4583435058594, "epoch": 0.061, - "grad_norm": 2.8298683120350017, - "kl": 0.001373291015625, + "grad_norm": 1.8579560006684275, + "kl": 0.0142822265625, "learning_rate": 6.1e-07, - "loss": 0.0768, - "reward": 1.4062500596046448, - "reward_std": 0.3714013397693634, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5520833432674408, + "loss": 0.0847, + "reward": 0.9791666865348816, + "reward_std": 0.046308886259794235, + "rewards/tag_count_reward": 0.9791666865348816, "step": 61 }, { "clip_ratio": 0.0, - "completion_length": 427.4375, + "completion_length": 400.6041717529297, "epoch": 0.062, - "grad_norm": 3.2356835438310516, - "kl": 0.00164031982421875, + "grad_norm": 0.10582058294459042, + "kl": 0.009918212890625, "learning_rate": 6.2e-07, - "loss": 0.0838, - "reward": 1.4635417461395264, - "reward_std": 0.3229318708181381, - "rewards/accuracy_reward": 0.8958333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5677083730697632, + "loss": 0.0004, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 62 }, { "clip_ratio": 0.0, - "completion_length": 401.68751525878906, + "completion_length": 397.8333435058594, "epoch": 0.063, - "grad_norm": 2.701273118842741, - "kl": 0.00189208984375, + "grad_norm": 2.117355134501401, + "kl": 0.0135498046875, "learning_rate": 6.3e-07, - "loss": 0.0546, - "reward": 1.5677083730697632, - "reward_std": 0.24152729660272598, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.6093750298023224, + "loss": 0.0914, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 63 }, { "clip_ratio": 0.0, - "completion_length": 381.31251525878906, + "completion_length": 366.4791717529297, "epoch": 0.064, - "grad_norm": 2.785220990507944, - "kl": 0.00252532958984375, + "grad_norm": 1.2394704841216948, + "kl": 0.009429931640625, "learning_rate": 6.4e-07, - "loss": 0.047, - "reward": 1.5364583730697632, - "reward_std": 0.19510139524936676, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5572916865348816, + "loss": 0.0065, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 64 }, { "clip_ratio": 0.0, - "completion_length": 396.2916717529297, + "completion_length": 396.7083435058594, "epoch": 0.065, - "grad_norm": 2.7334299455778894, - "kl": 0.00372314453125, + "grad_norm": 0.1021771380380073, + "kl": 0.01220703125, "learning_rate": 6.5e-07, - "loss": -0.027, - "reward": 1.3125000596046448, - "reward_std": 0.35766707360744476, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.6041666865348816, + "loss": 0.0005, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 65 }, { "clip_ratio": 0.0, - "completion_length": 413.5625, + "completion_length": 395.8333435058594, "epoch": 0.066, - "grad_norm": 2.90337412027861, - "kl": 0.00289154052734375, + "grad_norm": 0.08440259324542373, + "kl": 0.0106201171875, "learning_rate": 6.6e-07, - "loss": -0.0296, - "reward": 1.0520833730697632, - "reward_std": 0.28867512941360474, - "rewards/accuracy_reward": 0.5000000149011612, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.5520833432674408, + "loss": 0.0004, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 66 }, { "clip_ratio": 0.0, - "completion_length": 509.1458435058594, + "completion_length": 413.60418701171875, "epoch": 0.067, - "grad_norm": 2.2622909600353247, - "kl": 0.00420379638671875, + "grad_norm": 0.1441362554545231, + "kl": 0.014984130859375, "learning_rate": 6.7e-07, - "loss": -0.095, - "reward": 1.2968750596046448, - "reward_std": 0.4231996387243271, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.6302083432674408, + "loss": 0.0006, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 67 }, { "clip_ratio": 0.0, - "completion_length": 394.4166717529297, + "completion_length": 350.4583435058594, "epoch": 0.068, - "grad_norm": 2.8338086432398226, - "kl": 0.005706787109375, + "grad_norm": 0.11236301409587185, + "kl": 0.0130615234375, "learning_rate": 6.800000000000001e-07, - "loss": 0.0815, - "reward": 1.7135417461395264, - "reward_std": 0.2652830183506012, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.7343750298023224, + "loss": 0.0005, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 68 }, { "clip_ratio": 0.0, - "completion_length": 529.0416870117188, + "completion_length": 414.4583435058594, "epoch": 0.069, - "grad_norm": 2.348883283897237, - "kl": 0.00640869140625, + "grad_norm": 0.09782498202426738, + "kl": 0.010986328125, "learning_rate": 6.9e-07, - "loss": 0.0721, - "reward": 1.5520833730697632, - "reward_std": 0.39166125655174255, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.7187500298023224, + "loss": 0.0004, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 69 }, { "clip_ratio": 0.0, - "completion_length": 542.8958587646484, + "completion_length": 418.4791717529297, "epoch": 0.07, - "grad_norm": 2.6549281198785017, - "kl": 0.007904052734375, + "grad_norm": 0.09408510903169646, + "kl": 0.012115478515625, "learning_rate": 7e-07, - "loss": 0.0391, - "reward": 1.2604167461395264, - "reward_std": 0.42778633534908295, - "rewards/accuracy_reward": 0.5625000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.6979166865348816, + "loss": 0.0005, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 70 }, { "clip_ratio": 0.0, - "completion_length": 439.9791717529297, + "completion_length": 349.5833435058594, "epoch": 0.071, - "grad_norm": 2.7639237072885394, - "kl": 0.01141357421875, + "grad_norm": 0.1727501471584912, + "kl": 0.014862060546875, "learning_rate": 7.1e-07, - "loss": -0.0114, - "reward": 1.6302083730697632, - "reward_std": 0.28416091948747635, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.6718750298023224, + "loss": 0.0006, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 71 }, { "clip_ratio": 0.0, - "completion_length": 480.87501525878906, + "completion_length": 379.5208435058594, "epoch": 0.072, - "grad_norm": 2.8974277486768973, - "kl": 0.00982666015625, + "grad_norm": 0.14195377794463795, + "kl": 0.012359619140625, "learning_rate": 7.2e-07, - "loss": 0.1087, - "reward": 1.7552083730697632, - "reward_std": 0.29917779564857483, - "rewards/accuracy_reward": 0.9375000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8177083432674408, + "loss": 0.0005, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 72 }, { "clip_ratio": 0.0, - "completion_length": 435.2291717529297, + "completion_length": 373.04168701171875, "epoch": 0.073, - "grad_norm": 2.410972251211118, - "kl": 0.01416015625, + "grad_norm": 0.08441331456062105, + "kl": 0.011932373046875, "learning_rate": 7.3e-07, - "loss": 0.0276, - "reward": 1.7708333730697632, - "reward_std": 0.2537258267402649, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8125, + "loss": 0.0005, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 73 }, { "clip_ratio": 0.0, - "completion_length": 426.8333435058594, + "completion_length": 414.5625, "epoch": 0.074, - "grad_norm": 3.106522433918384, - "kl": 0.0172119140625, + "grad_norm": 1.1861509967700965, + "kl": 0.010528564453125, "learning_rate": 7.4e-07, - "loss": 0.1609, - "reward": 1.75, - "reward_std": 0.3627951741218567, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8333333432674408, + "loss": -0.0065, + "reward": 0.984375, + "reward_std": 0.05412658676505089, + "rewards/tag_count_reward": 0.984375, "step": 74 }, { "clip_ratio": 0.0, - "completion_length": 450.0208435058594, + "completion_length": 427.625, "epoch": 0.075, - "grad_norm": 3.153004661867398, - "kl": 0.014984130859375, + "grad_norm": 1.4796899488926614, + "kl": 0.013580322265625, "learning_rate": 7.5e-07, - "loss": 0.0772, - "reward": 1.6979167461395264, - "reward_std": 0.3426803648471832, - "rewards/accuracy_reward": 0.8125000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8854166865348816, + "loss": 0.0271, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 75 }, { "clip_ratio": 0.0, - "completion_length": 378.93751525878906, + "completion_length": 371.50001525878906, "epoch": 0.076, - "grad_norm": 3.2859708947711757, - "kl": 0.02044677734375, + "grad_norm": 0.09787357397649554, + "kl": 0.010894775390625, "learning_rate": 7.599999999999999e-07, - "loss": 0.1016, - "reward": 1.9427083730697632, - "reward_std": 0.12557905912399292, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9427083730697632, + "loss": 0.0004, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 76 }, { "clip_ratio": 0.0, - "completion_length": 562.4375305175781, + "completion_length": 490.9583435058594, "epoch": 0.077, - "grad_norm": 2.5628822205386146, - "kl": 0.017974853515625, + "grad_norm": 0.07637469397246856, + "kl": 0.0113525390625, "learning_rate": 7.699999999999999e-07, - "loss": 0.139, - "reward": 1.3489583730697632, - "reward_std": 0.4155941307544708, - "rewards/accuracy_reward": 0.4583333358168602, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.890625, + "loss": 0.0004, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 77 }, { "clip_ratio": 0.0, - "completion_length": 380.3333435058594, + "completion_length": 358.7708435058594, "epoch": 0.078, - "grad_norm": 3.3640871795164458, - "kl": 0.02288818359375, + "grad_norm": 1.6056766807978617, + "kl": 0.010772705078125, "learning_rate": 7.799999999999999e-07, - "loss": 0.085, - "reward": 1.7083333730697632, - "reward_std": 0.27923690527677536, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.0164, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 78 }, { "clip_ratio": 0.0, - "completion_length": 366.43751525878906, + "completion_length": 367.625, "epoch": 0.079, - "grad_norm": 1.6857006933575684, - "kl": 0.0169677734375, + "grad_norm": 2.052310159087112, + "kl": 0.0205078125, "learning_rate": 7.9e-07, - "loss": -0.0096, - "reward": 1.8854167461395264, - "reward_std": 0.16481656581163406, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/format_reward": 0.0, + "loss": -0.0571, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, "rewards/tag_count_reward": 0.9895833432674408, "step": 79 }, { "clip_ratio": 0.0, - "completion_length": 419.81251525878906, + "completion_length": 392.25, "epoch": 0.08, - "grad_norm": 1.8369083930873726, - "kl": 0.01873779296875, + "grad_norm": 1.7445790059715878, + "kl": 0.01043701171875, "learning_rate": 8e-07, - "loss": -0.0024, - "reward": 1.8958333730697632, - "reward_std": 0.21037912368774414, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": -0.0415, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 80 }, { "clip_ratio": 0.0, - "completion_length": 326.7708435058594, + "completion_length": 403.1458435058594, "epoch": 0.081, - "grad_norm": 7.261870940189173, - "kl": 0.02569580078125, + "grad_norm": 0.08372535905276965, + "kl": 0.01226806640625, "learning_rate": 8.1e-07, - "loss": -0.0493, - "reward": 1.6093750596046448, - "reward_std": 0.22657930105924606, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8802083730697632, + "loss": 0.0005, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 81 }, { "clip_ratio": 0.0, - "completion_length": 382.68751525878906, + "completion_length": 377.0208435058594, "epoch": 0.082, - "grad_norm": 1.6235926696398257, - "kl": 0.01806640625, + "grad_norm": 0.07293482460029307, + "kl": 0.0091552734375, "learning_rate": 8.199999999999999e-07, - "loss": 0.0113, - "reward": 1.7187500596046448, - "reward_std": 0.10825317353010178, - "rewards/accuracy_reward": 0.7291666716337204, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0004, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 82 }, { "clip_ratio": 0.0, - "completion_length": 372.5208435058594, + "completion_length": 376.7708435058594, "epoch": 0.083, - "grad_norm": 0.18552224430097738, - "kl": 0.0159912109375, + "grad_norm": 0.08117440047953302, + "kl": 0.010223388671875, "learning_rate": 8.299999999999999e-07, - "loss": 0.0006, - "reward": 1.75, + "loss": 0.0004, + "reward": 1.0, "reward_std": 0.0, - "rewards/accuracy_reward": 0.75, - "rewards/format_reward": 0.0, "rewards/tag_count_reward": 1.0, "step": 83 }, { "clip_ratio": 0.0, - "completion_length": 399.1458435058594, + "completion_length": 450.75001525878906, "epoch": 0.084, - "grad_norm": 1.4228905296688426, - "kl": 0.01898193359375, + "grad_norm": 1.3785091767073692, + "kl": 0.01312255859375, "learning_rate": 8.399999999999999e-07, - "loss": 0.0013, - "reward": 1.8541666865348816, - "reward_std": 0.12873217463493347, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": 0.1838, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 84 }, { "clip_ratio": 0.0, - "completion_length": 437.9166717529297, + "completion_length": 378.93751525878906, "epoch": 0.085, - "grad_norm": 2.387879794360072, - "kl": 0.020111083984375, + "grad_norm": 0.08138495566859796, + "kl": 0.01055908203125, "learning_rate": 8.499999999999999e-07, - "loss": 0.2284, - "reward": 1.9479166865348816, - "reward_std": 0.13339676335453987, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833730697632, + "loss": 0.0004, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 85 }, { "clip_ratio": 0.0, - "completion_length": 421.93751525878906, + "completion_length": 414.0833435058594, "epoch": 0.086, - "grad_norm": 2.2410324569487385, - "kl": 0.0184326171875, + "grad_norm": 0.07696676022736741, + "kl": 0.010498046875, "learning_rate": 8.599999999999999e-07, - "loss": 0.0154, - "reward": 1.6614583730697632, - "reward_std": 0.1835213154554367, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0004, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 86 }, { "clip_ratio": 0.0, - "completion_length": 406.5208435058594, + "completion_length": 407.8958435058594, "epoch": 0.087, - "grad_norm": 1.217180883926671, - "kl": 0.01629638671875, + "grad_norm": 0.07467232706116313, + "kl": 0.009857177734375, "learning_rate": 8.699999999999999e-07, - "loss": 0.0065, - "reward": 1.7447916865348816, - "reward_std": 0.018042195588350296, - "rewards/accuracy_reward": 0.75, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0004, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 87 }, { "clip_ratio": 0.0, - "completion_length": 468.60418701171875, + "completion_length": 506.3333435058594, "epoch": 0.088, - "grad_norm": 2.593549923551898, - "kl": 0.02294921875, + "grad_norm": 1.2191050233252494, + "kl": 0.013946533203125, "learning_rate": 8.799999999999999e-07, - "loss": 0.0167, - "reward": 1.7916666865348816, - "reward_std": 0.32399244606494904, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, - "step": 88 - }, + "loss": 0.0024, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, + "step": 88 + }, { "clip_ratio": 0.0, - "completion_length": 371.9583435058594, + "completion_length": 430.9583435058594, "epoch": 0.089, - "grad_norm": 1.9770861272965166, - "kl": 0.02215576171875, + "grad_norm": 0.08723707900524888, + "kl": 0.010101318359375, "learning_rate": 8.9e-07, - "loss": 0.0614, - "reward": 1.8645833730697632, - "reward_std": 0.16664262861013412, - "rewards/accuracy_reward": 0.875, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0004, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 89 }, { "clip_ratio": 0.0, - "completion_length": 409.87501525878906, + "completion_length": 448.7083435058594, "epoch": 0.09, - "grad_norm": 2.783530222413004, - "kl": 0.0206298828125, + "grad_norm": 0.07242099149329641, + "kl": 0.01177978515625, "learning_rate": 9e-07, - "loss": -0.0439, - "reward": 1.4114583730697632, - "reward_std": 0.31871186196804047, - "rewards/accuracy_reward": 0.416666679084301, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0005, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 90 }, { "clip_ratio": 0.0, - "completion_length": 403.04168701171875, + "completion_length": 474.00001525878906, "epoch": 0.091, - "grad_norm": 1.5407455230215508, - "kl": 0.01837158203125, + "grad_norm": 0.06937379595279321, + "kl": 0.01031494140625, "learning_rate": 9.1e-07, - "loss": 0.016, - "reward": 1.9739583730697632, - "reward_std": 0.09021097421646118, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0004, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 91 }, { "clip_ratio": 0.0, - "completion_length": 502.7083435058594, + "completion_length": 444.4583435058594, "epoch": 0.092, - "grad_norm": 2.0382323917391596, - "kl": 0.0185546875, + "grad_norm": 0.06533761500302811, + "kl": 0.0113525390625, "learning_rate": 9.2e-07, - "loss": 0.0754, - "reward": 1.7187500596046448, - "reward_std": 0.3716070353984833, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0005, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 92 }, { "clip_ratio": 0.0, - "completion_length": 440.0208435058594, + "completion_length": 429.7291717529297, "epoch": 0.093, - "grad_norm": 1.3899528720974876, - "kl": 0.0194091796875, + "grad_norm": 0.06768204712004344, + "kl": 0.0106201171875, "learning_rate": 9.3e-07, - "loss": 0.0246, - "reward": 1.6875, - "reward_std": 0.11306675523519516, - "rewards/accuracy_reward": 0.6875, - "rewards/format_reward": 0.0, + "loss": 0.0004, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 93 }, { "clip_ratio": 0.0, - "completion_length": 480.375, + "completion_length": 488.1875305175781, "epoch": 0.094, - "grad_norm": 0.9633234213043935, - "kl": 0.02008056640625, + "grad_norm": 0.07154676649029688, + "kl": 0.010498046875, "learning_rate": 9.399999999999999e-07, - "loss": -0.0384, - "reward": 1.9375, - "reward_std": 0.11306675523519516, - "rewards/accuracy_reward": 0.9375, - "rewards/format_reward": 0.0, + "loss": 0.0004, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 94 }, { "clip_ratio": 0.0, - "completion_length": 427.8541717529297, + "completion_length": 453.62501525878906, "epoch": 0.095, - "grad_norm": 2.8878107305036838, - "kl": 0.0238037109375, + "grad_norm": 0.07456151743945785, + "kl": 0.011993408203125, "learning_rate": 9.499999999999999e-07, - "loss": 0.0205, - "reward": 1.6875, - "reward_std": 0.31381870806217194, - "rewards/accuracy_reward": 0.6875, - "rewards/format_reward": 0.0, + "loss": 0.0005, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 95 }, { "clip_ratio": 0.0, - "completion_length": 508.04168701171875, + "completion_length": 489.3958435058594, "epoch": 0.096, - "grad_norm": 2.508355402096065, - "kl": 0.02386474609375, + "grad_norm": 0.066160922726964, + "kl": 0.011871337890625, "learning_rate": 9.6e-07, - "loss": 0.0262, - "reward": 1.6145833730697632, - "reward_std": 0.378067746758461, - "rewards/accuracy_reward": 0.6250000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0005, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 96 }, { "clip_ratio": 0.0, - "completion_length": 461.9166717529297, + "completion_length": 474.37501525878906, "epoch": 0.097, - "grad_norm": 1.907623551742255, - "kl": 0.01898193359375, + "grad_norm": 0.06491250867856747, + "kl": 0.009765625, "learning_rate": 9.7e-07, - "loss": 0.0308, - "reward": 1.8750000596046448, - "reward_std": 0.22040386497974396, - "rewards/accuracy_reward": 0.8750000298023224, - "rewards/format_reward": 0.0, + "loss": 0.0004, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 97 }, { "clip_ratio": 0.0, - "completion_length": 414.5833435058594, + "completion_length": 428.375, "epoch": 0.098, - "grad_norm": 1.6880791728540803, - "kl": 0.02008056640625, + "grad_norm": 0.06720556796337879, + "kl": 0.010955810546875, "learning_rate": 9.8e-07, - "loss": 0.0412, - "reward": 1.6875, - "reward_std": 0.11306675523519516, - "rewards/accuracy_reward": 0.6875, - "rewards/format_reward": 0.0, + "loss": 0.0004, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 98 }, { "clip_ratio": 0.0, - "completion_length": 505.6458435058594, + "completion_length": 517.2708435058594, "epoch": 0.099, - "grad_norm": 1.9770671054334987, - "kl": 0.022705078125, + "grad_norm": 0.07566623409050405, + "kl": 0.0106201171875, "learning_rate": 9.9e-07, - "loss": 0.0379, - "reward": 1.7291666865348816, - "reward_std": 0.21650634706020355, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/format_reward": 0.0, + "loss": 0.0004, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 99 }, { "clip_ratio": 0.0, - "completion_length": 424.9583435058594, + "completion_length": 438.3541717529297, "epoch": 0.1, - "grad_norm": 1.8906028255517844, - "kl": 0.0213623046875, + "grad_norm": 0.05388612872443928, + "kl": 0.010894775390625, "learning_rate": 1e-06, - "loss": 0.0106, - "reward": 1.9166666865348816, - "reward_std": 0.19462472200393677, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, + "loss": 0.0004, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 100 }, { "clip_ratio": 0.0, - "completion_length": 421.60418701171875, + "completion_length": 486.4583435058594, "epoch": 0.101, - "grad_norm": 1.284386257665641, - "kl": 0.02587890625, + "grad_norm": 0.07044171391846912, + "kl": 0.0115966796875, "learning_rate": 9.999972584460056e-07, - "loss": 0.0158, - "reward": 1.8958333730697632, - "reward_std": 0.12873217463493347, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/format_reward": 0.0, + "loss": 0.0005, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 101 }, { "clip_ratio": 0.0, - "completion_length": 424.5833435058594, + "completion_length": 439.5833435058594, "epoch": 0.102, - "grad_norm": 1.942495880486948, - "kl": 0.028076171875, + "grad_norm": 0.059048029504382746, + "kl": 0.00982666015625, "learning_rate": 9.999890338174275e-07, - "loss": 0.0074, - "reward": 1.8541666865348816, - "reward_std": 0.20272701978683472, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/format_reward": 0.0, + "loss": 0.0004, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 102 }, { "clip_ratio": 0.0, - "completion_length": 431.56251525878906, + "completion_length": 459.8333435058594, "epoch": 0.103, - "grad_norm": 1.8427160999583247, - "kl": 0.0279541015625, + "grad_norm": 0.09184309136650912, + "kl": 0.01324462890625, "learning_rate": 9.999753262144804e-07, - "loss": 0.0017, - "reward": 1.8854167461395264, - "reward_std": 0.20985443890094757, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0005, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 103 }, { "clip_ratio": 0.0, - "completion_length": 433.2916717529297, + "completion_length": 454.4583435058594, "epoch": 0.104, - "grad_norm": 1.9574788671578813, - "kl": 0.02099609375, + "grad_norm": 1.231549772721057, + "kl": 0.0106201171875, "learning_rate": 9.999561358041868e-07, - "loss": 0.0284, - "reward": 1.7916666865348816, - "reward_std": 0.22040385007858276, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": 0.0137, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 104 }, { "clip_ratio": 0.0, - "completion_length": 506.2500305175781, + "completion_length": 531.8541870117188, "epoch": 0.105, - "grad_norm": 1.8385105848779988, - "kl": 0.03076171875, + "grad_norm": 0.04630714212471174, + "kl": 0.009552001953125, "learning_rate": 9.99931462820376e-07, - "loss": 0.0522, - "reward": 1.9062500596046448, - "reward_std": 0.18471086025238037, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0004, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 105 }, { "clip_ratio": 0.0, - "completion_length": 358.375, + "completion_length": 469.12501525878906, "epoch": 0.106, - "grad_norm": 2.0869957030027124, - "kl": 0.03564453125, + "grad_norm": 0.040916036564967215, + "kl": 0.009063720703125, "learning_rate": 9.999013075636804e-07, - "loss": -0.0253, - "reward": 1.6927083730697632, - "reward_std": 0.0819607600569725, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.921875, + "loss": 0.0004, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 106 }, { "clip_ratio": 0.0, - "completion_length": 451.5833435058594, + "completion_length": 449.50001525878906, "epoch": 0.107, - "grad_norm": 1.7214935847958999, - "kl": 0.02197265625, + "grad_norm": 0.03666755496102716, + "kl": 0.0082855224609375, "learning_rate": 9.998656704015323e-07, - "loss": 0.0274, - "reward": 1.9583333730697632, - "reward_std": 0.12082496285438538, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 107 }, { "clip_ratio": 0.0, - "completion_length": 424.7708435058594, + "completion_length": 497.33335876464844, "epoch": 0.108, - "grad_norm": 2.9526026618703662, - "kl": 0.02996826171875, + "grad_norm": 0.06750587223762, + "kl": 0.00872802734375, "learning_rate": 9.998245517681593e-07, - "loss": -0.0762, - "reward": 1.5416666865348816, - "reward_std": 0.19702786207199097, - "rewards/accuracy_reward": 0.5833333358168602, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9583333432674408, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 108 }, { "clip_ratio": 0.0, - "completion_length": 416.4583435058594, + "completion_length": 466.93751525878906, "epoch": 0.109, - "grad_norm": 1.3140882663370017, - "kl": 0.0238037109375, + "grad_norm": 0.07079864707100918, + "kl": 0.009246826171875, "learning_rate": 9.997779521645791e-07, - "loss": 0.0102, - "reward": 1.9791666865348816, - "reward_std": 0.07216878235340118, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, + "loss": 0.0004, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 109 }, { "clip_ratio": 0.0, - "completion_length": 493.9791717529297, + "completion_length": 475.8958435058594, "epoch": 0.11, - "grad_norm": 1.513545322724992, - "kl": 0.02520751953125, + "grad_norm": 1.0674719403023372, + "kl": 0.015533447265625, "learning_rate": 9.997258721585931e-07, - "loss": 0.011, - "reward": 1.625, - "reward_std": 0.22040385007858276, - "rewards/accuracy_reward": 0.625, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": -0.0139, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 110 }, { "clip_ratio": 0.0, - "completion_length": 453.3958435058594, + "completion_length": 440.1875, "epoch": 0.111, - "grad_norm": 2.3692930782569643, - "kl": 0.03082275390625, + "grad_norm": 0.03989195456389211, + "kl": 0.009490966796875, "learning_rate": 9.996683123847795e-07, - "loss": 0.0319, - "reward": 1.78125, - "reward_std": 0.372411385178566, - "rewards/accuracy_reward": 0.8125000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9687500298023224, + "loss": 0.0004, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 111 }, { "clip_ratio": 0.0, - "completion_length": 412.87501525878906, + "completion_length": 442.0833435058594, "epoch": 0.112, - "grad_norm": 2.6740403739253766, - "kl": 0.06787109375, + "grad_norm": 0.04589218813730981, + "kl": 0.0080108642578125, "learning_rate": 9.996052735444862e-07, - "loss": 0.0151, - "reward": 1.7239583730697632, - "reward_std": 0.06669837608933449, - "rewards/accuracy_reward": 0.75, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9739583730697632, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 112 }, { "clip_ratio": 0.0, - "completion_length": 527.5, + "completion_length": 529.1250152587891, "epoch": 0.113, - "grad_norm": 2.3302793585121684, - "kl": 0.02557373046875, + "grad_norm": 0.054086545297594345, + "kl": 0.0091552734375, "learning_rate": 9.995367564058216e-07, - "loss": 0.0383, - "reward": 1.7500000596046448, - "reward_std": 0.364768885076046, - "rewards/accuracy_reward": 0.8125000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9375000298023224, + "loss": 0.0004, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 113 }, { "clip_ratio": 0.0, - "completion_length": 478.6875, + "completion_length": 445.39585876464844, "epoch": 0.114, - "grad_norm": 1.974509624510436, - "kl": 0.02410888671875, + "grad_norm": 1.209655996583225, + "kl": 0.011749267578125, "learning_rate": 9.994627618036452e-07, - "loss": 0.0208, - "reward": 1.8854166865348816, - "reward_std": 0.21218694001436234, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/format_reward": 0.0, + "loss": -0.002, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, "rewards/tag_count_reward": 0.9895833432674408, "step": 114 }, { "clip_ratio": 0.0, - "completion_length": 428.75, + "completion_length": 452.95835876464844, "epoch": 0.115, - "grad_norm": 1.8395033635729268, - "kl": 0.02496337890625, + "grad_norm": 0.039887545798762906, + "kl": 0.0077362060546875, "learning_rate": 9.993832906395582e-07, - "loss": -0.0052, - "reward": 1.9270833730697632, - "reward_std": 0.13579988479614258, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.96875, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 115 }, { "clip_ratio": 0.0, - "completion_length": 517.9791717529297, + "completion_length": 491.8541717529297, "epoch": 0.116, - "grad_norm": 2.573887435400486, - "kl": 0.0250244140625, + "grad_norm": 0.05109503294914965, + "kl": 0.00970458984375, "learning_rate": 9.992983438818915e-07, - "loss": -0.0187, - "reward": 1.3750000596046448, - "reward_std": 0.3927238881587982, - "rewards/accuracy_reward": 0.3958333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.0004, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 116 }, { "clip_ratio": 0.0, - "completion_length": 473.5208435058594, + "completion_length": 450.25, "epoch": 0.117, - "grad_norm": 1.8895271814490733, - "kl": 0.02532958984375, + "grad_norm": 0.050184412277555976, + "kl": 0.0084075927734375, "learning_rate": 9.992079225656944e-07, - "loss": -0.0106, - "reward": 1.7760416865348816, - "reward_std": 0.15143894776701927, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.984375, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 117 }, { "clip_ratio": 0.0, - "completion_length": 451.06251525878906, + "completion_length": 464.25001525878906, "epoch": 0.118, - "grad_norm": 1.7946350638021566, - "kl": 0.02410888671875, + "grad_norm": 0.04985779343703463, + "kl": 0.009490966796875, "learning_rate": 9.991120277927223e-07, - "loss": -0.0111, - "reward": 1.8072916865348816, - "reward_std": 0.13110895082354546, - "rewards/accuracy_reward": 0.8125, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0004, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 118 }, { "clip_ratio": 0.0, - "completion_length": 463.81251525878906, + "completion_length": 461.81251525878906, "epoch": 0.119, - "grad_norm": 1.1404926719023871, - "kl": 0.02349853515625, + "grad_norm": 0.07227541582028604, + "kl": 0.0108642578125, "learning_rate": 9.990106607314225e-07, - "loss": -0.0086, - "reward": 1.9895833730697632, - "reward_std": 0.03608439117670059, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0004, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 119 }, { "clip_ratio": 0.0, - "completion_length": 593.0625, + "completion_length": 508.33335876464844, "epoch": 0.12, - "grad_norm": 2.0093306234495154, - "kl": 0.02532958984375, + "grad_norm": 0.042175346801523864, + "kl": 0.00927734375, "learning_rate": 9.989038226169207e-07, - "loss": 0.0011, - "reward": 1.59375, - "reward_std": 0.3443092256784439, - "rewards/accuracy_reward": 0.6250000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9687500298023224, + "loss": 0.0004, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 120 }, { "clip_ratio": 0.0, - "completion_length": 507.8125, + "completion_length": 430.62501525878906, "epoch": 0.121, - "grad_norm": 1.7892482170583524, - "kl": 0.0277099609375, + "grad_norm": 0.04214803283818787, + "kl": 0.008148193359375, "learning_rate": 9.98791514751006e-07, - "loss": 0.1722, - "reward": 1.6927083730697632, - "reward_std": 0.12497352808713913, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 121 }, { "clip_ratio": 0.0, - "completion_length": 449.5, + "completion_length": 409.3333435058594, "epoch": 0.122, - "grad_norm": 1.722048989465319, - "kl": 0.030029296875, + "grad_norm": 0.047501096439942436, + "kl": 0.008209228515625, "learning_rate": 9.98673738502114e-07, - "loss": -0.023, - "reward": 1.7968750596046448, - "reward_std": 0.21180957555770874, - "rewards/accuracy_reward": 0.8125000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.984375, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 122 }, { "clip_ratio": 0.0, - "completion_length": 469.625, + "completion_length": 454.2708435058594, "epoch": 0.123, - "grad_norm": 1.7255792854875613, - "kl": 0.0279541015625, + "grad_norm": 0.07310038780733595, + "kl": 0.0118408203125, "learning_rate": 9.985504953053113e-07, - "loss": 0.0389, - "reward": 1.8802083730697632, - "reward_std": 0.1636047102510929, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9843750298023224, + "loss": 0.0004, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 123 }, { "clip_ratio": 0.0, - "completion_length": 457.5833435058594, + "completion_length": 458.8958435058594, "epoch": 0.124, - "grad_norm": 1.6902228262037184, - "kl": 0.02752685546875, + "grad_norm": 0.07502052646036045, + "kl": 0.008758544921875, "learning_rate": 9.98421786662277e-07, - "loss": -0.0151, - "reward": 1.8645833730697632, - "reward_std": 0.16664262861013412, - "rewards/accuracy_reward": 0.875, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 124 }, { "clip_ratio": 0.0, - "completion_length": 542.3333587646484, + "completion_length": 482.50001525878906, "epoch": 0.125, - "grad_norm": 1.9146301637942638, - "kl": 0.0269775390625, + "grad_norm": 0.04546364063861285, + "kl": 0.0083770751953125, "learning_rate": 9.982876141412855e-07, - "loss": -0.0007, - "reward": 1.4166667461395264, - "reward_std": 0.3345904052257538, - "rewards/accuracy_reward": 0.4375000223517418, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 125 }, { "clip_ratio": 0.0, - "completion_length": 491.0416717529297, + "completion_length": 442.9166717529297, "epoch": 0.126, - "grad_norm": 1.712921047727268, - "kl": 0.02764892578125, + "grad_norm": 0.03944185524719935, + "kl": 0.007354736328125, "learning_rate": 9.981479793771866e-07, - "loss": 0.0744, - "reward": 1.7760416865348816, - "reward_std": 0.2057519406080246, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.984375, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 126 }, { "clip_ratio": 0.0, - "completion_length": 546.5416870117188, + "completion_length": 489.1458435058594, "epoch": 0.127, - "grad_norm": 1.7859916303470895, - "kl": 0.034423828125, + "grad_norm": 0.07225300807985488, + "kl": 0.013092041015625, "learning_rate": 9.98002884071386e-07, - "loss": 0.0464, - "reward": 1.6979167461395264, - "reward_std": 0.2689327597618103, - "rewards/accuracy_reward": 0.7083333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833730697632, + "loss": 0.0005, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 127 }, { "clip_ratio": 0.0, - "completion_length": 503.6250305175781, + "completion_length": 484.6875, "epoch": 0.128, - "grad_norm": 2.073269318929038, - "kl": 0.03033447265625, + "grad_norm": 0.11451764875833652, + "kl": 0.010528564453125, "learning_rate": 9.97852329991824e-07, - "loss": 0.0246, - "reward": 1.75, - "reward_std": 0.35104452073574066, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.0004, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 128 }, { "clip_ratio": 0.0, - "completion_length": 544.0833435058594, + "completion_length": 476.3958435058594, "epoch": 0.129, - "grad_norm": 1.6460538532689097, - "kl": 0.0306396484375, + "grad_norm": 0.039811807244108155, + "kl": 0.008453369140625, "learning_rate": 9.976963189729547e-07, - "loss": 0.0949, - "reward": 1.5833333730697632, - "reward_std": 0.24179892987012863, - "rewards/accuracy_reward": 0.5833333432674408, - "rewards/format_reward": 0.0, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 129 }, { "clip_ratio": 0.0, - "completion_length": 580.8541870117188, + "completion_length": 446.3541717529297, "epoch": 0.13, - "grad_norm": 2.5401378528693264, - "kl": 0.04150390625, + "grad_norm": 0.5592846499570234, + "kl": 0.012420654296875, "learning_rate": 9.975348529157229e-07, - "loss": -0.0528, - "reward": 1.3958333730697632, - "reward_std": 0.4156641513109207, - "rewards/accuracy_reward": 0.3958333432674408, - "rewards/format_reward": 0.0, + "loss": 0.0005, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 130 }, { "clip_ratio": 0.0, - "completion_length": 531.7708435058594, + "completion_length": 517.3125, "epoch": 0.131, - "grad_norm": 1.9573752772301032, - "kl": 0.03125, + "grad_norm": 0.05179490828304956, + "kl": 0.0108642578125, "learning_rate": 9.973679337875418e-07, - "loss": -0.0261, - "reward": 1.7708333730697632, - "reward_std": 0.34913603961467743, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/format_reward": 0.0, + "loss": 0.0004, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 131 }, { "clip_ratio": 0.0, - "completion_length": 551.0416870117188, + "completion_length": 508.0416717529297, "epoch": 0.132, - "grad_norm": 1.6981949001451364, - "kl": 0.0321044921875, + "grad_norm": 0.05367694843444582, + "kl": 0.00897216796875, "learning_rate": 9.971955636222684e-07, - "loss": -0.0066, - "reward": 1.8697916865348816, - "reward_std": 0.2178531140089035, - "rewards/accuracy_reward": 0.875, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0004, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 132 }, { "clip_ratio": 0.0, - "completion_length": 449.06251525878906, + "completion_length": 424.3333435058594, "epoch": 0.133, - "grad_norm": 1.2726542950563706, - "kl": 0.03021240234375, + "grad_norm": 0.06917189855894977, + "kl": 0.010284423828125, "learning_rate": 9.970177445201783e-07, - "loss": 0.0341, - "reward": 1.8958333730697632, - "reward_std": 0.12873217463493347, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/format_reward": 0.0, + "loss": 0.0004, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 133 }, { "clip_ratio": 0.0, - "completion_length": 470.64585876464844, + "completion_length": 471.66668701171875, "epoch": 0.134, - "grad_norm": 1.0712547778176098, - "kl": 0.0323486328125, + "grad_norm": 0.06043455789032839, + "kl": 0.0084075927734375, "learning_rate": 9.968344786479415e-07, - "loss": -0.0093, - "reward": 1.875, - "reward_std": 0.13055823743343353, - "rewards/accuracy_reward": 0.875, - "rewards/format_reward": 0.0, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 134 }, { "clip_ratio": 0.0, - "completion_length": 398.29168701171875, + "completion_length": 399.43751525878906, "epoch": 0.135, - "grad_norm": 1.2032782052397564, - "kl": 0.034912109375, + "grad_norm": 0.03641136487333736, + "kl": 0.0073699951171875, "learning_rate": 9.96645768238595e-07, - "loss": -0.007, - "reward": 1.9791666865348816, - "reward_std": 0.07216878235340118, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 135 }, { "clip_ratio": 0.0, - "completion_length": 427.4583435058594, + "completion_length": 451.12501525878906, "epoch": 0.136, - "grad_norm": 1.7734535890068437, - "kl": 0.0382080078125, + "grad_norm": 1.112101379858347, + "kl": 0.0093841552734375, "learning_rate": 9.964516155915151e-07, - "loss": 0.0159, - "reward": 1.7239583730697632, - "reward_std": 0.09021097421646118, - "rewards/accuracy_reward": 0.7291666716337204, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": -0.007, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 136 }, { "clip_ratio": 0.0, - "completion_length": 537.0833740234375, + "completion_length": 529.8333435058594, "epoch": 0.137, - "grad_norm": 2.3975298518599217, - "kl": 0.0361328125, + "grad_norm": 0.03110825958701824, + "kl": 0.008819580078125, "learning_rate": 9.962520230723906e-07, - "loss": 0.0452, - "reward": 1.6458333730697632, - "reward_std": 0.3395978510379791, - "rewards/accuracy_reward": 0.6458333432674408, - "rewards/format_reward": 0.0, + "loss": 0.0004, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 137 }, { "clip_ratio": 0.0, - "completion_length": 546.6250305175781, + "completion_length": 481.7708435058594, "epoch": 0.138, - "grad_norm": 1.7081749467887004, - "kl": 0.0362548828125, + "grad_norm": 0.03882544215511118, + "kl": 0.009429931640625, "learning_rate": 9.960469931131936e-07, - "loss": 0.0188, - "reward": 1.625, - "reward_std": 0.22613351047039032, - "rewards/accuracy_reward": 0.625, - "rewards/format_reward": 0.0, + "loss": 0.0004, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 138 }, { "clip_ratio": 0.0, - "completion_length": 493.75, + "completion_length": 446.60418701171875, "epoch": 0.139, - "grad_norm": 2.0941768595706978, - "kl": 0.04107666015625, + "grad_norm": 0.03389657927366085, + "kl": 0.0071258544921875, "learning_rate": 9.958365282121496e-07, - "loss": -0.0722, - "reward": 1.7395833730697632, - "reward_std": 0.18042195588350296, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 139 }, { "clip_ratio": 0.0, - "completion_length": 544.0416870117188, + "completion_length": 458.6458435058594, "epoch": 0.14, - "grad_norm": 4.383530283163599, - "kl": 0.0750732421875, + "grad_norm": 0.03482197048060803, + "kl": 0.0071563720703125, "learning_rate": 9.956206309337066e-07, - "loss": -0.0355, - "reward": 1.8333333730697632, - "reward_std": 0.22787059843540192, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/format_reward": 0.0, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 140 }, { "clip_ratio": 0.0, - "completion_length": 487.7916717529297, + "completion_length": 509.25001525878906, "epoch": 0.141, - "grad_norm": 1.8675888286114841, - "kl": 0.0362548828125, + "grad_norm": 0.033669621174171584, + "kl": 0.0080108642578125, "learning_rate": 9.953993039085048e-07, - "loss": 0.0193, - "reward": 1.9739583730697632, - "reward_std": 0.09021097794175148, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 141 }, { "clip_ratio": 0.0, - "completion_length": 528.7500152587891, + "completion_length": 473.3958435058594, "epoch": 0.142, - "grad_norm": 1.7762889556678398, - "kl": 0.0345458984375, + "grad_norm": 1.3199912436596453, + "kl": 0.0085601806640625, "learning_rate": 9.951725498333448e-07, - "loss": 0.0152, - "reward": 1.6250000596046448, - "reward_std": 0.22040386497974396, - "rewards/accuracy_reward": 0.6250000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": 0.001, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 142 }, { "clip_ratio": 0.0, - "completion_length": 418.5416717529297, + "completion_length": 414.9375, "epoch": 0.143, - "grad_norm": 0.11857781401423817, - "kl": 0.03192138671875, + "grad_norm": 0.0685302014271952, + "kl": 0.010650634765625, "learning_rate": 9.949403714711526e-07, - "loss": 0.0013, - "reward": 2.0, + "loss": 0.0004, + "reward": 1.0, "reward_std": 0.0, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, "rewards/tag_count_reward": 1.0, "step": 143 }, { "clip_ratio": 0.0, - "completion_length": 461.41668701171875, + "completion_length": 442.37501525878906, "epoch": 0.144, - "grad_norm": 1.4523826657220977, - "kl": 0.0328369140625, + "grad_norm": 0.043254836515515906, + "kl": 0.0075225830078125, "learning_rate": 9.947027716509488e-07, - "loss": 0.0235, - "reward": 1.9583333730697632, - "reward_std": 0.09731237590312958, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 144 }, { "clip_ratio": 0.0, - "completion_length": 475.16668701171875, + "completion_length": 447.2708435058594, "epoch": 0.145, - "grad_norm": 1.6956389235449896, - "kl": 0.0345458984375, + "grad_norm": 0.034540457224639255, + "kl": 0.008514404296875, "learning_rate": 9.944597532678119e-07, - "loss": 0.0218, - "reward": 1.8333333730697632, - "reward_std": 0.24179892987012863, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/format_reward": 0.0, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 145 }, { "clip_ratio": 0.0, - "completion_length": 528.1041870117188, + "completion_length": 488.9583435058594, "epoch": 0.146, - "grad_norm": 2.0562109460663685, - "kl": 0.034912109375, + "grad_norm": 0.032769485521784394, + "kl": 0.008148193359375, "learning_rate": 9.942113192828444e-07, - "loss": -0.0068, - "reward": 1.4583333730697632, - "reward_std": 0.24164992570877075, - "rewards/accuracy_reward": 0.4583333432674408, - "rewards/format_reward": 0.0, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 146 }, { "clip_ratio": 0.0, - "completion_length": 528.7083587646484, + "completion_length": 469.9375, "epoch": 0.147, - "grad_norm": 1.690782703279331, - "kl": 0.03515625, + "grad_norm": 1.340172920356849, + "kl": 0.0115966796875, "learning_rate": 9.939574727231362e-07, - "loss": -0.049, - "reward": 1.9062500596046448, - "reward_std": 0.2414991855621338, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, + "loss": 0.0266, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, "rewards/tag_count_reward": 0.9895833432674408, "step": 147 }, { "clip_ratio": 0.0, - "completion_length": 481.18751525878906, + "completion_length": 476.8333435058594, "epoch": 0.148, - "grad_norm": 1.585327369732611, - "kl": 0.035400390625, + "grad_norm": 0.033459278580343914, + "kl": 0.008331298828125, "learning_rate": 9.93698216681727e-07, - "loss": -0.0056, - "reward": 1.8489583730697632, - "reward_std": 0.2344440296292305, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 148 }, { "clip_ratio": 0.0, - "completion_length": 464.3333435058594, + "completion_length": 456.75001525878906, "epoch": 0.149, - "grad_norm": 1.9819641473591838, - "kl": 0.036865234375, + "grad_norm": 0.039450475594316296, + "kl": 0.0071868896484375, "learning_rate": 9.934335543175705e-07, - "loss": 0.0947, - "reward": 1.9479166865348816, - "reward_std": 0.12164045870304108, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 149 }, { "clip_ratio": 0.0, - "completion_length": 440.8333435058594, + "completion_length": 470.37501525878906, "epoch": 0.15, - "grad_norm": 0.12285313912155071, - "kl": 0.0360107421875, + "grad_norm": 0.031938722125296505, + "kl": 0.0070953369140625, "learning_rate": 9.931634888554935e-07, - "loss": 0.0014, - "reward": 1.75, + "loss": 0.0003, + "reward": 1.0, "reward_std": 0.0, - "rewards/accuracy_reward": 0.75, - "rewards/format_reward": 0.0, "rewards/tag_count_reward": 1.0, "step": 150 }, { "clip_ratio": 0.0, - "completion_length": 488.7916717529297, + "completion_length": 430.47918701171875, "epoch": 0.151, - "grad_norm": 1.063195499891589, - "kl": 0.0328369140625, + "grad_norm": 0.04371106287499349, + "kl": 0.0083160400390625, "learning_rate": 9.928880235861588e-07, - "loss": -0.0188, - "reward": 1.9114583730697632, - "reward_std": 0.12054072320461273, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 151 }, { "clip_ratio": 0.0, - "completion_length": 474.0625, + "completion_length": 426.2291717529297, "epoch": 0.152, - "grad_norm": 1.2466282783011426, - "kl": 0.0355224609375, + "grad_norm": 0.036907155124602, + "kl": 0.00762939453125, "learning_rate": 9.926071618660237e-07, - "loss": -0.0431, - "reward": 1.7916666865348816, - "reward_std": 0.09731236100196838, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/format_reward": 0.0, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 152 }, { "clip_ratio": 0.0, - "completion_length": 535.8750305175781, + "completion_length": 481.1875, "epoch": 0.153, - "grad_norm": 1.938342796179097, - "kl": 0.0340576171875, + "grad_norm": 0.04160779790370388, + "kl": 0.007904052734375, "learning_rate": 9.923209071172994e-07, - "loss": -0.0078, - "reward": 1.7708333730697632, - "reward_std": 0.3139677047729492, - "rewards/accuracy_reward": 0.7708333730697632, - "rewards/format_reward": 0.0, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 153 }, { "clip_ratio": 0.0, - "completion_length": 455.62501525878906, + "completion_length": 464.1875, "epoch": 0.154, - "grad_norm": 1.4682973363511411, - "kl": 0.034912109375, + "grad_norm": 0.04461132836472903, + "kl": 0.00921630859375, "learning_rate": 9.9202926282791e-07, - "loss": -0.0231, - "reward": 1.8906250596046448, - "reward_std": 0.14677437022328377, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0004, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 154 }, { "clip_ratio": 0.0, - "completion_length": 522.0000305175781, + "completion_length": 495.375, "epoch": 0.155, - "grad_norm": 0.9308496613480906, - "kl": 0.03314208984375, + "grad_norm": 1.2860156647484209, + "kl": 0.007720947265625, "learning_rate": 9.917322325514487e-07, - "loss": -0.0261, - "reward": 1.8229166865348816, - "reward_std": 0.11254207789897919, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0091, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 155 }, { "clip_ratio": 0.0, - "completion_length": 391.1041717529297, + "completion_length": 394.7291717529297, "epoch": 0.156, - "grad_norm": 1.4470684501663114, - "kl": 0.03533935546875, + "grad_norm": 0.04035318480378971, + "kl": 0.0077362060546875, "learning_rate": 9.91429819907136e-07, - "loss": 0.0217, - "reward": 1.7447916865348816, - "reward_std": 0.018042195588350296, - "rewards/accuracy_reward": 0.75, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 156 }, { "clip_ratio": 0.0, - "completion_length": 524.1875305175781, + "completion_length": 479.10418701171875, "epoch": 0.157, - "grad_norm": 1.1522579341401598, - "kl": 0.03466796875, + "grad_norm": 0.04387283645291692, + "kl": 0.0073699951171875, "learning_rate": 9.911220285797748e-07, - "loss": -0.0318, - "reward": 1.9375, - "reward_std": 0.11306675523519516, - "rewards/accuracy_reward": 0.9375, - "rewards/format_reward": 0.0, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 157 }, { "clip_ratio": 0.0, - "completion_length": 538.6250305175781, + "completion_length": 448.9583435058594, "epoch": 0.158, - "grad_norm": 1.659266516264927, - "kl": 0.034423828125, + "grad_norm": 0.038691858484494475, + "kl": 0.00836181640625, "learning_rate": 9.908088623197048e-07, - "loss": 0.0514, - "reward": 1.7708333730697632, - "reward_std": 0.16948114335536957, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/format_reward": 0.0, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 158 }, { "clip_ratio": 0.0, - "completion_length": 531.5000152587891, + "completion_length": 479.6458435058594, "epoch": 0.159, - "grad_norm": 0.9351784025513035, - "kl": 0.0303955078125, + "grad_norm": 0.03853129628943197, + "kl": 0.007293701171875, "learning_rate": 9.904903249427582e-07, - "loss": -0.022, - "reward": 1.875, - "reward_std": 0.13055823743343353, - "rewards/accuracy_reward": 0.875, - "rewards/format_reward": 0.0, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 159 }, { "clip_ratio": 0.0, - "completion_length": 445.85418701171875, + "completion_length": 406.56251525878906, "epoch": 0.16, - "grad_norm": 0.12766514968990195, - "kl": 0.0340576171875, + "grad_norm": 1.1612045082634144, + "kl": 0.0088653564453125, "learning_rate": 9.901664203302124e-07, - "loss": 0.0014, - "reward": 1.75, - "reward_std": 0.0, - "rewards/accuracy_reward": 0.75, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": 0.0065, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 160 }, { "clip_ratio": 0.0, - "completion_length": 470.6458435058594, + "completion_length": 434.2291717529297, "epoch": 0.161, - "grad_norm": 1.2169597147006208, - "kl": 0.0350341796875, + "grad_norm": 0.04436998873448987, + "kl": 0.0077362060546875, "learning_rate": 9.89837152428743e-07, - "loss": -0.0296, - "reward": 1.8333333730697632, - "reward_std": 0.12309150397777557, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/format_reward": 0.0, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 161 }, { "clip_ratio": 0.0, - "completion_length": 515.7500305175781, + "completion_length": 447.8333435058594, "epoch": 0.162, - "grad_norm": 1.3834987531103262, - "kl": 0.0328369140625, + "grad_norm": 0.036244492000791544, + "kl": 0.0079345703125, "learning_rate": 9.895025252503755e-07, - "loss": 0.0062, - "reward": 1.6666667461395264, - "reward_std": 0.20090095698833466, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/format_reward": 0.0, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 162 }, { "clip_ratio": 0.0, - "completion_length": 570.5625152587891, + "completion_length": 458.4583435058594, "epoch": 0.163, - "grad_norm": 1.5517641138594835, - "kl": 0.0333251953125, + "grad_norm": 0.04100771556606424, + "kl": 0.0080108642578125, "learning_rate": 9.891625428724364e-07, - "loss": 0.0166, - "reward": 1.7083333730697632, - "reward_std": 0.32399244606494904, - "rewards/accuracy_reward": 0.708333358168602, - "rewards/format_reward": 0.0, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 163 }, { "clip_ratio": 0.0, - "completion_length": 562.7708435058594, + "completion_length": 449.8333435058594, "epoch": 0.164, - "grad_norm": 1.2232882841109314, - "kl": 0.03289794921875, + "grad_norm": 0.08359359800747004, + "kl": 0.0088653564453125, "learning_rate": 9.888172094375033e-07, - "loss": -0.0313, - "reward": 1.9375, - "reward_std": 0.16948114335536957, - "rewards/accuracy_reward": 0.9375, - "rewards/format_reward": 0.0, + "loss": 0.0004, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 164 }, { "clip_ratio": 0.0, - "completion_length": 607.3125, + "completion_length": 468.2708435058594, "epoch": 0.165, - "grad_norm": 1.4953067054465543, - "kl": 0.0352783203125, + "grad_norm": 0.04022627136874073, + "kl": 0.008453369140625, "learning_rate": 9.88466529153356e-07, - "loss": -0.0317, - "reward": 1.7812500596046448, - "reward_std": 0.17128896713256836, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 165 }, { "clip_ratio": 0.0, - "completion_length": 511.29168701171875, + "completion_length": 459.87501525878906, "epoch": 0.166, - "grad_norm": 2.0805601481445564, - "kl": 0.0374755859375, + "grad_norm": 0.04302669925116951, + "kl": 0.0085906982421875, "learning_rate": 9.881105062929221e-07, - "loss": 0.0403, - "reward": 1.7395833730697632, - "reward_std": 0.14433756098151207, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833730697632, + "loss": 0.0004, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 166 }, { "clip_ratio": 0.0, - "completion_length": 535.0208740234375, + "completion_length": 473.68751525878906, "epoch": 0.167, - "grad_norm": 2.2606308866143006, - "kl": 0.03857421875, + "grad_norm": 0.04557294629797261, + "kl": 0.008331298828125, "learning_rate": 9.877491451942284e-07, - "loss": 0.0342, - "reward": 1.7604166865348816, - "reward_std": 0.4283548891544342, - "rewards/accuracy_reward": 0.7708333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 167 }, { "clip_ratio": 0.0, - "completion_length": 515.8750152587891, + "completion_length": 442.5416717529297, "epoch": 0.168, - "grad_norm": 1.3025627066487886, - "kl": 0.031005859375, + "grad_norm": 0.03483320947401022, + "kl": 0.0066680908203125, "learning_rate": 9.873824502603459e-07, - "loss": 0.0295, - "reward": 1.9583333730697632, - "reward_std": 0.09731236100196838, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 168 }, { "clip_ratio": 0.0, - "completion_length": 590.4583435058594, + "completion_length": 493.35418701171875, "epoch": 0.169, - "grad_norm": 2.2294112391243393, - "kl": 0.0333251953125, + "grad_norm": 1.2768670862840756, + "kl": 0.010406494140625, "learning_rate": 9.870104259593362e-07, - "loss": 0.0586, - "reward": 1.6875000596046448, - "reward_std": 0.31381870806217194, - "rewards/accuracy_reward": 0.6875000149011612, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": 0.0031, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 169 }, { "clip_ratio": 0.0, - "completion_length": 494.6875, + "completion_length": 460.5208435058594, "epoch": 0.17, - "grad_norm": 1.554508243963261, - "kl": 0.03131103515625, + "grad_norm": 0.03919333998759371, + "kl": 0.0078582763671875, "learning_rate": 9.866330768241983e-07, - "loss": 0.0263, - "reward": 1.8489583730697632, - "reward_std": 0.14677435159683228, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 170 }, { "clip_ratio": 0.0, - "completion_length": 639.3333435058594, + "completion_length": 444.7708435058594, "epoch": 0.171, - "grad_norm": 1.3098496643658544, - "kl": 0.0316162109375, + "grad_norm": 0.04710061499517116, + "kl": 0.00848388671875, "learning_rate": 9.862504074528126e-07, - "loss": 0.0017, - "reward": 1.75, - "reward_std": 0.22613351047039032, - "rewards/accuracy_reward": 0.75, - "rewards/format_reward": 0.0, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 171 }, { "clip_ratio": 0.0, - "completion_length": 561.125, + "completion_length": 470.62501525878906, "epoch": 0.172, - "grad_norm": 1.7145836259546225, - "kl": 0.03125, + "grad_norm": 0.03832441075014794, + "kl": 0.00909423828125, "learning_rate": 9.85862422507884e-07, - "loss": 0.054, - "reward": 1.90625, - "reward_std": 0.20762187242507935, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0004, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 172 }, { "clip_ratio": 0.0, - "completion_length": 612.6666870117188, + "completion_length": 465.04168701171875, "epoch": 0.173, - "grad_norm": 2.006247979915205, - "kl": 0.032470703125, + "grad_norm": 0.04435668106278205, + "kl": 0.00775146484375, "learning_rate": 9.854691267168871e-07, - "loss": 0.0614, - "reward": 1.6093750596046448, - "reward_std": 0.23601797595620155, - "rewards/accuracy_reward": 0.625, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9843750298023224, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 173 }, { "clip_ratio": 0.0, - "completion_length": 625.2083740234375, + "completion_length": 503.9791717529297, "epoch": 0.174, - "grad_norm": 1.850468417947257, - "kl": 0.036376953125, + "grad_norm": 0.036439831353889465, + "kl": 0.00848388671875, "learning_rate": 9.850705248720068e-07, - "loss": 0.0366, - "reward": 1.7447916865348816, - "reward_std": 0.3420346528291702, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 174 }, { "clip_ratio": 0.0, - "completion_length": 691.0000305175781, + "completion_length": 481.6875, "epoch": 0.175, - "grad_norm": 2.0889329131705137, - "kl": 0.0399169921875, + "grad_norm": 0.04549820373206854, + "kl": 0.008941650390625, "learning_rate": 9.846666218300807e-07, - "loss": 0.0253, - "reward": 1.4062500596046448, - "reward_std": 0.37911172211170197, - "rewards/accuracy_reward": 0.4375, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9687500298023224, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 175 }, { "clip_ratio": 0.0, - "completion_length": 618.8958435058594, + "completion_length": 459.91668701171875, "epoch": 0.176, - "grad_norm": 1.4240098910462737, - "kl": 0.036865234375, + "grad_norm": 0.054447438694994736, + "kl": 0.0087890625, "learning_rate": 9.8425742251254e-07, - "loss": -0.0333, - "reward": 1.5833333730697632, - "reward_std": 0.24618300795555115, - "rewards/accuracy_reward": 0.5833333432674408, - "rewards/format_reward": 0.0, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 176 }, { "clip_ratio": 0.0, - "completion_length": 547.4166717529297, + "completion_length": 488.2708435058594, "epoch": 0.177, - "grad_norm": 1.3834162483113395, - "kl": 0.03466796875, + "grad_norm": 0.04069149781366238, + "kl": 0.0084075927734375, "learning_rate": 9.838429319053495e-07, - "loss": -0.0213, - "reward": 1.9583333730697632, - "reward_std": 0.14433756470680237, - "rewards/accuracy_reward": 0.9583333730697632, - "rewards/format_reward": 0.0, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 177 }, { "clip_ratio": 0.0, - "completion_length": 490.5, + "completion_length": 422.9583435058594, "epoch": 0.178, - "grad_norm": 0.11376178840815106, - "kl": 0.03277587890625, + "grad_norm": 0.05079796029083161, + "kl": 0.0077362060546875, "learning_rate": 9.83423155058946e-07, - "loss": 0.0013, - "reward": 2.0, + "loss": 0.0003, + "reward": 1.0, "reward_std": 0.0, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, "rewards/tag_count_reward": 1.0, "step": 178 }, { "clip_ratio": 0.0, - "completion_length": 528.375, + "completion_length": 417.10418701171875, "epoch": 0.179, - "grad_norm": 1.9333499177389204, - "kl": 0.041015625, + "grad_norm": 1.61418018880557, + "kl": 0.0090484619140625, "learning_rate": 9.829980970881784e-07, - "loss": -0.0732, - "reward": 1.640625, - "reward_std": 0.36500225961208344, - "rewards/accuracy_reward": 0.6458333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0033, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 179 }, { "clip_ratio": 0.0, - "completion_length": 512.1666870117188, + "completion_length": 458.5625305175781, "epoch": 0.18, - "grad_norm": 1.164840130947205, - "kl": 0.0325927734375, + "grad_norm": 1.1252202210444988, + "kl": 0.007904052734375, "learning_rate": 9.825677631722435e-07, - "loss": 0.0117, - "reward": 1.625, - "reward_std": 0.13055823743343353, - "rewards/accuracy_reward": 0.625, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": -0.0051, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 180 }, { "clip_ratio": 0.0, - "completion_length": 534.1875152587891, + "completion_length": 454.0208435058594, "epoch": 0.181, - "grad_norm": 1.9288689053507775, - "kl": 0.03271484375, + "grad_norm": 0.041347335947395474, + "kl": 0.0069580078125, "learning_rate": 9.821321585546243e-07, - "loss": 0.1236, - "reward": 1.8541666865348816, - "reward_std": 0.23615825921297073, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/format_reward": 0.0, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 181 }, { "clip_ratio": 0.0, - "completion_length": 760.7708435058594, + "completion_length": 514.6041870117188, "epoch": 0.182, - "grad_norm": 1.9428679698216365, - "kl": 0.0335693359375, + "grad_norm": 1.1967680351799976, + "kl": 0.0073089599609375, "learning_rate": 9.816912885430258e-07, - "loss": 0.1351, - "reward": 1.6927083730697632, - "reward_std": 0.351699560880661, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.984375, + "loss": 0.0239, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 182 }, { "clip_ratio": 0.0, - "completion_length": 566.9583435058594, + "completion_length": 490.0416717529297, "epoch": 0.183, - "grad_norm": 1.4126640994539446, - "kl": 0.03125, + "grad_norm": 0.038234135789257266, + "kl": 0.00762939453125, "learning_rate": 9.812451585093098e-07, - "loss": 0.024, - "reward": 1.9114583730697632, - "reward_std": 0.1835213080048561, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 183 }, { "clip_ratio": 0.0, - "completion_length": 516.3125152587891, + "completion_length": 500.31251525878906, "epoch": 0.184, - "grad_norm": 1.6167664461469606, - "kl": 0.035888671875, + "grad_norm": 0.059067413317888416, + "kl": 0.008697509765625, "learning_rate": 9.807937738894303e-07, - "loss": -0.0303, - "reward": 1.7291666865348816, - "reward_std": 0.23615825921297073, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/format_reward": 0.0, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 184 }, { "clip_ratio": 0.0, - "completion_length": 669.875, + "completion_length": 511.0625305175781, "epoch": 0.185, - "grad_norm": 1.6208098253559204, - "kl": 0.0389404296875, + "grad_norm": 0.05103624708512203, + "kl": 0.0093994140625, "learning_rate": 9.80337140183366e-07, - "loss": -0.0114, - "reward": 1.7500000596046448, - "reward_std": 0.3648904263973236, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/format_reward": 0.0, + "loss": 0.0004, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 185 }, { "clip_ratio": 0.0, - "completion_length": 512.8541870117188, + "completion_length": 455.9791717529297, "epoch": 0.186, - "grad_norm": 1.646219906849929, - "kl": 0.04052734375, + "grad_norm": 0.05080899260139373, + "kl": 0.00958251953125, "learning_rate": 9.798752629550546e-07, - "loss": 0.008, - "reward": 1.6822916865348816, - "reward_std": 0.13110895082354546, - "rewards/accuracy_reward": 0.6875, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0004, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 186 }, { "clip_ratio": 0.0, - "completion_length": 538.6875, + "completion_length": 483.0833435058594, "epoch": 0.187, - "grad_norm": 1.8498384499524474, - "kl": 0.0404052734375, + "grad_norm": 0.05430438471016196, + "kl": 0.00958251953125, "learning_rate": 9.794081478323245e-07, - "loss": -0.0134, - "reward": 1.8072916865348816, - "reward_std": 0.21330247819423676, - "rewards/accuracy_reward": 0.8125000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0004, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 187 }, { "clip_ratio": 0.0, - "completion_length": 500.08335876464844, + "completion_length": 433.7916717529297, "epoch": 0.188, - "grad_norm": 1.2422870567859647, - "kl": 0.0322265625, + "grad_norm": 0.04681447901040309, + "kl": 0.0081634521484375, "learning_rate": 9.78935800506826e-07, - "loss": 0.0061, - "reward": 1.4947916865348816, - "reward_std": 0.018042195588350296, - "rewards/accuracy_reward": 0.5, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 188 }, { "clip_ratio": 0.0, - "completion_length": 502.5833435058594, + "completion_length": 458.9166717529297, "epoch": 0.189, - "grad_norm": 1.525579593814434, - "kl": 0.0367431640625, + "grad_norm": 0.049365074467071016, + "kl": 0.010223388671875, "learning_rate": 9.784582267339622e-07, - "loss": -0.069, - "reward": 1.8125000596046448, - "reward_std": 0.19526028633117676, - "rewards/accuracy_reward": 0.8125000298023224, - "rewards/format_reward": 0.0, + "loss": 0.0004, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 189 }, { "clip_ratio": 0.0, - "completion_length": 549.6875152587891, + "completion_length": 464.5833435058594, "epoch": 0.19, - "grad_norm": 1.545849045795341, - "kl": 0.0374755859375, + "grad_norm": 0.047110399890504646, + "kl": 0.00836181640625, "learning_rate": 9.779754323328192e-07, - "loss": -0.0449, - "reward": 1.375, - "reward_std": 0.22613351047039032, - "rewards/accuracy_reward": 0.375, - "rewards/format_reward": 0.0, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 190 }, { "clip_ratio": 0.0, - "completion_length": 515.8541870117188, + "completion_length": 459.37501525878906, "epoch": 0.191, - "grad_norm": 1.1770893888687948, - "kl": 0.0347900390625, + "grad_norm": 0.05396861361472214, + "kl": 0.010223388671875, "learning_rate": 9.774874231860935e-07, - "loss": 0.0252, - "reward": 1.9791666865348816, - "reward_std": 0.07216878235340118, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, + "loss": 0.0004, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 191 }, { "clip_ratio": 0.0, - "completion_length": 531.0625305175781, + "completion_length": 439.3333435058594, "epoch": 0.192, - "grad_norm": 1.5630214160136489, - "kl": 0.038818359375, + "grad_norm": 0.04360310974022942, + "kl": 0.009002685546875, "learning_rate": 9.769942052400235e-07, - "loss": 0.0085, - "reward": 1.5572916865348816, - "reward_std": 0.13110895082354546, - "rewards/accuracy_reward": 0.5625, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0004, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 192 }, { "clip_ratio": 0.0, - "completion_length": 529.8125305175781, + "completion_length": 463.0416717529297, "epoch": 0.193, - "grad_norm": 1.823338149977394, - "kl": 0.03759765625, + "grad_norm": 0.0552391990077484, + "kl": 0.008544921875, "learning_rate": 9.764957845043135e-07, - "loss": 0.064, - "reward": 1.9583333730697632, - "reward_std": 0.14433756470680237, - "rewards/accuracy_reward": 0.9583333730697632, - "rewards/format_reward": 0.0, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 193 }, { "clip_ratio": 0.0, - "completion_length": 505.29168701171875, + "completion_length": 430.10418701171875, "epoch": 0.194, - "grad_norm": 1.2765669953869547, - "kl": 0.0360107421875, + "grad_norm": 0.046843083270820006, + "kl": 0.00994873046875, "learning_rate": 9.759921670520634e-07, - "loss": -0.0211, - "reward": 1.9375, - "reward_std": 0.16948114335536957, - "rewards/accuracy_reward": 0.9375, - "rewards/format_reward": 0.0, + "loss": 0.0004, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 194 }, { "clip_ratio": 0.0, - "completion_length": 517.7916870117188, + "completion_length": 510.5416717529297, "epoch": 0.195, - "grad_norm": 1.2267648655713164, - "kl": 0.03759765625, + "grad_norm": 1.243856881724339, + "kl": 0.009368896484375, "learning_rate": 9.754833590196926e-07, - "loss": 0.0168, - "reward": 1.7395833730697632, - "reward_std": 0.03608439117670059, - "rewards/accuracy_reward": 0.75, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0112, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 195 }, { "clip_ratio": 0.0, - "completion_length": 589.8750305175781, + "completion_length": 499.25, "epoch": 0.196, - "grad_norm": 1.5433762444397714, - "kl": 0.0380859375, + "grad_norm": 0.04662672009974053, + "kl": 0.00970458984375, "learning_rate": 9.749693666068663e-07, - "loss": -0.0119, - "reward": 1.7864583730697632, - "reward_std": 0.264947772026062, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0004, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 196 }, { "clip_ratio": 0.0, - "completion_length": 511.7500305175781, + "completion_length": 453.81251525878906, "epoch": 0.197, - "grad_norm": 1.5918163632383577, - "kl": 0.037353515625, + "grad_norm": 0.04207520871736277, + "kl": 0.008514404296875, "learning_rate": 9.744501960764203e-07, - "loss": 0.0372, - "reward": 1.8958333730697632, - "reward_std": 0.19526028633117676, - "rewards/accuracy_reward": 0.8958333730697632, - "rewards/format_reward": 0.0, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 197 }, { "clip_ratio": 0.0, - "completion_length": 470.04168701171875, + "completion_length": 444.8333435058594, "epoch": 0.198, - "grad_norm": 1.4113748478905823, - "kl": 0.0389404296875, + "grad_norm": 1.2748791825422368, + "kl": 0.009185791015625, "learning_rate": 9.739258537542835e-07, - "loss": 0.0054, - "reward": 1.7395833730697632, - "reward_std": 0.19643256068229675, - "rewards/accuracy_reward": 0.75, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": -0.0019, + "reward": 0.984375, + "reward_std": 0.05412658676505089, + "rewards/tag_count_reward": 0.984375, "step": 198 }, { "clip_ratio": 0.0, - "completion_length": 553.375, + "completion_length": 467.1458435058594, "epoch": 0.199, - "grad_norm": 1.6868362902122307, - "kl": 0.039306640625, + "grad_norm": 1.1242637108425146, + "kl": 0.010345458984375, "learning_rate": 9.733963460294015e-07, - "loss": 0.0354, - "reward": 1.6770833730697632, - "reward_std": 0.1589069850742817, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.96875, + "loss": 0.0021, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 199 }, { "clip_ratio": 0.0, - "completion_length": 617.0416870117188, + "completion_length": 500.4791717529297, "epoch": 0.2, - "grad_norm": 0.8811743398857854, - "kl": 0.0352783203125, + "grad_norm": 0.0364220480448336, + "kl": 0.0086669921875, "learning_rate": 9.728616793536587e-07, - "loss": 0.0398, - "reward": 1.8541666865348816, - "reward_std": 0.12873217463493347, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/format_reward": 0.0, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 200 }, { "clip_ratio": 0.0, - "completion_length": 582.0000305175781, + "completion_length": 455.1666717529297, "epoch": 0.201, - "grad_norm": 2.3848711307280435, - "kl": 0.0361328125, + "grad_norm": 0.046692387506755174, + "kl": 0.008758544921875, "learning_rate": 9.723218602418e-07, - "loss": 0.0927, - "reward": 1.6041667461395264, - "reward_std": 0.3470645844936371, - "rewards/accuracy_reward": 0.6041666865348816, - "rewards/format_reward": 0.0, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 201 }, { "clip_ratio": 0.0, - "completion_length": 491.10418701171875, + "completion_length": 443.0833435058594, "epoch": 0.202, - "grad_norm": 1.3880405450574906, - "kl": 0.0335693359375, + "grad_norm": 0.06278266766733745, + "kl": 0.0086669921875, "learning_rate": 9.717768952713511e-07, - "loss": -0.0306, - "reward": 1.7916666865348816, - "reward_std": 0.09731236100196838, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/format_reward": 0.0, + "loss": 0.0003, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 202 }, { "clip_ratio": 0.0, - "completion_length": 637.1041870117188, + "completion_length": 445.8125, "epoch": 0.203, - "grad_norm": 1.5734831132710547, - "kl": 0.040283203125, + "grad_norm": 0.733144439217337, + "kl": 0.026031494140625, "learning_rate": 9.71226791082538e-07, - "loss": 0.0075, - "reward": 1.8541666865348816, - "reward_std": 0.20272701978683472, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/format_reward": 0.0, + "loss": 0.001, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 203 }, { "clip_ratio": 0.0, - "completion_length": 464.9166717529297, + "completion_length": 418.50001525878906, "epoch": 0.204, - "grad_norm": 1.2999212850568949, - "kl": 0.0347900390625, + "grad_norm": 0.04988942656348295, + "kl": 0.0091552734375, "learning_rate": 9.706715543782064e-07, - "loss": 0.0356, - "reward": 1.9583333730697632, - "reward_std": 0.09731236100196838, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, + "loss": 0.0004, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 204 }, { "clip_ratio": 0.0, - "completion_length": 565.6875305175781, + "completion_length": 448.5, "epoch": 0.205, - "grad_norm": 1.5109780429888018, - "kl": 0.03955078125, + "grad_norm": 1.4429664723338587, + "kl": 0.010711669921875, "learning_rate": 9.701111919237408e-07, - "loss": -0.0002, - "reward": 1.8541666865348816, - "reward_std": 0.20272701978683472, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": 0.0029, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 205 }, { "clip_ratio": 0.0, - "completion_length": 554.6875, + "completion_length": 498.68751525878906, "epoch": 0.206, - "grad_norm": 2.4304215425778715, - "kl": 0.037353515625, + "grad_norm": 1.051585226601382, + "kl": 0.0103759765625, "learning_rate": 9.695457105469804e-07, - "loss": 0.0315, - "reward": 1.6875, - "reward_std": 0.31381870806217194, - "rewards/accuracy_reward": 0.6875, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": -0.0194, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 206 }, { "clip_ratio": 0.0, - "completion_length": 751.25, + "completion_length": 510.8541717529297, "epoch": 0.207, - "grad_norm": 1.6054354888342823, - "kl": 0.038818359375, + "grad_norm": 0.051580365905477286, + "kl": 0.0103759765625, "learning_rate": 9.689751171381377e-07, - "loss": -0.0256, - "reward": 1.7500000596046448, - "reward_std": 0.32399246096611023, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/format_reward": 0.0, + "loss": 0.0004, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 207 }, { "clip_ratio": 0.0, - "completion_length": 634.1666870117188, + "completion_length": 475.72918701171875, "epoch": 0.208, - "grad_norm": 2.032740200554419, - "kl": 0.042236328125, + "grad_norm": 0.1551803755199804, + "kl": 0.014739990234375, "learning_rate": 9.683994186497132e-07, - "loss": 0.0423, - "reward": 1.8906250596046448, - "reward_std": 0.2754465192556381, - "rewards/accuracy_reward": 0.8958333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0006, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 208 }, { "clip_ratio": 0.0, - "completion_length": 626.9791870117188, + "completion_length": 512.9583435058594, "epoch": 0.209, - "grad_norm": 1.9307476770598715, - "kl": 0.0445556640625, + "grad_norm": 0.05272348357338901, + "kl": 0.012725830078125, "learning_rate": 9.67818622096411e-07, - "loss": 0.0585, - "reward": 1.7447916865348816, - "reward_std": 0.28703896701335907, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0005, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 209 }, { "clip_ratio": 0.0, - "completion_length": 616.7291870117188, + "completion_length": 519.5833435058594, "epoch": 0.21, - "grad_norm": 1.6826570417632707, - "kl": 0.0408935546875, + "grad_norm": 1.10417006874386, + "kl": 0.011077880859375, "learning_rate": 9.672327345550543e-07, - "loss": 0.041, - "reward": 1.7916666865348816, - "reward_std": 0.34349535405635834, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": -0.0096, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 210 }, { "clip_ratio": 0.0, - "completion_length": 570.9791870117188, + "completion_length": 530.3958435058594, "epoch": 0.211, - "grad_norm": 1.970570054612099, - "kl": 0.04345703125, + "grad_norm": 1.1753773142145971, + "kl": 0.013519287109375, "learning_rate": 9.666417631644976e-07, - "loss": 0.0726, - "reward": 1.75, - "reward_std": 0.2837398797273636, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.0078, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 211 }, { "clip_ratio": 0.0, - "completion_length": 595.2916870117188, + "completion_length": 488.3125, "epoch": 0.212, - "grad_norm": 1.4838328368011435, - "kl": 0.04296875, + "grad_norm": 1.4918163438086747, + "kl": 0.02587890625, "learning_rate": 9.66045715125541e-07, - "loss": -0.0738, - "reward": 1.6354167461395264, - "reward_std": 0.2177312970161438, - "rewards/accuracy_reward": 0.6458333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0116, + "reward": 0.9791666865348816, + "reward_std": 0.04865618050098419, + "rewards/tag_count_reward": 0.9791666865348816, "step": 212 }, { "clip_ratio": 0.0, - "completion_length": 549.0416870117188, + "completion_length": 487.4166717529297, "epoch": 0.213, - "grad_norm": 1.5797054551535403, - "kl": 0.0386962890625, + "grad_norm": 2.1340988492664605, + "kl": 0.014801025390625, "learning_rate": 9.654445977008414e-07, - "loss": -0.0014, - "reward": 1.9114583730697632, - "reward_std": 0.20759673416614532, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0412, + "reward": 0.984375, + "reward_std": 0.05412658676505089, + "rewards/tag_count_reward": 0.984375, "step": 213 }, { "clip_ratio": 0.0, - "completion_length": 625.5416870117188, + "completion_length": 526.3958435058594, "epoch": 0.214, - "grad_norm": 2.170660977881024, - "kl": 0.04638671875, + "grad_norm": 0.11688279967288762, + "kl": 0.015472412109375, "learning_rate": 9.648384182148252e-07, - "loss": 0.0749, - "reward": 1.8385416865348816, - "reward_std": 0.24533500522375107, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9843750298023224, + "loss": 0.0006, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 214 }, { "clip_ratio": 0.0, - "completion_length": 591.6666870117188, + "completion_length": 478.5416717529297, "epoch": 0.215, - "grad_norm": 2.17643014116164, - "kl": 0.048583984375, + "grad_norm": 1.5894295793015976, + "kl": 0.017242431640625, "learning_rate": 9.64227184053598e-07, - "loss": 0.0356, - "reward": 1.7812500596046448, - "reward_std": 0.31549227982759476, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/format_reward": 0.0, + "loss": 0.0446, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, "rewards/tag_count_reward": 0.9895833432674408, "step": 215 }, { "clip_ratio": 0.0, - "completion_length": 506.7500305175781, + "completion_length": 468.02085876464844, "epoch": 0.216, - "grad_norm": 0.10446632805987377, - "kl": 0.034423828125, + "grad_norm": 0.08453494009874704, + "kl": 0.013946533203125, "learning_rate": 9.636109026648554e-07, - "loss": 0.0014, - "reward": 2.0, + "loss": 0.0006, + "reward": 1.0, "reward_std": 0.0, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, "rewards/tag_count_reward": 1.0, "step": 216 }, { "clip_ratio": 0.0, - "completion_length": 752.75, + "completion_length": 545.625, "epoch": 0.217, - "grad_norm": 2.3540574142805526, - "kl": 0.0469970703125, + "grad_norm": 0.07300918934288783, + "kl": 0.01556396484375, "learning_rate": 9.629895815577915e-07, - "loss": -0.1481, - "reward": 1.375, - "reward_std": 0.427034467458725, - "rewards/accuracy_reward": 0.375, - "rewards/format_reward": 0.0, + "loss": 0.0006, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 217 }, { "clip_ratio": 0.0, - "completion_length": 562.6041870117188, + "completion_length": 423.9583435058594, "epoch": 0.218, - "grad_norm": 1.8771225265937168, - "kl": 0.0445556640625, + "grad_norm": 0.0844738916631422, + "kl": 0.0145263671875, "learning_rate": 9.623632283030077e-07, - "loss": 0.0737, - "reward": 1.7239583730697632, - "reward_std": 0.22842131555080414, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0006, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 218 }, { "clip_ratio": 0.0, - "completion_length": 484.0, + "completion_length": 393.9583435058594, "epoch": 0.219, - "grad_norm": 2.0436128894263303, - "kl": 0.038330078125, + "grad_norm": 0.09820466302062966, + "kl": 0.01605224609375, "learning_rate": 9.617318505324212e-07, - "loss": 0.0276, - "reward": 1.9166666865348816, - "reward_std": 0.18523553758859634, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, + "loss": 0.0006, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 219 }, { "clip_ratio": 0.0, - "completion_length": 615.1875305175781, + "completion_length": 506.56251525878906, "epoch": 0.22, - "grad_norm": 1.9336203975256818, - "kl": 0.0416259765625, + "grad_norm": 1.1719180400121392, + "kl": 0.01727294921875, "learning_rate": 9.610954559391704e-07, - "loss": 0.07, - "reward": 1.7083333730697632, - "reward_std": 0.37674127519130707, - "rewards/accuracy_reward": 0.7083333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": -0.0179, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 220 }, { "clip_ratio": 0.0, - "completion_length": 579.0208435058594, + "completion_length": 437.7083435058594, "epoch": 0.221, - "grad_norm": 2.0163065562334404, - "kl": 0.046630859375, + "grad_norm": 1.2988094815882822, + "kl": 0.016204833984375, "learning_rate": 9.604540522775227e-07, - "loss": -0.0314, - "reward": 1.6875, - "reward_std": 0.4269455075263977, - "rewards/accuracy_reward": 0.6875, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": -0.0138, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 221 }, { "clip_ratio": 0.0, - "completion_length": 580.4166870117188, + "completion_length": 475.9791717529297, "epoch": 0.222, - "grad_norm": 1.8765027810152024, - "kl": 0.051025390625, + "grad_norm": 1.2837249478628319, + "kl": 0.016510009765625, "learning_rate": 9.598076473627796e-07, - "loss": 0.0179, - "reward": 1.7239583730697632, - "reward_std": 0.3158010095357895, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/format_reward": 0.0, + "loss": 0.0012, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, "rewards/tag_count_reward": 0.9947916865348816, "step": 222 }, { "clip_ratio": 0.0, - "completion_length": 494.5416717529297, + "completion_length": 450.7708435058594, "epoch": 0.223, - "grad_norm": 1.0951016884779496, - "kl": 0.0400390625, + "grad_norm": 1.253685798628967, + "kl": 0.016815185546875, "learning_rate": 9.59156249071181e-07, - "loss": 0.0036, - "reward": 1.7291666865348816, - "reward_std": 0.07216878235340118, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": -0.003, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 223 }, { "clip_ratio": 0.0, - "completion_length": 662.7500305175781, + "completion_length": 543.1666717529297, "epoch": 0.224, - "grad_norm": 1.8616225271003006, - "kl": 0.0384521484375, + "grad_norm": 1.30016273991412, + "kl": 0.01727294921875, "learning_rate": 9.58499865339809e-07, - "loss": 0.1526, - "reward": 1.5729167461395264, - "reward_std": 0.27979065477848053, - "rewards/accuracy_reward": 0.5833333432674408, - "rewards/format_reward": 0.0, + "loss": 0.0038, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, "rewards/tag_count_reward": 0.9895833432674408, "step": 224 }, { "clip_ratio": 0.0, - "completion_length": 500.54168701171875, + "completion_length": 434.12501525878906, "epoch": 0.225, - "grad_norm": 1.8956436916076644, - "kl": 0.0438232421875, + "grad_norm": 0.09757944338988957, + "kl": 0.015350341796875, "learning_rate": 9.578385041664925e-07, - "loss": 0.0097, - "reward": 1.7500000596046448, - "reward_std": 0.24164992570877075, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/format_reward": 0.0, + "loss": 0.0006, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 225 }, { "clip_ratio": 0.0, - "completion_length": 609.9791870117188, + "completion_length": 518.5416870117188, "epoch": 0.226, - "grad_norm": 2.046731958243305, - "kl": 0.0489501953125, + "grad_norm": 1.2273247367534148, + "kl": 0.016143798828125, "learning_rate": 9.571721736097088e-07, - "loss": 0.0026, - "reward": 1.6979166865348816, - "reward_std": 0.3049342483282089, - "rewards/accuracy_reward": 0.708333358168602, - "rewards/format_reward": 0.0, + "loss": -0.0008, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, "rewards/tag_count_reward": 0.9895833432674408, "step": 226 }, { "clip_ratio": 0.0, - "completion_length": 623.7708435058594, + "completion_length": 490.10418701171875, "epoch": 0.227, - "grad_norm": 1.7444868648149183, - "kl": 0.041259765625, + "grad_norm": 0.07686121660930782, + "kl": 0.01434326171875, "learning_rate": 9.565008817884854e-07, - "loss": -0.0396, - "reward": 1.7708333730697632, - "reward_std": 0.2574043273925781, - "rewards/accuracy_reward": 0.7708333730697632, - "rewards/format_reward": 0.0, + "loss": 0.0006, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 227 }, { "clip_ratio": 0.0, - "completion_length": 494.35418701171875, + "completion_length": 473.2708435058594, "epoch": 0.228, - "grad_norm": 1.551465225400232, - "kl": 0.0380859375, + "grad_norm": 1.5759387434813044, + "kl": 0.014068603515625, "learning_rate": 9.55824636882301e-07, - "loss": -0.0127, - "reward": 1.875, - "reward_std": 0.20090095698833466, - "rewards/accuracy_reward": 0.875, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": -0.004, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 228 }, { "clip_ratio": 0.0, - "completion_length": 530.2083587646484, + "completion_length": 471.50001525878906, "epoch": 0.229, - "grad_norm": 1.4584264987595335, - "kl": 0.04052734375, + "grad_norm": 0.21396356843199574, + "kl": 0.02099609375, "learning_rate": 9.55143447130987e-07, - "loss": 0.0373, - "reward": 1.7291666865348816, - "reward_std": 0.05689104273915291, - "rewards/accuracy_reward": 0.75, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.0008, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 229 }, { "clip_ratio": 0.0, - "completion_length": 528.2708435058594, + "completion_length": 479.2083435058594, "epoch": 0.23, - "grad_norm": 0.10447992335108418, - "kl": 0.0389404296875, + "grad_norm": 0.11709533688651282, + "kl": 0.0179443359375, "learning_rate": 9.54457320834625e-07, - "loss": 0.0016, - "reward": 2.0, + "loss": 0.0007, + "reward": 1.0, "reward_std": 0.0, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, "rewards/tag_count_reward": 1.0, "step": 230 }, { "clip_ratio": 0.0, - "completion_length": 505.6458435058594, + "completion_length": 478.8541717529297, "epoch": 0.231, - "grad_norm": 1.6160413927645534, - "kl": 0.0386962890625, + "grad_norm": 0.10027144130859894, + "kl": 0.018463134765625, "learning_rate": 9.537662663534477e-07, - "loss": 0.0344, - "reward": 1.75, - "reward_std": 0.26111647486686707, - "rewards/accuracy_reward": 0.75, - "rewards/format_reward": 0.0, + "loss": 0.0007, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 231 }, { "clip_ratio": 0.0, - "completion_length": 555.7083435058594, + "completion_length": 479.9583435058594, "epoch": 0.232, - "grad_norm": 1.7431877877211548, - "kl": 0.041015625, + "grad_norm": 0.09671371423542396, + "kl": 0.01788330078125, "learning_rate": 9.530702921077358e-07, - "loss": 0.0615, - "reward": 1.8281250596046448, - "reward_std": 0.25984111800789833, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0007, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 232 }, { "clip_ratio": 0.0, - "completion_length": 582.0625152587891, + "completion_length": 491.8958435058594, "epoch": 0.233, - "grad_norm": 1.957784810698847, - "kl": 0.049072265625, + "grad_norm": 0.14545470402187322, + "kl": 0.01849365234375, "learning_rate": 9.523694065777156e-07, - "loss": 0.0183, - "reward": 1.6979166865348816, - "reward_std": 0.3233904391527176, - "rewards/accuracy_reward": 0.708333358168602, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0007, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 233 }, { "clip_ratio": 0.0, - "completion_length": 525.2916870117188, + "completion_length": 496.72918701171875, "epoch": 0.234, - "grad_norm": 1.6270752049452997, - "kl": 0.042236328125, + "grad_norm": 1.198899919328826, + "kl": 0.0164794921875, "learning_rate": 9.516636183034564e-07, - "loss": 0.0157, - "reward": 1.9166666865348816, - "reward_std": 0.18523553758859634, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": 0.0263, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 234 }, { "clip_ratio": 0.0, - "completion_length": 517.9375, + "completion_length": 433.12501525878906, "epoch": 0.235, - "grad_norm": 1.092987074367016, - "kl": 0.041015625, + "grad_norm": 0.09162266424217855, + "kl": 0.01885986328125, "learning_rate": 9.509529358847654e-07, - "loss": -0.0014, - "reward": 1.9947916865348816, - "reward_std": 0.018042195588350296, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0007, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 235 }, { "clip_ratio": 0.0, - "completion_length": 639.6666870117188, + "completion_length": 595.0208435058594, "epoch": 0.236, - "grad_norm": 1.6337329373617202, - "kl": 0.044677734375, + "grad_norm": 1.1282448798264841, + "kl": 0.01708984375, "learning_rate": 9.502373679810839e-07, - "loss": -0.1059, - "reward": 1.71875, - "reward_std": 0.13339674472808838, - "rewards/accuracy_reward": 0.7291666716337204, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.1368, + "reward": 0.9739583432674408, + "reward_std": 0.06226281076669693, + "rewards/tag_count_reward": 0.9739583432674408, "step": 236 }, { "clip_ratio": 0.0, - "completion_length": 553.0833435058594, + "completion_length": 524.6666870117188, "epoch": 0.237, - "grad_norm": 1.4020154565818193, - "kl": 0.0416259765625, + "grad_norm": 0.10066116337470642, + "kl": 0.017547607421875, "learning_rate": 9.495169233113806e-07, - "loss": 0.1086, - "reward": 1.6875, - "reward_std": 0.19526028633117676, - "rewards/accuracy_reward": 0.6875, - "rewards/format_reward": 0.0, + "loss": 0.0007, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 237 }, { "clip_ratio": 0.0, - "completion_length": 593.5625305175781, + "completion_length": 565.1458435058594, "epoch": 0.238, - "grad_norm": 1.8982847694375606, - "kl": 0.0426025390625, + "grad_norm": 0.08271308069169885, + "kl": 0.0169677734375, "learning_rate": 9.487916106540465e-07, - "loss": 0.0692, - "reward": 1.7812500596046448, - "reward_std": 0.25648826360702515, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0007, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 238 }, { "clip_ratio": 0.0, - "completion_length": 558.8750305175781, + "completion_length": 499.3125, "epoch": 0.239, - "grad_norm": 1.9288179052798846, - "kl": 0.0401611328125, + "grad_norm": 0.0834749774046353, + "kl": 0.017578125, "learning_rate": 9.480614388467877e-07, - "loss": -0.0584, - "reward": 1.7916666865348816, - "reward_std": 0.29821331799030304, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/format_reward": 0.0, + "loss": 0.0007, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 239 }, { "clip_ratio": 0.0, - "completion_length": 490.16668701171875, + "completion_length": 516.7083435058594, "epoch": 0.24, - "grad_norm": 1.6870908384319176, - "kl": 0.04736328125, + "grad_norm": 1.082176127520902, + "kl": 0.016082763671875, "learning_rate": 9.473264167865171e-07, - "loss": 0.0651, - "reward": 1.9791666865348816, - "reward_std": 0.07216878235340118, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": 0.0141, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 240 }, { "clip_ratio": 0.0, - "completion_length": 462.16668701171875, + "completion_length": 512.125, "epoch": 0.241, - "grad_norm": 1.73953429694824, - "kl": 0.03857421875, + "grad_norm": 0.07003007584923018, + "kl": 0.016632080078125, "learning_rate": 9.465865534292464e-07, - "loss": 0.013, - "reward": 1.9583333730697632, - "reward_std": 0.14433756470680237, - "rewards/accuracy_reward": 0.9583333730697632, - "rewards/format_reward": 0.0, + "loss": 0.0007, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 241 }, { "clip_ratio": 0.0, - "completion_length": 455.4791717529297, + "completion_length": 513.0416870117188, "epoch": 0.242, - "grad_norm": 1.47708785634007, - "kl": 0.039794921875, + "grad_norm": 0.08471190370306036, + "kl": 0.01715087890625, "learning_rate": 9.458418577899774e-07, - "loss": 0.0349, - "reward": 1.9791666865348816, - "reward_std": 0.07216878235340118, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, + "loss": 0.0007, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 242 }, { "clip_ratio": 0.0, - "completion_length": 578.7916717529297, + "completion_length": 575.3333740234375, "epoch": 0.243, - "grad_norm": 0.804017233483098, - "kl": 0.0440673828125, + "grad_norm": 0.07407780109480006, + "kl": 0.014739990234375, "learning_rate": 9.450923389425911e-07, - "loss": 0.0345, - "reward": 1.8333333730697632, - "reward_std": 0.12309150397777557, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/format_reward": 0.0, + "loss": 0.0006, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 243 }, { "clip_ratio": 0.0, - "completion_length": 541.7916870117188, + "completion_length": 539.125, "epoch": 0.244, - "grad_norm": 0.9217564045439455, - "kl": 0.0423583984375, + "grad_norm": 1.0872363516816363, + "kl": 0.01788330078125, "learning_rate": 9.443380060197385e-07, - "loss": 0.0251, - "reward": 1.8958333730697632, - "reward_std": 0.12873217463493347, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": -0.006, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 244 }, { "clip_ratio": 0.0, - "completion_length": 594.2291870117188, + "completion_length": 581.9375305175781, "epoch": 0.245, - "grad_norm": 1.9827076097548295, - "kl": 0.046630859375, + "grad_norm": 0.06841999220752508, + "kl": 0.01605224609375, "learning_rate": 9.43578868212728e-07, - "loss": -0.0243, - "reward": 1.6875000596046448, - "reward_std": 0.3258185237646103, - "rewards/accuracy_reward": 0.6875000298023224, - "rewards/format_reward": 0.0, + "loss": 0.0006, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 245 }, { "clip_ratio": 0.0, - "completion_length": 545.0208435058594, + "completion_length": 546.5208435058594, "epoch": 0.246, - "grad_norm": 1.865405804331346, - "kl": 0.047607421875, + "grad_norm": 0.08021586563908664, + "kl": 0.01727294921875, "learning_rate": 9.428149347714143e-07, - "loss": -0.0008, - "reward": 1.5416666865348816, - "reward_std": 0.3251829594373703, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/format_reward": 0.0, + "loss": 0.0007, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 246 }, { "clip_ratio": 0.0, - "completion_length": 499.4375305175781, + "completion_length": 513.2916870117188, "epoch": 0.247, - "grad_norm": 0.9636491587585947, - "kl": 0.04345703125, + "grad_norm": 1.5957606865057967, + "kl": 0.0167236328125, "learning_rate": 9.420462150040852e-07, - "loss": 0.085, - "reward": 1.875, - "reward_std": 0.13055823743343353, - "rewards/accuracy_reward": 0.875, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": 0.0147, + "reward": 0.9895833730697632, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833730697632, "step": 247 }, { "clip_ratio": 0.0, - "completion_length": 477.8333435058594, + "completion_length": 523.125, "epoch": 0.248, - "grad_norm": 1.8131198477877324, - "kl": 0.04541015625, + "grad_norm": 0.07161150701962184, + "kl": 0.015472412109375, "learning_rate": 9.412727182773486e-07, - "loss": 0.0789, - "reward": 1.8541667461395264, - "reward_std": 0.22604453563690186, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/format_reward": 0.0, + "loss": 0.0006, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 248 }, { "clip_ratio": 0.0, - "completion_length": 488.43751525878906, + "completion_length": 485.62501525878906, "epoch": 0.249, - "grad_norm": 1.6187929261632488, - "kl": 0.046142578125, + "grad_norm": 0.07697007954058731, + "kl": 0.0164794921875, "learning_rate": 9.404944540160177e-07, - "loss": 0.0422, - "reward": 1.8958333730697632, - "reward_std": 0.19526028633117676, - "rewards/accuracy_reward": 0.8958333730697632, - "rewards/format_reward": 0.0, + "loss": 0.0007, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 249 }, { "clip_ratio": 0.0, - "completion_length": 486.35418701171875, + "completion_length": 501.62501525878906, "epoch": 0.25, - "grad_norm": 1.5724286696627654, - "kl": 0.04150390625, + "grad_norm": 0.08389406416336188, + "kl": 0.01727294921875, "learning_rate": 9.397114317029974e-07, - "loss": 0.0939, - "reward": 1.8750000596046448, - "reward_std": 0.20090095698833466, - "rewards/accuracy_reward": 0.8750000298023224, - "rewards/format_reward": 0.0, + "loss": 0.0007, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 250 }, { "clip_ratio": 0.0, - "completion_length": 503.91668701171875, + "completion_length": 487.9375305175781, "epoch": 0.251, - "grad_norm": 1.7327032194423762, - "kl": 0.051025390625, + "grad_norm": 0.0725621860485941, + "kl": 0.015411376953125, "learning_rate": 9.38923660879167e-07, - "loss": 0.0251, - "reward": 1.8750000596046448, - "reward_std": 0.20090095698833466, - "rewards/accuracy_reward": 0.8750000298023224, - "rewards/format_reward": 0.0, + "loss": 0.0006, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 251 }, { "clip_ratio": 0.0, - "completion_length": 578.1041870117188, + "completion_length": 518.9583435058594, "epoch": 0.252, - "grad_norm": 1.965147285631464, - "kl": 0.0479736328125, + "grad_norm": 0.08955151459635992, + "kl": 0.015899658203125, "learning_rate": 9.381311511432658e-07, - "loss": 0.1045, - "reward": 1.5208333730697632, - "reward_std": 0.2574043273925781, - "rewards/accuracy_reward": 0.520833358168602, - "rewards/format_reward": 0.0, + "loss": 0.0006, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 252 }, { "clip_ratio": 0.0, - "completion_length": 436.50001525878906, + "completion_length": 483.35418701171875, "epoch": 0.253, - "grad_norm": 0.2373916176524351, - "kl": 0.04833984375, + "grad_norm": 0.06650664724173376, + "kl": 0.0152587890625, "learning_rate": 9.373339121517746e-07, - "loss": 0.0019, - "reward": 2.0, + "loss": 0.0006, + "reward": 1.0, "reward_std": 0.0, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, "rewards/tag_count_reward": 1.0, "step": 253 }, { "clip_ratio": 0.0, - "completion_length": 475.2083435058594, + "completion_length": 484.68751525878906, "epoch": 0.254, - "grad_norm": 1.531390781511125, - "kl": 0.052978515625, + "grad_norm": 0.08269813073285828, + "kl": 0.018310546875, "learning_rate": 9.36531953618799e-07, - "loss": 0.0513, - "reward": 1.8958333730697632, - "reward_std": 0.19526028633117676, - "rewards/accuracy_reward": 0.8958333730697632, - "rewards/format_reward": 0.0, + "loss": 0.0007, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 254 }, { "clip_ratio": 0.0, - "completion_length": 497.7708435058594, + "completion_length": 543.6250305175781, "epoch": 0.255, - "grad_norm": 1.7634585239045695, - "kl": 0.0474853515625, + "grad_norm": 0.057157762620809585, + "kl": 0.01470947265625, "learning_rate": 9.357252853159505e-07, - "loss": 0.0247, - "reward": 1.7083333730697632, - "reward_std": 0.14433756470680237, - "rewards/accuracy_reward": 0.708333358168602, - "rewards/format_reward": 0.0, + "loss": 0.0006, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 255 }, { "clip_ratio": 0.0, - "completion_length": 446.5833435058594, + "completion_length": 464.5, "epoch": 0.256, - "grad_norm": 1.905154732192964, - "kl": 0.049560546875, + "grad_norm": 0.5122966220354302, + "kl": 0.02410888671875, "learning_rate": 9.34913917072228e-07, - "loss": 0.0554, - "reward": 1.9375000596046448, - "reward_std": 0.16948114335536957, - "rewards/accuracy_reward": 0.9375000298023224, - "rewards/format_reward": 0.0, + "loss": 0.0009, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 256 }, { "clip_ratio": 0.0, - "completion_length": 524.6875, + "completion_length": 485.9375, "epoch": 0.257, - "grad_norm": 1.5618982510486417, - "kl": 0.057373046875, + "grad_norm": 0.06726881896372686, + "kl": 0.01654052734375, "learning_rate": 9.340978587738972e-07, - "loss": -0.0693, - "reward": 1.7864583730697632, - "reward_std": 0.25408679246902466, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0006, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 257 }, { "clip_ratio": 0.0, - "completion_length": 456.37501525878906, + "completion_length": 501.54168701171875, "epoch": 0.258, - "grad_norm": 2.1526606311686853, - "kl": 0.0570068359375, + "grad_norm": 0.0748098899740606, + "kl": 0.01788330078125, "learning_rate": 9.332771203643714e-07, - "loss": -0.0614, - "reward": 1.6666667461395264, - "reward_std": 0.20090095698833466, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/format_reward": 0.0, + "loss": 0.0007, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 258 }, { "clip_ratio": 0.0, - "completion_length": 469.5833435058594, + "completion_length": 488.72918701171875, "epoch": 0.259, - "grad_norm": 0.12394876363564046, - "kl": 0.05322265625, + "grad_norm": 1.2840455355702072, + "kl": 0.01544189453125, "learning_rate": 9.324517118440888e-07, - "loss": 0.0021, - "reward": 2.0, - "reward_std": 0.0, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": 0.0321, + "reward": 0.984375, + "reward_std": 0.05412658676505089, + "rewards/tag_count_reward": 0.984375, "step": 259 }, { "clip_ratio": 0.0, - "completion_length": 448.56251525878906, + "completion_length": 512.3125305175781, "epoch": 0.26, - "grad_norm": 1.5562456504088078, - "kl": 0.0533447265625, + "grad_norm": 0.05847318018525027, + "kl": 0.015411376953125, "learning_rate": 9.316216432703916e-07, - "loss": -0.0118, - "reward": 1.7239583730697632, - "reward_std": 0.09021097421646118, - "rewards/accuracy_reward": 0.7291666716337204, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0006, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 260 }, { "clip_ratio": 0.0, - "completion_length": 476.3333435058594, + "completion_length": 469.1458435058594, "epoch": 0.261, - "grad_norm": 1.6065532765180472, - "kl": 0.0513916015625, + "grad_norm": 1.2796299237266675, + "kl": 0.0162353515625, "learning_rate": 9.307869247574038e-07, - "loss": 0.0535, - "reward": 1.9583333730697632, - "reward_std": 0.14433756470680237, - "rewards/accuracy_reward": 0.9583333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": 0.0266, + "reward": 0.984375, + "reward_std": 0.05412658676505089, + "rewards/tag_count_reward": 0.984375, "step": 261 }, { "clip_ratio": 0.0, - "completion_length": 404.5833435058594, + "completion_length": 453.1041717529297, "epoch": 0.262, - "grad_norm": 1.1438291803489842, - "kl": 0.05517578125, + "grad_norm": 0.0686759440576299, + "kl": 0.01666259765625, "learning_rate": 9.299475664759068e-07, - "loss": -0.0018, - "reward": 1.7708333730697632, - "reward_std": 0.07216878235340118, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/format_reward": 0.0, + "loss": 0.0007, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 262 }, { "clip_ratio": 0.0, - "completion_length": 402.25001525878906, + "completion_length": 424.625, "epoch": 0.263, - "grad_norm": 1.3669448886833322, - "kl": 0.045166015625, + "grad_norm": 1.5060360382856361, + "kl": 0.01507568359375, "learning_rate": 9.291035786532163e-07, - "loss": 0.0025, - "reward": 1.9583333730697632, - "reward_std": 0.09731236100196838, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": 0.032, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 263 }, { "clip_ratio": 0.0, - "completion_length": 463.7916717529297, + "completion_length": 499.81251525878906, "epoch": 0.264, - "grad_norm": 1.4798113577618432, - "kl": 0.0533447265625, + "grad_norm": 2.449862371398939, + "kl": 0.019775390625, "learning_rate": 9.282549715730579e-07, - "loss": 0.0242, - "reward": 1.875, - "reward_std": 0.22040385007858276, - "rewards/accuracy_reward": 0.875, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": 0.0313, + "reward": 0.9687500298023224, + "reward_std": 0.10825316980481148, + "rewards/tag_count_reward": 0.9687500298023224, "step": 264 }, { "clip_ratio": 0.0, - "completion_length": 581.6250305175781, + "completion_length": 577.3958740234375, "epoch": 0.265, - "grad_norm": 1.7790769508709383, - "kl": 0.0640869140625, + "grad_norm": 0.25453031839063006, + "kl": 0.019256591796875, "learning_rate": 9.274017555754407e-07, - "loss": 0.0064, - "reward": 1.7864583730697632, - "reward_std": 0.3314758837223053, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0007, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 265 }, { "clip_ratio": 0.0, - "completion_length": 491.70835876464844, + "completion_length": 455.9166717529297, "epoch": 0.266, - "grad_norm": 1.9599086067310774, - "kl": 0.0693359375, + "grad_norm": 0.06570987453035185, + "kl": 0.015380859375, "learning_rate": 9.265439410565328e-07, - "loss": -0.0725, - "reward": 1.7291666865348816, - "reward_std": 0.259290412068367, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/format_reward": 0.0, + "loss": 0.0006, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 266 }, { "clip_ratio": 0.0, - "completion_length": 607.8333435058594, + "completion_length": 463.60418701171875, "epoch": 0.267, - "grad_norm": 1.2021326550040026, - "kl": 0.0584716796875, + "grad_norm": 0.07329984438320578, + "kl": 0.015380859375, "learning_rate": 9.256815384685328e-07, - "loss": 0.0381, - "reward": 1.9166666865348816, - "reward_std": 0.13366925716400146, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9583333432674408, + "loss": 0.0006, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 267 }, { "clip_ratio": 0.0, - "completion_length": 496.9375305175781, + "completion_length": 467.87501525878906, "epoch": 0.268, - "grad_norm": 1.9459735742989812, - "kl": 0.0562744140625, + "grad_norm": 0.15862357870241905, + "kl": 0.014862060546875, "learning_rate": 9.248145583195447e-07, - "loss": 0.0878, - "reward": 1.9583333730697632, - "reward_std": 0.14433756470680237, - "rewards/accuracy_reward": 0.9583333730697632, - "rewards/format_reward": 0.0, + "loss": 0.0006, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 268 }, { "clip_ratio": 0.0, - "completion_length": 427.0833435058594, + "completion_length": 426.1041717529297, "epoch": 0.269, - "grad_norm": 1.1949901764547433, - "kl": 0.05419921875, + "grad_norm": 0.06528566167070449, + "kl": 0.015899658203125, "learning_rate": 9.239430111734476e-07, - "loss": -0.0081, - "reward": 1.7708333730697632, - "reward_std": 0.07216878235340118, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/format_reward": 0.0, + "loss": 0.0007, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 269 }, { "clip_ratio": 0.0, - "completion_length": 486.56251525878906, + "completion_length": 426.5833435058594, "epoch": 0.27, - "grad_norm": 1.4911458628765353, - "kl": 0.0511474609375, + "grad_norm": 0.07168963347611128, + "kl": 0.01611328125, "learning_rate": 9.230669076497687e-07, - "loss": 0.0048, - "reward": 1.8750000596046448, - "reward_std": 0.22040386497974396, - "rewards/accuracy_reward": 0.8750000298023224, - "rewards/format_reward": 0.0, + "loss": 0.0007, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 270 }, { "clip_ratio": 0.0, - "completion_length": 559.1041870117188, + "completion_length": 523.8750305175781, "epoch": 0.271, - "grad_norm": 2.2161858074413225, - "kl": 0.063232421875, + "grad_norm": 0.06544622695773468, + "kl": 0.01708984375, "learning_rate": 9.221862584235526e-07, - "loss": -0.0149, - "reward": 1.6822916865348816, - "reward_std": 0.43450842797756195, - "rewards/accuracy_reward": 0.6875000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0007, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 271 }, { "clip_ratio": 0.0, - "completion_length": 521.5833435058594, + "completion_length": 456.22918701171875, "epoch": 0.272, - "grad_norm": 2.2096450352669637, - "kl": 0.055908203125, + "grad_norm": 0.062094560268822145, + "kl": 0.01666259765625, "learning_rate": 9.213010742252327e-07, - "loss": 0.1146, - "reward": 1.7395833730697632, - "reward_std": 0.25259073078632355, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0006, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 272 }, { "clip_ratio": 0.0, - "completion_length": 604.6041717529297, + "completion_length": 474.8125, "epoch": 0.273, - "grad_norm": 1.3559808511488978, - "kl": 0.0498046875, + "grad_norm": 0.05862877618071435, + "kl": 0.01605224609375, "learning_rate": 9.204113658404989e-07, - "loss": 0.0683, - "reward": 1.7760416865348816, - "reward_std": 0.2851714491844177, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.984375, + "loss": 0.0006, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 273 }, { "clip_ratio": 0.0, - "completion_length": 496.54168701171875, + "completion_length": 443.9166717529297, "epoch": 0.274, - "grad_norm": 1.3815490224836835, - "kl": 0.04833984375, + "grad_norm": 0.06164062190782835, + "kl": 0.015228271484375, "learning_rate": 9.195171441101668e-07, - "loss": 0.1051, - "reward": 1.9895833730697632, - "reward_std": 0.03608439117670059, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0006, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 274 }, { "clip_ratio": 0.0, - "completion_length": 520.875, + "completion_length": 457.1041717529297, "epoch": 0.275, - "grad_norm": 1.4957151576657022, - "kl": 0.0565185546875, + "grad_norm": 0.05589128390148594, + "kl": 0.014068603515625, "learning_rate": 9.186184199300463e-07, - "loss": 0.0809, - "reward": 1.8541666865348816, - "reward_std": 0.20272701978683472, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/format_reward": 0.0, + "loss": 0.0006, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 275 }, { "clip_ratio": 0.0, - "completion_length": 513.9583435058594, + "completion_length": 497.10418701171875, "epoch": 0.276, - "grad_norm": 1.3361311159289526, - "kl": 0.055419921875, + "grad_norm": 0.05590537885725384, + "kl": 0.014373779296875, "learning_rate": 9.177152042508077e-07, "loss": 0.0006, - "reward": 1.9583333730697632, - "reward_std": 0.14433756470680237, - "rewards/accuracy_reward": 0.9583333730697632, - "rewards/format_reward": 0.0, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 276 }, { "clip_ratio": 0.0, - "completion_length": 588.5416870117188, + "completion_length": 486.8125305175781, "epoch": 0.277, - "grad_norm": 2.0258870005968195, - "kl": 0.0640869140625, + "grad_norm": 0.055203648657737074, + "kl": 0.01580810546875, "learning_rate": 9.168075080778494e-07, - "loss": 0.0781, - "reward": 1.9166667461395264, - "reward_std": 0.24164992570877075, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, + "loss": 0.0006, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 277 }, { "clip_ratio": 0.0, - "completion_length": 512.8541870117188, + "completion_length": 451.3958435058594, "epoch": 0.278, - "grad_norm": 1.6307241322113504, - "kl": 0.0572509765625, + "grad_norm": 0.054423924961094675, + "kl": 0.015106201171875, "learning_rate": 9.158953424711624e-07, - "loss": -0.0485, - "reward": 1.7708333730697632, - "reward_std": 0.21037912368774414, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/format_reward": 0.0, + "loss": 0.0006, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 278 }, { "clip_ratio": 0.0, - "completion_length": 530.6458740234375, + "completion_length": 498.6250305175781, "epoch": 0.279, - "grad_norm": 1.279366864903179, - "kl": 0.056640625, + "grad_norm": 0.04977983112824281, + "kl": 0.015594482421875, "learning_rate": 9.149787185451969e-07, - "loss": 0.0604, - "reward": 1.9375, - "reward_std": 0.11306675523519516, - "rewards/accuracy_reward": 0.9375, - "rewards/format_reward": 0.0, + "loss": 0.0006, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 279 }, { "clip_ratio": 0.0, - "completion_length": 533.0, + "completion_length": 515.875, "epoch": 0.28, - "grad_norm": 1.6269654380533742, - "kl": 0.061279296875, + "grad_norm": 1.5180700680436665, + "kl": 0.014678955078125, "learning_rate": 9.140576474687263e-07, - "loss": 0.0102, - "reward": 1.7708333730697632, - "reward_std": 0.25182367861270905, - "rewards/accuracy_reward": 0.7708333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": 0.0476, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 280 }, { "clip_ratio": 0.0, - "completion_length": 500.8125, + "completion_length": 501.7708435058594, "epoch": 0.281, - "grad_norm": 1.630677321540645, - "kl": 0.0537109375, + "grad_norm": 0.05324506592064358, + "kl": 0.02032470703125, "learning_rate": 9.131321404647109e-07, - "loss": -0.0096, - "reward": 1.8489583730697632, - "reward_std": 0.14677437022328377, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0008, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 281 }, { "clip_ratio": 0.0, - "completion_length": 491.37501525878906, + "completion_length": 433.6666717529297, "epoch": 0.282, - "grad_norm": 1.7192186218145291, - "kl": 0.0693359375, + "grad_norm": 0.0563429105599599, + "kl": 0.01556396484375, "learning_rate": 9.122022088101613e-07, - "loss": 0.0208, - "reward": 1.7708333730697632, - "reward_std": 0.16948114335536957, - "rewards/accuracy_reward": 0.7708333730697632, - "rewards/format_reward": 0.0, + "loss": 0.0006, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 282 }, { "clip_ratio": 0.0, - "completion_length": 530.8750152587891, + "completion_length": 504.18751525878906, "epoch": 0.283, - "grad_norm": 1.372306346543197, - "kl": 0.052490234375, + "grad_norm": 0.04710493186631686, + "kl": 0.01336669921875, "learning_rate": 9.112678638360015e-07, - "loss": 0.06, - "reward": 1.9166666865348816, - "reward_std": 0.18523554503917694, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, + "loss": 0.0005, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 283 }, { "clip_ratio": 0.0, - "completion_length": 401.1041717529297, + "completion_length": 436.2916717529297, "epoch": 0.284, - "grad_norm": 0.11544203473038296, - "kl": 0.0511474609375, + "grad_norm": 0.04673067805273048, + "kl": 0.014251708984375, "learning_rate": 9.103291169269299e-07, - "loss": 0.0021, - "reward": 2.0, + "loss": 0.0006, + "reward": 1.0, "reward_std": 0.0, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, "rewards/tag_count_reward": 1.0, "step": 284 }, { "clip_ratio": 0.0, - "completion_length": 428.3541717529297, + "completion_length": 415.6458435058594, "epoch": 0.285, - "grad_norm": 1.1779576749269904, - "kl": 0.06591796875, + "grad_norm": 0.05669767173984359, + "kl": 0.015106201171875, "learning_rate": 9.093859795212817e-07, - "loss": -0.0204, - "reward": 1.9166666865348816, - "reward_std": 0.12309150397777557, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, + "loss": 0.0006, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 285 }, { "clip_ratio": 0.0, - "completion_length": 500.45835876464844, + "completion_length": 444.43751525878906, "epoch": 0.286, - "grad_norm": 1.7979919834952898, - "kl": 0.066650390625, + "grad_norm": 0.05594767081328561, + "kl": 0.01556396484375, "learning_rate": 9.084384631108882e-07, - "loss": 0.0741, - "reward": 1.5729166865348816, - "reward_std": 0.21444134414196014, - "rewards/accuracy_reward": 0.5833333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0006, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 286 }, { "clip_ratio": 0.0, - "completion_length": 433.3333435058594, + "completion_length": 439.75001525878906, "epoch": 0.287, - "grad_norm": 1.698247369549637, - "kl": 0.0537109375, + "grad_norm": 0.07322331204423045, + "kl": 0.013763427734375, "learning_rate": 9.074865792409381e-07, - "loss": 0.0431, - "reward": 1.9583333730697632, - "reward_std": 0.14433756470680237, - "rewards/accuracy_reward": 0.9583333730697632, - "rewards/format_reward": 0.0, + "loss": 0.0006, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 287 }, { "clip_ratio": 0.0, - "completion_length": 451.5416717529297, + "completion_length": 469.0625, "epoch": 0.288, - "grad_norm": 1.3695302771332583, - "kl": 0.0595703125, + "grad_norm": 1.212173860336188, + "kl": 0.0167236328125, "learning_rate": 9.065303395098358e-07, - "loss": 0.0143, - "reward": 1.9791666865348816, - "reward_std": 0.07216878235340118, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": -0.0004, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 288 }, { "clip_ratio": 0.0, - "completion_length": 378.72918701171875, + "completion_length": 376.6875, "epoch": 0.289, - "grad_norm": 0.11760606290581059, - "kl": 0.0582275390625, + "grad_norm": 0.10599496620311394, + "kl": 0.015869140625, "learning_rate": 9.055697555690607e-07, - "loss": 0.0024, - "reward": 1.75, + "loss": 0.0006, + "reward": 1.0, "reward_std": 0.0, - "rewards/accuracy_reward": 0.75, - "rewards/format_reward": 0.0, "rewards/tag_count_reward": 1.0, "step": 289 }, { "clip_ratio": 0.0, - "completion_length": 529.0000152587891, + "completion_length": 511.2083435058594, "epoch": 0.29, - "grad_norm": 1.2254215114645115, - "kl": 0.0640869140625, + "grad_norm": 0.04930889731429071, + "kl": 0.014617919921875, "learning_rate": 9.046048391230247e-07, - "loss": 0.0729, - "reward": 1.4739583730697632, - "reward_std": 0.07278125733137131, - "rewards/accuracy_reward": 0.4791666716337204, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0005, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 290 }, { "clip_ratio": 0.0, - "completion_length": 422.47918701171875, + "completion_length": 457.0416717529297, "epoch": 0.291, - "grad_norm": 1.0097472716960079, - "kl": 0.0528564453125, + "grad_norm": 0.05112757445066342, + "kl": 0.01251220703125, "learning_rate": 9.036356019289309e-07, - "loss": 0.0158, - "reward": 1.7708333730697632, - "reward_std": 0.07216878235340118, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/format_reward": 0.0, + "loss": 0.0005, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 291 }, { "clip_ratio": 0.0, - "completion_length": 453.5625, + "completion_length": 461.3541717529297, "epoch": 0.292, - "grad_norm": 1.2469136180753613, - "kl": 0.0570068359375, + "grad_norm": 0.06788187986805701, + "kl": 0.0150146484375, "learning_rate": 9.026620557966279e-07, - "loss": 0.0303, - "reward": 1.875, - "reward_std": 0.13055823743343353, - "rewards/accuracy_reward": 0.875, - "rewards/format_reward": 0.0, + "loss": 0.0006, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 292 }, { "clip_ratio": 0.0, - "completion_length": 434.5625, + "completion_length": 443.93751525878906, "epoch": 0.293, - "grad_norm": 0.9532204430113935, - "kl": 0.0572509765625, + "grad_norm": 0.06684481261288397, + "kl": 0.016510009765625, "learning_rate": 9.016842125884684e-07, - "loss": 0.0799, - "reward": 1.828125, - "reward_std": 0.1281561702489853, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0007, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 293 }, { "clip_ratio": 0.0, - "completion_length": 463.5208435058594, + "completion_length": 461.7291717529297, "epoch": 0.294, - "grad_norm": 1.733183243712599, - "kl": 0.067626953125, + "grad_norm": 0.05151572062341038, + "kl": 0.01513671875, "learning_rate": 9.007020842191634e-07, - "loss": -0.0534, - "reward": 1.7083333730697632, - "reward_std": 0.29193708300590515, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/format_reward": 0.0, + "loss": 0.0006, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 294 }, { "clip_ratio": 0.0, - "completion_length": 455.89585876464844, + "completion_length": 503.5208435058594, "epoch": 0.295, - "grad_norm": 0.9847953426021161, - "kl": 0.0587158203125, + "grad_norm": 0.0590936047141315, + "kl": 0.013519287109375, "learning_rate": 8.997156826556369e-07, - "loss": -0.0223, - "reward": 1.625, - "reward_std": 0.13055823743343353, - "rewards/accuracy_reward": 0.625, - "rewards/format_reward": 0.0, + "loss": 0.0005, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 295 }, { "clip_ratio": 0.0, - "completion_length": 616.9583740234375, + "completion_length": 545.0625152587891, "epoch": 0.296, - "grad_norm": 1.3963657945325254, - "kl": 0.07470703125, + "grad_norm": 0.05288289391515388, + "kl": 0.01641845703125, "learning_rate": 8.987250199168808e-07, - "loss": 0.0493, - "reward": 1.9166666865348816, - "reward_std": 0.18523553758859634, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, + "loss": 0.0006, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 296 }, { "clip_ratio": 0.0, - "completion_length": 463.22918701171875, + "completion_length": 517.7291717529297, "epoch": 0.297, - "grad_norm": 1.1706032627741179, - "kl": 0.064697265625, + "grad_norm": 0.060860807391451464, + "kl": 0.016143798828125, "learning_rate": 8.977301080738079e-07, - "loss": 0.0047, - "reward": 1.9791666865348816, - "reward_std": 0.07216878235340118, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, + "loss": 0.0007, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 297 }, { "clip_ratio": 0.0, - "completion_length": 483.22918701171875, + "completion_length": 473.3958435058594, "epoch": 0.298, - "grad_norm": 1.3839329084946337, - "kl": 0.0732421875, + "grad_norm": 0.05173550443403316, + "kl": 0.017578125, "learning_rate": 8.967309592491052e-07, - "loss": 0.0126, - "reward": 1.7291666865348816, - "reward_std": 0.21037911623716354, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/format_reward": 0.0, + "loss": 0.0007, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 298 }, { "clip_ratio": 0.0, - "completion_length": 493.9166717529297, + "completion_length": 501.0208435058594, "epoch": 0.299, - "grad_norm": 1.615376030628624, - "kl": 0.0731201171875, + "grad_norm": 1.7229744630333605, + "kl": 0.014373779296875, "learning_rate": 8.957275856170855e-07, - "loss": 0.0587, - "reward": 1.5000000596046448, - "reward_std": 0.14433756470680237, - "rewards/accuracy_reward": 0.5000000204890966, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": -0.0001, + "reward": 0.9895833730697632, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833730697632, "step": 299 }, { "clip_ratio": 0.0, - "completion_length": 519.4583587646484, + "completion_length": 517.5208435058594, "epoch": 0.3, - "grad_norm": 1.0229873654833457, - "kl": 0.0693359375, + "grad_norm": 0.054848896917061254, + "kl": 0.0145263671875, "learning_rate": 8.9471999940354e-07, - "loss": 0.0167, - "reward": 1.9166666865348816, - "reward_std": 0.12309150397777557, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, + "loss": 0.0006, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 300 }, { "clip_ratio": 0.0, - "completion_length": 481.5416717529297, + "completion_length": 473.4791717529297, "epoch": 0.301, - "grad_norm": 1.9010664501913601, - "kl": 0.0703125, + "grad_norm": 1.1842846408033978, + "kl": 0.01910400390625, "learning_rate": 8.937082128855891e-07, - "loss": 0.0487, - "reward": 1.8072917461395264, - "reward_std": 0.21330247819423676, - "rewards/accuracy_reward": 0.8125000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0054, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 301 }, { "clip_ratio": 0.0, - "completion_length": 548.6458435058594, + "completion_length": 498.5625, "epoch": 0.302, - "grad_norm": 0.82961455607938, - "kl": 0.06884765625, + "grad_norm": 0.0416267862511951, + "kl": 0.01568603515625, "learning_rate": 8.926922383915315e-07, - "loss": 0.0223, - "reward": 1.8125, - "reward_std": 0.11306675523519516, - "rewards/accuracy_reward": 0.8125, - "rewards/format_reward": 0.0, + "loss": 0.0006, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 302 }, { "clip_ratio": 0.0, - "completion_length": 413.3333435058594, + "completion_length": 467.87501525878906, "epoch": 0.303, - "grad_norm": 1.1853334238442175, - "kl": 0.0751953125, + "grad_norm": 0.04731658896726055, + "kl": 0.01678466796875, "learning_rate": 8.916720883006963e-07, - "loss": 0.0135, - "reward": 1.7291666865348816, - "reward_std": 0.07216878235340118, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/format_reward": 0.0, + "loss": 0.0007, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 303 }, { "clip_ratio": 0.0, - "completion_length": 542.5625152587891, + "completion_length": 526.2291870117188, "epoch": 0.304, - "grad_norm": 1.9848921579633452, - "kl": 0.07958984375, + "grad_norm": 1.2303805096767035, + "kl": 0.016326904296875, "learning_rate": 8.906477750432903e-07, - "loss": 0.0365, - "reward": 1.8281250596046448, - "reward_std": 0.29483576118946075, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/format_reward": 0.0, + "loss": 0.0044, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, "rewards/tag_count_reward": 0.9947916865348816, "step": 304 }, { "clip_ratio": 0.0, - "completion_length": 603.6458740234375, + "completion_length": 579.3333435058594, "epoch": 0.305, - "grad_norm": 1.4544800013147805, - "kl": 0.089111328125, + "grad_norm": 0.05744770348848511, + "kl": 0.015533447265625, "learning_rate": 8.896193111002475e-07, - "loss": 0.0455, - "reward": 1.6145833730697632, - "reward_std": 0.2064298689365387, - "rewards/accuracy_reward": 0.625, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0006, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 305 }, { "clip_ratio": 0.0, - "completion_length": 397.9791717529297, + "completion_length": 427.3125, "epoch": 0.306, - "grad_norm": 0.11913652369951816, - "kl": 0.058349609375, + "grad_norm": 0.05850889841453987, + "kl": 0.01361083984375, "learning_rate": 8.88586709003076e-07, - "loss": 0.0024, - "reward": 2.0, + "loss": 0.0005, + "reward": 1.0, "reward_std": 0.0, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, "rewards/tag_count_reward": 1.0, "step": 306 }, { "clip_ratio": 0.0, - "completion_length": 534.4583435058594, + "completion_length": 470.9166717529297, "epoch": 0.307, - "grad_norm": 1.3479517861534447, - "kl": 0.077880859375, + "grad_norm": 1.1348449824926166, + "kl": 0.016357421875, "learning_rate": 8.875499813337067e-07, - "loss": 0.0527, - "reward": 1.78125, - "reward_std": 0.24761711061000824, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0164, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 307 }, { "clip_ratio": 0.0, - "completion_length": 438.9375, + "completion_length": 455.3541717529297, "epoch": 0.308, - "grad_norm": 1.133193989574027, - "kl": 0.068359375, + "grad_norm": 0.06661742406273104, + "kl": 0.01873779296875, "learning_rate": 8.865091407243394e-07, - "loss": -0.0027, - "reward": 1.5208333730697632, - "reward_std": 0.07216878235340118, - "rewards/accuracy_reward": 0.520833333954215, - "rewards/format_reward": 0.0, + "loss": 0.0007, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 308 }, { "clip_ratio": 0.0, - "completion_length": 405.97918701171875, + "completion_length": 458.85418701171875, "epoch": 0.309, - "grad_norm": 0.13700408202121145, - "kl": 0.063232421875, + "grad_norm": 0.05716761421750185, + "kl": 0.01800537109375, "learning_rate": 8.85464199857288e-07, - "loss": 0.0027, - "reward": 2.0, + "loss": 0.0007, + "reward": 1.0, "reward_std": 0.0, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, "rewards/tag_count_reward": 1.0, "step": 309 }, { "clip_ratio": 0.0, - "completion_length": 526.0625, + "completion_length": 495.95835876464844, "epoch": 0.31, - "grad_norm": 1.7134568131475336, - "kl": 0.08837890625, + "grad_norm": 0.06628759135865589, + "kl": 0.01934814453125, "learning_rate": 8.844151714648274e-07, - "loss": 0.0126, - "reward": 1.7916666865348816, - "reward_std": 0.18523553758859634, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/format_reward": 0.0, + "loss": 0.0008, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 310 }, { "clip_ratio": 0.0, - "completion_length": 526.1875152587891, + "completion_length": 493.7083435058594, "epoch": 0.311, - "grad_norm": 1.632802431264856, - "kl": 0.082763671875, + "grad_norm": 0.07749485923013408, + "kl": 0.01983642578125, "learning_rate": 8.833620683290375e-07, - "loss": -0.0325, - "reward": 1.9062500596046448, - "reward_std": 0.22131992876529694, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0008, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 311 }, { "clip_ratio": 0.0, - "completion_length": 572.6666870117188, + "completion_length": 512.4375305175781, "epoch": 0.312, - "grad_norm": 1.5253663359736243, - "kl": 0.083251953125, + "grad_norm": 0.09232922199419766, + "kl": 0.01904296875, "learning_rate": 8.823049032816478e-07, - "loss": 0.0206, - "reward": 1.8958333730697632, - "reward_std": 0.2574043273925781, - "rewards/accuracy_reward": 0.8958333730697632, - "rewards/format_reward": 0.0, + "loss": 0.0008, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 312 }, { "clip_ratio": 0.0, - "completion_length": 621.6875, + "completion_length": 507.0833435058594, "epoch": 0.313, - "grad_norm": 1.4695273684515788, - "kl": 0.093505859375, + "grad_norm": 0.0710841483167992, + "kl": 0.02093505859375, "learning_rate": 8.812436892038805e-07, - "loss": 0.0134, - "reward": 1.6770833730697632, - "reward_std": 0.2797093912959099, - "rewards/accuracy_reward": 0.6875, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0009, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 313 }, { "clip_ratio": 0.0, - "completion_length": 464.2916717529297, + "completion_length": 459.4583435058594, "epoch": 0.314, - "grad_norm": 1.1033631567573143, - "kl": 0.082275390625, + "grad_norm": 0.3275093589318186, + "kl": 0.018798828125, "learning_rate": 8.801784390262943e-07, - "loss": 0.0216, - "reward": 1.9583333730697632, - "reward_std": 0.09731236100196838, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, + "loss": 0.0008, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 314 }, { "clip_ratio": 0.0, - "completion_length": 574.9166870117188, + "completion_length": 526.5625152587891, "epoch": 0.315, - "grad_norm": 1.5233693323934243, - "kl": 0.08251953125, + "grad_norm": 0.059675318673412456, + "kl": 0.015472412109375, "learning_rate": 8.791091657286267e-07, - "loss": 0.0394, - "reward": 1.8958333730697632, - "reward_std": 0.21037911623716354, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/format_reward": 0.0, + "loss": 0.0006, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 315 }, { "clip_ratio": 0.0, - "completion_length": 457.9791717529297, + "completion_length": 463.00001525878906, "epoch": 0.316, - "grad_norm": 1.049999392705825, - "kl": 0.0791015625, + "grad_norm": 0.13956820712764417, + "kl": 0.02545166015625, "learning_rate": 8.780358823396352e-07, - "loss": -0.0344, - "reward": 1.7916666865348816, - "reward_std": 0.09731236100196838, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/format_reward": 0.0, + "loss": 0.001, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 316 }, { "clip_ratio": 0.0, - "completion_length": 561.5208435058594, + "completion_length": 514.125, "epoch": 0.317, - "grad_norm": 1.335210375686767, - "kl": 0.0869140625, + "grad_norm": 0.06831673248504883, + "kl": 0.01904296875, "learning_rate": 8.769586019369391e-07, - "loss": 0.027, - "reward": 1.9322917461395264, - "reward_std": 0.1700936183333397, - "rewards/accuracy_reward": 0.9375000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0008, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 317 }, { "clip_ratio": 0.0, - "completion_length": 488.3958435058594, + "completion_length": 523.4583587646484, "epoch": 0.318, - "grad_norm": 1.7608893285689193, - "kl": 0.06982421875, + "grad_norm": 0.15825854582183677, + "kl": 0.01904296875, "learning_rate": 8.758773376468604e-07, - "loss": 0.0186, - "reward": 1.9322916865348816, - "reward_std": 0.2039930783212185, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 0.0008, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 318 }, { "clip_ratio": 0.0, - "completion_length": 554.1458587646484, + "completion_length": 482.72918701171875, "epoch": 0.319, - "grad_norm": 2.2483486848963605, - "kl": 0.091552734375, + "grad_norm": 0.053797281866110686, + "kl": 0.0186767578125, "learning_rate": 8.747921026442629e-07, - "loss": 0.154, - "reward": 1.7916666865348816, - "reward_std": 0.26742906868457794, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/format_reward": 0.0, + "loss": 0.0007, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 319 }, { "clip_ratio": 0.0, - "completion_length": 669.4375305175781, + "completion_length": 502.6458435058594, "epoch": 0.32, - "grad_norm": 0.7334529033556741, - "kl": 0.087158203125, + "grad_norm": 0.04354466180020813, + "kl": 0.015777587890625, "learning_rate": 8.737029101523929e-07, - "loss": 0.0158, - "reward": 1.875, - "reward_std": 0.13055823743343353, - "rewards/accuracy_reward": 0.875, - "rewards/format_reward": 0.0, + "loss": 0.0006, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 320 }, { "clip_ratio": 0.0, - "completion_length": 612.3333435058594, + "completion_length": 542.9583587646484, "epoch": 0.321, - "grad_norm": 1.6871249628307925, - "kl": 0.10595703125, + "grad_norm": 0.04820768922800098, + "kl": 0.015869140625, "learning_rate": 8.726097734427172e-07, - "loss": -0.0049, - "reward": 1.6458333730697632, - "reward_std": 0.3749151676893234, - "rewards/accuracy_reward": 0.645833358168602, - "rewards/format_reward": 0.0, + "loss": 0.0006, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 321 }, { "clip_ratio": 0.0, - "completion_length": 597.8333435058594, + "completion_length": 502.04168701171875, "epoch": 0.322, - "grad_norm": 1.8109359792426167, - "kl": 0.0849609375, + "grad_norm": 0.059999763448746204, + "kl": 0.01849365234375, "learning_rate": 8.715127058347614e-07, - "loss": 0.0339, - "reward": 1.7447916865348816, - "reward_std": 0.2955199033021927, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0007, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 322 }, { "clip_ratio": 0.0, - "completion_length": 576.0, + "completion_length": 492.37501525878906, "epoch": 0.323, - "grad_norm": 1.4290500373152046, - "kl": 0.09375, + "grad_norm": 0.43781729225200805, + "kl": 0.02587890625, "learning_rate": 8.704117206959484e-07, - "loss": 0.0079, - "reward": 1.9166667461395264, - "reward_std": 0.19462472200393677, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, + "loss": 0.0011, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 323 }, { "clip_ratio": 0.0, - "completion_length": 525.3958435058594, + "completion_length": 505.4791717529297, "epoch": 0.324, - "grad_norm": 1.5722480741458937, - "kl": 0.07861328125, + "grad_norm": 0.05736039229627184, + "kl": 0.0220947265625, "learning_rate": 8.693068314414344e-07, - "loss": 0.025, - "reward": 1.8125000596046448, - "reward_std": 0.19526028633117676, - "rewards/accuracy_reward": 0.8125000298023224, - "rewards/format_reward": 0.0, + "loss": 0.0009, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 324 }, { "clip_ratio": 0.0, - "completion_length": 545.6458587646484, + "completion_length": 510.1250305175781, "epoch": 0.325, - "grad_norm": 0.14275340915080217, - "kl": 0.08203125, + "grad_norm": 0.05452839261024288, + "kl": 0.020263671875, "learning_rate": 8.681980515339463e-07, - "loss": 0.0036, - "reward": 2.0, + "loss": 0.0008, + "reward": 1.0, "reward_std": 0.0, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, "rewards/tag_count_reward": 1.0, "step": 325 }, { "clip_ratio": 0.0, - "completion_length": 644.8125, + "completion_length": 535.3333740234375, "epoch": 0.326, - "grad_norm": 2.03189865058837, - "kl": 0.101318359375, + "grad_norm": 0.8837300899462217, + "kl": 0.018035888671875, "learning_rate": 8.670853944836176e-07, - "loss": 0.1319, - "reward": 1.6510416865348816, - "reward_std": 0.34709176421165466, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.984375, + "loss": -0.0116, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 326 }, { "clip_ratio": 0.0, - "completion_length": 617.9583740234375, + "completion_length": 586.9375305175781, "epoch": 0.327, - "grad_norm": 1.3715228051278106, - "kl": 0.0947265625, + "grad_norm": 1.6811219122897223, + "kl": 0.02203369140625, "learning_rate": 8.659688738478231e-07, - "loss": 0.0056, - "reward": 1.8541667461395264, - "reward_std": 0.22604453563690186, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": 0.1342, + "reward": 0.9791666865348816, + "reward_std": 0.07216878235340118, + "rewards/tag_count_reward": 0.9791666865348816, "step": 327 }, { "clip_ratio": 0.0, - "completion_length": 550.3541870117188, + "completion_length": 489.3333435058594, "epoch": 0.328, - "grad_norm": 1.3300261340063773, - "kl": 0.084716796875, + "grad_norm": 1.8690265977611447, + "kl": 0.025390625, "learning_rate": 8.648485032310144e-07, - "loss": 0.0096, - "reward": 1.8177083730697632, - "reward_std": 0.26080769300460815, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.984375, + "loss": 0.0393, + "reward": 0.9895833730697632, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833730697632, "step": 328 }, { "clip_ratio": 0.0, - "completion_length": 455.16668701171875, + "completion_length": 509.12501525878906, "epoch": 0.329, - "grad_norm": 1.0651360953195264, - "kl": 0.06982421875, + "grad_norm": 2.034164201051351, + "kl": 0.0216064453125, "learning_rate": 8.63724296284554e-07, - "loss": 0.0005, - "reward": 1.8489583730697632, - "reward_std": 0.1234515830874443, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.1843, + "reward": 0.9791666865348816, + "reward_std": 0.07216878235340118, + "rewards/tag_count_reward": 0.9791666865348816, "step": 329 }, { "clip_ratio": 0.0, - "completion_length": 431.8958435058594, + "completion_length": 444.7916717529297, "epoch": 0.33, - "grad_norm": 1.1527589242621392, - "kl": 0.068115234375, + "grad_norm": 0.05567755862652329, + "kl": 0.0224609375, "learning_rate": 8.625962667065487e-07, - "loss": -0.002, - "reward": 1.9375, - "reward_std": 0.11306675523519516, - "rewards/accuracy_reward": 0.9375, - "rewards/format_reward": 0.0, + "loss": 0.0009, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 330 }, { "clip_ratio": 0.0, - "completion_length": 726.5833435058594, + "completion_length": 581.4166870117188, "epoch": 0.331, - "grad_norm": 1.8181438264423686, - "kl": 0.123046875, + "grad_norm": 0.04654044343912161, + "kl": 0.01922607421875, "learning_rate": 8.614644282416831e-07, - "loss": 0.0415, - "reward": 1.5, - "reward_std": 0.4227762967348099, - "rewards/accuracy_reward": 0.5208333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.0008, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 331 }, { "clip_ratio": 0.0, - "completion_length": 496.125, + "completion_length": 494.9375, "epoch": 0.332, - "grad_norm": 12.999889365887881, - "kl": 0.129638671875, + "grad_norm": 0.06383138430755644, + "kl": 0.0205078125, "learning_rate": 8.603287946810513e-07, - "loss": 0.0498, - "reward": 1.9375000596046448, - "reward_std": 0.16948114335536957, - "rewards/accuracy_reward": 0.9375000298023224, - "rewards/format_reward": 0.0, + "loss": 0.0008, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 332 }, { "clip_ratio": 0.0, - "completion_length": 608.0625305175781, + "completion_length": 472.4166717529297, "epoch": 0.333, - "grad_norm": 1.3589962087124057, - "kl": 0.09716796875, + "grad_norm": 0.09659644276035662, + "kl": 0.02178955078125, "learning_rate": 8.591893798619903e-07, - "loss": 0.0291, - "reward": 1.8958333730697632, - "reward_std": 0.21037911623716354, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/format_reward": 0.0, + "loss": 0.0009, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 333 }, { "clip_ratio": 0.0, - "completion_length": 603.6666717529297, + "completion_length": 482.52085876464844, "epoch": 0.334, - "grad_norm": 1.2683173147380378, - "kl": 0.09521484375, + "grad_norm": 0.0633386796003193, + "kl": 0.02069091796875, "learning_rate": 8.580461976679099e-07, - "loss": 0.0854, - "reward": 1.8489583730697632, - "reward_std": 0.2336074709892273, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0008, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 334 }, { "clip_ratio": 0.0, - "completion_length": 583.2291870117188, + "completion_length": 507.1041717529297, "epoch": 0.335, - "grad_norm": 1.4792294059034734, - "kl": 0.09716796875, + "grad_norm": 1.1750271881166172, + "kl": 0.024169921875, "learning_rate": 8.568992620281243e-07, - "loss": 0.026, - "reward": 1.8541666865348816, - "reward_std": 0.20272701978683472, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": 0.0184, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 335 }, { "clip_ratio": 0.0, - "completion_length": 551.0208435058594, + "completion_length": 515.2083435058594, "epoch": 0.336, - "grad_norm": 1.7022289003271576, - "kl": 0.096923828125, + "grad_norm": 0.0705503835518815, + "kl": 0.02325439453125, "learning_rate": 8.557485869176825e-07, - "loss": -0.0263, - "reward": 1.7395833730697632, - "reward_std": 0.25259073823690414, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0009, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 336 }, { "clip_ratio": 0.0, - "completion_length": 503.43751525878906, + "completion_length": 526.1041870117188, "epoch": 0.337, - "grad_norm": 1.0781904768916355, - "kl": 0.080078125, + "grad_norm": 0.10934531694748963, + "kl": 0.0225830078125, "learning_rate": 8.545941863571973e-07, - "loss": 0.0162, - "reward": 1.9791666865348816, - "reward_std": 0.07216878235340118, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, + "loss": 0.0009, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 337 }, { "clip_ratio": 0.0, - "completion_length": 508.41668701171875, + "completion_length": 494.85418701171875, "epoch": 0.338, - "grad_norm": 1.173175067685679, - "kl": 0.098388671875, + "grad_norm": 0.06572627733993577, + "kl": 0.0233154296875, "learning_rate": 8.534360744126753e-07, - "loss": 0.0335, - "reward": 1.96875, - "reward_std": 0.07769769430160522, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0009, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 338 }, { "clip_ratio": 0.0, - "completion_length": 579.3750152587891, + "completion_length": 470.47918701171875, "epoch": 0.339, - "grad_norm": 1.1744380518666515, - "kl": 0.092529296875, + "grad_norm": 0.06859987776775232, + "kl": 0.0250244140625, "learning_rate": 8.522742651953456e-07, - "loss": 0.0186, - "reward": 1.9635416865348816, - "reward_std": 0.09573988616466522, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.984375, + "loss": 0.001, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 339 }, { "clip_ratio": 0.0, - "completion_length": 511.6041717529297, + "completion_length": 544.3750305175781, "epoch": 0.34, - "grad_norm": 1.3633594610713922, - "kl": 0.084716796875, + "grad_norm": 0.06359343320977029, + "kl": 0.0220947265625, "learning_rate": 8.511087728614862e-07, - "loss": -0.001, - "reward": 1.8541666865348816, - "reward_std": 0.22604453563690186, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/format_reward": 0.0, + "loss": 0.0009, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 340 }, { "clip_ratio": 0.0, - "completion_length": 527.2291870117188, + "completion_length": 491.3541717529297, "epoch": 0.341, - "grad_norm": 1.7579493243510367, - "kl": 0.099853515625, + "grad_norm": 0.08259686589085237, + "kl": 0.02557373046875, "learning_rate": 8.499396116122535e-07, - "loss": -0.0084, - "reward": 1.6979167461395264, - "reward_std": 0.2897341400384903, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.001, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 341 }, { "clip_ratio": 0.0, - "completion_length": 499.77085876464844, + "completion_length": 484.1458435058594, "epoch": 0.342, - "grad_norm": 1.3486506673603433, - "kl": 0.090576171875, + "grad_norm": 0.09912480037756079, + "kl": 0.02545166015625, "learning_rate": 8.487667956935087e-07, - "loss": 0.0241, - "reward": 1.9791666865348816, - "reward_std": 0.07216878235340118, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, + "loss": 0.001, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 342 }, { "clip_ratio": 0.0, - "completion_length": 506.8333435058594, + "completion_length": 469.2708435058594, "epoch": 0.343, - "grad_norm": 1.060038239319139, - "kl": 0.088623046875, + "grad_norm": 0.07102065021267147, + "kl": 0.025390625, "learning_rate": 8.475903393956433e-07, - "loss": -0.0124, - "reward": 1.8958333730697632, - "reward_std": 0.12873217463493347, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/format_reward": 0.0, + "loss": 0.001, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 343 }, { "clip_ratio": 0.0, - "completion_length": 560.7291870117188, + "completion_length": 533.7083740234375, "epoch": 0.344, - "grad_norm": 1.8897855990332824, - "kl": 0.10986328125, + "grad_norm": 1.1653009469800304, + "kl": 0.02618408203125, "learning_rate": 8.464102570534061e-07, - "loss": -0.0813, - "reward": 1.8333333730697632, - "reward_std": 0.3083270341157913, - "rewards/accuracy_reward": 0.8333333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": 0.0184, + "reward": 0.984375, + "reward_std": 0.05412658676505089, + "rewards/tag_count_reward": 0.984375, "step": 344 }, { "clip_ratio": 0.0, - "completion_length": 560.375, + "completion_length": 518.3750152587891, "epoch": 0.345, - "grad_norm": 1.5781997371861605, - "kl": 0.099609375, + "grad_norm": 0.0967071028899652, + "kl": 0.0279541015625, "learning_rate": 8.452265630457282e-07, - "loss": 0.1018, - "reward": 1.8593750596046448, - "reward_std": 0.22860805690288544, - "rewards/accuracy_reward": 0.8750000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.984375, + "loss": 0.0011, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 345 }, { "clip_ratio": 0.0, - "completion_length": 595.0208740234375, + "completion_length": 601.8958435058594, "epoch": 0.346, - "grad_norm": 1.549241907434681, - "kl": 0.092041015625, + "grad_norm": 1.1680093291977525, + "kl": 0.02410888671875, "learning_rate": 8.440392717955475e-07, - "loss": 0.0624, - "reward": 1.9375, - "reward_std": 0.16948114335536957, - "rewards/accuracy_reward": 0.9375, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": 0.0125, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 346 }, { "clip_ratio": 0.0, - "completion_length": 463.2708435058594, + "completion_length": 480.7083435058594, "epoch": 0.347, - "grad_norm": 1.090363468823661, - "kl": 0.085693359375, + "grad_norm": 0.0684180862342716, + "kl": 0.02630615234375, "learning_rate": 8.428483977696328e-07, - "loss": 0.0047, - "reward": 1.9791666865348816, - "reward_std": 0.07216878235340118, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, + "loss": 0.001, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 347 }, { "clip_ratio": 0.0, - "completion_length": 499.8333435058594, + "completion_length": 517.8958435058594, "epoch": 0.348, - "grad_norm": 1.6064296670377103, - "kl": 0.08056640625, + "grad_norm": 1.1491175892311671, + "kl": 0.0286865234375, "learning_rate": 8.416539554784089e-07, - "loss": 0.0253, - "reward": 1.8541666865348816, - "reward_std": 0.20272701978683472, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": 0.0028, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 348 }, { "clip_ratio": 0.0, - "completion_length": 467.31251525878906, + "completion_length": 473.3958435058594, "epoch": 0.349, - "grad_norm": 0.16555077667993479, - "kl": 0.07861328125, + "grad_norm": 0.9962984772350725, + "kl": 0.02545166015625, "learning_rate": 8.404559594757777e-07, - "loss": 0.0034, - "reward": 2.0, - "reward_std": 0.0, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": -0.0051, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 349 }, { "clip_ratio": 0.0, - "completion_length": 464.54168701171875, + "completion_length": 511.00001525878906, "epoch": 0.35, - "grad_norm": 1.2560318303630802, - "kl": 0.091796875, + "grad_norm": 0.07845658168559723, + "kl": 0.0296630859375, "learning_rate": 8.392544243589427e-07, - "loss": -0.0241, - "reward": 1.9583333730697632, - "reward_std": 0.09731236100196838, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, + "loss": 0.0012, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 350 }, { "clip_ratio": 0.0, - "completion_length": 462.35418701171875, + "completion_length": 525.7708435058594, "epoch": 0.351, - "grad_norm": 0.1530072283362282, - "kl": 0.08984375, + "grad_norm": 0.9282118009818712, + "kl": 0.031005859375, "learning_rate": 8.3804936476823e-07, - "loss": 0.0036, - "reward": 2.0, - "reward_std": 0.0, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": 0.0192, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 351 }, { "clip_ratio": 0.0, - "completion_length": 446.2291717529297, + "completion_length": 563.8750152587891, "epoch": 0.352, - "grad_norm": 0.14873820420361133, - "kl": 0.07421875, + "grad_norm": 0.06552065313261518, + "kl": 0.02557373046875, "learning_rate": 8.368407953869103e-07, - "loss": 0.0031, - "reward": 2.0, + "loss": 0.001, + "reward": 1.0, "reward_std": 0.0, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, "rewards/tag_count_reward": 1.0, "step": 352 }, { "clip_ratio": 0.0, - "completion_length": 539.1875305175781, + "completion_length": 581.0625305175781, "epoch": 0.353, - "grad_norm": 1.1457511579066482, - "kl": 0.08154296875, + "grad_norm": 0.060361965526579735, + "kl": 0.02569580078125, "learning_rate": 8.356287309410204e-07, - "loss": 0.0745, - "reward": 1.9791666865348816, - "reward_std": 0.07216878235340118, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, + "loss": 0.001, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 353 }, { "clip_ratio": 0.0, - "completion_length": 573.0000305175781, + "completion_length": 548.6666717529297, "epoch": 0.354, - "grad_norm": 1.2506551209220567, - "kl": 0.10791015625, + "grad_norm": 0.10004136159227021, + "kl": 0.03033447265625, "learning_rate": 8.344131861991828e-07, - "loss": -0.0084, - "reward": 1.7708333730697632, - "reward_std": 0.21037912368774414, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/format_reward": 0.0, + "loss": 0.0012, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 354 }, { "clip_ratio": 0.0, - "completion_length": 528.4166870117188, + "completion_length": 570.4583587646484, "epoch": 0.355, - "grad_norm": 2.0351339386414735, - "kl": 0.10595703125, + "grad_norm": 0.07661012404336875, + "kl": 0.0269775390625, "learning_rate": 8.331941759724268e-07, - "loss": -0.0391, - "reward": 1.875, - "reward_std": 0.22613351047039032, - "rewards/accuracy_reward": 0.875, - "rewards/format_reward": 0.0, + "loss": 0.0011, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 355 }, { "clip_ratio": 0.0, - "completion_length": 519.8958740234375, + "completion_length": 585.1250305175781, "epoch": 0.356, - "grad_norm": 1.753651906481268, - "kl": 0.08349609375, + "grad_norm": 0.06700631082240648, + "kl": 0.02691650390625, "learning_rate": 8.319717151140072e-07, - "loss": -0.0632, - "reward": 1.7916667461395264, - "reward_std": 0.22040386497974396, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/format_reward": 0.0, + "loss": 0.0011, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 356 }, { "clip_ratio": 0.0, - "completion_length": 537.9166870117188, + "completion_length": 559.2708587646484, "epoch": 0.357, - "grad_norm": 1.4397028938630096, - "kl": 0.0908203125, + "grad_norm": 1.1262725311155874, + "kl": 0.02838134765625, "learning_rate": 8.307458185192238e-07, - "loss": 0.068, - "reward": 1.875, - "reward_std": 0.20090095698833466, - "rewards/accuracy_reward": 0.875, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": -0.0009, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 357 }, { "clip_ratio": 0.0, - "completion_length": 489.6875305175781, + "completion_length": 580.2500305175781, "epoch": 0.358, - "grad_norm": 1.4827034462032442, - "kl": 0.076416015625, + "grad_norm": 0.07699460992208937, + "kl": 0.028564453125, "learning_rate": 8.295165011252396e-07, - "loss": 0.0028, - "reward": 1.8125, - "reward_std": 0.2436249852180481, - "rewards/accuracy_reward": 0.8125, - "rewards/format_reward": 0.0, + "loss": 0.0012, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 358 }, { "clip_ratio": 0.0, - "completion_length": 413.0833435058594, + "completion_length": 488.4166717529297, "epoch": 0.359, - "grad_norm": 0.12982784855812285, - "kl": 0.073486328125, + "grad_norm": 0.07259581116856176, + "kl": 0.0335693359375, "learning_rate": 8.282837779108993e-07, - "loss": 0.003, - "reward": 2.0, + "loss": 0.0013, + "reward": 1.0, "reward_std": 0.0, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, "rewards/tag_count_reward": 1.0, "step": 359 }, { "clip_ratio": 0.0, - "completion_length": 425.62501525878906, + "completion_length": 601.1041870117188, "epoch": 0.36, - "grad_norm": 1.2043950398822112, - "kl": 0.0712890625, + "grad_norm": 1.645882079495026, + "kl": 0.03192138671875, "learning_rate": 8.270476638965461e-07, - "loss": 0.022, - "reward": 1.9791666865348816, - "reward_std": 0.07216878235340118, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": 0.1289, + "reward": 0.984375, + "reward_std": 0.05412658676505089, + "rewards/tag_count_reward": 0.984375, "step": 360 }, { "clip_ratio": 0.0, - "completion_length": 469.0416717529297, + "completion_length": 604.5208435058594, "epoch": 0.361, - "grad_norm": 1.5294320060286595, - "kl": 0.068603515625, + "grad_norm": 0.06797167589286383, + "kl": 0.02838134765625, "learning_rate": 8.258081741438394e-07, - "loss": 0.0154, - "reward": 1.7291666865348816, - "reward_std": 0.07216878235340118, - "rewards/accuracy_reward": 0.7291666716337204, - "rewards/format_reward": 0.0, + "loss": 0.0011, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 361 }, { "clip_ratio": 0.0, - "completion_length": 484.2916717529297, + "completion_length": 586.7708435058594, "epoch": 0.362, - "grad_norm": 0.9723332963261183, - "kl": 0.072509765625, + "grad_norm": 1.214355109649776, + "kl": 0.03515625, "learning_rate": 8.245653237555705e-07, - "loss": 0.007, - "reward": 1.9791666865348816, - "reward_std": 0.07216878235340118, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": 0.1163, + "reward": 0.9739583432674408, + "reward_std": 0.06226281076669693, + "rewards/tag_count_reward": 0.9739583432674408, "step": 362 }, { "clip_ratio": 0.0, - "completion_length": 432.8333435058594, + "completion_length": 536.7708435058594, "epoch": 0.363, - "grad_norm": 1.5069469978704204, - "kl": 0.068359375, + "grad_norm": 0.063669726734918, + "kl": 0.02801513671875, "learning_rate": 8.23319127875479e-07, - "loss": 0.0232, - "reward": 1.9687500596046448, - "reward_std": 0.10825317353010178, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0011, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 363 }, { "clip_ratio": 0.0, - "completion_length": 468.3750305175781, + "completion_length": 555.6041717529297, "epoch": 0.364, - "grad_norm": 1.1930864136693693, - "kl": 0.068603515625, + "grad_norm": 0.07180792348851471, + "kl": 0.03021240234375, "learning_rate": 8.220696016880687e-07, - "loss": 0.0686, - "reward": 1.9791666865348816, - "reward_std": 0.07216878235340118, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, + "loss": 0.0012, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 364 }, { "clip_ratio": 0.0, - "completion_length": 485.0000305175781, + "completion_length": 566.625, "epoch": 0.365, - "grad_norm": 1.7817636999623176, - "kl": 0.078857421875, + "grad_norm": 1.5129871887055757, + "kl": 0.031982421875, "learning_rate": 8.208167604184217e-07, - "loss": 0.0397, - "reward": 1.9166666865348816, - "reward_std": 0.18523553758859634, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": 0.1088, + "reward": 0.9791666865348816, + "reward_std": 0.07216878235340118, + "rewards/tag_count_reward": 0.9791666865348816, "step": 365 }, { "clip_ratio": 0.0, - "completion_length": 482.875, + "completion_length": 488.5833435058594, "epoch": 0.366, - "grad_norm": 2.1766087104472525, - "kl": 0.087158203125, + "grad_norm": 0.1253293797973738, + "kl": 0.0357666015625, "learning_rate": 8.195606193320136e-07, - "loss": 0.0038, - "reward": 1.8125000596046448, - "reward_std": 0.3000393956899643, - "rewards/accuracy_reward": 0.8125000298023224, - "rewards/format_reward": 0.0, + "loss": 0.0014, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 366 }, { "clip_ratio": 0.0, - "completion_length": 455.81251525878906, + "completion_length": 537.6041870117188, "epoch": 0.367, - "grad_norm": 1.316846946655349, - "kl": 0.065185546875, + "grad_norm": 0.09708604189309158, + "kl": 0.0328369140625, "learning_rate": 8.183011937345271e-07, - "loss": 0.013, - "reward": 1.7083333730697632, - "reward_std": 0.09731236100196838, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/format_reward": 0.0, + "loss": 0.0013, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 367 }, { "clip_ratio": 0.0, - "completion_length": 496.1458435058594, + "completion_length": 616.9791870117188, "epoch": 0.368, - "grad_norm": 1.248626414757594, - "kl": 0.08203125, + "grad_norm": 1.034557724842816, + "kl": 0.0343017578125, "learning_rate": 8.170384989716657e-07, - "loss": -0.037, - "reward": 1.9375000596046448, - "reward_std": 0.16948115825653076, - "rewards/accuracy_reward": 0.9375000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": 0.0395, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 368 }, { "clip_ratio": 0.0, - "completion_length": 468.37501525878906, + "completion_length": 541.6041870117188, "epoch": 0.369, - "grad_norm": 1.0199782335615606, - "kl": 0.07373046875, + "grad_norm": 0.06191459889287263, + "kl": 0.0311279296875, "learning_rate": 8.157725504289664e-07, - "loss": -0.0289, - "reward": 1.8333333730697632, - "reward_std": 0.12309150397777557, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/format_reward": 0.0, + "loss": 0.0012, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 369 }, { "clip_ratio": 0.0, - "completion_length": 499.8958435058594, + "completion_length": 507.9583435058594, "epoch": 0.37, - "grad_norm": 0.94982525136099, - "kl": 0.083984375, + "grad_norm": 1.1542301179234153, + "kl": 0.038330078125, "learning_rate": 8.145033635316128e-07, - "loss": -0.0013, - "reward": 1.7916666865348816, - "reward_std": 0.09731236100196838, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": 0.0442, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 370 }, { "clip_ratio": 0.0, - "completion_length": 517.2083587646484, + "completion_length": 540.1250152587891, "epoch": 0.371, - "grad_norm": 1.8194716028119047, - "kl": 0.083740234375, + "grad_norm": 0.10027568037073406, + "kl": 0.03515625, "learning_rate": 8.13230953744247e-07, - "loss": -0.0724, - "reward": 1.7812500596046448, - "reward_std": 0.2897341623902321, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0014, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 371 }, { "clip_ratio": 0.0, - "completion_length": 397.43751525878906, + "completion_length": 487.6041717529297, "epoch": 0.372, - "grad_norm": 1.7080148183061603, - "kl": 0.0609130859375, + "grad_norm": 0.0874359442204591, + "kl": 0.035400390625, "learning_rate": 8.119553365707802e-07, - "loss": 0.0417, - "reward": 1.9791666865348816, - "reward_std": 0.07216878235340118, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, + "loss": 0.0014, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 372 }, { "clip_ratio": 0.0, - "completion_length": 496.2708435058594, + "completion_length": 528.9791870117188, "epoch": 0.373, - "grad_norm": 1.0151951615675194, - "kl": 0.066650390625, + "grad_norm": 0.08394294589398749, + "kl": 0.03497314453125, "learning_rate": 8.106765275542053e-07, - "loss": 0.0113, - "reward": 1.9583333730697632, - "reward_std": 0.09731236100196838, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, + "loss": 0.0013, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 373 }, { "clip_ratio": 0.0, - "completion_length": 571.0625305175781, + "completion_length": 553.1250305175781, "epoch": 0.374, - "grad_norm": 1.8068360284906588, - "kl": 0.099609375, + "grad_norm": 0.09439882091925837, + "kl": 0.0390625, "learning_rate": 8.093945422764069e-07, - "loss": 0.0191, - "reward": 1.8541667461395264, - "reward_std": 0.22604453563690186, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/format_reward": 0.0, + "loss": 0.0015, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 374 }, { "clip_ratio": 0.0, - "completion_length": 759.6041870117188, + "completion_length": 559.7708740234375, "epoch": 0.375, - "grad_norm": 1.8369452651781422, - "kl": 0.0927734375, + "grad_norm": 0.05978916201104615, + "kl": 0.0321044921875, "learning_rate": 8.081093963579707e-07, - "loss": 0.0693, - "reward": 1.765625, - "reward_std": 0.27649710327386856, - "rewards/accuracy_reward": 0.8125, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.953125, + "loss": 0.0013, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 375 }, { "clip_ratio": 0.0, - "completion_length": 541.0000305175781, + "completion_length": 547.3958435058594, "epoch": 0.376, - "grad_norm": 0.9246071259306416, - "kl": 0.08154296875, + "grad_norm": 2.1280601903761647, + "kl": 0.03857421875, "learning_rate": 8.068211054579943e-07, - "loss": 0.0231, - "reward": 1.9583333730697632, - "reward_std": 0.09731236100196838, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": 0.2465, + "reward": 0.9739583432674408, + "reward_std": 0.09021097794175148, + "rewards/tag_count_reward": 0.9739583432674408, "step": 376 }, { "clip_ratio": 0.0, - "completion_length": 491.3958435058594, + "completion_length": 422.10418701171875, "epoch": 0.377, - "grad_norm": 1.8088015127518082, - "kl": 0.08984375, + "grad_norm": 0.08127219256788779, + "kl": 0.040283203125, "learning_rate": 8.055296852738956e-07, - "loss": 0.1191, - "reward": 1.9270833730697632, - "reward_std": 0.17128896713256836, - "rewards/accuracy_reward": 0.9375, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0016, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 377 }, { "clip_ratio": 0.0, - "completion_length": 604.0000305175781, + "completion_length": 545.1875305175781, "epoch": 0.378, - "grad_norm": 1.00437824388029, - "kl": 0.08740234375, + "grad_norm": 1.3001640432345112, + "kl": 0.03955078125, "learning_rate": 8.04235151541222e-07, - "loss": 0.0341, - "reward": 1.9583333730697632, - "reward_std": 0.09731236100196838, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": 0.1207, + "reward": 0.984375, + "reward_std": 0.05412658676505089, + "rewards/tag_count_reward": 0.984375, "step": 378 }, { "clip_ratio": 0.0, - "completion_length": 510.3958435058594, + "completion_length": 581.6458435058594, "epoch": 0.379, - "grad_norm": 1.621569522092847, - "kl": 0.079345703125, + "grad_norm": 0.08477819999323476, + "kl": 0.0411376953125, "learning_rate": 8.029375200334587e-07, - "loss": 0.0558, - "reward": 1.8958333730697632, - "reward_std": 0.19526028633117676, - "rewards/accuracy_reward": 0.8958333730697632, - "rewards/format_reward": 0.0, + "loss": 0.0016, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 379 }, { "clip_ratio": 0.0, - "completion_length": 526.9375305175781, + "completion_length": 528.8750305175781, "epoch": 0.38, - "grad_norm": 1.7173925029063968, - "kl": 0.076416015625, + "grad_norm": 0.09753995035880399, + "kl": 0.0433349609375, "learning_rate": 8.01636806561836e-07, - "loss": 0.0262, - "reward": 1.9375, - "reward_std": 0.16948114335536957, - "rewards/accuracy_reward": 0.9375, - "rewards/format_reward": 0.0, + "loss": 0.0017, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 380 }, { "clip_ratio": 0.0, - "completion_length": 535.7291870117188, + "completion_length": 507.5833435058594, "epoch": 0.381, - "grad_norm": 1.2080771599564561, - "kl": 0.08056640625, + "grad_norm": 0.999357167905774, + "kl": 0.049560546875, "learning_rate": 8.003330269751372e-07, - "loss": 0.0129, - "reward": 1.9791666865348816, - "reward_std": 0.07216878235340118, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": -0.002, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 381 }, { "clip_ratio": 0.0, - "completion_length": 604.3125, + "completion_length": 528.8750152587891, "epoch": 0.382, - "grad_norm": 1.7155137556890026, - "kl": 0.101806640625, + "grad_norm": 0.088362076534267, + "kl": 0.0413818359375, "learning_rate": 7.990261971595048e-07, - "loss": -0.0322, - "reward": 1.8177083730697632, - "reward_std": 0.29577653110027313, - "rewards/accuracy_reward": 0.8333333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.984375, + "loss": 0.0017, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 382 }, { "clip_ratio": 0.0, - "completion_length": 637.7500305175781, + "completion_length": 558.2916870117188, "epoch": 0.383, - "grad_norm": 1.1075346551188545, - "kl": 0.083984375, + "grad_norm": 0.8238863782718631, + "kl": 0.0421142578125, "learning_rate": 7.977163330382479e-07, - "loss": 0.0078, - "reward": 1.9062500596046448, - "reward_std": 0.18150995671749115, - "rewards/accuracy_reward": 0.9375000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.96875, + "loss": -0.0087, + "reward": 0.984375, + "reward_std": 0.03884884715080261, + "rewards/tag_count_reward": 0.984375, "step": 383 }, { "clip_ratio": 0.0, - "completion_length": 602.3125305175781, + "completion_length": 502.7083435058594, "epoch": 0.384, - "grad_norm": 2.0654902674698077, - "kl": 0.10107421875, + "grad_norm": 0.09055795872538838, + "kl": 0.0428466796875, "learning_rate": 7.964034505716476e-07, - "loss": 0.0747, - "reward": 1.8645833730697632, - "reward_std": 0.3186322972178459, - "rewards/accuracy_reward": 0.8750000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0017, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 384 }, { "clip_ratio": 0.0, - "completion_length": 676.6666870117188, + "completion_length": 589.5625305175781, "epoch": 0.385, - "grad_norm": 1.665189890086692, - "kl": 0.1201171875, + "grad_norm": 1.1676528135282047, + "kl": 0.0408935546875, "learning_rate": 7.950875657567621e-07, - "loss": 0.1235, - "reward": 1.7708333730697632, - "reward_std": 0.3117125928401947, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.0309, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 385 }, { "clip_ratio": 0.0, - "completion_length": 601.4375152587891, + "completion_length": 556.75, "epoch": 0.386, - "grad_norm": 1.0878552716172778, - "kl": 0.0863037109375, + "grad_norm": 1.1667991725242814, + "kl": 0.04248046875, "learning_rate": 7.93768694627233e-07, - "loss": -0.0737, - "reward": 1.796875, - "reward_std": 0.0965491235256195, - "rewards/accuracy_reward": 0.8125, - "rewards/format_reward": 0.0, + "loss": 0.061, + "reward": 0.984375, + "reward_std": 0.05412658676505089, "rewards/tag_count_reward": 0.984375, "step": 386 }, { "clip_ratio": 0.0, - "completion_length": 463.7708435058594, + "completion_length": 501.25001525878906, "epoch": 0.387, - "grad_norm": 1.1884535608368862, - "kl": 0.082275390625, + "grad_norm": 1.552193185021972, + "kl": 0.0416259765625, "learning_rate": 7.924468532530883e-07, - "loss": 0.0019, - "reward": 1.9791666865348816, - "reward_std": 0.07216878235340118, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": 0.149, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 387 }, { "clip_ratio": 0.0, - "completion_length": 462.6666717529297, + "completion_length": 564.6041870117188, "epoch": 0.388, - "grad_norm": 0.15406182753082623, - "kl": 0.07421875, + "grad_norm": 2.451979385765991, + "kl": 0.04248046875, "learning_rate": 7.911220577405484e-07, - "loss": 0.003, - "reward": 2.0, - "reward_std": 0.0, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": 0.2295, + "reward": 0.953125, + "reward_std": 0.16237976029515266, + "rewards/tag_count_reward": 0.953125, "step": 388 }, { "clip_ratio": 0.0, - "completion_length": 629.75, + "completion_length": 474.9166717529297, "epoch": 0.389, - "grad_norm": 1.7379734000173628, - "kl": 0.105224609375, + "grad_norm": 1.4403584875210205, + "kl": 0.0487060546875, "learning_rate": 7.897943242318285e-07, - "loss": -0.086, - "reward": 1.59375, - "reward_std": 0.2785986512899399, - "rewards/accuracy_reward": 0.6041666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0261, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 389 }, { "clip_ratio": 0.0, - "completion_length": 602.0833435058594, + "completion_length": 568.9791870117188, "epoch": 0.39, - "grad_norm": 1.5363258606007697, - "kl": 0.096435546875, + "grad_norm": 0.09658274855884183, + "kl": 0.0390625, "learning_rate": 7.884636689049422e-07, - "loss": 0.079, - "reward": 1.875, - "reward_std": 0.22613351047039032, - "rewards/accuracy_reward": 0.875, - "rewards/format_reward": 0.0, + "loss": 0.0016, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 390 }, { "clip_ratio": 0.0, - "completion_length": 458.9583435058594, + "completion_length": 503.47918701171875, "epoch": 0.391, - "grad_norm": 1.1039669535198295, - "kl": 0.08251953125, + "grad_norm": 0.10687541663572057, + "kl": 0.045654296875, "learning_rate": 7.871301079735049e-07, - "loss": 0.0127, - "reward": 1.7447916865348816, - "reward_std": 0.018042195588350296, - "rewards/accuracy_reward": 0.75, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0018, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 391 }, { "clip_ratio": 0.0, - "completion_length": 618.3125305175781, + "completion_length": 554.7708435058594, "epoch": 0.392, - "grad_norm": 2.4777435773224434, - "kl": 0.106689453125, + "grad_norm": 2.133786999649764, + "kl": 0.035888671875, "learning_rate": 7.857936576865356e-07, - "loss": 0.1429, - "reward": 1.765625, - "reward_std": 0.3965885192155838, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.1066, + "reward": 0.9739583432674408, + "reward_std": 0.07845467701554298, + "rewards/tag_count_reward": 0.9739583432674408, "step": 392 }, { "clip_ratio": 0.0, - "completion_length": 599.2083435058594, + "completion_length": 487.9583435058594, "epoch": 0.393, - "grad_norm": 1.5336626783219809, - "kl": 0.096435546875, + "grad_norm": 1.6908693537415456, + "kl": 0.0416259765625, "learning_rate": 7.844543343282595e-07, - "loss": 0.1085, - "reward": 1.8541666865348816, - "reward_std": 0.23615825176239014, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": 0.088, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 393 }, { "clip_ratio": 0.0, - "completion_length": 483.0416717529297, + "completion_length": 486.0208435058594, "epoch": 0.394, - "grad_norm": 1.877874498695064, - "kl": 0.093994140625, + "grad_norm": 0.10411848264951166, + "kl": 0.046630859375, "learning_rate": 7.831121542179086e-07, - "loss": 0.0039, - "reward": 2.0, + "loss": 0.0019, + "reward": 1.0, "reward_std": 0.0, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, "rewards/tag_count_reward": 1.0, "step": 394 }, { "clip_ratio": 0.0, - "completion_length": 495.8125305175781, + "completion_length": 484.8541717529297, "epoch": 0.395, - "grad_norm": 1.255686839878163, - "kl": 0.089111328125, + "grad_norm": 0.14391019717031175, + "kl": 0.0447998046875, "learning_rate": 7.817671337095244e-07, - "loss": 0.0842, - "reward": 1.9583333730697632, - "reward_std": 0.09731236100196838, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, + "loss": 0.0018, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 395 }, { "clip_ratio": 0.0, - "completion_length": 523.7500152587891, + "completion_length": 471.0833435058594, "epoch": 0.396, - "grad_norm": 1.5837068023036418, - "kl": 0.09228515625, + "grad_norm": 0.12688373205559, + "kl": 0.0457763671875, "learning_rate": 7.804192891917571e-07, - "loss": 0.0071, - "reward": 1.9739583730697632, - "reward_std": 0.09021097794175148, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0018, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 396 }, { "clip_ratio": 0.0, - "completion_length": 576.0416870117188, + "completion_length": 528.9375, "epoch": 0.397, - "grad_norm": 1.9755570049887077, - "kl": 0.087158203125, + "grad_norm": 0.13324985505443265, + "kl": 0.0406494140625, "learning_rate": 7.79068637087667e-07, - "loss": 0.1868, - "reward": 1.625, - "reward_std": 0.24643342196941376, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9583333432674408, + "loss": 0.0016, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 397 }, { "clip_ratio": 0.0, - "completion_length": 546.3125, + "completion_length": 519.5208435058594, "epoch": 0.398, - "grad_norm": 1.5961105585901436, - "kl": 0.098388671875, + "grad_norm": 1.0531523617961214, + "kl": 0.05224609375, "learning_rate": 7.777151938545235e-07, - "loss": -0.0368, - "reward": 1.6927083730697632, - "reward_std": 0.2631203308701515, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9843750298023224, + "loss": 0.0243, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 398 }, { "clip_ratio": 0.0, - "completion_length": 573.5833587646484, + "completion_length": 447.6666717529297, "epoch": 0.399, - "grad_norm": 1.4712910394298655, - "kl": 0.106201171875, + "grad_norm": 0.12314730150608288, + "kl": 0.0491943359375, "learning_rate": 7.763589759836058e-07, - "loss": 0.0465, - "reward": 1.8958333730697632, - "reward_std": 0.15674780309200287, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.002, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 399 }, { "clip_ratio": 0.0, - "completion_length": 627.5208587646484, + "completion_length": 535.2916870117188, "epoch": 0.4, - "grad_norm": 1.2017833764610608, - "kl": 0.109130859375, + "grad_norm": 0.13208913700559177, + "kl": 0.0465087890625, "learning_rate": 7.75e-07, - "loss": 0.0222, - "reward": 1.9166666865348816, - "reward_std": 0.14862647652626038, - "rewards/accuracy_reward": 0.9375, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.0019, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 400 }, { "clip_ratio": 0.0, - "completion_length": 520.9375152587891, + "completion_length": 475.7916717529297, "epoch": 0.401, - "grad_norm": 2.099349865984199, - "kl": 0.09375, + "grad_norm": 0.11631057614821307, + "kl": 0.050537109375, "learning_rate": 7.736382824623999e-07, - "loss": 0.0935, - "reward": 1.8125, - "reward_std": 0.19526028633117676, - "rewards/accuracy_reward": 0.8125, - "rewards/format_reward": 0.0, + "loss": 0.002, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 401 }, { "clip_ratio": 0.0, - "completion_length": 555.9791870117188, + "completion_length": 535.6666870117188, "epoch": 0.402, - "grad_norm": 0.1542523461981475, - "kl": 0.0888671875, + "grad_norm": 1.5757509818707738, + "kl": 0.0526123046875, "learning_rate": 7.72273839962904e-07, - "loss": 0.0038, - "reward": 1.75, - "reward_std": 0.0, - "rewards/accuracy_reward": 0.75, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": 0.0746, + "reward": 0.9739583730697632, + "reward_std": 0.09021097794175148, + "rewards/tag_count_reward": 0.9739583730697632, "step": 402 }, { "clip_ratio": 0.0, - "completion_length": 514.1666870117188, + "completion_length": 570.4166870117188, "epoch": 0.403, - "grad_norm": 1.2450241946443927, - "kl": 0.094482421875, + "grad_norm": 1.1336688099849115, + "kl": 0.048828125, "learning_rate": 7.709066891268133e-07, - "loss": 0.0305, - "reward": 1.7604166865348816, - "reward_std": 0.08356975018978119, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0292, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 403 }, { "clip_ratio": 0.0, - "completion_length": 497.3333435058594, + "completion_length": 520.75, "epoch": 0.404, - "grad_norm": 1.4572915644406153, - "kl": 0.099853515625, + "grad_norm": 0.10134365577177148, + "kl": 0.046630859375, "learning_rate": 7.695368466124296e-07, - "loss": 0.0089, - "reward": 1.9583333730697632, - "reward_std": 0.14433756470680237, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, + "loss": 0.0019, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 404 }, { "clip_ratio": 0.0, - "completion_length": 553.9375305175781, + "completion_length": 519.0416717529297, "epoch": 0.405, - "grad_norm": 1.0273056796750832, - "kl": 0.093994140625, + "grad_norm": 0.09498459340343707, + "kl": 0.0439453125, "learning_rate": 7.681643291108517e-07, - "loss": -0.0252, - "reward": 1.78125, - "reward_std": 0.07769769430160522, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0018, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 405 }, { "clip_ratio": 0.0, - "completion_length": 601.6666870117188, + "completion_length": 509.3125305175781, "epoch": 0.406, - "grad_norm": 1.3604992657768664, - "kl": 0.107421875, + "grad_norm": 0.09249419254408961, + "kl": 0.041259765625, "learning_rate": 7.667891533457718e-07, - "loss": 0.0998, - "reward": 1.6770833730697632, - "reward_std": 0.2500183582305908, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.96875, + "loss": 0.0016, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 406 }, { "clip_ratio": 0.0, - "completion_length": 527.9583435058594, + "completion_length": 500.3125305175781, "epoch": 0.407, - "grad_norm": 0.966903909176943, - "kl": 0.1015625, + "grad_norm": 0.1014194854177954, + "kl": 0.0462646484375, "learning_rate": 7.654113360732732e-07, - "loss": 0.0064, - "reward": 1.9791666865348816, - "reward_std": 0.07216878235340118, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, + "loss": 0.0019, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 407 }, { "clip_ratio": 0.0, - "completion_length": 575.1250152587891, + "completion_length": 531.7500152587891, "epoch": 0.408, - "grad_norm": 1.8716219770489841, - "kl": 0.103515625, + "grad_norm": 0.0850335798970897, + "kl": 0.0423583984375, "learning_rate": 7.640308940816239e-07, - "loss": 0.1477, - "reward": 1.9218750596046448, - "reward_std": 0.18833883479237556, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.0017, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 408 }, { "clip_ratio": 0.0, - "completion_length": 434.2291717529297, + "completion_length": 498.7291717529297, "epoch": 0.409, - "grad_norm": 1.2146316659168699, - "kl": 0.08837890625, + "grad_norm": 0.09839539734571516, + "kl": 0.047119140625, "learning_rate": 7.626478441910744e-07, - "loss": 0.0286, - "reward": 1.9947916865348816, - "reward_std": 0.018042195588350296, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0019, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 409 }, { "clip_ratio": 0.0, - "completion_length": 590.5625305175781, + "completion_length": 525.7083435058594, "epoch": 0.41, - "grad_norm": 1.5300992786230176, - "kl": 0.12939453125, + "grad_norm": 0.09239514842845858, + "kl": 0.0416259765625, "learning_rate": 7.612622032536507e-07, - "loss": -0.017, - "reward": 1.5729167461395264, - "reward_std": 0.19669440388679504, - "rewards/accuracy_reward": 0.5833333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0017, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 410 }, { "clip_ratio": 0.0, - "completion_length": 524.2708435058594, + "completion_length": 519.5833435058594, "epoch": 0.411, - "grad_norm": 1.8382570850232949, - "kl": 0.12109375, + "grad_norm": 0.09033270466242478, + "kl": 0.0458984375, "learning_rate": 7.59873988152951e-07, - "loss": 0.1339, - "reward": 1.6979167461395264, - "reward_std": 0.13339675217866898, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0018, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 411 }, { "clip_ratio": 0.0, - "completion_length": 478.9791717529297, + "completion_length": 452.8333435058594, "epoch": 0.412, - "grad_norm": 1.929994226152613, - "kl": 0.12646484375, + "grad_norm": 0.0849946940770293, + "kl": 0.046142578125, "learning_rate": 7.584832158039378e-07, - "loss": 0.0131, - "reward": 1.7708333730697632, - "reward_std": 0.21037911623716354, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/format_reward": 0.0, + "loss": 0.0018, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 412 }, { "clip_ratio": 0.0, - "completion_length": 499.41668701171875, + "completion_length": 479.5833435058594, "epoch": 0.413, - "grad_norm": 1.733834076494595, - "kl": 0.10498046875, + "grad_norm": 0.09480109650231232, + "kl": 0.051025390625, "learning_rate": 7.570899031527332e-07, - "loss": 0.0312, - "reward": 1.9739583730697632, - "reward_std": 0.09021097421646118, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.002, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 413 }, { "clip_ratio": 0.0, - "completion_length": 517.4583587646484, + "completion_length": 512.3958435058594, "epoch": 0.414, - "grad_norm": 1.2957637458925562, - "kl": 0.1064453125, + "grad_norm": 0.08812828615766397, + "kl": 0.045654296875, "learning_rate": 7.556940671764124e-07, - "loss": -0.0028, - "reward": 1.9739583730697632, - "reward_std": 0.0749332383275032, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 0.0019, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 414 }, { "clip_ratio": 0.0, - "completion_length": 519.5625, + "completion_length": 528.8541870117188, "epoch": 0.415, - "grad_norm": 0.1899817475176635, - "kl": 0.10595703125, + "grad_norm": 0.08811692768903577, + "kl": 0.0443115234375, "learning_rate": 7.54295724882796e-07, - "loss": 0.0046, - "reward": 1.75, + "loss": 0.0018, + "reward": 1.0, "reward_std": 0.0, - "rewards/accuracy_reward": 0.75, - "rewards/format_reward": 0.0, "rewards/tag_count_reward": 1.0, "step": 415 }, { "clip_ratio": 0.0, - "completion_length": 541.7916717529297, + "completion_length": 520.2500305175781, "epoch": 0.416, - "grad_norm": 1.805765847716464, - "kl": 0.12451171875, + "grad_norm": 0.07578652447775983, + "kl": 0.0396728515625, "learning_rate": 7.528948933102438e-07, - "loss": -0.0045, - "reward": 1.7916666865348816, - "reward_std": 0.17538414150476456, - "rewards/accuracy_reward": 0.8125000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.0016, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 416 }, { "clip_ratio": 0.0, - "completion_length": 530.1458435058594, + "completion_length": 561.4375305175781, "epoch": 0.417, - "grad_norm": 3.5885449791663024, - "kl": 0.10791015625, + "grad_norm": 0.07895167806367016, + "kl": 0.0433349609375, "learning_rate": 7.514915895274463e-07, - "loss": 0.151, - "reward": 1.9739583730697632, - "reward_std": 0.09021097794175148, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 0.0017, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 417 }, { "clip_ratio": 0.0, - "completion_length": 463.25001525878906, + "completion_length": 485.12501525878906, "epoch": 0.418, - "grad_norm": 1.1598781059620817, - "kl": 0.101806640625, + "grad_norm": 0.08518158009411357, + "kl": 0.046142578125, "learning_rate": 7.500858306332172e-07, - "loss": 0.0331, - "reward": 1.9375, - "reward_std": 0.11306675523519516, - "rewards/accuracy_reward": 0.9375, - "rewards/format_reward": 0.0, + "loss": 0.0019, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 418 }, { "clip_ratio": 0.0, - "completion_length": 644.6250305175781, + "completion_length": 508.97918701171875, "epoch": 0.419, - "grad_norm": 2.6732058964346437, - "kl": 0.13720703125, + "grad_norm": 1.0053918619854647, + "kl": 0.0478515625, "learning_rate": 7.486776337562853e-07, - "loss": 0.2273, - "reward": 1.7552083730697632, - "reward_std": 0.2240002602338791, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.0063, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 419 }, { "clip_ratio": 0.0, - "completion_length": 622.8333587646484, + "completion_length": 526.1666870117188, "epoch": 0.42, - "grad_norm": 3.3795722733526685, - "kl": 0.1435546875, + "grad_norm": 1.0912738995624687, + "kl": 0.044677734375, "learning_rate": 7.472670160550848e-07, - "loss": 0.3069, - "reward": 1.5572916865348816, - "reward_std": 0.2243468128144741, - "rewards/accuracy_reward": 0.6041666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.953125, + "loss": 0.038, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 420 }, { "clip_ratio": 0.0, - "completion_length": 589.5416870117188, + "completion_length": 515.4791870117188, "epoch": 0.421, - "grad_norm": 1.8704747821193433, - "kl": 0.13134765625, + "grad_norm": 0.10769262285700905, + "kl": 0.0389404296875, "learning_rate": 7.458539947175473e-07, - "loss": 0.1226, - "reward": 1.8802083730697632, - "reward_std": 0.21476569399237633, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.0016, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 421 }, { "clip_ratio": 0.0, - "completion_length": 684.9375305175781, + "completion_length": 542.8333435058594, "epoch": 0.422, - "grad_norm": 2.8120148648671703, - "kl": 0.1767578125, + "grad_norm": 0.124859076624025, + "kl": 0.0406494140625, "learning_rate": 7.444385869608921e-07, - "loss": 0.2267, - "reward": 1.828125, - "reward_std": 0.3041449449956417, - "rewards/accuracy_reward": 0.875, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.953125, + "loss": 0.0016, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 422 }, { "clip_ratio": 0.0, - "completion_length": 784.6458435058594, + "completion_length": 557.9583435058594, "epoch": 0.423, - "grad_norm": 3.175303473262236, - "kl": 0.18017578125, + "grad_norm": 0.06978524202047731, + "kl": 0.037841796875, "learning_rate": 7.430208100314156e-07, - "loss": 0.1409, - "reward": 1.7916666865348816, - "reward_std": 0.33538663387298584, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9375, + "loss": 0.0015, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 423 }, { "clip_ratio": 0.0, - "completion_length": 605.8333435058594, + "completion_length": 510.91668701171875, "epoch": 0.424, - "grad_norm": 3.191789569856487, - "kl": 0.1640625, + "grad_norm": 0.9837927706804174, + "kl": 0.0379638671875, "learning_rate": 7.416006812042827e-07, - "loss": 0.248, - "reward": 1.8697916865348816, - "reward_std": 0.3639189898967743, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.953125, + "loss": 0.007, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 424 }, { "clip_ratio": 0.0, - "completion_length": 598.8541870117188, + "completion_length": 540.6666870117188, "epoch": 0.425, - "grad_norm": 3.5367001886921847, - "kl": 0.1845703125, + "grad_norm": 2.9312181195112146, + "kl": 0.045654296875, "learning_rate": 7.401782177833147e-07, - "loss": 0.1732, - "reward": 1.9375, - "reward_std": 0.1315547227859497, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9583333432674408, + "loss": 0.213, + "reward": 0.9687500298023224, + "reward_std": 0.10825317353010178, + "rewards/tag_count_reward": 0.9687500298023224, "step": 425 }, { "clip_ratio": 0.0, - "completion_length": 616.5833435058594, + "completion_length": 527.2708435058594, "epoch": 0.426, - "grad_norm": 2.778381457804399, - "kl": 0.220703125, + "grad_norm": 0.09640325328962256, + "kl": 0.0430908203125, "learning_rate": 7.387534371007797e-07, - "loss": 0.1653, - "reward": 1.8750000596046448, - "reward_std": 0.26742906868457794, - "rewards/accuracy_reward": 0.8958333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.0017, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 426 }, { "clip_ratio": 0.0, - "completion_length": 574.9375152587891, + "completion_length": 519.8541870117188, "epoch": 0.427, - "grad_norm": 4.587203247007512, - "kl": 0.19140625, + "grad_norm": 0.07472945475941242, + "kl": 0.04296875, "learning_rate": 7.373263565171805e-07, - "loss": 0.3088, - "reward": 1.9062500596046448, - "reward_std": 0.21928749978542328, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9479166865348816, + "loss": 0.0017, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 427 }, { "clip_ratio": 0.0, - "completion_length": 482.5416717529297, + "completion_length": 480.4583435058594, "epoch": 0.428, - "grad_norm": 3.2023821836646444, - "kl": 0.17578125, + "grad_norm": 0.1283159299656476, + "kl": 0.046875, "learning_rate": 7.358969934210438e-07, - "loss": 0.1805, - "reward": 1.984375, - "reward_std": 0.05412658676505089, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.984375, + "loss": 0.0018, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 428 }, { "clip_ratio": 0.0, - "completion_length": 618.8125152587891, + "completion_length": 527.8125305175781, "epoch": 0.429, - "grad_norm": 2.3688294757006414, - "kl": 0.2431640625, + "grad_norm": 1.2109285264735294, + "kl": 0.0404052734375, "learning_rate": 7.344653652287077e-07, - "loss": 0.1125, - "reward": 1.8437500596046448, - "reward_std": 0.2966437339782715, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9479166865348816, + "loss": 0.0558, + "reward": 0.984375, + "reward_std": 0.05412658676505089, + "rewards/tag_count_reward": 0.984375, "step": 429 }, { "clip_ratio": 0.0, - "completion_length": 725.875, + "completion_length": 572.6875305175781, "epoch": 0.43, - "grad_norm": 4.953958776306139, - "kl": 0.36328125, + "grad_norm": 0.12680186205193347, + "kl": 0.043701171875, "learning_rate": 7.330314893841101e-07, - "loss": 0.1584, - "reward": 1.765625, - "reward_std": 0.32444924116134644, - "rewards/accuracy_reward": 0.8333333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9322916865348816, + "loss": 0.0018, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 430 }, { "clip_ratio": 0.0, - "completion_length": 773.2083435058594, + "completion_length": 502.0000305175781, "epoch": 0.431, - "grad_norm": 6.659614072971348, - "kl": 0.482421875, + "grad_norm": 0.08913699535865231, + "kl": 0.0457763671875, "learning_rate": 7.315953833585755e-07, - "loss": 0.3406, - "reward": 1.8125, - "reward_std": 0.32941484451293945, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8958333432674408, + "loss": 0.0018, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 431 }, { "clip_ratio": 0.0, - "completion_length": 933.6250305175781, + "completion_length": 652.8333435058594, "epoch": 0.432, - "grad_norm": 7.018207690929855, - "kl": 0.69140625, + "grad_norm": 1.1575484907375526, + "kl": 0.0435791015625, "learning_rate": 7.301570646506027e-07, - "loss": 0.3718, - "reward": 1.6770833730697632, - "reward_std": 0.363193154335022, - "rewards/accuracy_reward": 0.8125000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8645833432674408, + "loss": 0.0294, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 432 }, { "clip_ratio": 0.0, - "completion_length": 935.875, + "completion_length": 557.9791870117188, "epoch": 0.433, - "grad_norm": 7.878457807327411, - "kl": 0.876953125, + "grad_norm": 1.2239767941332247, + "kl": 0.0479736328125, "learning_rate": 7.287165507856512e-07, - "loss": 0.3271, - "reward": 1.7239583730697632, - "reward_std": 0.3327288329601288, - "rewards/accuracy_reward": 0.875, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8489583432674408, + "loss": 0.0179, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 433 }, { "clip_ratio": 0.0, - "completion_length": 619.625, + "completion_length": 501.93751525878906, "epoch": 0.434, - "grad_norm": 14.996230435121328, - "kl": 0.794921875, + "grad_norm": 1.6103755307461893, + "kl": 0.0455322265625, "learning_rate": 7.27273859315928e-07, - "loss": 0.43, - "reward": 1.9322916865348816, - "reward_std": 0.17774563655257225, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9531250298023224, + "loss": 0.0154, + "reward": 0.9739583432674408, + "reward_std": 0.09021097794175148, + "rewards/tag_count_reward": 0.9739583432674408, "step": 434 }, { "clip_ratio": 0.0, - "completion_length": 1145.7083740234375, + "completion_length": 547.8541870117188, "epoch": 0.435, - "grad_norm": 5.61269437367935, - "kl": 1.8984375, + "grad_norm": 1.3375839613733431, + "kl": 0.0462646484375, "learning_rate": 7.258290078201731e-07, - "loss": 0.1768, - "reward": 1.3020833730697632, - "reward_std": 0.48291391134262085, - "rewards/accuracy_reward": 0.5625000149011612, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.7395833432674408, + "loss": 0.0108, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 435 }, { "clip_ratio": 0.0, - "completion_length": 1187.1458740234375, + "completion_length": 543.0625305175781, "epoch": 0.436, - "grad_norm": 10.215860642931588, - "kl": 2.6875, + "grad_norm": 1.9877666897516044, + "kl": 0.049072265625, "learning_rate": 7.243820139034464e-07, - "loss": 0.3937, - "reward": 1.5104166865348816, - "reward_std": 0.5014172494411469, - "rewards/accuracy_reward": 0.7708333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.7395833432674408, + "loss": 0.1998, + "reward": 0.96875, + "reward_std": 0.07298427820205688, + "rewards/tag_count_reward": 0.96875, "step": 436 }, { "clip_ratio": 0.0, - "completion_length": 1252.6041870117188, + "completion_length": 614.6875305175781, "epoch": 0.437, - "grad_norm": 10.299726808387382, - "kl": 3.8203125, + "grad_norm": 3.393227912741051, + "kl": 0.0947265625, "learning_rate": 7.229328951969115e-07, - "loss": 0.4188, - "reward": 1.5000000596046448, - "reward_std": 0.46128278970718384, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/format_reward": 0.02083333395421505, - "rewards/tag_count_reward": 0.7083333432674408, + "loss": 0.1849, + "reward": 0.9531250298023224, + "reward_std": 0.14568757265806198, + "rewards/tag_count_reward": 0.9531250298023224, "step": 437 }, { "clip_ratio": 0.0, - "completion_length": 770.9791870117188, + "completion_length": 605.0625, "epoch": 0.438, - "grad_norm": 7.3595374700763365, - "kl": 2.5546875, + "grad_norm": 2.9704332189234335, + "kl": 0.0576171875, "learning_rate": 7.214816693576234e-07, - "loss": 0.3679, - "reward": 1.7916667461395264, - "reward_std": 0.3743432015180588, - "rewards/accuracy_reward": 0.9375000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8541666865348816, + "loss": 0.3069, + "reward": 0.953125, + "reward_std": 0.14568756893277168, + "rewards/tag_count_reward": 0.953125, "step": 438 }, { "clip_ratio": 0.0, - "completion_length": 1065.0833435058594, + "completion_length": 534.125, "epoch": 0.439, - "grad_norm": 10.461486581235732, - "kl": 2.64453125, + "grad_norm": 1.2044060882375394, + "kl": 0.058837890625, "learning_rate": 7.200283540683102e-07, - "loss": 0.4336, - "reward": 1.5416667461395264, - "reward_std": 0.4429154247045517, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/format_reward": 0.02083333395421505, - "rewards/tag_count_reward": 0.7916666865348816, + "loss": 0.073, + "reward": 0.984375, + "reward_std": 0.05412658676505089, + "rewards/tag_count_reward": 0.984375, "step": 439 }, { "clip_ratio": 0.0, - "completion_length": 994.4375, + "completion_length": 648.7500305175781, "epoch": 0.44, - "grad_norm": 15.79461102655438, - "kl": 3.0234375, + "grad_norm": 0.19633187817219797, + "kl": 0.079833984375, "learning_rate": 7.185729670371604e-07, - "loss": 0.5894, - "reward": 1.5000000596046448, - "reward_std": 0.5931048691272736, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.7916666865348816, + "loss": 0.0034, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 440 }, { "clip_ratio": 0.0, - "completion_length": 1215.1666870117188, + "completion_length": 734.0416870117188, "epoch": 0.441, - "grad_norm": 24.80579391824325, - "kl": 5.3125, + "grad_norm": 3.2067961154241917, + "kl": 0.078369140625, "learning_rate": 7.171155259976057e-07, - "loss": 0.5426, - "reward": 1.4947916865348816, - "reward_std": 0.4852359890937805, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.703125, + "loss": 0.3304, + "reward": 0.9635416865348816, + "reward_std": 0.12629536911845207, + "rewards/tag_count_reward": 0.9635416865348816, "step": 441 }, { "clip_ratio": 0.0, - "completion_length": 1074.2083740234375, + "completion_length": 626.3750305175781, "epoch": 0.442, - "grad_norm": 20.63734378009407, - "kl": 4.1953125, + "grad_norm": 2.5378060357093317, + "kl": 0.090087890625, "learning_rate": 7.156560487081051e-07, - "loss": 0.4523, - "reward": 1.5364583730697632, - "reward_std": 0.5057072937488556, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.7864583432674408, + "loss": 0.2675, + "reward": 0.9791666865348816, + "reward_std": 0.07216878235340118, + "rewards/tag_count_reward": 0.9791666865348816, "step": 442 }, { "clip_ratio": 0.0, - "completion_length": 752.8333740234375, + "completion_length": 566.5625152587891, "epoch": 0.443, - "grad_norm": 44.62788970417622, - "kl": 1.6953125, + "grad_norm": 2.7647180072565956, + "kl": 0.094482421875, "learning_rate": 7.141945529519288e-07, - "loss": 0.5175, - "reward": 1.6562500596046448, - "reward_std": 0.4979724735021591, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.90625, + "loss": 0.2807, + "reward": 0.9791666865348816, + "reward_std": 0.07216878235340118, + "rewards/tag_count_reward": 0.9791666865348816, "step": 443 }, { "clip_ratio": 0.0, - "completion_length": 1017.5208740234375, + "completion_length": 625.8750305175781, "epoch": 0.444, - "grad_norm": 33.114559024092536, - "kl": 2.41796875, + "grad_norm": 2.723551464343659, + "kl": 0.10546875, "learning_rate": 7.127310565369415e-07, - "loss": 0.5509, - "reward": 1.390625, - "reward_std": 0.6407037973403931, - "rewards/accuracy_reward": 0.6041666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.7864583432674408, + "loss": 0.2434, + "reward": 0.9791666865348816, + "reward_std": 0.07216878235340118, + "rewards/tag_count_reward": 0.9791666865348816, "step": 444 }, { "clip_ratio": 0.0, - "completion_length": 770.6875305175781, + "completion_length": 662.7291870117188, "epoch": 0.445, - "grad_norm": 23.995774880016523, - "kl": 2.5703125, + "grad_norm": 2.9676862937048174, + "kl": 0.13671875, "learning_rate": 7.11265577295385e-07, - "loss": 0.6869, - "reward": 1.6197916865348816, - "reward_std": 0.4352649301290512, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8697916865348816, + "loss": 0.239, + "reward": 0.9739583730697632, + "reward_std": 0.07845467701554298, + "rewards/tag_count_reward": 0.9739583730697632, "step": 445 }, { "clip_ratio": 0.0, - "completion_length": 860.3333435058594, + "completion_length": 831.4166870117188, "epoch": 0.446, - "grad_norm": 61.471990783951334, - "kl": 5.3125, + "grad_norm": 1.0131205931227645, + "kl": 0.18359375, "learning_rate": 7.097981330836616e-07, - "loss": 0.6768, - "reward": 1.5833333730697632, - "reward_std": 0.42580851912498474, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8125000298023224, + "loss": 0.0745, + "reward": 0.9895833432674408, + "reward_std": 0.024328090250492096, + "rewards/tag_count_reward": 0.9895833432674408, "step": 446 }, { "clip_ratio": 0.0, - "completion_length": 833.8750305175781, + "completion_length": 761.2500305175781, "epoch": 0.447, - "grad_norm": 31.471998864002433, - "kl": 5.0625, + "grad_norm": 3.0412955207595678, + "kl": 0.2373046875, "learning_rate": 7.083287417821157e-07, - "loss": 0.6939, - "reward": 1.6875, - "reward_std": 0.4197809398174286, - "rewards/accuracy_reward": 0.875, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8125, + "loss": 0.245, + "reward": 0.9687500298023224, + "reward_std": 0.08474057167768478, + "rewards/tag_count_reward": 0.9687500298023224, "step": 447 }, { "clip_ratio": 0.0, - "completion_length": 981.4791870117188, + "completion_length": 750.4166870117188, "epoch": 0.448, - "grad_norm": 9.204685220178225, - "kl": 4.234375, + "grad_norm": 1.7160871685898567, + "kl": 0.224609375, "learning_rate": 7.068574212948169e-07, - "loss": 0.6583, - "reward": 1.6302083730697632, - "reward_std": 0.4747810959815979, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.7760416865348816, + "loss": 0.1315, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 448 }, { "clip_ratio": 0.0, - "completion_length": 734.6666870117188, + "completion_length": 891.5208740234375, "epoch": 0.449, - "grad_norm": 8.238615228862814, - "kl": 2.1875, + "grad_norm": 6.204046081883707, + "kl": 0.3203125, "learning_rate": 7.053841895493406e-07, - "loss": 0.3116, - "reward": 1.6875000596046448, - "reward_std": 0.44833993911743164, - "rewards/accuracy_reward": 0.8125000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.875, + "loss": 0.4129, + "reward": 0.9218750298023224, + "reward_std": 0.17826225608587265, + "rewards/tag_count_reward": 0.9218750298023224, "step": 449 }, { "clip_ratio": 0.0, - "completion_length": 787.7083435058594, + "completion_length": 881.5833740234375, "epoch": 0.45, - "grad_norm": 25.607425007243297, - "kl": 2.03515625, + "grad_norm": 3.2434161686257363, + "kl": 0.3544921875, "learning_rate": 7.039090644965509e-07, - "loss": 0.4583, - "reward": 1.6770833730697632, - "reward_std": 0.4864405393600464, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8229166865348816, + "loss": 0.1983, + "reward": 0.953125, + "reward_std": 0.09235543012619019, + "rewards/tag_count_reward": 0.953125, "step": 450 }, { "clip_ratio": 0.0, - "completion_length": 1097.5208435058594, + "completion_length": 1146.0416870117188, "epoch": 0.451, - "grad_norm": 20.993367285758417, - "kl": 3.7890625, + "grad_norm": 4.693128676564285, + "kl": 0.40234375, "learning_rate": 7.024320641103811e-07, - "loss": 0.3665, - "reward": 1.4479166865348816, - "reward_std": 0.3619709610939026, - "rewards/accuracy_reward": 0.75, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.6979166865348816, + "loss": 0.3054, + "reward": 0.9010416865348816, + "reward_std": 0.18217922747135162, + "rewards/tag_count_reward": 0.9010416865348816, "step": 451 }, { "clip_ratio": 0.0, - "completion_length": 632.0625152587891, + "completion_length": 1136.0000610351562, "epoch": 0.452, - "grad_norm": 6.805977485633756, - "kl": 1.734375, + "grad_norm": 8.939444249342907, + "kl": 0.49609375, "learning_rate": 7.009532063876148e-07, - "loss": 0.3024, - "reward": 1.8229166865348816, - "reward_std": 0.2334350235760212, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.02083333395421505, - "rewards/tag_count_reward": 0.8854166865348816, + "loss": 0.4582, + "reward": 0.8697916865348816, + "reward_std": 0.21116822957992554, + "rewards/tag_count_reward": 0.8697916865348816, "step": 452 }, { "clip_ratio": 0.0, - "completion_length": 929.2708435058594, + "completion_length": 1136.5625, "epoch": 0.453, - "grad_norm": 5.9570107818750255, - "kl": 2.138671875, + "grad_norm": 9.611421819429221, + "kl": 0.66015625, "learning_rate": 6.994725093476664e-07, - "loss": 0.25, - "reward": 1.6302083730697632, - "reward_std": 0.36799517273902893, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/format_reward": 0.02083333395421505, - "rewards/tag_count_reward": 0.7760416865348816, + "loss": 0.4156, + "reward": 0.8645833432674408, + "reward_std": 0.22167008370161057, + "rewards/tag_count_reward": 0.8645833432674408, "step": 453 }, { "clip_ratio": 0.0, - "completion_length": 544.7916870117188, + "completion_length": 1111.6875, "epoch": 0.454, - "grad_norm": 13.701433484445106, - "kl": 0.86328125, + "grad_norm": 10.651125312059772, + "kl": 0.732421875, "learning_rate": 6.979899910323624e-07, - "loss": 0.3326, - "reward": 1.8177083730697632, - "reward_std": 0.333845853805542, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9010416865348816, + "loss": 0.4229, + "reward": 0.8697916865348816, + "reward_std": 0.219189815223217, + "rewards/tag_count_reward": 0.8697916865348816, "step": 454 }, { "clip_ratio": 0.0, - "completion_length": 667.3333435058594, + "completion_length": 967.6042175292969, "epoch": 0.455, - "grad_norm": 4.453103718617606, - "kl": 1.3828125, + "grad_norm": 10.978588111511952, + "kl": 0.96484375, "learning_rate": 6.965056695057204e-07, - "loss": 0.174, - "reward": 1.6770833730697632, - "reward_std": 0.39637820422649384, - "rewards/accuracy_reward": 0.8333333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8437500298023224, + "loss": 0.4329, + "reward": 0.9166666865348816, + "reward_std": 0.17608554661273956, + "rewards/tag_count_reward": 0.9166666865348816, "step": 455 }, { "clip_ratio": 0.0, - "completion_length": 636.7500305175781, + "completion_length": 1440.0000610351562, "epoch": 0.456, - "grad_norm": 14.178791081983867, - "kl": 1.66015625, + "grad_norm": 6.218928657554448, + "kl": 1.31640625, "learning_rate": 6.950195628537299e-07, - "loss": 0.3767, - "reward": 1.7447917461395264, - "reward_std": 0.3697783648967743, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8281250298023224, + "loss": 0.3387, + "reward": 0.859375, + "reward_std": 0.22917009145021439, + "rewards/tag_count_reward": 0.859375, "step": 456 }, { "clip_ratio": 0.0, - "completion_length": 649.5833435058594, + "completion_length": 1184.5625305175781, "epoch": 0.457, - "grad_norm": 12.299325771819511, - "kl": 1.96484375, + "grad_norm": 10.80209930784645, + "kl": 1.1328125, "learning_rate": 6.935316891841315e-07, - "loss": 0.1973, - "reward": 1.7552083730697632, - "reward_std": 0.2891301065683365, - "rewards/accuracy_reward": 0.8958333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8593750298023224, + "loss": 0.3945, + "reward": 0.8541666865348816, + "reward_std": 0.21569664776325226, + "rewards/tag_count_reward": 0.8541666865348816, "step": 457 }, { "clip_ratio": 0.0, - "completion_length": 875.3125, + "completion_length": 1348.3125610351562, "epoch": 0.458, - "grad_norm": 10.449805323196285, - "kl": 3.1484375, + "grad_norm": 11.179542694954074, + "kl": 1.064453125, "learning_rate": 6.920420666261961e-07, - "loss": 0.386, - "reward": 1.6822917461395264, - "reward_std": 0.4104303568601608, - "rewards/accuracy_reward": 0.9375, - "rewards/format_reward": 0.02083333395421505, - "rewards/tag_count_reward": 0.7239583432674408, + "loss": 0.4237, + "reward": 0.8020833432674408, + "reward_std": 0.23151954263448715, + "rewards/tag_count_reward": 0.8020833432674408, "step": 458 }, { "clip_ratio": 0.0, - "completion_length": 534.7083435058594, + "completion_length": 1252.3958740234375, "epoch": 0.459, - "grad_norm": 12.752467269928452, - "kl": 1.0732421875, + "grad_norm": 7.894711801824028, + "kl": 1.07421875, "learning_rate": 6.905507133305047e-07, - "loss": 0.2904, - "reward": 1.9218750596046448, - "reward_std": 0.18188626319169998, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9427083432674408, + "loss": 0.3084, + "reward": 0.875, + "reward_std": 0.1687939241528511, + "rewards/tag_count_reward": 0.875, "step": 459 }, { "clip_ratio": 0.0, - "completion_length": 782.8125305175781, + "completion_length": 1284.666748046875, "epoch": 0.46, - "grad_norm": 10.833997950231373, - "kl": 2.5390625, + "grad_norm": 10.218837999976117, + "kl": 1.2734375, "learning_rate": 6.890576474687263e-07, - "loss": 0.2904, - "reward": 1.5989583730697632, - "reward_std": 0.3610430806875229, - "rewards/accuracy_reward": 0.8333333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.7656250298023224, + "loss": 0.3575, + "reward": 0.8697916865348816, + "reward_std": 0.19752872735261917, + "rewards/tag_count_reward": 0.8697916865348816, "step": 460 }, { "clip_ratio": 0.0, - "completion_length": 544.5625305175781, + "completion_length": 1536.2916870117188, "epoch": 0.461, - "grad_norm": 10.099772583349912, - "kl": 1.265625, + "grad_norm": 6.28405434545915, + "kl": 1.55078125, "learning_rate": 6.875628872333975e-07, - "loss": 0.0792, - "reward": 1.6302083730697632, - "reward_std": 0.22907259315252304, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.859375, + "loss": 0.293, + "reward": 0.796875, + "reward_std": 0.22817014157772064, + "rewards/tag_count_reward": 0.796875, "step": 461 }, { "clip_ratio": 0.0, - "completion_length": 679.8125, + "completion_length": 1360.75, "epoch": 0.462, - "grad_norm": 29.14241939606568, - "kl": 0.900390625, + "grad_norm": 8.457933807258653, + "kl": 1.58203125, "learning_rate": 6.860664508377001e-07, - "loss": 0.3667, - "reward": 1.5677083730697632, - "reward_std": 0.38647958636283875, - "rewards/accuracy_reward": 0.6875000149011612, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8802083730697632, + "loss": 0.2901, + "reward": 0.8489583432674408, + "reward_std": 0.18343394994735718, + "rewards/tag_count_reward": 0.8489583432674408, "step": 462 }, { "clip_ratio": 0.0, - "completion_length": 562.6250305175781, + "completion_length": 1370.229248046875, "epoch": 0.463, - "grad_norm": 7.41102233413824, - "kl": 0.591796875, + "grad_norm": 6.243040180753141, + "kl": 1.65625, "learning_rate": 6.84568356515239e-07, - "loss": -0.0259, - "reward": 1.8072916865348816, - "reward_std": 0.20932568609714508, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9114583432674408, + "loss": 0.4428, + "reward": 0.78125, + "reward_std": 0.2227114662528038, + "rewards/tag_count_reward": 0.78125, "step": 463 }, { "clip_ratio": 0.0, - "completion_length": 545.3541870117188, + "completion_length": 1128.0416870117188, "epoch": 0.464, - "grad_norm": 22.41456814629958, - "kl": 0.536376953125, + "grad_norm": 6.440456811434236, + "kl": 1.296875, "learning_rate": 6.83068622519821e-07, - "loss": 0.3116, - "reward": 1.8697916865348816, - "reward_std": 0.26856791973114014, - "rewards/accuracy_reward": 0.9375, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9322916865348816, + "loss": 0.5212, + "reward": 0.8437500298023224, + "reward_std": 0.20314887166023254, + "rewards/tag_count_reward": 0.8437500298023224, "step": 464 }, { "clip_ratio": 0.0, - "completion_length": 502.00001525878906, + "completion_length": 864.9375, "epoch": 0.465, - "grad_norm": 6.332288555801555, - "kl": 0.42919921875, + "grad_norm": 6.887498046550961, + "kl": 1.03515625, "learning_rate": 6.815672671252315e-07, - "loss": 0.0859, - "reward": 1.828125, - "reward_std": 0.25555509328842163, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9322916865348816, + "loss": 0.5915, + "reward": 0.8593750298023224, + "reward_std": 0.17061126232147217, + "rewards/tag_count_reward": 0.8593750298023224, "step": 465 }, { "clip_ratio": 0.0, - "completion_length": 548.0833587646484, + "completion_length": 937.3333435058594, "epoch": 0.466, - "grad_norm": 10.586737760467948, - "kl": 0.71826171875, + "grad_norm": 5.250772904790437, + "kl": 1.197265625, "learning_rate": 6.800643086250121e-07, - "loss": 0.2031, - "reward": 1.5416666865348816, - "reward_std": 0.2056521661579609, - "rewards/accuracy_reward": 0.625, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9166666865348816, + "loss": 0.6632, + "reward": 0.8385416865348816, + "reward_std": 0.1986435353755951, + "rewards/tag_count_reward": 0.8385416865348816, "step": 466 }, { "clip_ratio": 0.0, - "completion_length": 522.5208435058594, + "completion_length": 991.8541870117188, "epoch": 0.467, - "grad_norm": 8.518005613063707, - "kl": 0.359375, + "grad_norm": 11.570194784069281, + "kl": 1.306640625, "learning_rate": 6.78559765332238e-07, - "loss": 0.0813, - "reward": 1.7031250596046448, - "reward_std": 0.15909089893102646, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9531250298023224, + "loss": 0.5321, + "reward": 0.8177083730697632, + "reward_std": 0.2344372197985649, + "rewards/tag_count_reward": 0.8177083730697632, "step": 467 }, { "clip_ratio": 0.0, - "completion_length": 559.0000305175781, + "completion_length": 824.4375, "epoch": 0.468, - "grad_norm": 4.239765641538712, - "kl": 0.83984375, + "grad_norm": 6.672446715960221, + "kl": 0.978515625, "learning_rate": 6.770536555792944e-07, - "loss": 0.1594, - "reward": 1.9062500596046448, - "reward_std": 0.1668463870882988, - "rewards/accuracy_reward": 0.9583333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9479166865348816, + "loss": 0.6408, + "reward": 0.8229166865348816, + "reward_std": 0.23008184880018234, + "rewards/tag_count_reward": 0.8229166865348816, "step": 468 }, { "clip_ratio": 0.0, - "completion_length": 459.3958435058594, + "completion_length": 804.9583740234375, "epoch": 0.469, - "grad_norm": 3.2826243956409935, - "kl": 0.5556640625, + "grad_norm": 7.585398445994687, + "kl": 1.1796875, "learning_rate": 6.755459977176532e-07, - "loss": 0.0948, - "reward": 1.9010416865348816, - "reward_std": 0.1897234059870243, - "rewards/accuracy_reward": 0.9375, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.5452, + "reward": 0.8489583432674408, + "reward_std": 0.19954737275838852, + "rewards/tag_count_reward": 0.8489583432674408, "step": 469 }, { "clip_ratio": 0.0, - "completion_length": 578.3541870117188, + "completion_length": 623.6458435058594, "epoch": 0.47, - "grad_norm": 7.949356042088438, - "kl": 1.224609375, + "grad_norm": 7.801702501059949, + "kl": 0.6171875, "learning_rate": 6.740368101176495e-07, - "loss": 0.1851, - "reward": 1.7447916865348816, - "reward_std": 0.18040694296360016, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.953125, + "loss": 0.5407, + "reward": 0.8906250298023224, + "reward_std": 0.20231406390666962, + "rewards/tag_count_reward": 0.8906250298023224, "step": 470 }, { "clip_ratio": 0.0, - "completion_length": 454.97918701171875, + "completion_length": 767.7291870117188, "epoch": 0.471, - "grad_norm": 4.068740890617025, - "kl": 0.7802734375, + "grad_norm": 11.038095688888431, + "kl": 0.828125, "learning_rate": 6.725261111682584e-07, - "loss": 0.1731, - "reward": 1.9479166865348816, - "reward_std": 0.12832656875252724, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9687500298023224, + "loss": 0.8531, + "reward": 0.8958333432674408, + "reward_std": 0.18303204327821732, + "rewards/tag_count_reward": 0.8958333432674408, "step": 471 }, { "clip_ratio": 0.0, - "completion_length": 663.1875305175781, + "completion_length": 697.7083435058594, "epoch": 0.472, - "grad_norm": 27.03718558919886, - "kl": 3.3515625, + "grad_norm": 12.9836550475911, + "kl": 0.697265625, "learning_rate": 6.710139192768694e-07, - "loss": 0.3514, - "reward": 1.6927083730697632, - "reward_std": 0.287121944129467, - "rewards/accuracy_reward": 0.8125, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8802083432674408, + "loss": 0.5237, + "reward": 0.8906250298023224, + "reward_std": 0.2117478996515274, + "rewards/tag_count_reward": 0.8906250298023224, "step": 472 }, { "clip_ratio": 0.0, - "completion_length": 508.2708435058594, + "completion_length": 719.5625305175781, "epoch": 0.473, - "grad_norm": 14.219380205160485, - "kl": 1.291015625, + "grad_norm": 65.68918290296943, + "kl": 0.767578125, "learning_rate": 6.695002528690639e-07, - "loss": 0.3057, - "reward": 1.9427083730697632, - "reward_std": 0.136161208152771, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.02083333395421505, - "rewards/tag_count_reward": 0.9427083432674408, + "loss": 0.7192, + "reward": 0.9166666865348816, + "reward_std": 0.17281758040189743, + "rewards/tag_count_reward": 0.9166666865348816, "step": 473 }, { "clip_ratio": 0.0, - "completion_length": 645.7291870117188, + "completion_length": 815.1458435058594, "epoch": 0.474, - "grad_norm": 18.570958403967257, - "kl": 2.78125, + "grad_norm": 96.15686083214344, + "kl": 2.8984375, "learning_rate": 6.679851303883891e-07, - "loss": 0.2299, - "reward": 1.6979166865348816, - "reward_std": 0.2152324914932251, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8645833730697632, + "loss": 1.0024, + "reward": 0.8802083730697632, + "reward_std": 0.21328985691070557, + "rewards/tag_count_reward": 0.8802083730697632, "step": 474 }, { "clip_ratio": 0.0, - "completion_length": 522.6666870117188, + "completion_length": 737.3541870117188, "epoch": 0.475, - "grad_norm": 12.644910255118035, - "kl": 0.89599609375, + "grad_norm": 47.91657008217291, + "kl": 2.21875, "learning_rate": 6.664685702961344e-07, - "loss": 0.256, - "reward": 1.9114583730697632, - "reward_std": 0.2113637775182724, - "rewards/accuracy_reward": 0.9583333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.953125, + "loss": 0.7382, + "reward": 0.8802083730697632, + "reward_std": 0.19346709549427032, + "rewards/tag_count_reward": 0.8802083730697632, "step": 475 }, { "clip_ratio": 0.0, - "completion_length": 459.4791717529297, + "completion_length": 692.2916870117188, "epoch": 0.476, - "grad_norm": 6.0913074350254846, - "kl": 0.5732421875, + "grad_norm": 257.53632873324773, + "kl": 4.984375, "learning_rate": 6.649505910711058e-07, - "loss": 0.1027, - "reward": 1.9635416865348816, - "reward_std": 0.11453906819224358, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9843750298023224, + "loss": 0.9263, + "reward": 0.8697916865348816, + "reward_std": 0.21693718433380127, + "rewards/tag_count_reward": 0.8697916865348816, "step": 476 }, { "clip_ratio": 0.0, - "completion_length": 552.3750152587891, + "completion_length": 849.0208435058594, "epoch": 0.477, - "grad_norm": 17.510693211836937, - "kl": 1.119140625, + "grad_norm": 97.85672791835103, + "kl": 2.6875, "learning_rate": 6.634312112094013e-07, - "loss": 0.3838, - "reward": 1.8958333730697632, - "reward_std": 0.2755969874560833, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9166666865348816, + "loss": 0.5899, + "reward": 0.8177083432674408, + "reward_std": 0.23917385935783386, + "rewards/tag_count_reward": 0.8177083432674408, "step": 477 }, { "clip_ratio": 0.0, - "completion_length": 570.9791870117188, + "completion_length": 730.5625, "epoch": 0.478, - "grad_norm": 6.018246594418881, - "kl": 1.2890625, + "grad_norm": 156.19302478120434, + "kl": 1.4296875, "learning_rate": 6.619104492241847e-07, - "loss": 0.2289, - "reward": 1.5885416865348816, - "reward_std": 0.3315717577934265, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8802083432674408, + "loss": 0.6196, + "reward": 0.8958333730697632, + "reward_std": 0.19614407420158386, + "rewards/tag_count_reward": 0.8958333730697632, "step": 478 }, { "clip_ratio": 0.0, - "completion_length": 523.3333587646484, + "completion_length": 574.5416870117188, "epoch": 0.479, - "grad_norm": 6.478822229591154, - "kl": 1.5390625, + "grad_norm": 103.28895867513401, + "kl": 1.53125, "learning_rate": 6.603883236454612e-07, - "loss": 0.4294, - "reward": 1.9010416865348816, - "reward_std": 0.2816907614469528, - "rewards/accuracy_reward": 0.9583333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9427083432674408, + "loss": 0.4779, + "reward": 0.8645833432674408, + "reward_std": 0.18251947313547134, + "rewards/tag_count_reward": 0.8645833432674408, "step": 479 }, { "clip_ratio": 0.0, - "completion_length": 685.7708435058594, + "completion_length": 679.7708740234375, "epoch": 0.48, - "grad_norm": 24.13751750023251, - "kl": 3.171875, + "grad_norm": 124.83742052045773, + "kl": 4.7421875, "learning_rate": 6.588648530198504e-07, - "loss": 0.4124, - "reward": 1.6302083730697632, - "reward_std": 0.27779868245124817, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8802083730697632, + "loss": 0.7442, + "reward": 0.8958333432674408, + "reward_std": 0.19233327358961105, + "rewards/tag_count_reward": 0.8958333432674408, "step": 480 }, { "clip_ratio": 0.0, - "completion_length": 496.54168701171875, + "completion_length": 473.1458435058594, "epoch": 0.481, - "grad_norm": 25.308366462199018, - "kl": 2.91796875, + "grad_norm": 35.46585217849751, + "kl": 1.1640625, "learning_rate": 6.573400559103613e-07, - "loss": 0.1922, - "reward": 1.5, - "reward_std": 0.1741945669054985, - "rewards/accuracy_reward": 0.5208333544433117, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.2258, + "reward": 0.9375000298023224, + "reward_std": 0.10045047849416733, + "rewards/tag_count_reward": 0.9375000298023224, "step": 481 }, { "clip_ratio": 0.0, - "completion_length": 574.125, + "completion_length": 436.4583435058594, "epoch": 0.482, - "grad_norm": 15.949086382375398, - "kl": 3.052734375, + "grad_norm": 22.23612829381126, + "kl": 0.796875, "learning_rate": 6.558139508961654e-07, - "loss": 0.2678, - "reward": 1.8333333730697632, - "reward_std": 0.22406283020973206, - "rewards/accuracy_reward": 0.9375000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8958333432674408, + "loss": 0.3157, + "reward": 0.953125, + "reward_std": 0.12124212644994259, + "rewards/tag_count_reward": 0.953125, "step": 482 }, { "clip_ratio": 0.0, - "completion_length": 466.41668701171875, + "completion_length": 388.62501525878906, "epoch": 0.483, - "grad_norm": 11.337596070425166, - "kl": 1.94921875, + "grad_norm": 11.049648561597081, + "kl": 0.7451171875, "learning_rate": 6.542865565723707e-07, - "loss": 0.3152, - "reward": 1.9687500596046448, - "reward_std": 0.08030500635504723, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9687500298023224, + "loss": 0.1719, + "reward": 0.9635416865348816, + "reward_std": 0.1110176295042038, + "rewards/tag_count_reward": 0.9635416865348816, "step": 483 }, { "clip_ratio": 0.0, - "completion_length": 730.1041870117188, + "completion_length": 444.0208435058594, "epoch": 0.484, - "grad_norm": 20.511565289173998, - "kl": 4.0234375, + "grad_norm": 17.789594235685367, + "kl": 0.91552734375, "learning_rate": 6.527578915497951e-07, - "loss": 0.3592, - "reward": 1.2760417461395264, - "reward_std": 0.3052527606487274, - "rewards/accuracy_reward": 0.4166666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8593750298023224, + "loss": 0.2053, + "reward": 0.9739583730697632, + "reward_std": 0.06435108184814453, + "rewards/tag_count_reward": 0.9739583730697632, "step": 484 }, { "clip_ratio": 0.0, - "completion_length": 415.1458435058594, + "completion_length": 368.8541717529297, "epoch": 0.485, - "grad_norm": 3.0229154347861944, - "kl": 0.443359375, + "grad_norm": 2.529096020385133, + "kl": 0.20751953125, "learning_rate": 6.512279744547392e-07, - "loss": 0.0437, - "reward": 1.9687500596046448, - "reward_std": 0.10825317353010178, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0085, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 485 }, { "clip_ratio": 0.0, - "completion_length": 567.5625305175781, + "completion_length": 415.62501525878906, "epoch": 0.486, - "grad_norm": 7.238574649674555, - "kl": 1.09765625, + "grad_norm": 19.63053426156352, + "kl": 1.0244140625, "learning_rate": 6.496968239287603e-07, - "loss": 0.0855, - "reward": 1.71875, - "reward_std": 0.14083527773618698, - "rewards/accuracy_reward": 0.7708333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9479166865348816, + "loss": 0.1081, + "reward": 0.9322916865348816, + "reward_std": 0.14217785745859146, + "rewards/tag_count_reward": 0.9322916865348816, "step": 486 }, { "clip_ratio": 0.0, - "completion_length": 665.0625, + "completion_length": 439.8333435058594, "epoch": 0.487, - "grad_norm": 10.887071644507325, - "kl": 1.56640625, + "grad_norm": 10.97524555021414, + "kl": 1.1123046875, "learning_rate": 6.481644586284442e-07, - "loss": 0.2154, - "reward": 1.6197917461395264, - "reward_std": 0.21968603879213333, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8697916865348816, + "loss": 0.0812, + "reward": 0.8750000298023224, + "reward_std": 0.20938248187303543, + "rewards/tag_count_reward": 0.8750000298023224, "step": 487 }, { "clip_ratio": 0.0, - "completion_length": 438.93751525878906, + "completion_length": 399.47918701171875, "epoch": 0.488, - "grad_norm": 2.6014400076566426, - "kl": 0.39599609375, + "grad_norm": 9.925768908740274, + "kl": 0.724609375, "learning_rate": 6.466308972251785e-07, - "loss": 0.0596, - "reward": 1.953125, - "reward_std": 0.11028437316417694, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 0.1406, + "reward": 0.8541666865348816, + "reward_std": 0.21116547286510468, + "rewards/tag_count_reward": 0.8541666865348816, "step": 488 }, { "clip_ratio": 0.0, - "completion_length": 844.7083740234375, + "completion_length": 546.2708435058594, "epoch": 0.489, - "grad_norm": 11.515087358230776, - "kl": 0.93359375, + "grad_norm": 9.12846183905699, + "kl": 0.9140625, "learning_rate": 6.45096158404925e-07, - "loss": 0.3163, - "reward": 1.5937500596046448, - "reward_std": 0.38610778748989105, - "rewards/accuracy_reward": 0.75, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8437500298023224, + "loss": 0.1185, + "reward": 0.75, + "reward_std": 0.22956441342830658, + "rewards/tag_count_reward": 0.75, "step": 489 }, { "clip_ratio": 0.0, - "completion_length": 566.8333740234375, + "completion_length": 466.31251525878906, "epoch": 0.49, - "grad_norm": 15.957099154959339, - "kl": 0.5205078125, + "grad_norm": 6.830735300703454, + "kl": 0.9296875, "learning_rate": 6.435602608679916e-07, - "loss": 0.237, - "reward": 1.8958333730697632, - "reward_std": 0.1992821916937828, - "rewards/accuracy_reward": 0.9375, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9583333432674408, + "loss": 0.0974, + "reward": 0.7760416865348816, + "reward_std": 0.22821441292762756, + "rewards/tag_count_reward": 0.7760416865348816, "step": 490 }, { "clip_ratio": 0.0, - "completion_length": 632.25, + "completion_length": 482.18751525878906, "epoch": 0.491, - "grad_norm": 10.144639023870276, - "kl": 0.83984375, + "grad_norm": 15.773366828026825, + "kl": 0.88671875, "learning_rate": 6.420232233288055e-07, - "loss": 0.339, - "reward": 1.7343750596046448, - "reward_std": 0.3555731326341629, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9010416865348816, + "loss": 0.1663, + "reward": 0.8177083432674408, + "reward_std": 0.2563057094812393, + "rewards/tag_count_reward": 0.8177083432674408, "step": 491 }, { "clip_ratio": 0.0, - "completion_length": 501.87501525878906, + "completion_length": 426.91668701171875, "epoch": 0.492, - "grad_norm": 9.503038130574328, - "kl": 0.4912109375, + "grad_norm": 32.803915126715765, + "kl": 1.70703125, "learning_rate": 6.404850645156841e-07, - "loss": 0.1848, - "reward": 1.8802083730697632, - "reward_std": 0.18833883479237556, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.02083333395421505, - "rewards/tag_count_reward": 0.9427083730697632, + "loss": 0.0887, + "reward": 0.8489583432674408, + "reward_std": 0.20587080717086792, + "rewards/tag_count_reward": 0.8489583432674408, "step": 492 }, { "clip_ratio": 0.0, - "completion_length": 542.3333740234375, + "completion_length": 436.7083435058594, "epoch": 0.493, - "grad_norm": 8.578506476897093, - "kl": 0.623046875, + "grad_norm": 39.13983549538198, + "kl": 2.1015625, "learning_rate": 6.389458031706068e-07, - "loss": 0.2166, - "reward": 1.6770833730697632, - "reward_std": 0.1889241859316826, - "rewards/accuracy_reward": 0.7291666716337204, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9479166865348816, + "loss": 0.1392, + "reward": 0.8593750298023224, + "reward_std": 0.2021520510315895, + "rewards/tag_count_reward": 0.8593750298023224, "step": 493 }, { "clip_ratio": 0.0, - "completion_length": 444.8958435058594, + "completion_length": 455.3958435058594, "epoch": 0.494, - "grad_norm": 18.133235922465023, - "kl": 0.50048828125, + "grad_norm": 12.80499937349423, + "kl": 1.44140625, "learning_rate": 6.374054580489873e-07, - "loss": -0.0821, - "reward": 2.0, - "reward_std": 0.03608439117670059, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.02083333395421505, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.1327, + "reward": 0.8125000298023224, + "reward_std": 0.2328476831316948, + "rewards/tag_count_reward": 0.8125000298023224, "step": 494 }, { "clip_ratio": 0.0, - "completion_length": 569.2708435058594, + "completion_length": 491.8125, "epoch": 0.495, - "grad_norm": 5.536746511290644, - "kl": 1.0, + "grad_norm": 21.581406879814047, + "kl": 1.51953125, "learning_rate": 6.358640479194451e-07, - "loss": 0.0665, - "reward": 1.6822916865348816, - "reward_std": 0.2431003749370575, - "rewards/accuracy_reward": 0.75, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9322916865348816, + "loss": 0.1651, + "reward": 0.8333333730697632, + "reward_std": 0.22698140144348145, + "rewards/tag_count_reward": 0.8333333730697632, "step": 495 }, { "clip_ratio": 0.0, - "completion_length": 713.8125, + "completion_length": 495.04168701171875, "epoch": 0.496, - "grad_norm": 8.621719433085927, - "kl": 2.59375, + "grad_norm": 11.910719077247448, + "kl": 1.28515625, "learning_rate": 6.343215915635761e-07, - "loss": 0.5833, - "reward": 1.7500000596046448, - "reward_std": 0.36013171076774597, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8333333730697632, + "loss": 0.1314, + "reward": 0.7760416865348816, + "reward_std": 0.22269393503665924, + "rewards/tag_count_reward": 0.7760416865348816, "step": 496 }, { "clip_ratio": 0.0, - "completion_length": 604.2291717529297, + "completion_length": 423.04168701171875, "epoch": 0.497, - "grad_norm": 14.910085461123963, - "kl": 1.90185546875, + "grad_norm": 18.119709925914574, + "kl": 1.55859375, "learning_rate": 6.327781077757241e-07, - "loss": 0.0438, - "reward": 1.8229166865348816, - "reward_std": 0.13010412454605103, - "rewards/accuracy_reward": 0.875, - "rewards/format_reward": 0.02083333395421505, - "rewards/tag_count_reward": 0.9270833432674408, + "loss": 0.1089, + "reward": 0.7864583432674408, + "reward_std": 0.22677022963762283, + "rewards/tag_count_reward": 0.7864583432674408, "step": 497 }, { "clip_ratio": 0.0, - "completion_length": 545.8750305175781, + "completion_length": 411.8125, "epoch": 0.498, - "grad_norm": 3.73145441743871, - "kl": 0.86669921875, + "grad_norm": 8.939652278302464, + "kl": 0.693359375, "learning_rate": 6.31233615362752e-07, - "loss": 0.2215, - "reward": 1.9479166865348816, - "reward_std": 0.13466879725456238, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.96875, + "loss": 0.06, + "reward": 0.90625, + "reward_std": 0.1671900376677513, + "rewards/tag_count_reward": 0.90625, "step": 498 }, { "clip_ratio": 0.0, - "completion_length": 703.8333435058594, + "completion_length": 475.50001525878906, "epoch": 0.499, - "grad_norm": 12.13954312660943, - "kl": 1.96875, + "grad_norm": 242.16416813046348, + "kl": 3.513671875, "learning_rate": 6.296881331438126e-07, - "loss": 0.1925, - "reward": 1.6302083730697632, - "reward_std": 0.34837743639945984, - "rewards/accuracy_reward": 0.7083333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9218750298023224, + "loss": 0.3017, + "reward": 0.8541666865348816, + "reward_std": 0.21013373881578445, + "rewards/tag_count_reward": 0.8541666865348816, "step": 499 }, { "clip_ratio": 0.0, - "completion_length": 491.54168701171875, + "completion_length": 478.75001525878906, "epoch": 0.5, - "grad_norm": 7.134922053152654, - "kl": 0.888671875, + "grad_norm": 9.289622996609156, + "kl": 1.484375, "learning_rate": 6.281416799501187e-07, - "loss": 0.1284, - "reward": 1.9583333730697632, - "reward_std": 0.08141736686229706, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.1325, + "reward": 0.8333333432674408, + "reward_std": 0.2499879226088524, + "rewards/tag_count_reward": 0.8333333432674408, "step": 500 }, { "clip_ratio": 0.0, - "completion_length": 409.7291717529297, + "completion_length": 434.0833435058594, "epoch": 0.501, - "grad_norm": 1.6621261479363783, - "kl": 0.18798828125, + "grad_norm": 72.80609003854057, + "kl": 2.296875, "learning_rate": 6.265942746247146e-07, - "loss": 0.003, - "reward": 1.9947916865348816, - "reward_std": 0.018042195588350296, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.3123, + "reward": 0.90625, + "reward_std": 0.1826242059469223, + "rewards/tag_count_reward": 0.90625, "step": 501 }, { "clip_ratio": 0.0, - "completion_length": 408.2083435058594, + "completion_length": 397.0208435058594, "epoch": 0.502, - "grad_norm": 6.153264949056815, - "kl": 0.39208984375, + "grad_norm": 10.01226903410698, + "kl": 1.0859375, "learning_rate": 6.25045936022246e-07, - "loss": 0.0791, - "reward": 1.9791666865348816, - "reward_std": 0.05689104646444321, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.1132, + "reward": 0.9114583432674408, + "reward_std": 0.18020494282245636, + "rewards/tag_count_reward": 0.9114583432674408, "step": 502 }, { "clip_ratio": 0.0, - "completion_length": 417.8333435058594, + "completion_length": 378.6041717529297, "epoch": 0.503, - "grad_norm": 4.7770952919487195, - "kl": 0.4462890625, + "grad_norm": 25.30471139536292, + "kl": 1.4609375, "learning_rate": 6.2349668300873e-07, - "loss": 0.1043, - "reward": 1.9739583730697632, - "reward_std": 0.07278125733137131, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.1484, + "reward": 0.9375000298023224, + "reward_std": 0.1506819725036621, + "rewards/tag_count_reward": 0.9375000298023224, "step": 503 }, { "clip_ratio": 0.0, - "completion_length": 495.9166717529297, + "completion_length": 413.00001525878906, "epoch": 0.504, - "grad_norm": 11.547329552573979, - "kl": 1.17138671875, + "grad_norm": 26.653952653018365, + "kl": 2.2890625, "learning_rate": 6.219465344613258e-07, - "loss": 0.0946, - "reward": 1.890625, - "reward_std": 0.11654654145240784, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 0.2085, + "reward": 0.90625, + "reward_std": 0.17194338142871857, + "rewards/tag_count_reward": 0.90625, "step": 504 }, { "clip_ratio": 0.0, - "completion_length": 713.5208740234375, + "completion_length": 417.0208435058594, "epoch": 0.505, - "grad_norm": 22.12587453073882, - "kl": 4.2734375, + "grad_norm": 18.34064468054739, + "kl": 1.0390625, "learning_rate": 6.203955092681039e-07, - "loss": 0.5008, - "reward": 1.6927083730697632, - "reward_std": 0.3396236300468445, - "rewards/accuracy_reward": 0.8750000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8177083432674408, + "loss": 0.1238, + "reward": 0.9375000298023224, + "reward_std": 0.13520457595586777, + "rewards/tag_count_reward": 0.9375000298023224, "step": 505 }, { "clip_ratio": 0.0, - "completion_length": 552.4583435058594, + "completion_length": 394.56251525878906, "epoch": 0.506, - "grad_norm": 14.450009773077992, - "kl": 2.3828125, + "grad_norm": 14.306057688537363, + "kl": 0.4296875, "learning_rate": 6.188436263278172e-07, - "loss": 0.3484, - "reward": 1.8437500596046448, - "reward_std": 0.2805076912045479, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9270833432674408, + "loss": 0.1227, + "reward": 0.9531250298023224, + "reward_std": 0.12172887474298477, + "rewards/tag_count_reward": 0.9531250298023224, "step": 506 }, { "clip_ratio": 0.0, - "completion_length": 431.8541717529297, + "completion_length": 402.9166717529297, "epoch": 0.507, - "grad_norm": 5.3963470752817555, - "kl": 0.787109375, + "grad_norm": 43.426631300255, + "kl": 1.783203125, "learning_rate": 6.172909045496694e-07, - "loss": 0.1626, - "reward": 1.9635416865348816, - "reward_std": 0.12629537284374237, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.984375, + "loss": 0.1454, + "reward": 0.9479166865348816, + "reward_std": 0.13458874076604843, + "rewards/tag_count_reward": 0.9479166865348816, "step": 507 }, { "clip_ratio": 0.0, - "completion_length": 431.37501525878906, + "completion_length": 410.9791717529297, "epoch": 0.508, - "grad_norm": 4.646295127312127, - "kl": 0.5625, + "grad_norm": 8.009705320931293, + "kl": 0.3251953125, "learning_rate": 6.157373628530852e-07, - "loss": 0.0235, - "reward": 1.7447916865348816, - "reward_std": 0.018042195588350296, - "rewards/accuracy_reward": 0.75, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0487, + "reward": 0.9531250298023224, + "reward_std": 0.14710202813148499, + "rewards/tag_count_reward": 0.9531250298023224, "step": 508 }, { "clip_ratio": 0.0, - "completion_length": 682.4583587646484, + "completion_length": 441.77085876464844, "epoch": 0.509, - "grad_norm": 6.148635226630127, - "kl": 1.7744140625, + "grad_norm": 7.064052309828297, + "kl": 0.5234375, "learning_rate": 6.141830201674802e-07, - "loss": 0.1548, - "reward": 1.7916667461395264, - "reward_std": 0.27450863271951675, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8750000298023224, + "loss": 0.0775, + "reward": 0.96875, + "reward_std": 0.07769769430160522, + "rewards/tag_count_reward": 0.96875, "step": 509 }, { "clip_ratio": 0.0, - "completion_length": 614.5000152587891, + "completion_length": 459.6666717529297, "epoch": 0.51, - "grad_norm": 14.593554160855128, - "kl": 0.9921875, + "grad_norm": 30.244709764967375, + "kl": 1.96875, "learning_rate": 6.126278954320294e-07, - "loss": 0.3892, - "reward": 1.9427083730697632, - "reward_std": 0.20130915194749832, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.02083333395421505, - "rewards/tag_count_reward": 0.9218750298023224, + "loss": 0.2789, + "reward": 0.9270833432674408, + "reward_std": 0.18332497030496597, + "rewards/tag_count_reward": 0.9270833432674408, "step": 510 }, { "clip_ratio": 0.0, - "completion_length": 495.3541717529297, + "completion_length": 415.2083435058594, "epoch": 0.511, - "grad_norm": 5.440825841913161, - "kl": 0.8994140625, + "grad_norm": 115.24412450112247, + "kl": 3.8671875, "learning_rate": 6.11072007595437e-07, - "loss": 0.0888, - "reward": 1.8385416865348816, - "reward_std": 0.18555867671966553, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.921875, + "loss": 0.2433, + "reward": 0.9791666865348816, + "reward_std": 0.07216878235340118, + "rewards/tag_count_reward": 0.9791666865348816, "step": 511 }, { "clip_ratio": 0.0, - "completion_length": 740.6458435058594, + "completion_length": 382.3958435058594, "epoch": 0.512, - "grad_norm": 33.765959368127625, - "kl": 2.3671875, + "grad_norm": 195.61340613748922, + "kl": 8.140625, "learning_rate": 6.095153756157051e-07, - "loss": 0.2329, - "reward": 1.6354166865348816, - "reward_std": 0.3800914138555527, - "rewards/accuracy_reward": 0.8125, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8229166865348816, + "loss": 0.5186, + "reward": 0.9531250298023224, + "reward_std": 0.11535455286502838, + "rewards/tag_count_reward": 0.9531250298023224, "step": 512 }, { "clip_ratio": 0.0, - "completion_length": 462.4375305175781, + "completion_length": 371.5208435058594, "epoch": 0.513, - "grad_norm": 7.586108548829313, - "kl": 0.41357421875, + "grad_norm": 16.785721604403555, + "kl": 1.45703125, "learning_rate": 6.079580184599032e-07, - "loss": -0.0538, - "reward": 1.7604166865348816, - "reward_std": 0.04485878348350525, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.96875, + "loss": 0.084, + "reward": 0.9739583432674408, + "reward_std": 0.09021097421646118, + "rewards/tag_count_reward": 0.9739583432674408, "step": 513 }, { "clip_ratio": 0.0, - "completion_length": 472.68751525878906, + "completion_length": 353.4583435058594, "epoch": 0.514, - "grad_norm": 6.163156016133745, - "kl": 0.38037109375, + "grad_norm": 4.699772991934067, + "kl": 0.38330078125, "learning_rate": 6.06399955103937e-07, - "loss": 0.1466, - "reward": 1.9791666865348816, - "reward_std": 0.07216878235340118, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.0608, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 514 }, { "clip_ratio": 0.0, - "completion_length": 395.1458435058594, + "completion_length": 390.2291717529297, "epoch": 0.515, - "grad_norm": 3.2391516961509192, - "kl": 0.117431640625, + "grad_norm": 14.015572428904417, + "kl": 0.80859375, "learning_rate": 6.048412045323164e-07, - "loss": 0.0257, - "reward": 1.9583333730697632, - "reward_std": 0.09731237590312958, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": 0.1295, + "reward": 0.9531250298023224, + "reward_std": 0.12172887474298477, + "rewards/tag_count_reward": 0.9531250298023224, "step": 515 }, { "clip_ratio": 0.0, - "completion_length": 473.4583435058594, + "completion_length": 371.0208435058594, "epoch": 0.516, - "grad_norm": 5.042245384135466, - "kl": 0.3955078125, + "grad_norm": 7.072123039382963, + "kl": 0.151611328125, "learning_rate": 6.032817857379256e-07, - "loss": -0.0055, - "reward": 1.578125, - "reward_std": 0.21512985974550247, - "rewards/accuracy_reward": 0.625, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.953125, + "loss": 0.0768, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 516 }, { "clip_ratio": 0.0, - "completion_length": 420.56251525878906, + "completion_length": 348.2291717529297, "epoch": 0.517, - "grad_norm": 8.621199763513951, - "kl": 0.29443359375, + "grad_norm": 1.6403662268885855, + "kl": 0.15283203125, "learning_rate": 6.017217177217899e-07, - "loss": 0.1731, - "reward": 1.9895833730697632, - "reward_std": 0.03608439117670059, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0061, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 517 }, { "clip_ratio": 0.0, - "completion_length": 426.7708435058594, + "completion_length": 354.5833435058594, "epoch": 0.518, - "grad_norm": 1.3715084990602338, - "kl": 0.121826171875, + "grad_norm": 494.9613593566199, + "kl": 7.6015625, "learning_rate": 6.001610194928464e-07, - "loss": 0.0163, - "reward": 1.9895833730697632, - "reward_std": 0.03608439117670059, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.6701, + "reward": 0.9739583432674408, + "reward_std": 0.09021097794175148, + "rewards/tag_count_reward": 0.9739583432674408, "step": 518 }, { "clip_ratio": 0.0, - "completion_length": 657.9583740234375, + "completion_length": 435.29168701171875, "epoch": 0.519, - "grad_norm": 10.390490000193223, - "kl": 0.755859375, + "grad_norm": 14.551518722427685, + "kl": 0.8330078125, "learning_rate": 5.985997100677103e-07, - "loss": 0.334, - "reward": 1.8593750596046448, - "reward_std": 0.21416960284113884, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9427083730697632, + "loss": 0.0463, + "reward": 0.984375, + "reward_std": 0.05412658676505089, + "rewards/tag_count_reward": 0.984375, "step": 519 }, { "clip_ratio": 0.0, - "completion_length": 445.81251525878906, + "completion_length": 378.1458435058594, "epoch": 0.52, - "grad_norm": 9.36630447405393, - "kl": 0.2421875, + "grad_norm": 12.964649241316112, + "kl": 0.915283203125, "learning_rate": 5.97037808470444e-07, - "loss": 0.114, - "reward": 1.7708333730697632, - "reward_std": 0.14031123742461205, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.0657, + "reward": 0.984375, + "reward_std": 0.03884884715080261, + "rewards/tag_count_reward": 0.984375, "step": 520 }, { "clip_ratio": 0.0, - "completion_length": 460.875, + "completion_length": 385.9583435058594, "epoch": 0.521, - "grad_norm": 2.673721551693534, - "kl": 0.322021484375, + "grad_norm": 51.96366845104974, + "kl": 1.982421875, "learning_rate": 5.954753337323259e-07, - "loss": 0.0717, - "reward": 1.984375, - "reward_std": 0.03884884715080261, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.984375, + "loss": 0.1629, + "reward": 0.9687500298023224, + "reward_std": 0.08474057167768478, + "rewards/tag_count_reward": 0.9687500298023224, "step": 521 }, { "clip_ratio": 0.0, - "completion_length": 581.8125305175781, + "completion_length": 379.1666717529297, "epoch": 0.522, - "grad_norm": 8.574045801589394, - "kl": 0.9609375, + "grad_norm": 10.509987238838747, + "kl": 0.7578125, "learning_rate": 5.939123048916173e-07, - "loss": 0.0336, - "reward": 1.6041667461395264, - "reward_std": 0.3190048784017563, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9375000298023224, + "loss": 0.0965, + "reward": 0.9687500298023224, + "reward_std": 0.10825317353010178, + "rewards/tag_count_reward": 0.9687500298023224, "step": 522 }, { "clip_ratio": 0.0, - "completion_length": 547.3333587646484, + "completion_length": 366.7916717529297, "epoch": 0.523, - "grad_norm": 12.240069295990747, - "kl": 0.998046875, + "grad_norm": 8.71809976619088, + "kl": 0.42626953125, "learning_rate": 5.923487409933315e-07, - "loss": 0.4171, - "reward": 1.9583333730697632, - "reward_std": 0.1290598213672638, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9583333432674408, + "loss": 0.0853, + "reward": 0.9739583432674408, + "reward_std": 0.09021097421646118, + "rewards/tag_count_reward": 0.9739583432674408, "step": 523 }, { "clip_ratio": 0.0, - "completion_length": 637.1458435058594, + "completion_length": 409.75001525878906, "epoch": 0.524, - "grad_norm": 4.801577378029944, - "kl": 1.60546875, + "grad_norm": 26.138982212177606, + "kl": 1.822265625, "learning_rate": 5.907846610890011e-07, - "loss": 0.3953, - "reward": 1.6718750596046448, - "reward_std": 0.44389602541923523, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9427083432674408, + "loss": 0.1788, + "reward": 0.9583333432674408, + "reward_std": 0.11287746578454971, + "rewards/tag_count_reward": 0.9583333432674408, "step": 524 }, { "clip_ratio": 0.0, - "completion_length": 655.8541870117188, + "completion_length": 403.0416717529297, "epoch": 0.525, - "grad_norm": 8.385477085385283, - "kl": 1.40234375, + "grad_norm": 7.60573189749456, + "kl": 0.4453125, "learning_rate": 5.892200842364462e-07, - "loss": 0.1879, - "reward": 1.6250000596046448, - "reward_std": 0.26600293815135956, - "rewards/accuracy_reward": 0.6875000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9375000298023224, + "loss": 0.0792, + "reward": 0.9739583432674408, + "reward_std": 0.06317693740129471, + "rewards/tag_count_reward": 0.9739583432674408, "step": 525 }, { "clip_ratio": 0.0, - "completion_length": 656.8333435058594, + "completion_length": 391.6666717529297, "epoch": 0.526, - "grad_norm": 4.900789983639907, - "kl": 1.886962890625, + "grad_norm": 162.43944905543728, + "kl": 5.5390625, "learning_rate": 5.87655029499542e-07, - "loss": 0.3296, - "reward": 1.9010416865348816, - "reward_std": 0.15065115690231323, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.921875, + "loss": 0.3085, + "reward": 0.9427083432674408, + "reward_std": 0.09432543441653252, + "rewards/tag_count_reward": 0.9427083432674408, "step": 526 }, { "clip_ratio": 0.0, - "completion_length": 715.4791870117188, + "completion_length": 441.8333435058594, "epoch": 0.527, - "grad_norm": 10.10301825544499, - "kl": 2.04296875, + "grad_norm": 222.20817265394626, + "kl": 6.86328125, "learning_rate": 5.860895159479864e-07, - "loss": 0.2304, - "reward": 1.6041666865348816, - "reward_std": 0.2951066195964813, - "rewards/accuracy_reward": 0.6875000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9166666865348816, + "loss": 0.3774, + "reward": 0.9791666865348816, + "reward_std": 0.07216878235340118, + "rewards/tag_count_reward": 0.9791666865348816, "step": 527 }, { "clip_ratio": 0.0, - "completion_length": 576.2708740234375, + "completion_length": 467.8958435058594, "epoch": 0.528, - "grad_norm": 3.705372796686908, - "kl": 1.12890625, + "grad_norm": 107.24042987401106, + "kl": 5.59375, "learning_rate": 5.845235626570683e-07, - "loss": 0.2558, - "reward": 1.8229166865348816, - "reward_std": 0.33442574739456177, - "rewards/accuracy_reward": 0.8750000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9479166865348816, + "loss": 0.3771, + "reward": 0.9427083432674408, + "reward_std": 0.14112106710672379, + "rewards/tag_count_reward": 0.9427083432674408, "step": 528 }, { "clip_ratio": 0.0, - "completion_length": 515.9166870117188, + "completion_length": 457.8333435058594, "epoch": 0.529, - "grad_norm": 2.654063413992035, - "kl": 0.57177734375, + "grad_norm": 31.326251648615383, + "kl": 1.126953125, "learning_rate": 5.829571887074343e-07, - "loss": 0.1588, - "reward": 1.7083333730697632, - "reward_std": 0.08141736686229706, - "rewards/accuracy_reward": 0.75, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9583333432674408, + "loss": 0.2654, + "reward": 0.9687500298023224, + "reward_std": 0.08474057167768478, + "rewards/tag_count_reward": 0.9687500298023224, "step": 529 }, { "clip_ratio": 0.0, - "completion_length": 491.8750305175781, + "completion_length": 431.06251525878906, "epoch": 0.53, - "grad_norm": 3.4052549059286603, - "kl": 0.63427734375, + "grad_norm": 8.097526346389188, + "kl": 0.5712890625, "learning_rate": 5.813904131848564e-07, - "loss": 0.0414, - "reward": 1.7864583730697632, - "reward_std": 0.11135252565145493, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.953125, + "loss": 0.0689, + "reward": 0.9635416865348816, + "reward_std": 0.12629536539316177, + "rewards/tag_count_reward": 0.9635416865348816, "step": 530 }, { "clip_ratio": 0.0, - "completion_length": 486.0625305175781, + "completion_length": 451.0208435058594, "epoch": 0.531, - "grad_norm": 4.683290624164214, - "kl": 0.2998046875, + "grad_norm": 18.943146834182464, + "kl": 0.8515625, "learning_rate": 5.798232551800002e-07, - "loss": 0.0549, - "reward": 1.9687500596046448, - "reward_std": 0.10825316980481148, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833730697632, + "loss": 0.3034, + "reward": 0.9166666865348816, + "reward_std": 0.18585222214460373, + "rewards/tag_count_reward": 0.9166666865348816, "step": 531 }, { "clip_ratio": 0.0, - "completion_length": 458.62501525878906, + "completion_length": 364.68751525878906, "epoch": 0.532, - "grad_norm": 6.136399618458228, - "kl": 0.31298828125, + "grad_norm": 8.498932430014783, + "kl": 0.5078125, "learning_rate": 5.78255733788191e-07, - "loss": 0.2051, - "reward": 1.9166666865348816, - "reward_std": 0.22756417095661163, - "rewards/accuracy_reward": 0.9375000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.0566, + "reward": 0.9739583432674408, + "reward_std": 0.0749332383275032, + "rewards/tag_count_reward": 0.9739583432674408, "step": 532 }, { "clip_ratio": 0.0, - "completion_length": 523.3125152587891, + "completion_length": 358.81251525878906, "epoch": 0.533, - "grad_norm": 5.168550237418491, - "kl": 0.5205078125, + "grad_norm": 9.600836590427424, + "kl": 0.34765625, "learning_rate": 5.766878681091828e-07, - "loss": 0.1878, - "reward": 1.9687500596046448, - "reward_std": 0.0908234529197216, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, + "loss": 0.1632, + "reward": 0.9687500298023224, + "reward_std": 0.10825317353010178, "rewards/tag_count_reward": 0.9687500298023224, "step": 533 }, { "clip_ratio": 0.0, - "completion_length": 553.7708435058594, + "completion_length": 396.7708435058594, "epoch": 0.534, - "grad_norm": 6.544521956408023, - "kl": 0.48046875, + "grad_norm": 27.376458471111953, + "kl": 0.9453125, "learning_rate": 5.751196772469237e-07, - "loss": 0.2071, - "reward": 1.6875, - "reward_std": 0.15811091661453247, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9583333432674408, + "loss": 0.1872, + "reward": 0.9687500298023224, + "reward_std": 0.10825317353010178, + "rewards/tag_count_reward": 0.9687500298023224, "step": 534 }, { "clip_ratio": 0.0, - "completion_length": 495.3125305175781, + "completion_length": 391.5208435058594, "epoch": 0.535, - "grad_norm": 6.335961191604862, - "kl": 0.689453125, + "grad_norm": 25.511749648224892, + "kl": 1.671875, "learning_rate": 5.735511803093248e-07, - "loss": 0.0117, - "reward": 1.7916666865348816, - "reward_std": 0.14031124114990234, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9375, + "loss": 0.1309, + "reward": 0.9843750298023224, + "reward_std": 0.05412658676505089, + "rewards/tag_count_reward": 0.9843750298023224, "step": 535 }, { "clip_ratio": 0.0, - "completion_length": 412.0416717529297, + "completion_length": 362.8541717529297, "epoch": 0.536, - "grad_norm": 0.6794510063356352, - "kl": 0.1484375, + "grad_norm": 98.56527960040326, + "kl": 4.28125, "learning_rate": 5.71982396408026e-07, - "loss": 0.0062, - "reward": 2.0, - "reward_std": 0.0, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": 0.2705, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 536 }, { "clip_ratio": 0.0, - "completion_length": 618.0416870117188, + "completion_length": 395.72918701171875, "epoch": 0.537, - "grad_norm": 12.597082941716934, - "kl": 1.8671875, + "grad_norm": 30.228185583257314, + "kl": 1.845703125, "learning_rate": 5.704133446581642e-07, - "loss": 0.2081, - "reward": 1.7916667461395264, - "reward_std": 0.2311250939965248, - "rewards/accuracy_reward": 0.875, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9166666865348816, + "loss": 0.1388, + "reward": 0.9739583432674408, + "reward_std": 0.04956009238958359, + "rewards/tag_count_reward": 0.9739583432674408, "step": 537 }, { "clip_ratio": 0.0, - "completion_length": 607.9791870117188, + "completion_length": 424.54168701171875, "epoch": 0.538, - "grad_norm": 4.1433656398497405, - "kl": 0.9560546875, + "grad_norm": 55.78191720577316, + "kl": 2.5703125, "learning_rate": 5.688440441781398e-07, - "loss": 0.1982, - "reward": 1.9322916865348816, - "reward_std": 0.16619428992271423, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.953125, + "loss": 0.1954, + "reward": 0.9687500298023224, + "reward_std": 0.0929754376411438, + "rewards/tag_count_reward": 0.9687500298023224, "step": 538 }, { "clip_ratio": 0.0, - "completion_length": 493.8333435058594, + "completion_length": 392.50001525878906, "epoch": 0.539, - "grad_norm": 6.729068770057365, - "kl": 0.54345703125, + "grad_norm": 12.773537553545584, + "kl": 0.94921875, "learning_rate": 5.672745140893839e-07, - "loss": 0.2218, - "reward": 1.9791666865348816, - "reward_std": 0.07216878235340118, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.1353, + "reward": 0.9687500298023224, + "reward_std": 0.08474056795239449, + "rewards/tag_count_reward": 0.9687500298023224, "step": 539 }, { "clip_ratio": 0.0, - "completion_length": 534.9375, + "completion_length": 420.7291717529297, "epoch": 0.54, - "grad_norm": 14.498748193688524, - "kl": 0.830078125, + "grad_norm": 4.149720436875501, + "kl": 0.828125, "learning_rate": 5.657047735161255e-07, - "loss": 0.3991, - "reward": 1.8281250596046448, - "reward_std": 0.2755082845687866, - "rewards/accuracy_reward": 0.875, - "rewards/format_reward": 0.0, + "loss": 0.067, + "reward": 0.9531250298023224, + "reward_std": 0.11773939803242683, "rewards/tag_count_reward": 0.9531250298023224, "step": 540 }, { "clip_ratio": 0.0, - "completion_length": 491.31251525878906, + "completion_length": 377.4791717529297, "epoch": 0.541, - "grad_norm": 4.973785971547981, - "kl": 0.7734375, + "grad_norm": 5.7930433922410325, + "kl": 0.5126953125, "learning_rate": 5.641348415851577e-07, - "loss": 0.1901, - "reward": 1.8593750596046448, - "reward_std": 0.17305626347661018, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.052, + "reward": 0.9687500298023224, + "reward_std": 0.07298427075147629, + "rewards/tag_count_reward": 0.9687500298023224, "step": 541 }, { "clip_ratio": 0.0, - "completion_length": 664.6250305175781, + "completion_length": 479.1041717529297, "epoch": 0.542, - "grad_norm": 5.871538553513047, - "kl": 1.828125, + "grad_norm": 16.982699499324443, + "kl": 0.515625, "learning_rate": 5.625647374256061e-07, - "loss": 0.2915, - "reward": 1.7552083730697632, - "reward_std": 0.34619835019111633, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9218750298023224, + "loss": 0.1876, + "reward": 0.9739583432674408, + "reward_std": 0.0749332457780838, + "rewards/tag_count_reward": 0.9739583432674408, "step": 542 }, { "clip_ratio": 0.0, - "completion_length": 699.7083740234375, + "completion_length": 356.4375, "epoch": 0.543, - "grad_norm": 7.572949815456705, - "kl": 2.484375, + "grad_norm": 1.042112146020366, + "kl": 0.132568359375, "learning_rate": 5.60994480168694e-07, - "loss": 0.3509, - "reward": 1.6406250596046448, - "reward_std": 0.42194561660289764, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8697916865348816, + "loss": 0.006, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 543 }, { "clip_ratio": 0.0, - "completion_length": 602.6041870117188, + "completion_length": 420.87501525878906, "epoch": 0.544, - "grad_norm": 8.302998278882528, - "kl": 2.06640625, + "grad_norm": 12.484906716391471, + "kl": 0.611328125, "learning_rate": 5.594240889475106e-07, - "loss": 0.2011, - "reward": 1.7343750596046448, - "reward_std": 0.2437637597322464, - "rewards/accuracy_reward": 0.8125000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.921875, + "loss": 0.1302, + "reward": 0.9791666865348816, + "reward_std": 0.05689104273915291, + "rewards/tag_count_reward": 0.9791666865348816, "step": 544 }, { "clip_ratio": 0.0, - "completion_length": 716.7083435058594, + "completion_length": 419.4583435058594, "epoch": 0.545, - "grad_norm": 23.266933352107536, - "kl": 3.7265625, + "grad_norm": 8.979713953123971, + "kl": 0.5458984375, "learning_rate": 5.578535828967777e-07, - "loss": 0.53, - "reward": 1.5729167461395264, - "reward_std": 0.4846072643995285, - "rewards/accuracy_reward": 0.6875000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8854166865348816, + "loss": 0.0264, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 545 }, { "clip_ratio": 0.0, - "completion_length": 686.7916870117188, + "completion_length": 472.7291717529297, "epoch": 0.546, - "grad_norm": 25.051591548627965, - "kl": 2.900390625, + "grad_norm": 11.351801658638019, + "kl": 1.1796875, "learning_rate": 5.562829811526154e-07, - "loss": 0.41, - "reward": 1.7031250596046448, - "reward_std": 0.28520841151475906, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9114583432674408, + "loss": 0.2698, + "reward": 0.9479166865348816, + "reward_std": 0.13466878980398178, + "rewards/tag_count_reward": 0.9479166865348816, "step": 546 }, { "clip_ratio": 0.0, - "completion_length": 669.0416870117188, + "completion_length": 414.3125, "epoch": 0.547, - "grad_norm": 5.937024042953502, - "kl": 2.2578125, + "grad_norm": 5.840490960004321, + "kl": 0.517578125, "learning_rate": 5.547123028523106e-07, - "loss": 0.4504, - "reward": 1.8541667461395264, - "reward_std": 0.32571033388376236, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8958333432674408, + "loss": 0.1347, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 547 }, { "clip_ratio": 0.0, - "completion_length": 465.66668701171875, + "completion_length": 421.12501525878906, "epoch": 0.548, - "grad_norm": 7.7019693307956425, - "kl": 0.5009765625, + "grad_norm": 31.369294570677745, + "kl": 1.8203125, "learning_rate": 5.531415671340826e-07, - "loss": 0.145, - "reward": 1.9635416865348816, - "reward_std": 0.12629537284374237, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.984375, + "loss": 0.1599, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 548 }, { "clip_ratio": 0.0, - "completion_length": 449.1666717529297, + "completion_length": 367.3333435058594, "epoch": 0.549, - "grad_norm": 3.2056006160535917, - "kl": 0.278076171875, + "grad_norm": 3.215178196877606, + "kl": 0.24951171875, "learning_rate": 5.515707931368507e-07, - "loss": 0.0235, - "reward": 2.0, + "loss": 0.011, + "reward": 1.0, "reward_std": 0.0, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, "rewards/tag_count_reward": 1.0, "step": 549 }, { "clip_ratio": 0.0, - "completion_length": 731.1250305175781, + "completion_length": 407.7083435058594, "epoch": 0.55, - "grad_norm": 14.16934382823151, - "kl": 2.12890625, + "grad_norm": 4.109487023976539, + "kl": 0.309814453125, "learning_rate": 5.5e-07, - "loss": 0.5906, - "reward": 1.734375, - "reward_std": 0.4276340454816818, - "rewards/accuracy_reward": 0.8750000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8593750298023224, + "loss": 0.0139, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 550 }, { "clip_ratio": 0.0, - "completion_length": 744.0000305175781, + "completion_length": 434.8541717529297, "epoch": 0.551, - "grad_norm": 8.356917827415577, - "kl": 2.4296875, + "grad_norm": 5.49398024609441, + "kl": 0.248291015625, "learning_rate": 5.484292068631494e-07, - "loss": 0.5004, - "reward": 1.6927083730697632, - "reward_std": 0.37811143696308136, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8385416865348816, + "loss": 0.0506, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 551 }, { "clip_ratio": 0.0, - "completion_length": 740.6458435058594, + "completion_length": 411.3333435058594, "epoch": 0.552, - "grad_norm": 7.034253450240812, - "kl": 2.484375, + "grad_norm": 10.446442147389755, + "kl": 0.671875, "learning_rate": 5.468584328659172e-07, - "loss": 0.5062, - "reward": 1.7916666865348816, - "reward_std": 0.2923833690583706, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8333333432674408, + "loss": 0.0515, + "reward": 0.9791666865348816, + "reward_std": 0.04865618050098419, + "rewards/tag_count_reward": 0.9791666865348816, "step": 552 }, { "clip_ratio": 0.0, - "completion_length": 604.3125305175781, + "completion_length": 407.9583435058594, "epoch": 0.553, - "grad_norm": 7.996495891288037, - "kl": 1.8916015625, + "grad_norm": 11.659490993973444, + "kl": 0.26318359375, "learning_rate": 5.452876971476896e-07, - "loss": 0.1856, - "reward": 1.8385416865348816, - "reward_std": 0.1234515830874443, - "rewards/accuracy_reward": 0.9375, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9010416865348816, + "loss": 0.0936, + "reward": 0.9739583432674408, + "reward_std": 0.09021097421646118, + "rewards/tag_count_reward": 0.9739583432674408, "step": 553 }, { "clip_ratio": 0.0, - "completion_length": 633.3541870117188, + "completion_length": 392.25, "epoch": 0.554, - "grad_norm": 16.0086645550577, - "kl": 1.28515625, + "grad_norm": 6.929076125153229, + "kl": 0.462890625, "learning_rate": 5.437170188473847e-07, - "loss": 0.0317, - "reward": 1.7552083730697632, - "reward_std": 0.20322755724191666, - "rewards/accuracy_reward": 0.8125000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9427083432674408, + "loss": 0.0281, + "reward": 0.9791666865348816, + "reward_std": 0.04865618050098419, + "rewards/tag_count_reward": 0.9791666865348816, "step": 554 }, { "clip_ratio": 0.0, - "completion_length": 659.4166870117188, + "completion_length": 400.8958435058594, "epoch": 0.555, - "grad_norm": 10.504024001155507, - "kl": 2.4296875, + "grad_norm": 0.2713283619354236, + "kl": 0.105712890625, "learning_rate": 5.421464171032224e-07, - "loss": 0.3636, - "reward": 1.7395833730697632, - "reward_std": 0.25392141938209534, - "rewards/accuracy_reward": 0.8958333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8437500298023224, + "loss": 0.0044, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 555 }, { "clip_ratio": 0.0, - "completion_length": 635.4166870117188, + "completion_length": 384.9791717529297, "epoch": 0.556, - "grad_norm": 5.286799663719701, - "kl": 1.83984375, + "grad_norm": 10.336134198326514, + "kl": 0.837890625, "learning_rate": 5.405759110524894e-07, - "loss": 0.1763, - "reward": 1.7968750596046448, - "reward_std": 0.18324360251426697, - "rewards/accuracy_reward": 0.9375, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8593750298023224, + "loss": 0.1197, + "reward": 0.9739583730697632, + "reward_std": 0.06669837608933449, + "rewards/tag_count_reward": 0.9739583730697632, "step": 556 }, { "clip_ratio": 0.0, - "completion_length": 583.0833435058594, + "completion_length": 459.8958435058594, "epoch": 0.557, - "grad_norm": 6.485103886137227, - "kl": 1.6875, + "grad_norm": 106.61184551140086, + "kl": 3.3203125, "learning_rate": 5.390055198313061e-07, - "loss": 0.3552, - "reward": 1.7760416865348816, - "reward_std": 0.31283604353666306, - "rewards/accuracy_reward": 0.8750000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9010416865348816, + "loss": 0.256, + "reward": 0.984375, + "reward_std": 0.05412658676505089, + "rewards/tag_count_reward": 0.984375, "step": 557 }, { "clip_ratio": 0.0, - "completion_length": 639.3750152587891, + "completion_length": 350.85418701171875, "epoch": 0.558, - "grad_norm": 5.735814782993109, - "kl": 1.5927734375, + "grad_norm": 29.507973013028842, + "kl": 1.607421875, "learning_rate": 5.37435262574394e-07, - "loss": 0.2224, - "reward": 1.7291667461395264, - "reward_std": 0.31504253298044205, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8750000298023224, + "loss": 0.1244, + "reward": 0.984375, + "reward_std": 0.03884884715080261, + "rewards/tag_count_reward": 0.984375, "step": 558 }, { "clip_ratio": 0.0, - "completion_length": 535.4583435058594, + "completion_length": 422.00001525878906, "epoch": 0.559, - "grad_norm": 17.176190412428053, - "kl": 0.716796875, + "grad_norm": 13.652699765057086, + "kl": 1.53125, "learning_rate": 5.358651584148423e-07, - "loss": 0.3629, - "reward": 1.8854167461395264, - "reward_std": 0.21260907500982285, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9270833432674408, + "loss": 0.171, + "reward": 0.9843750298023224, + "reward_std": 0.05412658676505089, + "rewards/tag_count_reward": 0.9843750298023224, "step": 559 }, { "clip_ratio": 0.0, - "completion_length": 687.1250305175781, + "completion_length": 416.2291717529297, "epoch": 0.56, - "grad_norm": 11.538916297682759, - "kl": 1.671875, + "grad_norm": 17.52960900631826, + "kl": 0.5546875, "learning_rate": 5.342952264838747e-07, - "loss": 0.4309, - "reward": 1.640625, - "reward_std": 0.40570642054080963, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8697916865348816, + "loss": 0.025, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 560 }, { "clip_ratio": 0.0, - "completion_length": 857.2708740234375, + "completion_length": 421.68751525878906, "epoch": 0.561, - "grad_norm": 14.259529042888598, - "kl": 3.06640625, + "grad_norm": 13.64501272217535, + "kl": 0.611328125, "learning_rate": 5.32725485910616e-07, - "loss": 0.3609, - "reward": 1.4791666865348816, - "reward_std": 0.4655623733997345, - "rewards/accuracy_reward": 0.7083333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.7708333432674408, + "loss": 0.1599, + "reward": 0.9791666865348816, + "reward_std": 0.07216878235340118, + "rewards/tag_count_reward": 0.9791666865348816, "step": 561 }, { "clip_ratio": 0.0, - "completion_length": 607.1666870117188, + "completion_length": 391.875, "epoch": 0.562, - "grad_norm": 12.492586634408255, - "kl": 1.369140625, + "grad_norm": 2.5491679809231127, + "kl": 0.137451171875, "learning_rate": 5.311559558218603e-07, - "loss": 0.4061, - "reward": 1.8229166865348816, - "reward_std": 0.2777213752269745, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.90625, + "loss": 0.0147, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 562 }, { "clip_ratio": 0.0, - "completion_length": 522.4375, + "completion_length": 401.31251525878906, "epoch": 0.563, - "grad_norm": 17.30053081775478, - "kl": 0.65234375, + "grad_norm": 4.48586331295426, + "kl": 0.208251953125, "learning_rate": 5.295866553418358e-07, - "loss": 0.244, - "reward": 1.9270833730697632, - "reward_std": 0.25259073078632355, - "rewards/accuracy_reward": 0.9375000298023224, - "rewards/format_reward": 0.0, + "loss": 0.0002, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, "rewards/tag_count_reward": 0.9895833432674408, "step": 563 }, { "clip_ratio": 0.0, - "completion_length": 581.0833435058594, + "completion_length": 372.16668701171875, "epoch": 0.564, - "grad_norm": 18.739890619835844, - "kl": 1.396484375, + "grad_norm": 4.886429592648346, + "kl": 0.556640625, "learning_rate": 5.28017603591974e-07, - "loss": 0.5502, - "reward": 1.890625, - "reward_std": 0.20460152626037598, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9114583432674408, + "loss": 0.0217, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 564 }, { "clip_ratio": 0.0, - "completion_length": 662.9375305175781, + "completion_length": 405.7083435058594, "epoch": 0.565, - "grad_norm": 6.615120650795511, - "kl": 2.08984375, + "grad_norm": 14.789814425659378, + "kl": 0.94189453125, "learning_rate": 5.264488196906752e-07, - "loss": 0.371, - "reward": 1.5520833730697632, - "reward_std": 0.32989659160375595, - "rewards/accuracy_reward": 0.7083333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8437500298023224, + "loss": 0.0884, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 565 }, { "clip_ratio": 0.0, - "completion_length": 533.3125305175781, + "completion_length": 398.25001525878906, "epoch": 0.566, - "grad_norm": 4.711057784606764, - "kl": 1.279296875, + "grad_norm": 32.17134982668231, + "kl": 2.3115234375, "learning_rate": 5.248803227530763e-07, - "loss": 0.2421, - "reward": 1.9010416865348816, - "reward_std": 0.1796237677335739, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9427083432674408, + "loss": 0.1142, + "reward": 0.9739583432674408, + "reward_std": 0.04956009238958359, + "rewards/tag_count_reward": 0.9739583432674408, "step": 566 }, { "clip_ratio": 0.0, - "completion_length": 455.5625, + "completion_length": 383.47918701171875, "epoch": 0.567, - "grad_norm": 5.466315934880269, - "kl": 0.88037109375, + "grad_norm": 12.737701967241774, + "kl": 0.9169921875, "learning_rate": 5.233121318908173e-07, - "loss": 0.1982, - "reward": 1.890625, - "reward_std": 0.2643452286720276, - "rewards/accuracy_reward": 0.9375, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.953125, + "loss": 0.1053, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 567 }, { "clip_ratio": 0.0, - "completion_length": 404.0833435058594, + "completion_length": 358.43751525878906, "epoch": 0.568, - "grad_norm": 3.416961245398717, - "kl": 0.3779296875, + "grad_norm": 21.185906835712984, + "kl": 1.016845703125, "learning_rate": 5.21744266211809e-07, - "loss": 0.0587, - "reward": 1.9739583730697632, - "reward_std": 0.09021097421646118, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.2176, + "reward": 0.984375, + "reward_std": 0.05412658676505089, + "rewards/tag_count_reward": 0.984375, "step": 568 }, { "clip_ratio": 0.0, - "completion_length": 526.6666717529297, + "completion_length": 415.3333435058594, "epoch": 0.569, - "grad_norm": 14.974074981308137, - "kl": 1.50390625, + "grad_norm": 17.39490993968257, + "kl": 0.84033203125, "learning_rate": 5.2017674482e-07, - "loss": 0.5294, - "reward": 1.7708333730697632, - "reward_std": 0.31909389048814774, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9375000298023224, + "loss": 0.0687, + "reward": 0.984375, + "reward_std": 0.03884884715080261, + "rewards/tag_count_reward": 0.984375, "step": 569 }, { "clip_ratio": 0.0, - "completion_length": 437.4166717529297, + "completion_length": 397.0208435058594, "epoch": 0.57, - "grad_norm": 3.7415959697333885, - "kl": 0.828125, + "grad_norm": 6.288612720147936, + "kl": 0.385498046875, "learning_rate": 5.186095868151436e-07, - "loss": 0.1659, - "reward": 1.9322916865348816, - "reward_std": 0.18871532380580902, - "rewards/accuracy_reward": 0.9583333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 0.0489, + "reward": 0.9791666865348816, + "reward_std": 0.04865618050098419, + "rewards/tag_count_reward": 0.9791666865348816, "step": 570 }, { "clip_ratio": 0.0, - "completion_length": 555.7916870117188, + "completion_length": 397.31251525878906, "epoch": 0.571, - "grad_norm": 16.4083416445283, - "kl": 4.359375, + "grad_norm": 8.595110401084613, + "kl": 0.478515625, "learning_rate": 5.170428112925659e-07, - "loss": 0.6484, - "reward": 1.5520833730697632, - "reward_std": 0.5498536676168442, - "rewards/accuracy_reward": 0.6875, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8645833432674408, + "loss": 0.0841, + "reward": 0.9739583432674408, + "reward_std": 0.0749332383275032, + "rewards/tag_count_reward": 0.9739583432674408, "step": 571 }, { "clip_ratio": 0.0, - "completion_length": 545.8125305175781, + "completion_length": 372.62501525878906, "epoch": 0.572, - "grad_norm": 15.357929481975177, - "kl": 3.7265625, + "grad_norm": 0.5249421203742181, + "kl": 0.10546875, "learning_rate": 5.154764373429315e-07, - "loss": 0.6876, - "reward": 1.8541666865348816, - "reward_std": 0.3646731525659561, - "rewards/accuracy_reward": 0.9375000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9166666865348816, + "loss": 0.0044, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 572 }, { "clip_ratio": 0.0, - "completion_length": 704.8541870117188, + "completion_length": 359.9583435058594, "epoch": 0.573, - "grad_norm": 51.159738087601475, - "kl": 7.390625, + "grad_norm": 18.64973350155189, + "kl": 0.6201171875, "learning_rate": 5.139104840520135e-07, - "loss": 0.5712, - "reward": 1.4947916865348816, - "reward_std": 0.38124869763851166, - "rewards/accuracy_reward": 0.6458333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8489583730697632, + "loss": 0.0493, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 573 }, { "clip_ratio": 0.0, - "completion_length": 406.56251525878906, + "completion_length": 372.29168701171875, "epoch": 0.574, - "grad_norm": 3.6499812578804476, - "kl": 0.80810546875, + "grad_norm": 15.238197765085344, + "kl": 1.044921875, "learning_rate": 5.123449705004581e-07, - "loss": 0.2383, - "reward": 1.984375, - "reward_std": 0.05412658676505089, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.984375, + "loss": 0.1585, + "reward": 0.9687500298023224, + "reward_std": 0.08474057167768478, + "rewards/tag_count_reward": 0.9687500298023224, "step": 574 }, { "clip_ratio": 0.0, - "completion_length": 481.7708435058594, + "completion_length": 395.1458435058594, "epoch": 0.575, - "grad_norm": 7.135688087622697, - "kl": 1.65625, + "grad_norm": 3.7259115162429315, + "kl": 0.2841796875, "learning_rate": 5.107799157635538e-07, - "loss": 0.4438, - "reward": 1.9218750596046448, - "reward_std": 0.23052222281694412, - "rewards/accuracy_reward": 0.9583333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.0127, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 575 }, { "clip_ratio": 0.0, - "completion_length": 715.7500457763672, + "completion_length": 383.3541717529297, "epoch": 0.576, - "grad_norm": 25.249821366837935, - "kl": 6.96875, + "grad_norm": 3.6577494997491047, + "kl": 0.208984375, "learning_rate": 5.09215338910999e-07, - "loss": 0.6322, - "reward": 1.5572916865348816, - "reward_std": 0.44661253318190575, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.7864583432674408, + "loss": 0.0099, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 576 }, { "clip_ratio": 0.0, - "completion_length": 577.7500305175781, + "completion_length": 409.72918701171875, "epoch": 0.577, - "grad_norm": 11.231151868828828, - "kl": 3.32421875, + "grad_norm": 3.734336816963088, + "kl": 0.22265625, "learning_rate": 5.076512590066685e-07, - "loss": 0.6265, - "reward": 1.8229167461395264, - "reward_std": 0.29137512296438217, - "rewards/accuracy_reward": 0.8958333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9270833730697632, + "loss": 0.0127, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 577 }, { "clip_ratio": 0.0, - "completion_length": 512.2083587646484, + "completion_length": 383.0416717529297, "epoch": 0.578, - "grad_norm": 7.3484554208276345, - "kl": 2.51171875, + "grad_norm": 5.111707915314032, + "kl": 0.239501953125, "learning_rate": 5.060876951083828e-07, - "loss": 0.3525, - "reward": 1.8541666865348816, - "reward_std": 0.2698933370411396, - "rewards/accuracy_reward": 0.8750000298023224, - "rewards/format_reward": 0.02083333395421505, - "rewards/tag_count_reward": 0.9583333730697632, + "loss": 0.0149, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 578 }, { "clip_ratio": 0.0, - "completion_length": 600.5625, + "completion_length": 410.5, "epoch": 0.579, - "grad_norm": 22.418179731963274, - "kl": 3.5390625, + "grad_norm": 8.980476325432091, + "kl": 0.2880859375, "learning_rate": 5.045246662676741e-07, - "loss": 0.3676, - "reward": 1.8020833730697632, - "reward_std": 0.2119518145918846, - "rewards/accuracy_reward": 0.875, - "rewards/format_reward": 0.02083333395421505, - "rewards/tag_count_reward": 0.9062500298023224, + "loss": 0.0689, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 579 }, { "clip_ratio": 0.0, - "completion_length": 498.95835876464844, + "completion_length": 391.3125, "epoch": 0.58, - "grad_norm": 7.3401079741277435, - "kl": 1.681640625, + "grad_norm": 8.677092482156407, + "kl": 0.1748046875, "learning_rate": 5.02962191529556e-07, - "loss": 0.3166, - "reward": 1.9010416865348816, - "reward_std": 0.16761544719338417, - "rewards/accuracy_reward": 0.9375, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.0902, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 580 }, { "clip_ratio": 0.0, - "completion_length": 408.97918701171875, + "completion_length": 356.60418701171875, "epoch": 0.581, - "grad_norm": 9.027696575669205, - "kl": 0.6689453125, + "grad_norm": 0.7969810760307527, + "kl": 0.12744140625, "learning_rate": 5.014002899322896e-07, - "loss": 0.2138, - "reward": 1.9583333730697632, - "reward_std": 0.1259434074163437, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.0056, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 581 }, { "clip_ratio": 0.0, - "completion_length": 613.6666870117188, + "completion_length": 402.62501525878906, "epoch": 0.582, - "grad_norm": 18.404182637426995, - "kl": 2.51171875, + "grad_norm": 5.255605393691853, + "kl": 0.226806640625, "learning_rate": 4.998389805071536e-07, - "loss": 0.5197, - "reward": 1.8072916865348816, - "reward_std": 0.3174736574292183, - "rewards/accuracy_reward": 0.8750000298023224, - "rewards/format_reward": 0.02083333395421505, - "rewards/tag_count_reward": 0.9114583432674408, + "loss": 0.0216, + "reward": 0.984375, + "reward_std": 0.05412658676505089, + "rewards/tag_count_reward": 0.984375, "step": 582 }, { "clip_ratio": 0.0, - "completion_length": 518.6250305175781, + "completion_length": 408.0, "epoch": 0.583, - "grad_norm": 11.380168419566196, - "kl": 1.390625, + "grad_norm": 4.093041562859633, + "kl": 0.19970703125, "learning_rate": 4.982782822782101e-07, - "loss": 0.2449, - "reward": 1.7135416865348816, - "reward_std": 0.35906873643398285, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9427083432674408, + "loss": 0.0602, + "reward": 0.984375, + "reward_std": 0.03884884715080261, + "rewards/tag_count_reward": 0.984375, "step": 583 }, { "clip_ratio": 0.0, - "completion_length": 389.7916717529297, + "completion_length": 395.4791717529297, "epoch": 0.584, - "grad_norm": 1.2405382317366134, - "kl": 0.171875, + "grad_norm": 11.262303957873653, + "kl": 0.150390625, "learning_rate": 4.967182142620745e-07, - "loss": 0.0075, - "reward": 2.0, - "reward_std": 0.0, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": 0.1275, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 584 }, { "clip_ratio": 0.0, - "completion_length": 568.0625305175781, + "completion_length": 410.68751525878906, "epoch": 0.585, - "grad_norm": 7.532605419678999, - "kl": 2.00390625, + "grad_norm": 2.747042042599716, + "kl": 0.076904296875, "learning_rate": 4.951587954676837e-07, - "loss": 0.4504, - "reward": 1.8072917461395264, - "reward_std": 0.2588347792625427, - "rewards/accuracy_reward": 0.8750000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9322916865348816, + "loss": 0.0212, + "reward": 0.9843750298023224, + "reward_std": 0.05412658676505089, + "rewards/tag_count_reward": 0.9843750298023224, "step": 585 }, { "clip_ratio": 0.0, - "completion_length": 418.7708435058594, + "completion_length": 407.6458435058594, "epoch": 0.586, - "grad_norm": 6.109506418281163, - "kl": 0.935546875, + "grad_norm": 3.7904544650583105, + "kl": 0.2137451171875, "learning_rate": 4.93600044896063e-07, - "loss": 0.0927, - "reward": 1.8489583730697632, - "reward_std": 0.17645524814724922, - "rewards/accuracy_reward": 0.875, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9739583730697632, + "loss": 0.0532, + "reward": 0.9791666865348816, + "reward_std": 0.04865618050098419, + "rewards/tag_count_reward": 0.9791666865348816, "step": 586 }, { "clip_ratio": 0.0, - "completion_length": 533.9375152587891, + "completion_length": 379.5416717529297, "epoch": 0.587, - "grad_norm": 6.494514320843762, - "kl": 2.20703125, + "grad_norm": 5.346241073417882, + "kl": 0.27197265625, "learning_rate": 4.920419815400968e-07, - "loss": 0.2095, - "reward": 1.7812500596046448, - "reward_std": 0.28840160369873047, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9270833432674408, + "loss": 0.0151, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 587 }, { "clip_ratio": 0.0, - "completion_length": 458.4583435058594, + "completion_length": 368.1458435058594, "epoch": 0.588, - "grad_norm": 8.207630176695815, - "kl": 1.296875, + "grad_norm": 0.3159785441102848, + "kl": 0.067626953125, "learning_rate": 4.904846243842949e-07, - "loss": 0.3513, - "reward": 1.9114583730697632, - "reward_std": 0.15857338160276413, - "rewards/accuracy_reward": 0.9375, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 0.0029, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 588 }, { "clip_ratio": 0.0, - "completion_length": 541.6041870117188, + "completion_length": 419.62501525878906, "epoch": 0.589, - "grad_norm": 7.382823799599969, - "kl": 1.5478515625, + "grad_norm": 46.96028559035001, + "kl": 0.9996337890625, "learning_rate": 4.88927992404563e-07, - "loss": 0.2113, - "reward": 1.8958333730697632, - "reward_std": 0.10770505666732788, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9375, + "loss": 0.0576, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 589 }, { "clip_ratio": 0.0, - "completion_length": 483.3333435058594, + "completion_length": 397.50001525878906, "epoch": 0.59, - "grad_norm": 4.142851095861234, - "kl": 0.78466796875, + "grad_norm": 9.089838613001081, + "kl": 0.529296875, "learning_rate": 4.873721045679706e-07, - "loss": 0.2612, - "reward": 1.8802083730697632, - "reward_std": 0.24040424823760986, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.1543, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 590 }, { "clip_ratio": 0.0, - "completion_length": 514.0416717529297, + "completion_length": 415.04168701171875, "epoch": 0.591, - "grad_norm": 8.709750803577402, - "kl": 1.0263671875, + "grad_norm": 0.7738897102064202, + "kl": 0.076904296875, "learning_rate": 4.858169798325198e-07, - "loss": 0.3315, - "reward": 1.90625, - "reward_std": 0.21704530715942383, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9479166865348816, + "loss": 0.003, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 591 }, { "clip_ratio": 0.0, - "completion_length": 400.4583435058594, + "completion_length": 363.68751525878906, "epoch": 0.592, - "grad_norm": 4.389997299168009, - "kl": 0.35302734375, + "grad_norm": 0.25596770540476016, + "kl": 0.07470703125, "learning_rate": 4.842626371469149e-07, - "loss": 0.0513, - "reward": 1.9322916865348816, - "reward_std": 0.18752333521842957, - "rewards/accuracy_reward": 0.9375, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.003, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 592 }, { "clip_ratio": 0.0, - "completion_length": 506.04168701171875, + "completion_length": 417.7083435058594, "epoch": 0.593, - "grad_norm": 7.023673404275894, - "kl": 0.767578125, + "grad_norm": 3.0302408863154127, + "kl": 0.30126953125, "learning_rate": 4.827090954503308e-07, - "loss": 0.2847, - "reward": 1.9375000596046448, - "reward_std": 0.13520457595586777, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.0114, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 593 }, { "clip_ratio": 0.0, - "completion_length": 596.0208740234375, + "completion_length": 453.8333435058594, "epoch": 0.594, - "grad_norm": 12.067906565975324, - "kl": 1.79296875, + "grad_norm": 4.801912575140599, + "kl": 0.26025390625, "learning_rate": 4.811563736721829e-07, - "loss": 0.3998, - "reward": 1.6406250596046448, - "reward_std": 0.3280720040202141, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9114583432674408, + "loss": 0.0579, + "reward": 0.9739583432674408, + "reward_std": 0.07845468074083328, + "rewards/tag_count_reward": 0.9739583432674408, "step": 594 }, { "clip_ratio": 0.0, - "completion_length": 681.8541870117188, + "completion_length": 461.93751525878906, "epoch": 0.595, - "grad_norm": 5.654211217145191, - "kl": 2.6875, + "grad_norm": 7.675974503977892, + "kl": 0.2900390625, "learning_rate": 4.79604490731896e-07, - "loss": 0.4924, - "reward": 1.5468750596046448, - "reward_std": 0.44048693776130676, - "rewards/accuracy_reward": 0.6875000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8593750298023224, + "loss": 0.0444, + "reward": 0.9739583730697632, + "reward_std": 0.09021097794175148, + "rewards/tag_count_reward": 0.9739583730697632, "step": 595 }, { "clip_ratio": 0.0, - "completion_length": 394.3958435058594, + "completion_length": 378.12501525878906, "epoch": 0.596, - "grad_norm": 3.0645040373754626, - "kl": 0.43701171875, + "grad_norm": 0.15133686111386602, + "kl": 0.0579833984375, "learning_rate": 4.780534655386743e-07, - "loss": 0.0233, - "reward": 2.0, + "loss": 0.0024, + "reward": 1.0, "reward_std": 0.0, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, "rewards/tag_count_reward": 1.0, "step": 596 }, { "clip_ratio": 0.0, - "completion_length": 456.7083435058594, + "completion_length": 429.2083435058594, "epoch": 0.597, - "grad_norm": 4.342970828781974, - "kl": 0.6796875, + "grad_norm": 3.795833682022691, + "kl": 0.302734375, "learning_rate": 4.7650331699127013e-07, - "loss": 0.0481, - "reward": 1.7916666865348816, - "reward_std": 0.19728264957666397, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9583333432674408, + "loss": 0.1003, + "reward": 0.9791666865348816, + "reward_std": 0.04865618795156479, + "rewards/tag_count_reward": 0.9791666865348816, "step": 597 }, { "clip_ratio": 0.0, - "completion_length": 427.93751525878906, + "completion_length": 367.625, "epoch": 0.598, - "grad_norm": 5.861042865096573, - "kl": 0.978515625, + "grad_norm": 3.9422683198568147, + "kl": 0.3291015625, "learning_rate": 4.749540639777539e-07, - "loss": 0.1564, - "reward": 1.75, - "reward_std": 0.05330018326640129, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.043, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 598 }, { "clip_ratio": 0.0, - "completion_length": 451.4583435058594, + "completion_length": 438.93751525878906, "epoch": 0.599, - "grad_norm": 5.3241744582648165, - "kl": 1.025390625, + "grad_norm": 2.2094625444727685, + "kl": 0.19482421875, "learning_rate": 4.7340572537528547e-07, - "loss": 0.1261, - "reward": 1.8177083730697632, - "reward_std": 0.23352795839309692, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.0083, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 599 }, { "clip_ratio": 0.0, - "completion_length": 523.8333587646484, + "completion_length": 390.87501525878906, "epoch": 0.6, - "grad_norm": 26.685709282532063, - "kl": 0.875, + "grad_norm": 0.9972729819698383, + "kl": 0.120849609375, "learning_rate": 4.7185832004988133e-07, - "loss": 0.4357, - "reward": 1.921875, - "reward_std": 0.25320322811603546, - "rewards/accuracy_reward": 0.9583333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.0059, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 600 }, { "clip_ratio": 0.0, - "completion_length": 375.91668701171875, + "completion_length": 360.4791717529297, "epoch": 0.601, - "grad_norm": 0.5783329522424004, - "kl": 0.14208984375, + "grad_norm": 1.4942812379211767, + "kl": 0.1241455078125, "learning_rate": 4.703118668561875e-07, - "loss": 0.006, - "reward": 2.0, - "reward_std": 0.0, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": -0.009, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 601 }, { "clip_ratio": 0.0, - "completion_length": 531.5000305175781, + "completion_length": 461.2083435058594, "epoch": 0.602, - "grad_norm": 7.603025544671101, - "kl": 1.03125, + "grad_norm": 0.1762287472453783, + "kl": 0.062744140625, "learning_rate": 4.68766384637248e-07, - "loss": 0.3475, - "reward": 1.9114583730697632, - "reward_std": 0.2396276444196701, - "rewards/accuracy_reward": 0.9583333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.953125, + "loss": 0.0026, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 602 }, { "clip_ratio": 0.0, - "completion_length": 474.87501525878906, + "completion_length": 387.6875, "epoch": 0.603, - "grad_norm": 5.445187278408046, - "kl": 0.93115234375, + "grad_norm": 2.6832838261921457, + "kl": 0.10107421875, "learning_rate": 4.672218922242759e-07, - "loss": 0.2845, - "reward": 1.7864583730697632, - "reward_std": 0.2633417621254921, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9322916865348816, + "loss": 0.0191, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 603 }, { "clip_ratio": 0.0, - "completion_length": 521.0416870117188, + "completion_length": 433.1875, "epoch": 0.604, - "grad_norm": 4.539188426316394, - "kl": 1.61669921875, + "grad_norm": 3.2447151229056708, + "kl": 0.21142578125, "learning_rate": 4.656784084364238e-07, - "loss": 0.2783, - "reward": 1.8697916865348816, - "reward_std": 0.17568521201610565, - "rewards/accuracy_reward": 0.9375, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9322916865348816, + "loss": -0.0232, + "reward": 0.9791666865348816, + "reward_std": 0.07216878235340118, + "rewards/tag_count_reward": 0.9791666865348816, "step": 604 }, { "clip_ratio": 0.0, - "completion_length": 499.52085876464844, + "completion_length": 389.87501525878906, "epoch": 0.605, - "grad_norm": 6.972452993105158, - "kl": 1.474609375, + "grad_norm": 0.13605051880246277, + "kl": 0.0572509765625, "learning_rate": 4.641359520805548e-07, - "loss": 0.4104, - "reward": 1.8750000596046448, - "reward_std": 0.35279126465320587, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9583333432674408, + "loss": 0.0023, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 605 }, { "clip_ratio": 0.0, - "completion_length": 453.3333435058594, + "completion_length": 392.25001525878906, "epoch": 0.606, - "grad_norm": 8.036077022422743, - "kl": 0.9755859375, + "grad_norm": 5.4369335865443364, + "kl": 0.132080078125, "learning_rate": 4.6259454195101267e-07, - "loss": 0.0702, - "reward": 1.8645833730697632, - "reward_std": 0.18782321363687515, - "rewards/accuracy_reward": 0.8958333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.96875, + "loss": 0.0267, + "reward": 0.9791666865348816, + "reward_std": 0.07216878235340118, + "rewards/tag_count_reward": 0.9791666865348816, "step": 606 }, { "clip_ratio": 0.0, - "completion_length": 489.66668701171875, + "completion_length": 418.87501525878906, "epoch": 0.607, - "grad_norm": 15.217177254721056, - "kl": 1.17822265625, + "grad_norm": 3.092282569169051, + "kl": 0.146484375, "learning_rate": 4.6105419682939316e-07, - "loss": 0.4674, - "reward": 1.953125, - "reward_std": 0.13443158566951752, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.953125, + "loss": 0.0541, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 607 }, { "clip_ratio": 0.0, - "completion_length": 424.3541717529297, + "completion_length": 423.8958435058594, "epoch": 0.608, - "grad_norm": 5.678800343928725, - "kl": 1.06640625, + "grad_norm": 0.24882707008371582, + "kl": 0.0673828125, "learning_rate": 4.59514935484316e-07, - "loss": 0.1803, - "reward": 1.8958333730697632, - "reward_std": 0.2054632604122162, - "rewards/accuracy_reward": 0.9375, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9583333432674408, + "loss": 0.0028, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 608 }, { "clip_ratio": 0.0, - "completion_length": 556.3541717529297, + "completion_length": 491.43751525878906, "epoch": 0.609, - "grad_norm": 5.2604130264685365, - "kl": 2.39453125, + "grad_norm": 6.451615477059651, + "kl": 0.456298828125, "learning_rate": 4.579767766711944e-07, - "loss": 0.4194, - "reward": 1.7760416865348816, - "reward_std": 0.376691535115242, - "rewards/accuracy_reward": 0.875, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9010416865348816, + "loss": 0.1002, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 609 }, { "clip_ratio": 0.0, - "completion_length": 553.3541870117188, + "completion_length": 403.50001525878906, "epoch": 0.61, - "grad_norm": 25.4539577589348, - "kl": 3.0, + "grad_norm": 0.3635257731764919, + "kl": 0.0567626953125, "learning_rate": 4.5643973913200837e-07, - "loss": 0.3118, - "reward": 1.7083333730697632, - "reward_std": 0.2848246470093727, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/format_reward": 0.02083333395421505, - "rewards/tag_count_reward": 0.9166666865348816, + "loss": 0.0022, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 610 }, { "clip_ratio": 0.0, - "completion_length": 615.4375152587891, + "completion_length": 409.0416717529297, "epoch": 0.611, - "grad_norm": 11.210844254396868, - "kl": 3.03125, + "grad_norm": 12.337782777253606, + "kl": 0.49267578125, "learning_rate": 4.549038415950751e-07, - "loss": 0.5337, - "reward": 1.8072916865348816, - "reward_std": 0.3188003897666931, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.890625, + "loss": 0.0244, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 611 }, { "clip_ratio": 0.0, - "completion_length": 384.75001525878906, + "completion_length": 408.3333435058594, "epoch": 0.612, - "grad_norm": 4.526455521728452, - "kl": 0.7001953125, + "grad_norm": 0.7775751820487108, + "kl": 0.0877685546875, "learning_rate": 4.5336910277482155e-07, - "loss": 0.0455, - "reward": 1.9270833730697632, - "reward_std": 0.10934117436408997, - "rewards/accuracy_reward": 0.9375, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0035, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 612 }, { "clip_ratio": 0.0, - "completion_length": 628.1458435058594, + "completion_length": 437.9375, "epoch": 0.613, - "grad_norm": 13.619403092991371, - "kl": 3.91796875, + "grad_norm": 1.510417692801011, + "kl": 0.114013671875, "learning_rate": 4.51835541371556e-07, - "loss": 0.4659, - "reward": 1.515625, - "reward_std": 0.47586315125226974, - "rewards/accuracy_reward": 0.6458333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8697916865348816, + "loss": 0.0048, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 613 }, { "clip_ratio": 0.0, - "completion_length": 717.6458435058594, + "completion_length": 440.04168701171875, "epoch": 0.614, - "grad_norm": 18.047276720524565, - "kl": 4.2421875, + "grad_norm": 8.22785879406297, + "kl": 0.276611328125, "learning_rate": 4.503031760712397e-07, - "loss": 0.4726, - "reward": 1.6354167461395264, - "reward_std": 0.42935848236083984, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8437500298023224, + "loss": 0.027, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 614 }, { "clip_ratio": 0.0, - "completion_length": 576.6250305175781, + "completion_length": 411.4583435058594, "epoch": 0.615, - "grad_norm": 13.843581211604308, - "kl": 2.75, + "grad_norm": 0.650664392596077, + "kl": 0.07958984375, "learning_rate": 4.4877202554526084e-07, - "loss": 0.5854, - "reward": 1.6354166865348816, - "reward_std": 0.33045031130313873, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8854166865348816, + "loss": 0.0034, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 615 }, { "clip_ratio": 0.0, - "completion_length": 384.2916717529297, + "completion_length": 401.1458435058594, "epoch": 0.616, - "grad_norm": 0.7016241996723709, - "kl": 0.1435546875, + "grad_norm": 13.66748173578044, + "kl": 0.49755859375, "learning_rate": 4.4724210845020494e-07, - "loss": 0.0059, - "reward": 2.0, - "reward_std": 0.0, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": 0.0807, + "reward": 0.9895833730697632, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833730697632, "step": 616 }, { "clip_ratio": 0.0, - "completion_length": 666.5833435058594, + "completion_length": 402.9583435058594, "epoch": 0.617, - "grad_norm": 13.154950410277756, - "kl": 2.28515625, + "grad_norm": 2.654318753149108, + "kl": 0.2138671875, "learning_rate": 4.457134434276293e-07, - "loss": 0.2011, - "reward": 1.5885417461395264, - "reward_std": 0.24745624512434006, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8593750298023224, + "loss": 0.0104, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 617 }, { "clip_ratio": 0.0, - "completion_length": 461.9791717529297, + "completion_length": 427.2083435058594, "epoch": 0.618, - "grad_norm": 32.31466025675279, - "kl": 0.681640625, + "grad_norm": 2.214910708774484, + "kl": 0.2333984375, "learning_rate": 4.441860491038345e-07, - "loss": 0.4226, - "reward": 1.9479166865348816, - "reward_std": 0.18042195588350296, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9687500298023224, + "loss": 0.0099, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 618 }, { "clip_ratio": 0.0, - "completion_length": 586.5416870117188, + "completion_length": 470.1458435058594, "epoch": 0.619, - "grad_norm": 5.3624025568550735, - "kl": 1.3125, + "grad_norm": 1.0104438176626016, + "kl": 0.1396484375, "learning_rate": 4.4265994408963867e-07, - "loss": 0.184, - "reward": 1.7291666865348816, - "reward_std": 0.3180026561021805, - "rewards/accuracy_reward": 0.8125000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9166666865348816, + "loss": 0.0066, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 619 }, { "clip_ratio": 0.0, - "completion_length": 485.25, + "completion_length": 422.62501525878906, "epoch": 0.62, - "grad_norm": 5.583378444676486, - "kl": 0.5654296875, + "grad_norm": 0.7722857506341341, + "kl": 0.1124267578125, "learning_rate": 4.4113514698014953e-07, - "loss": 0.1404, - "reward": 1.8958333730697632, - "reward_std": 0.21894602477550507, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.0045, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 620 }, { "clip_ratio": 0.0, - "completion_length": 624.5000305175781, + "completion_length": 459.2291717529297, "epoch": 0.621, - "grad_norm": 9.888865631896893, - "kl": 1.3515625, + "grad_norm": 0.199694627883351, + "kl": 0.0582275390625, "learning_rate": 4.3961167635453876e-07, - "loss": 0.301, - "reward": 1.5520833730697632, - "reward_std": 0.36164598166942596, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8854166865348816, + "loss": 0.0026, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 621 }, { "clip_ratio": 0.0, - "completion_length": 563.3750152587891, + "completion_length": 441.2291717529297, "epoch": 0.622, - "grad_norm": 3.861304780440373, - "kl": 1.06689453125, + "grad_norm": 2.593170125654589, + "kl": 0.0628662109375, "learning_rate": 4.3808955077581546e-07, - "loss": 0.1299, - "reward": 1.890625, - "reward_std": 0.11535456031560898, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9322916865348816, + "loss": -0.0074, + "reward": 0.9843750298023224, + "reward_std": 0.04237028583884239, + "rewards/tag_count_reward": 0.9843750298023224, "step": 622 }, { "clip_ratio": 0.0, - "completion_length": 407.8125, + "completion_length": 421.2291717529297, "epoch": 0.623, - "grad_norm": 2.7524061074059083, - "kl": 0.37158203125, + "grad_norm": 0.28650814113330986, + "kl": 0.0794677734375, "learning_rate": 4.365687887905988e-07, - "loss": 0.0106, - "reward": 1.953125, - "reward_std": 0.08050356805324554, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 0.0035, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 623 }, { "clip_ratio": 0.0, - "completion_length": 596.2291870117188, + "completion_length": 429.41668701171875, "epoch": 0.624, - "grad_norm": 15.237602585474976, - "kl": 0.95703125, + "grad_norm": 0.15751595560674495, + "kl": 0.0628662109375, "learning_rate": 4.350494089288943e-07, - "loss": 0.4758, - "reward": 1.8072916865348816, - "reward_std": 0.3118917793035507, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9114583730697632, + "loss": 0.0026, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 624 }, { "clip_ratio": 0.0, - "completion_length": 482.97918701171875, + "completion_length": 429.2708435058594, "epoch": 0.625, - "grad_norm": 6.918258531922286, - "kl": 0.609619140625, + "grad_norm": 2.4315463556233707, + "kl": 0.0986328125, "learning_rate": 4.3353142970386557e-07, - "loss": 0.2013, - "reward": 1.609375, - "reward_std": 0.21576672792434692, - "rewards/accuracy_reward": 0.6666666716337204, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9427083432674408, + "loss": -0.0122, + "reward": 0.984375, + "reward_std": 0.05412658676505089, + "rewards/tag_count_reward": 0.984375, "step": 625 }, { "clip_ratio": 0.0, - "completion_length": 375.4583435058594, + "completion_length": 445.0, "epoch": 0.626, - "grad_norm": 2.4451266925953905, - "kl": 0.112060546875, + "grad_norm": 3.0278010700726425, + "kl": 0.112548828125, "learning_rate": 4.3201486961161093e-07, - "loss": 0.0122, - "reward": 1.9687500596046448, - "reward_std": 0.10825317353010178, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, + "loss": 0.0305, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, "rewards/tag_count_reward": 0.9895833432674408, "step": 626 }, { "clip_ratio": 0.0, - "completion_length": 463.31251525878906, + "completion_length": 484.6458435058594, "epoch": 0.627, - "grad_norm": 2.3309225573294947, - "kl": 0.599609375, + "grad_norm": 4.435038587352279, + "kl": 0.173828125, "learning_rate": 4.304997471309361e-07, - "loss": 0.0702, - "reward": 1.9322916865348816, - "reward_std": 0.1290765255689621, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 0.0223, + "reward": 0.9635416865348816, + "reward_std": 0.1110176332294941, + "rewards/tag_count_reward": 0.9635416865348816, "step": 627 }, { "clip_ratio": 0.0, - "completion_length": 432.3333435058594, + "completion_length": 429.66668701171875, "epoch": 0.628, - "grad_norm": 5.713117958811042, - "kl": 0.4013671875, + "grad_norm": 1.6392048454495811, + "kl": 0.093017578125, "learning_rate": 4.2898608072313045e-07, - "loss": 0.1809, - "reward": 1.9635416865348816, - "reward_std": 0.12629537284374237, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, + "loss": 0.0047, + "reward": 0.984375, + "reward_std": 0.03884884715080261, "rewards/tag_count_reward": 0.984375, "step": 628 }, { "clip_ratio": 0.0, - "completion_length": 378.93751525878906, + "completion_length": 447.2083435058594, "epoch": 0.629, - "grad_norm": 6.085786192044784, - "kl": 0.26708984375, + "grad_norm": 5.466422979992953, + "kl": 0.19580078125, "learning_rate": 4.2747388883174154e-07, - "loss": 0.0442, - "reward": 1.9479166865348816, - "reward_std": 0.18042196705937386, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833730697632, + "loss": 0.0774, + "reward": 0.984375, + "reward_std": 0.04237028583884239, + "rewards/tag_count_reward": 0.984375, "step": 629 }, { "clip_ratio": 0.0, - "completion_length": 658.5416870117188, + "completion_length": 435.5833435058594, "epoch": 0.63, - "grad_norm": 15.319335321641084, - "kl": 2.26953125, + "grad_norm": 2.9526648613508013, + "kl": 0.156982421875, "learning_rate": 4.2596318988235037e-07, - "loss": 0.5449, - "reward": 1.6302083730697632, - "reward_std": 0.5507534742355347, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.859375, + "loss": 0.036, + "reward": 0.9791666865348816, + "reward_std": 0.046308884397149086, + "rewards/tag_count_reward": 0.9791666865348816, "step": 630 }, { "clip_ratio": 0.0, - "completion_length": 380.9583435058594, + "completion_length": 373.5416717529297, "epoch": 0.631, - "grad_norm": 3.2393705594494757, - "kl": 0.3671875, + "grad_norm": 2.932328755604553, + "kl": 0.33056640625, "learning_rate": 4.2445400228234687e-07, - "loss": 0.0381, - "reward": 1.7604167461395264, - "reward_std": 0.10825317353010178, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/format_reward": 0.0, + "loss": 0.0355, + "reward": 0.9895833432674408, + "reward_std": 0.024328090250492096, "rewards/tag_count_reward": 0.9895833432674408, "step": 631 }, { "clip_ratio": 0.0, - "completion_length": 446.47918701171875, + "completion_length": 416.7708435058594, "epoch": 0.632, - "grad_norm": 18.109523778106656, - "kl": 0.7646484375, + "grad_norm": 0.34680124590011835, + "kl": 0.086181640625, "learning_rate": 4.2294634442070553e-07, - "loss": 0.2823, - "reward": 1.90625, - "reward_std": 0.265943706035614, - "rewards/accuracy_reward": 0.9375, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.96875, + "loss": 0.0039, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 632 }, { "clip_ratio": 0.0, - "completion_length": 582.2291870117188, + "completion_length": 479.60418701171875, "epoch": 0.633, - "grad_norm": 7.2865340716766465, - "kl": 1.962890625, + "grad_norm": 0.40169847681205495, + "kl": 0.104736328125, "learning_rate": 4.214402346677619e-07, - "loss": 0.2803, - "reward": 1.796875, - "reward_std": 0.22159092128276825, - "rewards/accuracy_reward": 0.875, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.921875, + "loss": 0.0042, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 633 }, { "clip_ratio": 0.0, - "completion_length": 453.8958435058594, + "completion_length": 449.6666717529297, "epoch": 0.634, - "grad_norm": 10.88110433146186, - "kl": 1.3759765625, + "grad_norm": 1.8717955177940753, + "kl": 0.1416015625, "learning_rate": 4.1993569137498776e-07, - "loss": 0.1843, - "reward": 1.90625, - "reward_std": 0.10825317353010178, - "rewards/accuracy_reward": 0.9375, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.96875, + "loss": 0.0287, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 634 }, { "clip_ratio": 0.0, - "completion_length": 386.8125, + "completion_length": 398.125, "epoch": 0.635, - "grad_norm": 5.897141006047558, - "kl": 0.68359375, + "grad_norm": 0.14764771310944494, + "kl": 0.0518798828125, "learning_rate": 4.1843273287476854e-07, - "loss": 0.1425, - "reward": 1.9479166865348816, - "reward_std": 0.18042195588350296, - "rewards/accuracy_reward": 0.9583333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0022, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 635 }, { "clip_ratio": 0.0, - "completion_length": 483.62501525878906, + "completion_length": 445.6875, "epoch": 0.636, - "grad_norm": 13.772772093644015, - "kl": 1.904296875, + "grad_norm": 1.8274048455196004, + "kl": 0.2001953125, "learning_rate": 4.1693137748017915e-07, - "loss": 0.1288, - "reward": 1.8333333730697632, - "reward_std": 0.183121956884861, - "rewards/accuracy_reward": 0.8750000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9583333432674408, + "loss": 0.0041, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 636 }, { "clip_ratio": 0.0, - "completion_length": 509.1041717529297, + "completion_length": 435.9166717529297, "epoch": 0.637, - "grad_norm": 10.70019037290711, - "kl": 3.07421875, + "grad_norm": 0.2659317168517705, + "kl": 0.08740234375, "learning_rate": 4.15431643484761e-07, - "loss": 0.4424, - "reward": 1.6406250596046448, - "reward_std": 0.37014535069465637, - "rewards/accuracy_reward": 0.708333358168602, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9322916865348816, + "loss": 0.0032, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 637 }, { "clip_ratio": 0.0, - "completion_length": 515.9375152587891, + "completion_length": 440.4166717529297, "epoch": 0.638, - "grad_norm": 46.028322896982, - "kl": 4.0234375, + "grad_norm": 3.071094295279427, + "kl": 0.24169921875, "learning_rate": 4.1393354916230005e-07, - "loss": 0.6939, - "reward": 1.71875, - "reward_std": 0.3373633921146393, - "rewards/accuracy_reward": 0.8125, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.90625, + "loss": 0.0487, + "reward": 0.9843750298023224, + "reward_std": 0.05412658676505089, + "rewards/tag_count_reward": 0.9843750298023224, "step": 638 }, { "clip_ratio": 0.0, - "completion_length": 468.2708435058594, + "completion_length": 418.4583435058594, "epoch": 0.639, - "grad_norm": 15.411064783416274, - "kl": 1.35205078125, + "grad_norm": 7.3329075619624415, + "kl": 0.33447265625, "learning_rate": 4.124371127666024e-07, - "loss": 0.1836, - "reward": 1.8489583730697632, - "reward_std": 0.1175578162074089, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.953125, + "loss": 0.0151, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 639 }, { "clip_ratio": 0.0, - "completion_length": 585.9791870117188, + "completion_length": 514.0625305175781, "epoch": 0.64, - "grad_norm": 16.856454357927625, - "kl": 2.609375, + "grad_norm": 3.3242478064876124, + "kl": 0.24560546875, "learning_rate": 4.1094235253127374e-07, - "loss": 0.3657, - "reward": 1.7447916865348816, - "reward_std": 0.3786095231771469, - "rewards/accuracy_reward": 0.8333333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9114583432674408, + "loss": 0.0392, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 640 }, { "clip_ratio": 0.0, - "completion_length": 507.0416717529297, + "completion_length": 454.9583435058594, "epoch": 0.641, - "grad_norm": 16.6668571347689, - "kl": 2.15625, + "grad_norm": 3.1850444453646687, + "kl": 0.20361328125, "learning_rate": 4.0944928666949527e-07, - "loss": 0.5477, - "reward": 1.828125, - "reward_std": 0.3752746507525444, - "rewards/accuracy_reward": 0.8958333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9322916865348816, + "loss": 0.0431, + "reward": 0.9843750298023224, + "reward_std": 0.05412658676505089, + "rewards/tag_count_reward": 0.9843750298023224, "step": 641 }, { "clip_ratio": 0.0, - "completion_length": 388.3333435058594, + "completion_length": 436.81251525878906, "epoch": 0.642, - "grad_norm": 7.860208030202798, - "kl": 0.74609375, + "grad_norm": 4.125701151939386, + "kl": 0.35693359375, "learning_rate": 4.079579333738039e-07, - "loss": 0.0989, - "reward": 1.9583333730697632, - "reward_std": 0.14433756470680237, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.1334, + "reward": 0.9739583730697632, + "reward_std": 0.09021097794175148, + "rewards/tag_count_reward": 0.9739583730697632, "step": 642 }, { "clip_ratio": 0.0, - "completion_length": 514.3541870117188, + "completion_length": 460.1041717529297, "epoch": 0.643, - "grad_norm": 8.288782834178, - "kl": 1.77490234375, + "grad_norm": 2.7286225905496653, + "kl": 0.12744140625, "learning_rate": 4.064683108158685e-07, - "loss": 0.2371, - "reward": 1.9270833730697632, - "reward_std": 0.18150995671749115, - "rewards/accuracy_reward": 0.9375, - "rewards/format_reward": 0.0, + "loss": 0.0701, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, "rewards/tag_count_reward": 0.9895833432674408, "step": 643 }, { "clip_ratio": 0.0, - "completion_length": 622.6875152587891, + "completion_length": 412.87501525878906, "epoch": 0.644, - "grad_norm": 15.999120662762444, - "kl": 3.26953125, + "grad_norm": 10.135554352971921, + "kl": 1.005859375, "learning_rate": 4.0498043714627006e-07, - "loss": 0.5493, - "reward": 1.8177083730697632, - "reward_std": 0.33464595675468445, - "rewards/accuracy_reward": 0.8958333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.921875, + "loss": 0.038, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 644 }, { "clip_ratio": 0.0, - "completion_length": 635.4791870117188, + "completion_length": 386.5, "epoch": 0.645, - "grad_norm": 16.396655318849227, - "kl": 3.6875, + "grad_norm": 1.117186995184922, + "kl": 0.076904296875, "learning_rate": 4.034943304942796e-07, - "loss": 0.2983, - "reward": 1.6666666865348816, - "reward_std": 0.22979877889156342, - "rewards/accuracy_reward": 0.8125000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8541666865348816, + "loss": -0.0208, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 645 }, { "clip_ratio": 0.0, - "completion_length": 484.3333435058594, + "completion_length": 417.50001525878906, "epoch": 0.646, - "grad_norm": 15.35826943411963, - "kl": 1.33984375, + "grad_norm": 1.9492601392707025, + "kl": 0.16943359375, "learning_rate": 4.020100089676376e-07, - "loss": 0.3568, - "reward": 1.9270833730697632, - "reward_std": 0.25259073078632355, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9479166865348816, + "loss": 0.0064, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 646 }, { "clip_ratio": 0.0, - "completion_length": 509.29168701171875, + "completion_length": 440.2708435058594, "epoch": 0.647, - "grad_norm": 7.776151948305263, - "kl": 1.4443359375, + "grad_norm": 0.21179588270881652, + "kl": 0.05908203125, "learning_rate": 4.005274906523336e-07, - "loss": 0.2073, - "reward": 1.9270833730697632, - "reward_std": 0.1995142102241516, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.96875, + "loss": 0.0024, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 647 }, { "clip_ratio": 0.0, - "completion_length": 689.3541870117188, + "completion_length": 454.2708435058594, "epoch": 0.648, - "grad_norm": 16.322074117524977, - "kl": 3.8203125, + "grad_norm": 12.073619542101616, + "kl": 0.352294921875, "learning_rate": 3.9904679361238526e-07, - "loss": 0.7114, - "reward": 1.5729167461395264, - "reward_std": 0.508946880698204, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8437500298023224, + "loss": 0.017, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 648 }, { "clip_ratio": 0.0, - "completion_length": 425.7083435058594, + "completion_length": 397.60418701171875, "epoch": 0.649, - "grad_norm": 11.227285581188948, - "kl": 0.84765625, + "grad_norm": 1.2740144063670524, + "kl": 0.140625, "learning_rate": 3.975679358896189e-07, - "loss": 0.1605, - "reward": 1.7291666865348816, - "reward_std": 0.14433756470680237, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.0061, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 649 }, { "clip_ratio": 0.0, - "completion_length": 463.7708435058594, + "completion_length": 430.1666717529297, "epoch": 0.65, - "grad_norm": 10.093562627402227, - "kl": 1.1953125, + "grad_norm": 3.3652514648254273, + "kl": 0.37109375, "learning_rate": 3.9609093550344907e-07, - "loss": 0.2881, - "reward": 1.9531250596046448, - "reward_std": 0.12172887474298477, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9531250298023224, + "loss": 0.0209, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 650 }, { "clip_ratio": 0.0, - "completion_length": 342.7083435058594, + "completion_length": 371.1666717529297, "epoch": 0.651, - "grad_norm": 2.2555843861653404, - "kl": 0.1103515625, + "grad_norm": 6.504464272319793, + "kl": 0.677734375, "learning_rate": 3.946158104506594e-07, - "loss": 0.024, - "reward": 1.7447916865348816, - "reward_std": 0.018042195588350296, - "rewards/accuracy_reward": 0.75, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": -0.0297, + "reward": 0.9739583730697632, + "reward_std": 0.09021097794175148, + "rewards/tag_count_reward": 0.9739583730697632, "step": 651 }, { "clip_ratio": 0.0, - "completion_length": 465.0416717529297, + "completion_length": 410.3958435058594, "epoch": 0.652, - "grad_norm": 12.706103031204714, - "kl": 1.41796875, + "grad_norm": 13.816051573602309, + "kl": 0.38525390625, "learning_rate": 3.931425787051832e-07, - "loss": 0.1063, - "reward": 1.6093750596046448, - "reward_std": 0.2300998941063881, - "rewards/accuracy_reward": 0.6875000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.921875, + "loss": 0.0443, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 652 }, { "clip_ratio": 0.0, - "completion_length": 481.7083435058594, + "completion_length": 434.5833435058594, "epoch": 0.653, - "grad_norm": 9.588244758972122, - "kl": 1.18310546875, + "grad_norm": 13.428180365575258, + "kl": 1.0625, "learning_rate": 3.9167125821788416e-07, - "loss": 0.3146, - "reward": 1.90625, - "reward_std": 0.21309566497802734, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9270833432674408, + "loss": 0.1626, + "reward": 0.9791666865348816, + "reward_std": 0.07216878235340118, + "rewards/tag_count_reward": 0.9791666865348816, "step": 653 }, { "clip_ratio": 0.0, - "completion_length": 421.2291717529297, + "completion_length": 444.7708435058594, "epoch": 0.654, - "grad_norm": 6.787145845687959, - "kl": 0.91845703125, + "grad_norm": 3.7340291643285846, + "kl": 0.38525390625, "learning_rate": 3.902018669163384e-07, - "loss": 0.074, - "reward": 1.9010416865348816, - "reward_std": 0.22076396644115448, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.984375, + "loss": 0.0162, + "reward": 0.9739583432674408, + "reward_std": 0.0749332383275032, + "rewards/tag_count_reward": 0.9739583432674408, "step": 654 }, { "clip_ratio": 0.0, - "completion_length": 451.2916717529297, + "completion_length": 458.60418701171875, "epoch": 0.655, - "grad_norm": 17.444888520800696, - "kl": 0.966796875, + "grad_norm": 0.1697259953337305, + "kl": 0.04541015625, "learning_rate": 3.8873442270461485e-07, - "loss": 0.3744, - "reward": 1.9010417461395264, - "reward_std": 0.3428017199039459, - "rewards/accuracy_reward": 0.9375000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.0018, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 655 }, { "clip_ratio": 0.0, - "completion_length": 366.29168701171875, + "completion_length": 389.7708435058594, "epoch": 0.656, - "grad_norm": 3.2018175898420496, - "kl": 0.50634765625, + "grad_norm": 0.8838722155559678, + "kl": 0.07080078125, "learning_rate": 3.872689434630585e-07, - "loss": 0.0742, - "reward": 1.796875, - "reward_std": 0.1281561702489853, - "rewards/accuracy_reward": 0.8125, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.984375, + "loss": 0.0029, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 656 }, { "clip_ratio": 0.0, - "completion_length": 559.3958435058594, + "completion_length": 452.8958435058594, "epoch": 0.657, - "grad_norm": 8.750666974164382, - "kl": 2.123046875, + "grad_norm": 2.6003850864220164, + "kl": 0.170166015625, "learning_rate": 3.8580544704807117e-07, - "loss": 0.461, - "reward": 1.7447916865348816, - "reward_std": 0.36910194158554077, - "rewards/accuracy_reward": 0.8125000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9322916865348816, + "loss": 0.0084, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 657 }, { "clip_ratio": 0.0, - "completion_length": 523.5833435058594, + "completion_length": 448.31251525878906, "epoch": 0.658, - "grad_norm": 9.47884007985339, - "kl": 1.66796875, + "grad_norm": 1.4956454431121218, + "kl": 0.13623046875, "learning_rate": 3.843439512918949e-07, - "loss": 0.2826, - "reward": 1.8437500596046448, - "reward_std": 0.23587782680988312, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9479166865348816, + "loss": 0.0056, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 658 }, { "clip_ratio": 0.0, - "completion_length": 501.5208435058594, + "completion_length": 488.9583435058594, "epoch": 0.659, - "grad_norm": 11.746202787090173, - "kl": 1.7626953125, + "grad_norm": 2.286765300071012, + "kl": 0.162109375, "learning_rate": 3.8288447400239443e-07, - "loss": 0.2517, - "reward": 1.78125, - "reward_std": 0.11935241520404816, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9479166865348816, + "loss": 0.0211, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 659 }, { "clip_ratio": 0.0, - "completion_length": 525.6250152587891, + "completion_length": 408.0833435058594, "epoch": 0.66, - "grad_norm": 10.034141199711346, - "kl": 2.19921875, + "grad_norm": 0.41200222233589673, + "kl": 0.08740234375, "learning_rate": 3.8142703296283953e-07, - "loss": 0.4662, - "reward": 1.390625, - "reward_std": 0.23617514222860336, - "rewards/accuracy_reward": 0.5, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.890625, + "loss": 0.0034, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 660 }, { "clip_ratio": 0.0, - "completion_length": 495.6875305175781, + "completion_length": 492.5833435058594, "epoch": 0.661, - "grad_norm": 4.161344052398567, - "kl": 1.3515625, + "grad_norm": 3.0613933390855377, + "kl": 0.0755615234375, "learning_rate": 3.7997164593168983e-07, - "loss": 0.2163, - "reward": 1.8489583730697632, - "reward_std": 0.23822123557329178, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9322916865348816, + "loss": 0.0422, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 661 }, { "clip_ratio": 0.0, - "completion_length": 427.1041717529297, + "completion_length": 451.00001525878906, "epoch": 0.662, - "grad_norm": 10.265206117027367, - "kl": 0.82421875, + "grad_norm": 0.4471118658004672, + "kl": 0.095458984375, "learning_rate": 3.785183306423767e-07, - "loss": 0.269, - "reward": 1.9479166865348816, - "reward_std": 0.16186628118157387, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9687500298023224, + "loss": 0.0039, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 662 }, { "clip_ratio": 0.0, - "completion_length": 514.8125152587891, + "completion_length": 437.5416717529297, "epoch": 0.663, - "grad_norm": 4.778160522745052, - "kl": 1.48828125, + "grad_norm": 4.6981177319338325, + "kl": 0.140625, "learning_rate": 3.7706710480308835e-07, - "loss": 0.2057, - "reward": 1.6927083730697632, - "reward_std": 0.27833671122789383, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.0501, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 663 }, { "clip_ratio": 0.0, - "completion_length": 417.1666717529297, + "completion_length": 434.29168701171875, "epoch": 0.664, - "grad_norm": 6.6464583664250805, - "kl": 1.2197265625, + "grad_norm": 3.2404016855128974, + "kl": 0.182861328125, "learning_rate": 3.7561798609655373e-07, - "loss": 0.133, - "reward": 1.7864583730697632, - "reward_std": 0.21303357928991318, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9531250298023224, + "loss": 0.055, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 664 }, { "clip_ratio": 0.0, - "completion_length": 403.81251525878906, + "completion_length": 383.31251525878906, "epoch": 0.665, - "grad_norm": 7.811845482970346, - "kl": 0.734375, + "grad_norm": 0.42080049592277263, + "kl": 0.08251953125, "learning_rate": 3.7417099217982686e-07, - "loss": 0.054, - "reward": 1.8072916865348816, - "reward_std": 0.09776745736598969, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 0.0036, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 665 }, { "clip_ratio": 0.0, - "completion_length": 432.0833435058594, + "completion_length": 446.3541717529297, "epoch": 0.666, - "grad_norm": 8.758211062001669, - "kl": 2.0185546875, + "grad_norm": 1.2318748164342126, + "kl": 0.15087890625, "learning_rate": 3.72726140684072e-07, - "loss": 0.1164, - "reward": 1.5312500596046448, - "reward_std": 0.3305683881044388, - "rewards/accuracy_reward": 0.5833333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9479166865348816, + "loss": 0.0063, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 666 }, { "clip_ratio": 0.0, - "completion_length": 465.3333435058594, + "completion_length": 468.75001525878906, "epoch": 0.667, - "grad_norm": 14.633939544314472, - "kl": 0.8671875, + "grad_norm": 1.8456503709812673, + "kl": 0.1396484375, "learning_rate": 3.712834492143487e-07, - "loss": 0.3938, - "reward": 1.9635416865348816, - "reward_std": 0.12629536911845207, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.0208, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 667 }, { "clip_ratio": 0.0, - "completion_length": 480.7291717529297, + "completion_length": 437.7708435058594, "epoch": 0.668, - "grad_norm": 8.183379454229799, - "kl": 1.3046875, + "grad_norm": 6.344832823638316, + "kl": 0.118408203125, "learning_rate": 3.6984293534939737e-07, - "loss": 0.2708, - "reward": 1.8385416865348816, - "reward_std": 0.2887519672513008, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9218750298023224, + "loss": 0.0746, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 668 }, { "clip_ratio": 0.0, - "completion_length": 482.68751525878906, + "completion_length": 435.7708435058594, "epoch": 0.669, - "grad_norm": 16.156518899200908, - "kl": 1.89453125, + "grad_norm": 1.7198633924214835, + "kl": 0.10986328125, "learning_rate": 3.6840461664142444e-07, - "loss": 0.5873, - "reward": 1.8125000596046448, - "reward_std": 0.40520651638507843, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/format_reward": 0.02083333395421505, - "rewards/tag_count_reward": 0.8958333432674408, + "loss": -0.0091, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 669 }, { "clip_ratio": 0.0, - "completion_length": 390.7708435058594, + "completion_length": 408.87501525878906, "epoch": 0.67, - "grad_norm": 15.532473512027257, - "kl": 0.4365234375, + "grad_norm": 2.643321718727809, + "kl": 0.2587890625, "learning_rate": 3.6696851061588994e-07, - "loss": 0.233, - "reward": 1.9895833730697632, - "reward_std": 0.03608439117670059, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0438, + "reward": 0.9739583432674408, + "reward_std": 0.04956009238958359, + "rewards/tag_count_reward": 0.9739583432674408, "step": 670 }, { "clip_ratio": 0.0, - "completion_length": 485.7291717529297, + "completion_length": 380.93751525878906, "epoch": 0.671, - "grad_norm": 9.198560157106922, - "kl": 2.109375, + "grad_norm": 3.507375631504847, + "kl": 0.302978515625, "learning_rate": 3.655346347712922e-07, - "loss": 0.2612, - "reward": 1.8385416865348816, - "reward_std": 0.2914777956902981, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.921875, + "loss": 0.0146, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 671 }, { "clip_ratio": 0.0, - "completion_length": 381.7916717529297, + "completion_length": 393.75001525878906, "epoch": 0.672, - "grad_norm": 9.244625026565478, - "kl": 0.556640625, + "grad_norm": 6.501617653915259, + "kl": 0.39306640625, "learning_rate": 3.641030065789562e-07, - "loss": 0.2217, - "reward": 1.9791666865348816, - "reward_std": 0.07216878235340118, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, + "loss": 0.0163, + "reward": 0.9791666865348816, + "reward_std": 0.05689104273915291, "rewards/tag_count_reward": 0.9791666865348816, "step": 672 }, { "clip_ratio": 0.0, - "completion_length": 425.56251525878906, + "completion_length": 421.2916717529297, "epoch": 0.673, - "grad_norm": 6.287912642621773, - "kl": 1.5625, + "grad_norm": 4.276393202562739, + "kl": 0.5185546875, "learning_rate": 3.6267364348281946e-07, - "loss": 0.2877, - "reward": 1.796875, - "reward_std": 0.30066054314374924, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9427083432674408, + "loss": 0.0333, + "reward": 0.9739583432674408, + "reward_std": 0.0749332383275032, + "rewards/tag_count_reward": 0.9739583432674408, "step": 673 }, { "clip_ratio": 0.0, - "completion_length": 452.27085876464844, + "completion_length": 402.5208435058594, "epoch": 0.674, - "grad_norm": 8.718058879764467, - "kl": 1.38671875, + "grad_norm": 694.7450073397962, + "kl": 20.8046875, "learning_rate": 3.612465628992203e-07, - "loss": 0.1729, - "reward": 1.9375, - "reward_std": 0.09608826041221619, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9583333432674408, + "loss": 1.1834, + "reward": 0.9687500298023224, + "reward_std": 0.0929754413664341, + "rewards/tag_count_reward": 0.9687500298023224, "step": 674 }, { "clip_ratio": 0.0, - "completion_length": 549.625, + "completion_length": 396.68751525878906, "epoch": 0.675, - "grad_norm": 10.474436901773498, - "kl": 2.87646484375, + "grad_norm": 4.273385615664699, + "kl": 0.1492919921875, "learning_rate": 3.5982178221668533e-07, - "loss": 0.2636, - "reward": 1.6979166865348816, - "reward_std": 0.3109816312789917, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.90625, + "loss": 0.0074, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 675 }, { "clip_ratio": 0.0, - "completion_length": 542.9791870117188, + "completion_length": 419.7916717529297, "epoch": 0.676, - "grad_norm": 9.725329664214886, - "kl": 3.0830078125, + "grad_norm": 586.7551051861542, + "kl": 10.14404296875, "learning_rate": 3.5839931879571725e-07, - "loss": 0.4888, - "reward": 1.7604166865348816, - "reward_std": 0.314975380897522, - "rewards/accuracy_reward": 0.8333333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9270833432674408, + "loss": 0.7136, + "reward": 0.984375, + "reward_std": 0.05412658676505089, + "rewards/tag_count_reward": 0.984375, "step": 676 }, { "clip_ratio": 0.0, - "completion_length": 365.60418701171875, + "completion_length": 429.7708435058594, "epoch": 0.677, - "grad_norm": 3.104806476968875, - "kl": 0.3828125, + "grad_norm": 8.04671027131853, + "kl": 0.5595703125, "learning_rate": 3.5697918996858443e-07, - "loss": 0.0175, - "reward": 2.0, - "reward_std": 0.0, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": 0.0337, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 677 }, { "clip_ratio": 0.0, - "completion_length": 522.7708435058594, + "completion_length": 488.0833435058594, "epoch": 0.678, - "grad_norm": 31.009459384934313, - "kl": 3.4609375, + "grad_norm": 19.4631689907819, + "kl": 1.8203125, "learning_rate": 3.555614130391079e-07, - "loss": 0.6262, - "reward": 1.7916666865348816, - "reward_std": 0.17538414150476456, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9375000298023224, + "loss": 0.1889, + "reward": 0.9687500298023224, + "reward_std": 0.08474056795239449, + "rewards/tag_count_reward": 0.9687500298023224, "step": 678 }, { "clip_ratio": 0.0, - "completion_length": 375.9375, + "completion_length": 437.9166717529297, "epoch": 0.679, - "grad_norm": 8.065044363113154, - "kl": 0.77001953125, + "grad_norm": 4.207458753364508, + "kl": 0.34423828125, "learning_rate": 3.5414600528245266e-07, - "loss": 0.2675, - "reward": 1.96875, - "reward_std": 0.10825317353010178, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0715, + "reward": 0.9791666865348816, + "reward_std": 0.05689104646444321, + "rewards/tag_count_reward": 0.9791666865348816, "step": 679 }, { "clip_ratio": 0.0, - "completion_length": 399.2916717529297, + "completion_length": 472.22918701171875, "epoch": 0.68, - "grad_norm": 4.945598864936947, - "kl": 0.28076171875, + "grad_norm": 8.59331015221391, + "kl": 1.037109375, "learning_rate": 3.5273298394491515e-07, - "loss": 0.0356, - "reward": 1.9947916865348816, - "reward_std": 0.018042195588350296, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0282, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 680 }, { "clip_ratio": 0.0, - "completion_length": 482.47918701171875, + "completion_length": 476.2916717529297, "epoch": 0.681, - "grad_norm": 18.75089337748267, - "kl": 2.4794921875, + "grad_norm": 9.993205923284181, + "kl": 1.244140625, "learning_rate": 3.513223662437147e-07, - "loss": 0.6058, - "reward": 1.8072916865348816, - "reward_std": 0.3214855194091797, - "rewards/accuracy_reward": 0.875, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9322916865348816, + "loss": 0.2014, + "reward": 0.9427083432674408, + "reward_std": 0.12026621401309967, + "rewards/tag_count_reward": 0.9427083432674408, "step": 681 }, { "clip_ratio": 0.0, - "completion_length": 496.8958435058594, + "completion_length": 453.54168701171875, "epoch": 0.682, - "grad_norm": 17.379877040288555, - "kl": 3.326171875, + "grad_norm": 5.448757627782719, + "kl": 0.57421875, "learning_rate": 3.4991416936678276e-07, - "loss": 0.4109, - "reward": 1.8697916865348816, - "reward_std": 0.21021871641278267, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9531250298023224, + "loss": 0.1092, + "reward": 0.984375, + "reward_std": 0.05412658676505089, + "rewards/tag_count_reward": 0.984375, "step": 682 }, { "clip_ratio": 0.0, - "completion_length": 404.25001525878906, + "completion_length": 453.25, "epoch": 0.683, - "grad_norm": 5.272063363799655, - "kl": 0.64990234375, + "grad_norm": 8.456088400083507, + "kl": 0.61328125, "learning_rate": 3.4850841047255364e-07, - "loss": 0.0618, - "reward": 1.9479166865348816, - "reward_std": 0.1263538897037506, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, + "loss": 0.0627, + "reward": 0.96875, + "reward_std": 0.0929754339158535, "rewards/tag_count_reward": 0.96875, "step": 683 }, { "clip_ratio": 0.0, - "completion_length": 437.7708435058594, + "completion_length": 419.87501525878906, "epoch": 0.684, - "grad_norm": 6.3373870231815985, - "kl": 1.5986328125, + "grad_norm": 3.6825779293110537, + "kl": 0.357421875, "learning_rate": 3.471051066897562e-07, - "loss": 0.3165, - "reward": 1.9583333730697632, - "reward_std": 0.09731236100196838, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9583333432674408, + "loss": 0.0295, + "reward": 0.984375, + "reward_std": 0.04237028956413269, + "rewards/tag_count_reward": 0.984375, "step": 684 }, { "clip_ratio": 0.0, - "completion_length": 339.9583435058594, + "completion_length": 371.2083435058594, "epoch": 0.685, - "grad_norm": 7.459585178558405, - "kl": 0.5224609375, + "grad_norm": 10.382180895310823, + "kl": 0.57421875, "learning_rate": 3.45704275117204e-07, - "loss": 0.0775, - "reward": 1.9479167461395264, - "reward_std": 0.13339675217866898, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0874, + "reward": 0.9583333432674408, + "reward_std": 0.11378209292888641, + "rewards/tag_count_reward": 0.9583333432674408, "step": 685 }, { "clip_ratio": 0.0, - "completion_length": 580.3125305175781, + "completion_length": 439.50001525878906, "epoch": 0.686, - "grad_norm": 14.569683148694587, - "kl": 3.615234375, + "grad_norm": 7.030962041723798, + "kl": 0.576171875, "learning_rate": 3.4430593282358777e-07, - "loss": 0.3833, - "reward": 1.8958333730697632, - "reward_std": 0.17306438833475113, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9166666865348816, + "loss": 0.0018, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 686 }, { "clip_ratio": 0.0, - "completion_length": 382.56251525878906, + "completion_length": 492.4791717529297, "epoch": 0.687, - "grad_norm": 7.024535476163044, - "kl": 0.9453125, + "grad_norm": 14.597759514272322, + "kl": 1.191650390625, "learning_rate": 3.429100968472668e-07, - "loss": 0.1215, - "reward": 1.9427083730697632, - "reward_std": 0.17495156079530716, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.1023, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 687 }, { "clip_ratio": 0.0, - "completion_length": 458.10418701171875, + "completion_length": 416.6666717529297, "epoch": 0.688, - "grad_norm": 5.89892561311204, - "kl": 1.736328125, + "grad_norm": 30.769876265900503, + "kl": 2.765625, "learning_rate": 3.4151678419606233e-07, - "loss": 0.3014, - "reward": 1.9427083730697632, - "reward_std": 0.12497352808713913, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.3468, + "reward": 0.9791666865348816, + "reward_std": 0.07216878235340118, + "rewards/tag_count_reward": 0.9791666865348816, "step": 688 }, { "clip_ratio": 0.0, - "completion_length": 469.8958435058594, + "completion_length": 437.00001525878906, "epoch": 0.689, - "grad_norm": 7.588207971045676, - "kl": 2.1171875, + "grad_norm": 26.098704511067435, + "kl": 1.56103515625, "learning_rate": 3.4012601184704904e-07, - "loss": 0.4495, - "reward": 1.9166666865348816, - "reward_std": 0.19839920848608017, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9583333730697632, + "loss": 0.1296, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 689 }, { "clip_ratio": 0.0, - "completion_length": 506.58335876464844, + "completion_length": 439.6666717529297, "epoch": 0.69, - "grad_norm": 18.33227477565721, - "kl": 2.375, + "grad_norm": 33.33379609157588, + "kl": 2.5234375, "learning_rate": 3.387377967463493e-07, - "loss": 0.2341, - "reward": 1.7656250596046448, - "reward_std": 0.10371024534106255, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9322916865348816, + "loss": 0.2176, + "reward": 0.9791666865348816, + "reward_std": 0.06041248142719269, + "rewards/tag_count_reward": 0.9791666865348816, "step": 690 }, { "clip_ratio": 0.0, - "completion_length": 341.56251525878906, + "completion_length": 415.8333435058594, "epoch": 0.691, - "grad_norm": 0.9699860334723364, - "kl": 0.15478515625, + "grad_norm": 5.181330445699268, + "kl": 0.529296875, "learning_rate": 3.3735215580892575e-07, - "loss": 0.0067, - "reward": 2.0, - "reward_std": 0.0, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": 0.0972, + "reward": 0.9791666865348816, + "reward_std": 0.05689104646444321, + "rewards/tag_count_reward": 0.9791666865348816, "step": 691 }, { "clip_ratio": 0.0, - "completion_length": 517.2291717529297, + "completion_length": 454.2291717529297, "epoch": 0.692, - "grad_norm": 7.76152289717968, - "kl": 2.16015625, + "grad_norm": 5.785792471904993, + "kl": 0.380859375, "learning_rate": 3.359691059183761e-07, - "loss": 0.4388, - "reward": 1.7916666865348816, - "reward_std": 0.3108699470758438, - "rewards/accuracy_reward": 0.8750000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9166666865348816, + "loss": 0.0734, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 692 }, { "clip_ratio": 0.0, - "completion_length": 443.22918701171875, + "completion_length": 442.06251525878906, "epoch": 0.693, - "grad_norm": 7.333262641142574, - "kl": 1.40673828125, + "grad_norm": 17.32190836556746, + "kl": 1.2958984375, "learning_rate": 3.3458866392672694e-07, - "loss": 0.1375, - "reward": 1.8072916865348816, - "reward_std": 0.09776745736598969, - "rewards/accuracy_reward": 0.875, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9322916865348816, + "loss": 0.0677, + "reward": 0.9791666865348816, + "reward_std": 0.04070868343114853, + "rewards/tag_count_reward": 0.9791666865348816, "step": 693 }, { "clip_ratio": 0.0, - "completion_length": 485.1458435058594, + "completion_length": 483.12501525878906, "epoch": 0.694, - "grad_norm": 15.76007301171471, - "kl": 2.71484375, + "grad_norm": 3.844960881993056, + "kl": 0.6005859375, "learning_rate": 3.3321084665422803e-07, - "loss": 0.6101, - "reward": 1.9010416865348816, - "reward_std": 0.23052222281694412, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9218750298023224, + "loss": 0.0806, + "reward": 0.96875, + "reward_std": 0.0625, + "rewards/tag_count_reward": 0.96875, "step": 694 }, { "clip_ratio": 0.0, - "completion_length": 461.8958435058594, + "completion_length": 432.50001525878906, "epoch": 0.695, - "grad_norm": 13.8312684886872, - "kl": 1.9111328125, + "grad_norm": 6.264027723583563, + "kl": 0.557861328125, "learning_rate": 3.3183567088914833e-07, - "loss": 0.7307, - "reward": 1.8802083730697632, - "reward_std": 0.25236696004867554, - "rewards/accuracy_reward": 0.9375, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9427083432674408, + "loss": 0.062, + "reward": 0.9791666865348816, + "reward_std": 0.04865618050098419, + "rewards/tag_count_reward": 0.9791666865348816, "step": 695 }, { "clip_ratio": 0.0, - "completion_length": 356.125, + "completion_length": 417.62501525878906, "epoch": 0.696, - "grad_norm": 3.142846506316496, - "kl": 0.3759765625, + "grad_norm": 5.851220345933992, + "kl": 0.6953125, "learning_rate": 3.3046315338757026e-07, - "loss": 0.0536, - "reward": 1.984375, - "reward_std": 0.05412658676505089, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.984375, + "loss": 0.0707, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 696 }, { "clip_ratio": 0.0, - "completion_length": 503.3750305175781, + "completion_length": 448.7083435058594, "epoch": 0.697, - "grad_norm": 5.588010924626038, - "kl": 1.66650390625, + "grad_norm": 19.52816493353307, + "kl": 0.5810546875, "learning_rate": 3.290933108731866e-07, - "loss": 0.2265, - "reward": 1.8020833730697632, - "reward_std": 0.21084074676036835, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9479166865348816, + "loss": 0.0484, + "reward": 0.9843750298023224, + "reward_std": 0.04237028583884239, + "rewards/tag_count_reward": 0.9843750298023224, "step": 697 }, { "clip_ratio": 0.0, - "completion_length": 415.2708435058594, + "completion_length": 481.1041717529297, "epoch": 0.698, - "grad_norm": 4.085307118062538, - "kl": 0.992919921875, + "grad_norm": 11.159572266012402, + "kl": 1.13671875, "learning_rate": 3.2772616003709616e-07, - "loss": 0.1462, - "reward": 1.9739583730697632, - "reward_std": 0.09021097421646118, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.1515, + "reward": 0.9583333730697632, + "reward_std": 0.10368667915463448, + "rewards/tag_count_reward": 0.9583333730697632, "step": 698 }, { "clip_ratio": 0.0, - "completion_length": 341.3541717529297, + "completion_length": 401.3541717529297, "epoch": 0.699, - "grad_norm": 1.2279757873288046, - "kl": 0.18212890625, + "grad_norm": 11.300893647697604, + "kl": 1.216796875, "learning_rate": 3.263617175376001e-07, - "loss": 0.0079, - "reward": 2.0, - "reward_std": 0.0, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": 0.1522, + "reward": 0.9739583432674408, + "reward_std": 0.0749332457780838, + "rewards/tag_count_reward": 0.9739583432674408, "step": 699 }, { "clip_ratio": 0.0, - "completion_length": 742.6458587646484, + "completion_length": 503.20835876464844, "epoch": 0.7, - "grad_norm": 10.139398098084365, - "kl": 5.00390625, + "grad_norm": 26.92750609594345, + "kl": 1.82568359375, "learning_rate": 3.250000000000001e-07, - "loss": 0.7811, - "reward": 1.75, - "reward_std": 0.3389703258872032, - "rewards/accuracy_reward": 0.8750000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8750000298023224, + "loss": 0.1622, + "reward": 0.9791666865348816, + "reward_std": 0.05689104646444321, + "rewards/tag_count_reward": 0.9791666865348816, "step": 700 }, { "clip_ratio": 0.0, - "completion_length": 481.5208435058594, + "completion_length": 510.06251525878906, "epoch": 0.701, - "grad_norm": 6.657850920838966, - "kl": 2.15625, + "grad_norm": 10.494983415892529, + "kl": 1.27734375, "learning_rate": 3.2364102401639423e-07, - "loss": 0.3804, - "reward": 1.8125000596046448, - "reward_std": 0.35387982428073883, - "rewards/accuracy_reward": 0.875, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9375000298023224, + "loss": 0.1745, + "reward": 0.9583333730697632, + "reward_std": 0.11039762571454048, + "rewards/tag_count_reward": 0.9583333730697632, "step": 701 }, { "clip_ratio": 0.0, - "completion_length": 551.0416870117188, + "completion_length": 446.9583435058594, "epoch": 0.702, - "grad_norm": 29.768138074376726, - "kl": 1.987060546875, + "grad_norm": 11.436039976655804, + "kl": 1.490234375, "learning_rate": 3.222848061454764e-07, - "loss": 0.0773, - "reward": 1.6145833730697632, - "reward_std": 0.156256303191185, - "rewards/accuracy_reward": 0.6875, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9270833432674408, + "loss": 0.2968, + "reward": 0.9687500298023224, + "reward_std": 0.08474057167768478, + "rewards/tag_count_reward": 0.9687500298023224, "step": 702 }, { "clip_ratio": 0.0, - "completion_length": 378.10418701171875, + "completion_length": 436.54168701171875, "epoch": 0.703, - "grad_norm": 2.3089932161894913, - "kl": 0.314208984375, + "grad_norm": 3.1017789326327474, + "kl": 0.48046875, "learning_rate": 3.209313629123329e-07, - "loss": 0.0155, - "reward": 1.75, - "reward_std": 0.0, - "rewards/accuracy_reward": 0.75, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": 0.061, + "reward": 0.984375, + "reward_std": 0.03884884715080261, + "rewards/tag_count_reward": 0.984375, "step": 703 }, { "clip_ratio": 0.0, - "completion_length": 496.97918701171875, + "completion_length": 422.5833435058594, "epoch": 0.704, - "grad_norm": 30.19692141758088, - "kl": 1.49609375, + "grad_norm": 4.996298099273398, + "kl": 0.57568359375, "learning_rate": 3.195807108082429e-07, - "loss": 0.6103, - "reward": 1.8333333730697632, - "reward_std": 0.44709303975105286, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9375, + "loss": 0.0241, + "reward": 0.9791666865348816, + "reward_std": 0.07216878235340118, + "rewards/tag_count_reward": 0.9791666865348816, "step": 704 }, { "clip_ratio": 0.0, - "completion_length": 390.2708435058594, + "completion_length": 394.0208435058594, "epoch": 0.705, - "grad_norm": 12.7039868505871, - "kl": 0.44873046875, + "grad_norm": 4.677444483256973, + "kl": 0.2783203125, "learning_rate": 3.182328662904756e-07, - "loss": 0.21, - "reward": 1.9739583730697632, - "reward_std": 0.09021097421646118, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 0.0153, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 705 }, { "clip_ratio": 0.0, - "completion_length": 606.2916870117188, + "completion_length": 506.2708435058594, "epoch": 0.706, - "grad_norm": 32.55623223008865, - "kl": 1.8828125, + "grad_norm": 5.964023130051764, + "kl": 0.71533203125, "learning_rate": 3.168878457820915e-07, - "loss": 0.101, - "reward": 1.7083333730697632, - "reward_std": 0.23976805061101913, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9375, + "loss": 0.0581, + "reward": 0.9843750298023224, + "reward_std": 0.05412658676505089, + "rewards/tag_count_reward": 0.9843750298023224, "step": 706 }, { "clip_ratio": 0.0, - "completion_length": 400.5208435058594, + "completion_length": 367.50001525878906, "epoch": 0.707, - "grad_norm": 14.333505597413973, - "kl": 0.614990234375, + "grad_norm": 4.32921957152037, + "kl": 0.15380859375, "learning_rate": 3.155456656717408e-07, - "loss": 0.2913, - "reward": 1.953125, - "reward_std": 0.16237977147102356, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 0.0155, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 707 }, { "clip_ratio": 0.0, - "completion_length": 523.7916717529297, + "completion_length": 439.41668701171875, "epoch": 0.708, - "grad_norm": 9.988399285895824, - "kl": 1.99609375, + "grad_norm": 6.0390044634387765, + "kl": 0.15966796875, "learning_rate": 3.142063423134644e-07, - "loss": 0.4525, - "reward": 1.6458333730697632, - "reward_std": 0.43036723136901855, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9166666865348816, + "loss": 0.0001, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 708 }, { "clip_ratio": 0.0, - "completion_length": 639.0416870117188, + "completion_length": 512.25, "epoch": 0.709, - "grad_norm": 9.923037129609137, - "kl": 3.78125, + "grad_norm": 39.10476230531704, + "kl": 1.29296875, "learning_rate": 3.1286989202649503e-07, - "loss": 0.4593, - "reward": 1.6406250596046448, - "reward_std": 0.32235653698444366, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8489583432674408, + "loss": 0.5214, + "reward": 0.9791666865348816, + "reward_std": 0.07216878235340118, + "rewards/tag_count_reward": 0.9791666865348816, "step": 709 }, { "clip_ratio": 0.0, - "completion_length": 347.1666717529297, + "completion_length": 397.6041717529297, "epoch": 0.71, - "grad_norm": 1.0453519254579182, - "kl": 0.163330078125, + "grad_norm": 1.411237124800069, + "kl": 0.1256103515625, "learning_rate": 3.115363310950578e-07, - "loss": 0.0071, - "reward": 2.0, + "loss": 0.0068, + "reward": 1.0, "reward_std": 0.0, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, "rewards/tag_count_reward": 1.0, "step": 710 }, { "clip_ratio": 0.0, - "completion_length": 326.56251525878906, + "completion_length": 382.97918701171875, "epoch": 0.711, - "grad_norm": 3.2849089415625996, - "kl": 0.09765625, + "grad_norm": 11.257531925319636, + "kl": 1.0078125, "learning_rate": 3.102056757681715e-07, - "loss": 0.0039, - "reward": 2.0, - "reward_std": 0.0, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": 0.1469, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 711 }, { "clip_ratio": 0.0, - "completion_length": 481.56251525878906, + "completion_length": 459.68751525878906, "epoch": 0.712, - "grad_norm": 9.211696561129303, - "kl": 1.8671875, + "grad_norm": 16.25512877311021, + "kl": 1.931640625, "learning_rate": 3.0887794225945143e-07, - "loss": 0.4281, - "reward": 1.7760417461395264, - "reward_std": 0.1180451512336731, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9427083432674408, + "loss": 0.1908, + "reward": 0.9739583432674408, + "reward_std": 0.0749332457780838, + "rewards/tag_count_reward": 0.9739583432674408, "step": 712 }, { "clip_ratio": 0.0, - "completion_length": 483.72918701171875, + "completion_length": 470.91668701171875, "epoch": 0.713, - "grad_norm": 10.16731006575068, - "kl": 1.271484375, + "grad_norm": 10.833452938881697, + "kl": 0.4189453125, "learning_rate": 3.075531467469116e-07, - "loss": 0.2044, - "reward": 1.5677083730697632, - "reward_std": 0.15415260195732117, - "rewards/accuracy_reward": 0.6041666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.0206, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 713 }, { "clip_ratio": 0.0, - "completion_length": 591.7291870117188, + "completion_length": 464.5208435058594, "epoch": 0.714, - "grad_norm": 7.644656692644529, - "kl": 2.052734375, + "grad_norm": 2.80201483686987, + "kl": 0.4462890625, "learning_rate": 3.062313053727671e-07, - "loss": 0.2548, - "reward": 1.8906250596046448, - "reward_std": 0.14420080184936523, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9322916865348816, + "loss": 0.0226, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 714 }, { "clip_ratio": 0.0, - "completion_length": 511.6875305175781, + "completion_length": 507.6458435058594, "epoch": 0.715, - "grad_norm": 17.428411727398277, - "kl": 1.3359375, + "grad_norm": 4.41405067914247, + "kl": 0.33935546875, "learning_rate": 3.0491243424323783e-07, - "loss": 0.4885, - "reward": 1.7239583730697632, - "reward_std": 0.35039304196834564, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9322916865348816, + "loss": 0.0414, + "reward": 0.984375, + "reward_std": 0.05412658676505089, + "rewards/tag_count_reward": 0.984375, "step": 715 }, { "clip_ratio": 0.0, - "completion_length": 367.00001525878906, + "completion_length": 390.1458435058594, "epoch": 0.716, - "grad_norm": 5.795807819669617, - "kl": 0.2197265625, + "grad_norm": 3.201216632549719, + "kl": 0.20703125, "learning_rate": 3.0359654942835247e-07, - "loss": 0.0608, - "reward": 1.5260416865348816, - "reward_std": 0.15143895149230957, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.984375, + "loss": 0.0293, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 716 }, { "clip_ratio": 0.0, - "completion_length": 593.0833587646484, + "completion_length": 460.2708435058594, "epoch": 0.717, - "grad_norm": 17.12159414092038, - "kl": 2.796875, + "grad_norm": 2.3102229608350924, + "kl": 0.1923828125, "learning_rate": 3.02283666961752e-07, - "loss": 0.3553, - "reward": 1.6197916865348816, - "reward_std": 0.32312043011188507, - "rewards/accuracy_reward": 0.6875000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9322916865348816, + "loss": 0.0096, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 717 }, { "clip_ratio": 0.0, - "completion_length": 479.1875305175781, + "completion_length": 417.3958435058594, "epoch": 0.718, - "grad_norm": 4.42017793182423, - "kl": 1.0888671875, + "grad_norm": 12.582863511927389, + "kl": 0.5849609375, "learning_rate": 3.0097380284049523e-07, - "loss": 0.2418, - "reward": 1.8177083730697632, - "reward_std": 0.29496608674526215, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.0379, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 718 }, { "clip_ratio": 0.0, - "completion_length": 516.2916870117188, + "completion_length": 491.47918701171875, "epoch": 0.719, - "grad_norm": 4.066723573849035, - "kl": 1.466796875, + "grad_norm": 1.3415707838315272, + "kl": 0.21337890625, "learning_rate": 2.996669730248628e-07, - "loss": 0.1836, - "reward": 1.9010416865348816, - "reward_std": 0.16091519594192505, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9427083432674408, + "loss": 0.0092, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 719 }, { "clip_ratio": 0.0, - "completion_length": 478.6875, + "completion_length": 443.56251525878906, "epoch": 0.72, - "grad_norm": 13.88675900880161, - "kl": 1.712890625, + "grad_norm": 1.999158876101931, + "kl": 0.093994140625, "learning_rate": 2.9836319343816397e-07, - "loss": 0.2485, - "reward": 1.8385417461395264, - "reward_std": 0.20586540549993515, - "rewards/accuracy_reward": 0.8958333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9427083730697632, + "loss": 0.0036, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 720 }, { "clip_ratio": 0.0, - "completion_length": 491.1458435058594, + "completion_length": 519.0833435058594, "epoch": 0.721, - "grad_norm": 12.748686066507087, - "kl": 1.05322265625, + "grad_norm": 16.99046812237398, + "kl": 0.5625, "learning_rate": 2.9706247996654134e-07, - "loss": 0.3131, - "reward": 1.8177083730697632, - "reward_std": 0.32699036598205566, - "rewards/accuracy_reward": 0.875, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9427083432674408, + "loss": 0.1438, + "reward": 0.9843750298023224, + "reward_std": 0.05412658676505089, + "rewards/tag_count_reward": 0.9843750298023224, "step": 721 }, { "clip_ratio": 0.0, - "completion_length": 375.25, + "completion_length": 430.60418701171875, "epoch": 0.722, - "grad_norm": 2.5167647725002316, - "kl": 0.33935546875, + "grad_norm": 4.075599579072145, + "kl": 0.677001953125, "learning_rate": 2.9576484845877793e-07, - "loss": 0.0215, - "reward": 2.0, - "reward_std": 0.0, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": 0.0711, + "reward": 0.984375, + "reward_std": 0.03884884715080261, + "rewards/tag_count_reward": 0.984375, "step": 722 }, { "clip_ratio": 0.0, - "completion_length": 394.81251525878906, + "completion_length": 506.35418701171875, "epoch": 0.723, - "grad_norm": 7.251829155077994, - "kl": 0.1787109375, + "grad_norm": 26.315070501082214, + "kl": 1.134765625, "learning_rate": 2.944703147261046e-07, - "loss": 0.0773, - "reward": 1.7447916865348816, - "reward_std": 0.018042195588350296, - "rewards/accuracy_reward": 0.75, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.1994, + "reward": 0.9739583432674408, + "reward_std": 0.06317693740129471, + "rewards/tag_count_reward": 0.9739583432674408, "step": 723 }, { "clip_ratio": 0.0, - "completion_length": 378.8333435058594, + "completion_length": 513.4791717529297, "epoch": 0.724, - "grad_norm": 14.761227216324544, - "kl": 0.609375, + "grad_norm": 16.348844686428375, + "kl": 0.98828125, "learning_rate": 2.931788945420058e-07, - "loss": 0.1776, - "reward": 1.9583333730697632, - "reward_std": 0.10825316980481148, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.2025, + "reward": 0.9739583432674408, + "reward_std": 0.0749332457780838, + "rewards/tag_count_reward": 0.9739583432674408, "step": 724 }, { "clip_ratio": 0.0, - "completion_length": 389.5416717529297, + "completion_length": 411.5625, "epoch": 0.725, - "grad_norm": 6.243168978434975, - "kl": 0.25830078125, + "grad_norm": 0.6207821166682146, + "kl": 0.09716796875, "learning_rate": 2.918906036420294e-07, - "loss": 0.0707, - "reward": 1.9791667461395264, - "reward_std": 0.07216878235340118, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.0043, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 725 }, { "clip_ratio": 0.0, - "completion_length": 456.1041717529297, + "completion_length": 483.8333435058594, "epoch": 0.726, - "grad_norm": 5.5600893276951835, - "kl": 0.5478515625, + "grad_norm": 12.43240779138603, + "kl": 0.787109375, "learning_rate": 2.9060545772359305e-07, - "loss": 0.0337, - "reward": 1.8854166865348816, - "reward_std": 0.13458873704075813, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9687500298023224, + "loss": 0.1083, + "reward": 0.9739583432674408, + "reward_std": 0.09021097794175148, + "rewards/tag_count_reward": 0.9739583432674408, "step": 726 }, { "clip_ratio": 0.0, - "completion_length": 642.875, + "completion_length": 492.43751525878906, "epoch": 0.727, - "grad_norm": 7.974492121254368, - "kl": 2.0380859375, + "grad_norm": 2.1358171479466765, + "kl": 0.2100830078125, "learning_rate": 2.893234724457946e-07, - "loss": 0.4957, - "reward": 1.6614583730697632, - "reward_std": 0.43694548308849335, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9114583432674408, + "loss": 0.009, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 727 }, { "clip_ratio": 0.0, - "completion_length": 868.0625305175781, + "completion_length": 521.7083435058594, "epoch": 0.728, - "grad_norm": 18.398151917865814, - "kl": 4.5, + "grad_norm": 12.834481320766775, + "kl": 1.7900390625, "learning_rate": 2.8804466342921987e-07, - "loss": 0.5892, - "reward": 1.5208333730697632, - "reward_std": 0.42187660932540894, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.7916666865348816, + "loss": 0.2872, + "reward": 0.9791666865348816, + "reward_std": 0.05689104646444321, + "rewards/tag_count_reward": 0.9791666865348816, "step": 728 }, { "clip_ratio": 0.0, - "completion_length": 666.7500305175781, + "completion_length": 493.7708435058594, "epoch": 0.729, - "grad_norm": 22.690140168287954, - "kl": 2.859375, + "grad_norm": 1.9494794184581465, + "kl": 0.25, "learning_rate": 2.86769046255753e-07, - "loss": 0.1898, - "reward": 1.4270833730697632, - "reward_std": 0.30900222063064575, - "rewards/accuracy_reward": 0.5625, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8645833432674408, + "loss": 0.0108, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 729 }, { "clip_ratio": 0.0, - "completion_length": 581.2708740234375, + "completion_length": 492.79168701171875, "epoch": 0.73, - "grad_norm": 22.900002006250954, - "kl": 1.78515625, + "grad_norm": 15.846940368073113, + "kl": 1.458984375, "learning_rate": 2.854966364683872e-07, - "loss": 0.4223, - "reward": 1.7812500596046448, - "reward_std": 0.3125811293721199, - "rewards/accuracy_reward": 0.8750000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.90625, + "loss": 0.2833, + "reward": 0.984375, + "reward_std": 0.05412658676505089, + "rewards/tag_count_reward": 0.984375, "step": 730 }, { "clip_ratio": 0.0, - "completion_length": 607.5833435058594, + "completion_length": 474.08335876464844, "epoch": 0.731, - "grad_norm": 15.774414896639312, - "kl": 3.4375, + "grad_norm": 7.863889311889127, + "kl": 0.3837890625, "learning_rate": 2.842274495710335e-07, - "loss": 0.3655, - "reward": 1.6875000596046448, - "reward_std": 0.36248913407325745, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8541666865348816, + "loss": 0.0895, + "reward": 0.9687500298023224, + "reward_std": 0.08474057167768478, + "rewards/tag_count_reward": 0.9687500298023224, "step": 731 }, { "clip_ratio": 0.0, - "completion_length": 316.1041717529297, + "completion_length": 378.9166717529297, "epoch": 0.732, - "grad_norm": 0.7330881354800509, - "kl": 0.091796875, + "grad_norm": 11.632733691365143, + "kl": 1.029296875, "learning_rate": 2.829615010283344e-07, - "loss": 0.0037, - "reward": 2.0, - "reward_std": 0.0, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": 0.1566, + "reward": 0.9843750298023224, + "reward_std": 0.05412658676505089, + "rewards/tag_count_reward": 0.9843750298023224, "step": 732 }, { "clip_ratio": 0.0, - "completion_length": 629.7291870117188, + "completion_length": 441.50001525878906, "epoch": 0.733, - "grad_norm": 24.723855367343294, - "kl": 2.705078125, + "grad_norm": 3.9288806935544667, + "kl": 0.15625, "learning_rate": 2.8169880626547283e-07, - "loss": 0.2251, - "reward": 1.765625, - "reward_std": 0.20550191029906273, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9114583432674408, + "loss": 0.0414, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 733 }, { "clip_ratio": 0.0, - "completion_length": 429.31251525878906, + "completion_length": 448.0208435058594, "epoch": 0.734, - "grad_norm": 4.581110895233979, - "kl": 0.569580078125, + "grad_norm": 8.40097124257542, + "kl": 0.654296875, "learning_rate": 2.8043938066798645e-07, - "loss": 0.079, - "reward": 1.9427083730697632, - "reward_std": 0.11716237664222717, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.984375, + "loss": 0.1139, + "reward": 0.9791666865348816, + "reward_std": 0.07216878235340118, + "rewards/tag_count_reward": 0.9791666865348816, "step": 734 }, { "clip_ratio": 0.0, - "completion_length": 581.7500305175781, + "completion_length": 494.95835876464844, "epoch": 0.735, - "grad_norm": 6.214394260264761, - "kl": 2.03759765625, + "grad_norm": 16.13408884785172, + "kl": 1.859375, "learning_rate": 2.791832395815782e-07, - "loss": 0.3873, - "reward": 1.7031250596046448, - "reward_std": 0.408438578248024, - "rewards/accuracy_reward": 0.8125000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.890625, + "loss": 0.2235, + "reward": 0.9895833730697632, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833730697632, "step": 735 }, { "clip_ratio": 0.0, - "completion_length": 385.18751525878906, + "completion_length": 444.8541717529297, "epoch": 0.736, - "grad_norm": 11.63268791970304, - "kl": 0.23876953125, + "grad_norm": 1.6994100231244034, + "kl": 0.155517578125, "learning_rate": 2.7793039831193133e-07, - "loss": 0.2088, - "reward": 1.9635416865348816, - "reward_std": 0.12629537284374237, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.984375, + "loss": 0.0063, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 736 }, { "clip_ratio": 0.0, - "completion_length": 607.2083435058594, + "completion_length": 529.7083587646484, "epoch": 0.737, - "grad_norm": 16.775131439725214, - "kl": 2.396484375, + "grad_norm": 28.00915402328376, + "kl": 1.4169921875, "learning_rate": 2.766808721245211e-07, - "loss": 0.6521, - "reward": 1.6614583730697632, - "reward_std": 0.4493499845266342, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8697916865348816, + "loss": 0.2558, + "reward": 0.9895833432674408, + "reward_std": 0.024328090250492096, + "rewards/tag_count_reward": 0.9895833432674408, "step": 737 }, { "clip_ratio": 0.0, - "completion_length": 527.1458435058594, + "completion_length": 464.7083435058594, "epoch": 0.738, - "grad_norm": 6.7290983199941845, - "kl": 1.4765625, + "grad_norm": 31.317454674009355, + "kl": 1.384765625, "learning_rate": 2.7543467624442956e-07, - "loss": 0.2432, - "reward": 1.6979167461395264, - "reward_std": 0.33060067147016525, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9270833432674408, + "loss": 0.3725, + "reward": 0.9843750298023224, + "reward_std": 0.05412658676505089, + "rewards/tag_count_reward": 0.9843750298023224, "step": 738 }, { "clip_ratio": 0.0, - "completion_length": 495.8333435058594, + "completion_length": 483.41668701171875, "epoch": 0.739, - "grad_norm": 5.5907904218550195, - "kl": 1.45556640625, + "grad_norm": 25.5966330668809, + "kl": 1.373046875, "learning_rate": 2.741918258561607e-07, - "loss": 0.3125, - "reward": 1.8281250596046448, - "reward_std": 0.26883548498153687, - "rewards/accuracy_reward": 0.8958333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9322916865348816, + "loss": 0.3158, + "reward": 0.9843750298023224, + "reward_std": 0.05412658676505089, + "rewards/tag_count_reward": 0.9843750298023224, "step": 739 }, { "clip_ratio": 0.0, - "completion_length": 395.25001525878906, + "completion_length": 427.4791717529297, "epoch": 0.74, - "grad_norm": 12.98191392607371, - "kl": 0.7626953125, + "grad_norm": 28.50096014279551, + "kl": 2.21484375, "learning_rate": 2.729523361034538e-07, - "loss": 0.3345, - "reward": 1.9062500596046448, - "reward_std": 0.25934017449617386, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9479166865348816, + "loss": 0.4558, + "reward": 0.984375, + "reward_std": 0.05412658676505089, + "rewards/tag_count_reward": 0.984375, "step": 740 }, { "clip_ratio": 0.0, - "completion_length": 578.0833435058594, + "completion_length": 491.3958435058594, "epoch": 0.741, - "grad_norm": 5.58196058975895, - "kl": 1.925537109375, + "grad_norm": 8.654662460777226, + "kl": 1.49609375, "learning_rate": 2.717162220891007e-07, - "loss": 0.3753, - "reward": 1.875, - "reward_std": 0.1878853738307953, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.875, + "loss": 0.2585, + "reward": 0.9635416865348816, + "reward_std": 0.10278276726603508, + "rewards/tag_count_reward": 0.9635416865348816, "step": 741 }, { "clip_ratio": 0.0, - "completion_length": 453.81251525878906, + "completion_length": 499.25001525878906, "epoch": 0.742, - "grad_norm": 47.61043656749001, - "kl": 0.455078125, + "grad_norm": 11.857589451422239, + "kl": 0.6640625, "learning_rate": 2.7048349887476037e-07, - "loss": 0.234, - "reward": 1.96875, - "reward_std": 0.10825317353010178, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0695, + "reward": 0.984375, + "reward_std": 0.03884884715080261, + "rewards/tag_count_reward": 0.984375, "step": 742 }, { "clip_ratio": 0.0, - "completion_length": 519.8541870117188, + "completion_length": 464.0208435058594, "epoch": 0.743, - "grad_norm": 21.950415777530985, - "kl": 1.474609375, + "grad_norm": 9.793252984079043, + "kl": 0.4365234375, "learning_rate": 2.692541814807763e-07, - "loss": 0.7038, - "reward": 1.8802083730697632, - "reward_std": 0.28356732428073883, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9218750298023224, + "loss": 0.164, + "reward": 0.9895833432674408, + "reward_std": 0.024328090250492096, + "rewards/tag_count_reward": 0.9895833432674408, "step": 743 }, { "clip_ratio": 0.0, - "completion_length": 697.7708435058594, + "completion_length": 485.8958435058594, "epoch": 0.744, - "grad_norm": 10.980318043858643, - "kl": 2.8125, + "grad_norm": 4.419154289930755, + "kl": 0.2177734375, "learning_rate": 2.6802828488599294e-07, - "loss": 0.6072, - "reward": 1.4895833730697632, - "reward_std": 0.348388459533453, - "rewards/accuracy_reward": 0.6041666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8854166865348816, + "loss": 0.0177, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 744 }, { "clip_ratio": 0.0, - "completion_length": 765.7708740234375, + "completion_length": 506.64585876464844, "epoch": 0.745, - "grad_norm": 22.71300386629857, - "kl": 4.21875, + "grad_norm": 10.681726054990179, + "kl": 0.51904296875, "learning_rate": 2.6680582402757324e-07, - "loss": 0.4176, - "reward": 1.4687500596046448, - "reward_std": 0.4039232060313225, - "rewards/accuracy_reward": 0.6458333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8229166865348816, + "loss": 0.1505, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 745 }, { "clip_ratio": 0.0, - "completion_length": 534.8333435058594, + "completion_length": 438.7916717529297, "epoch": 0.746, - "grad_norm": 9.28912253634519, - "kl": 2.28515625, + "grad_norm": 5.303740743308652, + "kl": 0.40185546875, "learning_rate": 2.655868138008171e-07, - "loss": 0.6251, - "reward": 1.8906250596046448, - "reward_std": 0.26262669265270233, - "rewards/accuracy_reward": 0.9583333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9322916865348816, + "loss": 0.1217, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 746 }, { "clip_ratio": 0.0, - "completion_length": 453.5208435058594, + "completion_length": 400.22918701171875, "epoch": 0.747, - "grad_norm": 9.633261077244628, - "kl": 1.46484375, + "grad_norm": 7.818333629793629, + "kl": 0.388671875, "learning_rate": 2.6437126905897967e-07, - "loss": 0.3679, - "reward": 1.8750000596046448, - "reward_std": 0.2426239252090454, - "rewards/accuracy_reward": 0.9375000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9375, + "loss": 0.1222, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 747 }, { "clip_ratio": 0.0, - "completion_length": 368.6875, + "completion_length": 424.8125, "epoch": 0.748, - "grad_norm": 1.3739521074542798, - "kl": 0.173095703125, + "grad_norm": 4.940523622959794, + "kl": 0.3505859375, "learning_rate": 2.631592046130896e-07, - "loss": 0.007, - "reward": 1.75, + "loss": 0.0192, + "reward": 1.0, "reward_std": 0.0, - "rewards/accuracy_reward": 0.75, - "rewards/format_reward": 0.0, "rewards/tag_count_reward": 1.0, "step": 748 }, { "clip_ratio": 0.0, - "completion_length": 372.37501525878906, + "completion_length": 447.5833435058594, "epoch": 0.749, - "grad_norm": 11.453213641212772, - "kl": 0.759765625, + "grad_norm": 14.486858386225327, + "kl": 0.75244140625, "learning_rate": 2.6195063523177e-07, - "loss": 0.2469, - "reward": 1.9479166865348816, - "reward_std": 0.1461983658373356, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9687500298023224, + "loss": 0.0931, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 749 }, { "clip_ratio": 0.0, - "completion_length": 369.62501525878906, + "completion_length": 412.5833435058594, "epoch": 0.75, - "grad_norm": 5.12693301829503, - "kl": 0.9189453125, + "grad_norm": 1.4456840039078567, + "kl": 0.1439208984375, "learning_rate": 2.6074557564105724e-07, - "loss": 0.0712, - "reward": 1.9166666865348816, - "reward_std": 0.18523554503917694, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, + "loss": 0.0067, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 750 }, { "clip_ratio": 0.0, - "completion_length": 417.9375, + "completion_length": 513.5208435058594, "epoch": 0.751, - "grad_norm": 6.286447945604738, - "kl": 1.32421875, + "grad_norm": 35.005057574401924, + "kl": 2.38671875, "learning_rate": 2.595440405242222e-07, - "loss": 0.198, - "reward": 1.9270833730697632, - "reward_std": 0.12164046615362167, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.96875, + "loss": 0.3071, + "reward": 0.9739583432674408, + "reward_std": 0.07845467701554298, + "rewards/tag_count_reward": 0.9739583432674408, "step": 751 }, { "clip_ratio": 0.0, - "completion_length": 457.3541717529297, + "completion_length": 446.00001525878906, "epoch": 0.752, - "grad_norm": 9.824686585527399, - "kl": 2.8125, + "grad_norm": 22.58672685506717, + "kl": 0.9561767578125, "learning_rate": 2.583460445215911e-07, - "loss": 0.4985, - "reward": 1.8958333730697632, - "reward_std": 0.30491530895233154, - "rewards/accuracy_reward": 0.9583333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9375000298023224, + "loss": 0.1269, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 752 }, { "clip_ratio": 0.0, - "completion_length": 427.72918701171875, + "completion_length": 464.62501525878906, "epoch": 0.753, - "grad_norm": 3.5912283463784984, - "kl": 1.06787109375, + "grad_norm": 10.917661375391802, + "kl": 0.528076171875, "learning_rate": 2.571516022303671e-07, - "loss": 0.2436, - "reward": 1.953125, - "reward_std": 0.1281561702489853, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, + "loss": 0.165, + "reward": 0.9739583432674408, + "reward_std": 0.0749332457780838, "rewards/tag_count_reward": 0.9739583432674408, "step": 753 }, { "clip_ratio": 0.0, - "completion_length": 525.5833435058594, + "completion_length": 547.0208587646484, "epoch": 0.754, - "grad_norm": 10.999543408139116, - "kl": 2.2734375, + "grad_norm": 14.595777318588363, + "kl": 2.20703125, "learning_rate": 2.5596072820445254e-07, - "loss": 0.3437, - "reward": 1.7239583730697632, - "reward_std": 0.36418458819389343, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9322916865348816, + "loss": 0.475, + "reward": 0.9583333432674408, + "reward_std": 0.1276453733444214, + "rewards/tag_count_reward": 0.9583333432674408, "step": 754 }, { "clip_ratio": 0.0, - "completion_length": 408.5833435058594, + "completion_length": 437.7083435058594, "epoch": 0.755, - "grad_norm": 12.043473083750008, - "kl": 1.248046875, + "grad_norm": 9.72501428549619, + "kl": 0.6739501953125, "learning_rate": 2.547734369542718e-07, - "loss": 0.3779, - "reward": 1.7135416865348816, - "reward_std": 0.12629536911845207, - "rewards/accuracy_reward": 0.75, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.1245, + "reward": 0.9739583432674408, + "reward_std": 0.0749332457780838, + "rewards/tag_count_reward": 0.9739583432674408, "step": 755 }, { "clip_ratio": 0.0, - "completion_length": 416.62501525878906, + "completion_length": 441.93751525878906, "epoch": 0.756, - "grad_norm": 5.251568437182964, - "kl": 1.882568359375, + "grad_norm": 11.940232882356842, + "kl": 0.75390625, "learning_rate": 2.5358974294659373e-07, - "loss": 0.289, - "reward": 1.8958333730697632, - "reward_std": 0.2655413746833801, - "rewards/accuracy_reward": 0.9375, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9583333432674408, + "loss": 0.2374, + "reward": 0.9843750298023224, + "reward_std": 0.05412658676505089, + "rewards/tag_count_reward": 0.9843750298023224, "step": 756 }, { "clip_ratio": 0.0, - "completion_length": 413.3541717529297, + "completion_length": 425.1458435058594, "epoch": 0.757, - "grad_norm": 27.723671273560402, - "kl": 1.955078125, + "grad_norm": 21.25957650725657, + "kl": 0.53466796875, "learning_rate": 2.5240966060435674e-07, - "loss": 0.3962, - "reward": 1.9375, - "reward_std": 0.19795067235827446, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9583333432674408, + "loss": 0.0655, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 757 }, { "clip_ratio": 0.0, - "completion_length": 444.31251525878906, + "completion_length": 426.2083435058594, "epoch": 0.758, - "grad_norm": 15.600424673597132, - "kl": 1.95166015625, + "grad_norm": 2.670261773175817, + "kl": 0.182861328125, "learning_rate": 2.512332043064913e-07, - "loss": 0.2562, - "reward": 1.8229166865348816, - "reward_std": 0.1550140678882599, - "rewards/accuracy_reward": 0.875, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9479166865348816, + "loss": 0.0105, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 758 }, { "clip_ratio": 0.0, - "completion_length": 431.8125, + "completion_length": 438.1875, "epoch": 0.759, - "grad_norm": 24.009973204121238, - "kl": 2.46484375, + "grad_norm": 3.550420770270506, + "kl": 0.375, "learning_rate": 2.5006038838774647e-07, - "loss": 0.2499, - "reward": 1.765625, - "reward_std": 0.193053238093853, - "rewards/accuracy_reward": 0.8333333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9322916865348816, + "loss": 0.0534, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 759 }, { "clip_ratio": 0.0, - "completion_length": 353.8958435058594, + "completion_length": 467.70835876464844, "epoch": 0.76, - "grad_norm": 12.827717740116938, - "kl": 0.4296875, + "grad_norm": 9.761605268786091, + "kl": 1.1611328125, "learning_rate": 2.488912271385139e-07, - "loss": -0.0412, - "reward": 1.7708333730697632, - "reward_std": 0.07216878235340118, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": 0.2286, + "reward": 0.9635416865348816, + "reward_std": 0.08307896926999092, + "rewards/tag_count_reward": 0.9635416865348816, "step": 760 }, { "clip_ratio": 0.0, - "completion_length": 505.54168701171875, + "completion_length": 574.4791870117188, "epoch": 0.761, - "grad_norm": 11.366192562927665, - "kl": 2.64453125, + "grad_norm": 24.51004058459727, + "kl": 1.64453125, "learning_rate": 2.4772573480465445e-07, - "loss": 0.3814, - "reward": 1.75, - "reward_std": 0.2551109194755554, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9166666865348816, + "loss": 0.5824, + "reward": 0.9791666865348816, + "reward_std": 0.06041248142719269, + "rewards/tag_count_reward": 0.9791666865348816, "step": 761 }, { "clip_ratio": 0.0, - "completion_length": 431.1666717529297, + "completion_length": 485.06251525878906, "epoch": 0.762, - "grad_norm": 12.59853091065054, - "kl": 0.998046875, + "grad_norm": 10.02285213561004, + "kl": 0.9228515625, "learning_rate": 2.465639255873246e-07, - "loss": 0.2772, - "reward": 1.6875000596046448, - "reward_std": 0.183121956884861, - "rewards/accuracy_reward": 0.708333358168602, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.1431, + "reward": 0.9895833730697632, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833730697632, "step": 762 }, { "clip_ratio": 0.0, - "completion_length": 465.7083435058594, + "completion_length": 424.25001525878906, "epoch": 0.763, - "grad_norm": 23.871057557588358, - "kl": 1.45654296875, + "grad_norm": 12.319177988211461, + "kl": 0.7608642578125, "learning_rate": 2.454058136428027e-07, - "loss": 0.0873, - "reward": 1.5364583730697632, - "reward_std": 0.08618465065956116, - "rewards/accuracy_reward": 0.5625, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 0.0391, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 763 }, { "clip_ratio": 0.0, - "completion_length": 464.56251525878906, + "completion_length": 467.3125, "epoch": 0.764, - "grad_norm": 9.770183662921154, - "kl": 1.62890625, + "grad_norm": 16.438634033001705, + "kl": 1.4501953125, "learning_rate": 2.4425141308231765e-07, - "loss": 0.4394, - "reward": 1.875, - "reward_std": 0.17368127778172493, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9583333432674408, + "loss": 0.2977, + "reward": 0.9843750298023224, + "reward_std": 0.05412658676505089, + "rewards/tag_count_reward": 0.9843750298023224, "step": 764 }, { "clip_ratio": 0.0, - "completion_length": 543.3333435058594, + "completion_length": 447.9375, "epoch": 0.765, - "grad_norm": 11.883086611599984, - "kl": 2.2734375, + "grad_norm": 1.407361201542416, + "kl": 0.16259765625, "learning_rate": 2.4310073797187573e-07, - "loss": 0.6227, - "reward": 1.5625, - "reward_std": 0.37577906250953674, - "rewards/accuracy_reward": 0.6458333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9166666865348816, + "loss": 0.0066, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 765 }, { "clip_ratio": 0.0, - "completion_length": 413.31251525878906, + "completion_length": 427.7291717529297, "epoch": 0.766, - "grad_norm": 5.918853257889928, - "kl": 0.80322265625, + "grad_norm": 4.688693636755467, + "kl": 0.31298828125, "learning_rate": 2.4195380233209006e-07, - "loss": 0.1821, - "reward": 1.96875, - "reward_std": 0.07769769430160522, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, + "loss": 0.0387, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, "rewards/tag_count_reward": 0.9895833432674408, "step": 766 }, { "clip_ratio": 0.0, - "completion_length": 592.3750152587891, + "completion_length": 508.93751525878906, "epoch": 0.767, - "grad_norm": 7.72225659937188, - "kl": 3.03369140625, + "grad_norm": 5.683735747294983, + "kl": 1.4609375, "learning_rate": 2.408106201380097e-07, - "loss": 0.5582, - "reward": 1.4947916865348816, - "reward_std": 0.3689124286174774, - "rewards/accuracy_reward": 0.6041666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.890625, + "loss": 0.1967, + "reward": 0.9791666865348816, + "reward_std": 0.06041248142719269, + "rewards/tag_count_reward": 0.9791666865348816, "step": 767 }, { "clip_ratio": 0.0, - "completion_length": 471.39585876464844, + "completion_length": 535.1041717529297, "epoch": 0.768, - "grad_norm": 13.29949755910264, - "kl": 1.58251953125, + "grad_norm": 9.361688645147792, + "kl": 1.2958984375, "learning_rate": 2.3967120531894857e-07, - "loss": 0.0493, - "reward": 1.7604166865348816, - "reward_std": 0.08771618455648422, - "rewards/accuracy_reward": 0.8125, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9479166865348816, + "loss": 0.1311, + "reward": 0.9739583432674408, + "reward_std": 0.058750879019498825, + "rewards/tag_count_reward": 0.9739583432674408, "step": 768 }, { "clip_ratio": 0.0, - "completion_length": 527.5000305175781, + "completion_length": 520.7916870117188, "epoch": 0.769, - "grad_norm": 10.981151982341768, - "kl": 2.3212890625, + "grad_norm": 7.86994387145606, + "kl": 0.72802734375, "learning_rate": 2.38535571758317e-07, - "loss": 0.3413, - "reward": 1.7083333730697632, - "reward_std": 0.166842982172966, - "rewards/accuracy_reward": 0.75, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9583333432674408, + "loss": 0.222, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 769 }, { "clip_ratio": 0.0, - "completion_length": 709.4166870117188, + "completion_length": 425.3333435058594, "epoch": 0.77, - "grad_norm": 10.937949977934677, - "kl": 4.078125, + "grad_norm": 1.8396818228183496, + "kl": 0.186767578125, "learning_rate": 2.374037332934512e-07, - "loss": 0.4597, - "reward": 1.6406250596046448, - "reward_std": 0.37205876410007477, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8697916865348816, + "loss": 0.0035, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 770 }, { "clip_ratio": 0.0, - "completion_length": 442.41668701171875, + "completion_length": 446.5833435058594, "epoch": 0.771, - "grad_norm": 11.113716555269422, - "kl": 1.22119140625, + "grad_norm": 16.397728494583784, + "kl": 0.681640625, "learning_rate": 2.36275703715446e-07, - "loss": 0.1746, - "reward": 1.796875, - "reward_std": 0.0965491235256195, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9427083432674408, + "loss": 0.3143, + "reward": 0.9843750298023224, + "reward_std": 0.05412658676505089, + "rewards/tag_count_reward": 0.9843750298023224, "step": 771 }, { "clip_ratio": 0.0, - "completion_length": 417.18751525878906, + "completion_length": 470.0833435058594, "epoch": 0.772, - "grad_norm": 13.657549444048158, - "kl": 0.87548828125, + "grad_norm": 12.503031358735678, + "kl": 1.130859375, "learning_rate": 2.3515149676898552e-07, - "loss": 0.2508, - "reward": 1.953125, - "reward_std": 0.14568756520748138, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 0.2804, + "reward": 0.9687500298023224, + "reward_std": 0.08474057167768478, + "rewards/tag_count_reward": 0.9687500298023224, "step": 772 }, { "clip_ratio": 0.0, - "completion_length": 593.2708435058594, + "completion_length": 462.85418701171875, "epoch": 0.773, - "grad_norm": 5.647172563839161, - "kl": 2.490234375, + "grad_norm": 3.9169964655197944, + "kl": 0.38720703125, "learning_rate": 2.3403112615217693e-07, - "loss": 0.276, - "reward": 1.6979167461395264, - "reward_std": 0.3060782104730606, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/format_reward": 0.02083333395421505, - "rewards/tag_count_reward": 0.8854166865348816, + "loss": 0.0518, + "reward": 0.984375, + "reward_std": 0.05412658676505089, + "rewards/tag_count_reward": 0.984375, "step": 773 }, { "clip_ratio": 0.0, - "completion_length": 534.6458435058594, + "completion_length": 416.9166717529297, "epoch": 0.774, - "grad_norm": 5.380705143919367, - "kl": 1.48828125, + "grad_norm": 0.9880321965045378, + "kl": 0.142822265625, "learning_rate": 2.3291460551638237e-07, - "loss": 0.182, - "reward": 1.9062500596046448, - "reward_std": 0.17639562487602234, - "rewards/accuracy_reward": 0.9583333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9479166865348816, + "loss": 0.006, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 774 }, { "clip_ratio": 0.0, - "completion_length": 427.9583435058594, + "completion_length": 540.6666870117188, "epoch": 0.775, - "grad_norm": 7.227834304046939, - "kl": 0.38134765625, + "grad_norm": 11.726121649739454, + "kl": 0.869140625, "learning_rate": 2.3180194846605364e-07, - "loss": 0.1919, - "reward": 1.875, - "reward_std": 0.20090095698833466, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/format_reward": 0.0, + "loss": 0.3413, + "reward": 0.9791666865348816, + "reward_std": 0.07216878235340118, "rewards/tag_count_reward": 0.9791666865348816, "step": 775 }, { "clip_ratio": 0.0, - "completion_length": 421.43751525878906, + "completion_length": 471.64585876464844, "epoch": 0.776, - "grad_norm": 8.637535551458807, - "kl": 0.7099609375, + "grad_norm": 6.582729677504788, + "kl": 0.580078125, "learning_rate": 2.306931685585657e-07, - "loss": 0.1274, - "reward": 1.828125, - "reward_std": 0.14113369584083557, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 0.0981, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 776 }, { "clip_ratio": 0.0, - "completion_length": 356.7291717529297, + "completion_length": 454.3541717529297, "epoch": 0.777, - "grad_norm": 5.443953406189849, - "kl": 0.262939453125, + "grad_norm": 30.100361780455177, + "kl": 1.84375, "learning_rate": 2.2958827930405162e-07, - "loss": 0.0884, - "reward": 1.9791666865348816, - "reward_std": 0.07216878235340118, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.3048, + "reward": 0.9635416865348816, + "reward_std": 0.07955753058195114, + "rewards/tag_count_reward": 0.9635416865348816, "step": 777 }, { "clip_ratio": 0.0, - "completion_length": 736.9166870117188, + "completion_length": 475.72918701171875, "epoch": 0.778, - "grad_norm": 12.488748653101004, - "kl": 3.8671875, + "grad_norm": 6.574018009507352, + "kl": 0.27490234375, "learning_rate": 2.2848729416523859e-07, - "loss": 0.4991, - "reward": 1.4635416865348816, - "reward_std": 0.41394008696079254, - "rewards/accuracy_reward": 0.6458333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8177083730697632, + "loss": 0.0354, + "reward": 0.9791666865348816, + "reward_std": 0.07216878235340118, + "rewards/tag_count_reward": 0.9791666865348816, "step": 778 }, { "clip_ratio": 0.0, - "completion_length": 454.31251525878906, + "completion_length": 408.0, "epoch": 0.779, - "grad_norm": 27.421105424501206, - "kl": 0.912109375, + "grad_norm": 0.47326634644492166, + "kl": 0.0831298828125, "learning_rate": 2.2739022655728277e-07, - "loss": 0.5184, - "reward": 1.9010416865348816, - "reward_std": 0.32610950991511345, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9427083730697632, + "loss": 0.0035, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 779 }, { "clip_ratio": 0.0, - "completion_length": 449.6875305175781, + "completion_length": 520.0208587646484, "epoch": 0.78, - "grad_norm": 6.444707163031271, - "kl": 0.8994140625, + "grad_norm": 2.9849829875627867, + "kl": 0.51708984375, "learning_rate": 2.2629708984760706e-07, - "loss": 0.1452, - "reward": 1.7864583730697632, - "reward_std": 0.1290765255689621, - "rewards/accuracy_reward": 0.8125, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 0.0061, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 780 }, { "clip_ratio": 0.0, - "completion_length": 412.3125, + "completion_length": 465.1458435058594, "epoch": 0.781, - "grad_norm": 3.260890956680781, - "kl": 0.52783203125, + "grad_norm": 0.4898002516853844, + "kl": 0.077392578125, "learning_rate": 2.2520789735573704e-07, - "loss": 0.0552, - "reward": 1.9739583730697632, - "reward_std": 0.06226281076669693, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 0.0035, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 781 }, { "clip_ratio": 0.0, - "completion_length": 440.72918701171875, + "completion_length": 426.1666717529297, "epoch": 0.782, - "grad_norm": 20.761835302852564, - "kl": 0.396484375, + "grad_norm": 2.203243601671771, + "kl": 0.195556640625, "learning_rate": 2.2412266235313973e-07, - "loss": -0.1212, - "reward": 2.005208373069763, - "reward_std": 0.018042195588350296, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.02083333395421505, - "rewards/tag_count_reward": 0.984375, + "loss": 0.0108, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 782 }, { "clip_ratio": 0.0, - "completion_length": 419.4375, + "completion_length": 412.9583435058594, "epoch": 0.783, - "grad_norm": 5.448229360479665, - "kl": 0.571533203125, + "grad_norm": 2.4466254859091947, + "kl": 0.171875, "learning_rate": 2.230413980630609e-07, - "loss": 0.0445, - "reward": 1.8750000596046448, - "reward_std": 0.18934839963912964, - "rewards/accuracy_reward": 0.8958333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.0147, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 783 }, { "clip_ratio": 0.0, - "completion_length": 548.0208587646484, + "completion_length": 439.8958435058594, "epoch": 0.784, - "grad_norm": 15.37025204894443, - "kl": 1.47265625, + "grad_norm": 4.002690913646053, + "kl": 0.295654296875, "learning_rate": 2.2196411766036487e-07, - "loss": 0.2681, - "reward": 1.3697917461395264, - "reward_std": 0.2829871401190758, - "rewards/accuracy_reward": 0.4583333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9114583432674408, + "loss": 0.0801, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 784 }, { "clip_ratio": 0.0, - "completion_length": 524.1250152587891, + "completion_length": 448.16668701171875, "epoch": 0.785, - "grad_norm": 4.217285150765813, - "kl": 1.259033203125, + "grad_norm": 18.272707650400818, + "kl": 0.798828125, "learning_rate": 2.2089083427137329e-07, - "loss": 0.1985, - "reward": 1.90625, - "reward_std": 0.10148052871227264, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9270833432674408, + "loss": 0.216, + "reward": 0.9687500298023224, + "reward_std": 0.10825317353010178, + "rewards/tag_count_reward": 0.9687500298023224, "step": 785 }, { "clip_ratio": 0.0, - "completion_length": 450.18751525878906, + "completion_length": 504.87501525878906, "epoch": 0.786, - "grad_norm": 12.95383681415401, - "kl": 0.6337890625, + "grad_norm": 4.5120995319694845, + "kl": 0.80712890625, "learning_rate": 2.1982156097370557e-07, - "loss": 0.2129, - "reward": 1.9218750596046448, - "reward_std": 0.1651609167456627, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.1073, + "reward": 0.984375, + "reward_std": 0.05412658676505089, + "rewards/tag_count_reward": 0.984375, "step": 786 }, { "clip_ratio": 0.0, - "completion_length": 556.0833435058594, + "completion_length": 470.3333435058594, "epoch": 0.787, - "grad_norm": 10.123070320918194, - "kl": 1.146484375, + "grad_norm": 8.290006005097963, + "kl": 0.7265625, "learning_rate": 2.1875631079611956e-07, - "loss": 0.397, - "reward": 1.8958333730697632, - "reward_std": 0.17987383902072906, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9375000298023224, + "loss": 0.0023, + "reward": 0.9739583730697632, + "reward_std": 0.09021097794175148, + "rewards/tag_count_reward": 0.9739583730697632, "step": 787 }, { "clip_ratio": 0.0, - "completion_length": 474.7916717529297, + "completion_length": 551.8333740234375, "epoch": 0.788, - "grad_norm": 59.96109238008041, - "kl": 0.58203125, + "grad_norm": 6.163806278693715, + "kl": 0.77197265625, "learning_rate": 2.1769509671835223e-07, - "loss": 0.382, - "reward": 1.9583333730697632, - "reward_std": 0.14433756470680237, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, + "loss": 0.1466, + "reward": 0.9791666865348816, + "reward_std": 0.04865618050098419, "rewards/tag_count_reward": 0.9791666865348816, "step": 788 }, { "clip_ratio": 0.0, - "completion_length": 504.66668701171875, + "completion_length": 489.4375, "epoch": 0.789, - "grad_norm": 7.815572673454279, - "kl": 1.296875, + "grad_norm": 15.557960983334132, + "kl": 1.03857421875, "learning_rate": 2.166379316709625e-07, - "loss": 0.2516, - "reward": 1.4791666865348816, - "reward_std": 0.22677228599786758, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9375000298023224, + "loss": 0.3285, + "reward": 0.9895833730697632, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833730697632, "step": 789 }, { "clip_ratio": 0.0, - "completion_length": 481.5416717529297, + "completion_length": 477.27085876464844, "epoch": 0.79, - "grad_norm": 18.13324153118153, - "kl": 1.017578125, + "grad_norm": 10.413647380902116, + "kl": 0.8818359375, "learning_rate": 2.1558482853517253e-07, - "loss": 0.5292, - "reward": 1.921875, - "reward_std": 0.20250799879431725, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.2329, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 790 }, { "clip_ratio": 0.0, - "completion_length": 418.25, + "completion_length": 401.9166717529297, "epoch": 0.791, - "grad_norm": 1.0247941528534334, - "kl": 0.219482421875, + "grad_norm": 3.8936337344497334, + "kl": 0.490234375, "learning_rate": 2.1453580014271203e-07, - "loss": 0.0089, - "reward": 2.0, - "reward_std": 0.0, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": 0.0582, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 791 }, { "clip_ratio": 0.0, - "completion_length": 361.3333435058594, + "completion_length": 406.8333435058594, "epoch": 0.792, - "grad_norm": 3.0229533398948027, - "kl": 0.13916015625, + "grad_norm": 3.8365345601196426, + "kl": 0.2802734375, "learning_rate": 2.134908592756607e-07, - "loss": 0.0196, - "reward": 1.9791666865348816, - "reward_std": 0.07216878235340118, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, + "loss": 0.0132, + "reward": 1.0, + "reward_std": 0.0, "rewards/tag_count_reward": 1.0, "step": 792 }, { "clip_ratio": 0.0, - "completion_length": 436.7291717529297, + "completion_length": 454.0, "epoch": 0.793, - "grad_norm": 18.3906976488108, - "kl": 0.953125, + "grad_norm": 4.896164030024204, + "kl": 0.490234375, "learning_rate": 2.124500186662932e-07, - "loss": 0.4588, - "reward": 1.9583333730697632, - "reward_std": 0.14433756470680237, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.068, + "reward": 0.984375, + "reward_std": 0.03884884715080261, + "rewards/tag_count_reward": 0.984375, "step": 793 }, { "clip_ratio": 0.0, - "completion_length": 492.93751525878906, + "completion_length": 448.1666717529297, "epoch": 0.794, - "grad_norm": 7.771453323186821, - "kl": 1.0390625, + "grad_norm": 7.086922184767771, + "kl": 0.6494140625, "learning_rate": 2.1141329099692406e-07, - "loss": 0.1561, - "reward": 1.7083333730697632, - "reward_std": 0.1779182329773903, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9583333432674408, + "loss": 0.0308, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 794 }, { "clip_ratio": 0.0, - "completion_length": 638.2916870117188, + "completion_length": 475.375, "epoch": 0.795, - "grad_norm": 59.61100557440209, - "kl": 4.12109375, + "grad_norm": 1.7909734564065407, + "kl": 0.15576171875, "learning_rate": 2.1038068889975259e-07, - "loss": 1.012, - "reward": 1.6458333730697632, - "reward_std": 0.5022158622741699, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8958333432674408, + "loss": 0.0317, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 795 }, { "clip_ratio": 0.0, - "completion_length": 399.6666717529297, + "completion_length": 497.0833435058594, "epoch": 0.796, - "grad_norm": 15.760272223313777, - "kl": 0.4375, + "grad_norm": 12.102755712800246, + "kl": 0.6318359375, "learning_rate": 2.0935222495670968e-07, - "loss": 0.2244, - "reward": 1.9583333730697632, - "reward_std": 0.14433756470680237, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.1842, + "reward": 0.9895833730697632, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833730697632, "step": 796 }, { "clip_ratio": 0.0, - "completion_length": 815.5208740234375, + "completion_length": 573.7916870117188, "epoch": 0.797, - "grad_norm": 11.679933547512489, - "kl": 4.65625, + "grad_norm": 12.78765245301671, + "kl": 0.822265625, "learning_rate": 2.0832791169930363e-07, - "loss": 0.501, - "reward": 1.4895833730697632, - "reward_std": 0.4656493365764618, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8229166865348816, + "loss": 0.1833, + "reward": 0.9843750298023224, + "reward_std": 0.05412658676505089, + "rewards/tag_count_reward": 0.9843750298023224, "step": 797 }, { "clip_ratio": 0.0, - "completion_length": 602.2500152587891, + "completion_length": 478.3541717529297, "epoch": 0.798, - "grad_norm": 14.854978403567321, - "kl": 2.2578125, + "grad_norm": 1.9918339123953115, + "kl": 0.31591796875, "learning_rate": 2.0730776160846853e-07, - "loss": 0.4897, - "reward": 1.8750000596046448, - "reward_std": 0.15769054368138313, - "rewards/accuracy_reward": 0.9375, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9375000298023224, + "loss": 0.0254, + "reward": 0.984375, + "reward_std": 0.03884884715080261, + "rewards/tag_count_reward": 0.984375, "step": 798 }, { "clip_ratio": 0.0, - "completion_length": 640.5625305175781, + "completion_length": 533.3958587646484, "epoch": 0.799, - "grad_norm": 6.275810383361826, - "kl": 3.0751953125, + "grad_norm": 12.82764672497113, + "kl": 0.98779296875, "learning_rate": 2.0629178711441115e-07, - "loss": 0.4361, - "reward": 1.7083333730697632, - "reward_std": 0.31055209040641785, - "rewards/accuracy_reward": 0.8125, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8958333432674408, + "loss": 0.2835, + "reward": 0.9687500298023224, + "reward_std": 0.10825317353010178, + "rewards/tag_count_reward": 0.9687500298023224, "step": 799 }, { "clip_ratio": 0.0, - "completion_length": 654.6458435058594, + "completion_length": 481.3958435058594, "epoch": 0.8, - "grad_norm": 7.092189554092769, - "kl": 2.1650390625, + "grad_norm": 3.39840233989013, + "kl": 0.1318359375, "learning_rate": 2.0528000059645995e-07, - "loss": 0.1893, - "reward": 1.6979166865348816, - "reward_std": 0.163921520113945, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.90625, + "loss": 0.0493, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 800 }, { "clip_ratio": 0.0, - "completion_length": 692.4166870117188, + "completion_length": 440.2916717529297, "epoch": 0.801, - "grad_norm": 22.421636074169705, - "kl": 3.359375, + "grad_norm": 1.17039551897165, + "kl": 0.14599609375, "learning_rate": 2.042724143829146e-07, - "loss": 0.1977, - "reward": 1.6510416865348816, - "reward_std": 0.2480478212237358, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8802083432674408, + "loss": 0.0057, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 801 }, { "clip_ratio": 0.0, - "completion_length": 471.8958435058594, + "completion_length": 441.60418701171875, "epoch": 0.802, - "grad_norm": 16.215249905637823, - "kl": 1.04296875, + "grad_norm": 0.6409417964283548, + "kl": 0.098388671875, "learning_rate": 2.032690407508949e-07, - "loss": 0.0668, - "reward": 1.5208333730697632, - "reward_std": 0.055476587265729904, - "rewards/accuracy_reward": 0.5625, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9583333432674408, + "loss": 0.0039, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 802 }, { "clip_ratio": 0.0, - "completion_length": 414.2291717529297, + "completion_length": 438.3125, "epoch": 0.803, - "grad_norm": 11.593000511531278, - "kl": 0.533203125, + "grad_norm": 0.5923060952205192, + "kl": 0.084716796875, "learning_rate": 2.0226989192619204e-07, - "loss": 0.2118, - "reward": 1.9739583730697632, - "reward_std": 0.09021097794175148, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 0.0035, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 803 }, { "clip_ratio": 0.0, - "completion_length": 691.2083740234375, + "completion_length": 443.06251525878906, "epoch": 0.804, - "grad_norm": 11.957552251244676, - "kl": 2.46875, + "grad_norm": 1.848745051481109, + "kl": 0.136962890625, "learning_rate": 2.0127498008311922e-07, - "loss": 0.2831, - "reward": 1.7447916865348816, - "reward_std": 0.24248994886875153, - "rewards/accuracy_reward": 0.875, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8697916865348816, + "loss": 0.0323, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 804 }, { "clip_ratio": 0.0, - "completion_length": 425.5208435058594, + "completion_length": 420.9166717529297, "epoch": 0.805, - "grad_norm": 6.713224493471999, - "kl": 0.888671875, + "grad_norm": 4.013705556924666, + "kl": 0.34765625, "learning_rate": 2.0028431734436308e-07, - "loss": 0.2372, - "reward": 1.7239583730697632, - "reward_std": 0.30906153470277786, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9322916865348816, + "loss": 0.0543, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 805 }, { "clip_ratio": 0.0, - "completion_length": 760.1041870117188, + "completion_length": 483.97918701171875, "epoch": 0.806, - "grad_norm": 10.984652662962171, - "kl": 3.7734375, + "grad_norm": 3.2232372608499174, + "kl": 0.3173828125, "learning_rate": 1.9929791578083655e-07, - "loss": 0.7889, - "reward": 1.6406250596046448, - "reward_std": 0.37293438613414764, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8489583432674408, + "loss": 0.0114, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 806 }, { "clip_ratio": 0.0, - "completion_length": 461.7916717529297, + "completion_length": 498.47918701171875, "epoch": 0.807, - "grad_norm": 10.599510437745305, - "kl": 1.21875, + "grad_norm": 5.401087737272918, + "kl": 0.71484375, "learning_rate": 1.9831578741153155e-07, - "loss": 0.3295, - "reward": 1.8854166865348816, - "reward_std": 0.25331807136535645, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.96875, + "loss": 0.1851, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 807 }, { "clip_ratio": 0.0, - "completion_length": 490.0208435058594, + "completion_length": 440.8333435058594, "epoch": 0.808, - "grad_norm": 6.223489420982448, - "kl": 1.19921875, + "grad_norm": 2.0006096032089475, + "kl": 0.148193359375, "learning_rate": 1.9733794420337213e-07, - "loss": 0.2697, - "reward": 1.9010416865348816, - "reward_std": 0.22727635502815247, - "rewards/accuracy_reward": 0.9583333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9427083432674408, + "loss": 0.0078, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 808 }, { "clip_ratio": 0.0, - "completion_length": 362.93751525878906, + "completion_length": 394.18751525878906, "epoch": 0.809, - "grad_norm": 5.373605387452282, - "kl": 0.3720703125, + "grad_norm": 0.5031508156464227, + "kl": 0.05029296875, "learning_rate": 1.9636439807106912e-07, - "loss": 0.0168, - "reward": 1.7864583730697632, - "reward_std": 0.176949180662632, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0021, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 809 }, { "clip_ratio": 0.0, - "completion_length": 448.33335876464844, + "completion_length": 441.04168701171875, "epoch": 0.81, - "grad_norm": 5.6266109738606644, - "kl": 0.73974609375, + "grad_norm": 0.6605051513844795, + "kl": 0.08056640625, "learning_rate": 1.9539516087697517e-07, - "loss": 0.2143, - "reward": 1.921875, - "reward_std": 0.16881313920021057, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.0033, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 810 }, { "clip_ratio": 0.0, - "completion_length": 403.12501525878906, + "completion_length": 422.0625, "epoch": 0.811, - "grad_norm": 9.331811683275411, - "kl": 0.64892578125, + "grad_norm": 3.881878608245351, + "kl": 0.23193359375, "learning_rate": 1.944302444309393e-07, - "loss": 0.2286, - "reward": 1.9635416865348816, - "reward_std": 0.08618465065956116, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.984375, + "loss": 0.0133, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 811 }, { "clip_ratio": 0.0, - "completion_length": 565.7291870117188, + "completion_length": 349.4166717529297, "epoch": 0.812, - "grad_norm": 10.839420100422307, - "kl": 2.130859375, + "grad_norm": 3.3941575923116543, + "kl": 0.067138671875, "learning_rate": 1.934696604901642e-07, - "loss": 0.528, - "reward": 1.7552083730697632, - "reward_std": 0.33489036560058594, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9010416865348816, + "loss": 0.0356, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 812 }, { "clip_ratio": 0.0, - "completion_length": 376.5625, + "completion_length": 391.625, "epoch": 0.813, - "grad_norm": 2.7936568025493917, - "kl": 0.225830078125, + "grad_norm": 2.354593368951386, + "kl": 0.1708984375, "learning_rate": 1.9251342075906179e-07, - "loss": 0.0012, - "reward": 1.9895833730697632, - "reward_std": 0.03608439117670059, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0081, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 813 }, { "clip_ratio": 0.0, - "completion_length": 540.5416870117188, + "completion_length": 508.62501525878906, "epoch": 0.814, - "grad_norm": 25.846117412294443, - "kl": 1.064453125, + "grad_norm": 4.684002035921945, + "kl": 0.2940673828125, "learning_rate": 1.915615368891117e-07, - "loss": -0.0598, - "reward": 1.5104166865348816, - "reward_std": 0.1359912659972906, - "rewards/accuracy_reward": 0.5416666865348816, - "rewards/format_reward": 0.02083333395421505, - "rewards/tag_count_reward": 0.9479166865348816, + "loss": 0.0818, + "reward": 0.9635416865348816, + "reward_std": 0.07457557320594788, + "rewards/tag_count_reward": 0.9635416865348816, "step": 814 }, { "clip_ratio": 0.0, - "completion_length": 388.2708435058594, + "completion_length": 388.2083435058594, "epoch": 0.815, - "grad_norm": 10.179075425735023, - "kl": 0.45458984375, + "grad_norm": 7.01050645008269, + "kl": 0.387939453125, "learning_rate": 1.9061402047871833e-07, - "loss": 0.242, - "reward": 1.9739583730697632, - "reward_std": 0.09021097421646118, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, + "loss": 0.0556, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, "rewards/tag_count_reward": 0.9947916865348816, "step": 815 }, { "clip_ratio": 0.0, - "completion_length": 415.3958435058594, + "completion_length": 384.7708435058594, "epoch": 0.816, - "grad_norm": 15.796299729756432, - "kl": 0.826171875, + "grad_norm": 2.2092743578327623, + "kl": 0.06591796875, "learning_rate": 1.8967088307307e-07, - "loss": 0.335, - "reward": 1.953125, - "reward_std": 0.16237976029515266, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 0.0047, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 816 }, { "clip_ratio": 0.0, - "completion_length": 601.5208435058594, + "completion_length": 500.3125305175781, "epoch": 0.817, - "grad_norm": 10.197635027392812, - "kl": 2.78125, + "grad_norm": 0.5326382898866008, + "kl": 0.0927734375, "learning_rate": 1.887321361639985e-07, - "loss": 0.4657, - "reward": 1.7812500596046448, - "reward_std": 0.30305100977420807, - "rewards/accuracy_reward": 0.8750000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.90625, + "loss": 0.0037, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 817 }, { "clip_ratio": 0.0, - "completion_length": 572.6041717529297, + "completion_length": 463.35418701171875, "epoch": 0.818, - "grad_norm": 15.035395691495074, - "kl": 3.78125, + "grad_norm": 23.222843148752947, + "kl": 1.390625, "learning_rate": 1.8779779118983867e-07, - "loss": 0.642, - "reward": 1.5781250596046448, - "reward_std": 0.33661970496177673, - "rewards/accuracy_reward": 0.6666666716337204, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9114583730697632, + "loss": 0.3004, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 818 }, { "clip_ratio": 0.0, - "completion_length": 548.5000305175781, + "completion_length": 454.2500305175781, "epoch": 0.819, - "grad_norm": 9.166682317132233, - "kl": 2.8984375, + "grad_norm": 2.7247414932960843, + "kl": 0.179443359375, "learning_rate": 1.8686785953528922e-07, - "loss": 0.4114, - "reward": 1.7239583730697632, - "reward_std": 0.4167061299085617, - "rewards/accuracy_reward": 0.8333333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.890625, + "loss": 0.008, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 819 }, { "clip_ratio": 0.0, - "completion_length": 414.04168701171875, + "completion_length": 405.9583435058594, "epoch": 0.82, - "grad_norm": 34.78801565486368, - "kl": 1.262939453125, + "grad_norm": 2.4711903035532443, + "kl": 0.107177734375, "learning_rate": 1.8594235253127372e-07, - "loss": 0.4179, - "reward": 1.984375, - "reward_std": 0.03884884715080261, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.984375, + "loss": 0.0236, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 820 }, { "clip_ratio": 0.0, - "completion_length": 551.6458587646484, + "completion_length": 514.6875, "epoch": 0.821, - "grad_norm": 10.01037115756937, - "kl": 2.025390625, + "grad_norm": 6.694991948872537, + "kl": 0.3466796875, "learning_rate": 1.850212814548031e-07, - "loss": 0.2498, - "reward": 1.765625, - "reward_std": 0.15568077564239502, - "rewards/accuracy_reward": 0.8125, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.953125, + "loss": 0.1343, + "reward": 0.9843750298023224, + "reward_std": 0.05412658676505089, + "rewards/tag_count_reward": 0.9843750298023224, "step": 821 }, { "clip_ratio": 0.0, - "completion_length": 855.6041870117188, + "completion_length": 404.37501525878906, "epoch": 0.822, - "grad_norm": 27.294904036367736, - "kl": 6.21875, + "grad_norm": 0.40739460926817, + "kl": 0.0841064453125, "learning_rate": 1.8410465752883758e-07, - "loss": 0.6579, - "reward": 1.2708333730697632, - "reward_std": 0.43975476920604706, - "rewards/accuracy_reward": 0.5000000149011612, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.7708333432674408, + "loss": 0.0034, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 822 }, { "clip_ratio": 0.0, - "completion_length": 478.2708435058594, + "completion_length": 476.64585876464844, "epoch": 0.823, - "grad_norm": 12.709642111430005, - "kl": 1.65087890625, + "grad_norm": 11.307479964352174, + "kl": 0.95556640625, "learning_rate": 1.8319249192215055e-07, - "loss": 0.4573, - "reward": 1.8802083730697632, - "reward_std": 0.32728417217731476, - "rewards/accuracy_reward": 0.9375000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9427083432674408, + "loss": 0.1691, + "reward": 0.984375, + "reward_std": 0.03884884715080261, + "rewards/tag_count_reward": 0.984375, "step": 823 }, { "clip_ratio": 0.0, - "completion_length": 436.2708435058594, + "completion_length": 405.79168701171875, "epoch": 0.824, - "grad_norm": 12.250211655601001, - "kl": 0.900390625, + "grad_norm": 0.8410365687756945, + "kl": 0.095458984375, "learning_rate": 1.822847957491922e-07, - "loss": 0.325, - "reward": 1.9166666865348816, - "reward_std": 0.21813733875751495, - "rewards/accuracy_reward": 0.9375000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.0045, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 824 }, { "clip_ratio": 0.0, - "completion_length": 453.0833435058594, + "completion_length": 452.5416717529297, "epoch": 0.825, - "grad_norm": 12.43364332178179, - "kl": 1.83984375, + "grad_norm": 5.302986400500268, + "kl": 0.41162109375, "learning_rate": 1.8138158006995363e-07, - "loss": 0.4023, - "reward": 1.8854166865348816, - "reward_std": 0.20827918872237206, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.96875, + "loss": 0.164, + "reward": 0.9843750298023224, + "reward_std": 0.05412658676505089, + "rewards/tag_count_reward": 0.9843750298023224, "step": 825 }, { "clip_ratio": 0.0, - "completion_length": 491.39585876464844, + "completion_length": 511.66668701171875, "epoch": 0.826, - "grad_norm": 4.1635399271298486, - "kl": 1.0009765625, + "grad_norm": 21.20486140542435, + "kl": 2.265625, "learning_rate": 1.804828558898332e-07, - "loss": 0.1676, - "reward": 1.96875, - "reward_std": 0.0625, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.96875, + "loss": 0.3956, + "reward": 0.9427083432674408, + "reward_std": 0.1291983723640442, + "rewards/tag_count_reward": 0.9427083432674408, "step": 826 }, { "clip_ratio": 0.0, - "completion_length": 476.8333435058594, + "completion_length": 439.2708435058594, "epoch": 0.827, - "grad_norm": 29.78711978021362, - "kl": 1.69140625, + "grad_norm": 0.33843770005717105, + "kl": 0.0599365234375, "learning_rate": 1.7958863415950112e-07, - "loss": 0.6112, - "reward": 1.7187500596046448, - "reward_std": 0.4132007360458374, - "rewards/accuracy_reward": 0.7708333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9479166865348816, + "loss": 0.0025, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 827 }, { "clip_ratio": 0.0, - "completion_length": 465.9583435058594, + "completion_length": 513.2500152587891, "epoch": 0.828, - "grad_norm": 8.71138636681152, - "kl": 0.9208984375, + "grad_norm": 3.3190965305319784, + "kl": 0.54541015625, "learning_rate": 1.7869892577476722e-07, - "loss": 0.1821, - "reward": 1.9687500596046448, - "reward_std": 0.09649687260389328, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0249, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 828 }, { "clip_ratio": 0.0, - "completion_length": 561.2083587646484, + "completion_length": 455.6875, "epoch": 0.829, - "grad_norm": 21.782186202689566, - "kl": 3.39453125, + "grad_norm": 2.3840650360632987, + "kl": 0.14794921875, "learning_rate": 1.7781374157644713e-07, - "loss": 0.4503, - "reward": 1.875, - "reward_std": 0.17475633323192596, - "rewards/accuracy_reward": 0.9375, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9375, + "loss": 0.0412, + "reward": 0.984375, + "reward_std": 0.03884884715080261, + "rewards/tag_count_reward": 0.984375, "step": 829 }, { "clip_ratio": 0.0, - "completion_length": 379.0208435058594, + "completion_length": 401.0208435058594, "epoch": 0.83, - "grad_norm": 0.5769139807884268, - "kl": 0.1279296875, + "grad_norm": 2.090446063492981, + "kl": 0.228515625, "learning_rate": 1.7693309235023127e-07, - "loss": 0.0053, - "reward": 2.0, - "reward_std": 0.0, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": 0.035, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 830 }, { "clip_ratio": 0.0, - "completion_length": 676.8125152587891, + "completion_length": 486.6458435058594, "epoch": 0.831, - "grad_norm": 24.597699285703715, - "kl": 4.888671875, + "grad_norm": 3.0077010271823728, + "kl": 0.3515625, "learning_rate": 1.7605698882655233e-07, - "loss": 0.5869, - "reward": 1.7656250596046448, - "reward_std": 0.25207727029919624, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8697916865348816, + "loss": 0.0079, + "reward": 0.984375, + "reward_std": 0.03884884715080261, + "rewards/tag_count_reward": 0.984375, "step": 831 }, { "clip_ratio": 0.0, - "completion_length": 439.5625, + "completion_length": 356.75001525878906, "epoch": 0.832, - "grad_norm": 9.356841804065343, - "kl": 1.77978515625, + "grad_norm": 4.850165404763736, + "kl": 0.256591796875, "learning_rate": 1.7518544168045524e-07, - "loss": 0.3146, - "reward": 1.828125, - "reward_std": 0.1809951364994049, - "rewards/accuracy_reward": 0.875, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.953125, + "loss": 0.0276, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 832 }, { "clip_ratio": 0.0, - "completion_length": 496.1041717529297, + "completion_length": 447.54168701171875, "epoch": 0.833, - "grad_norm": 9.53708173653762, - "kl": 1.98388671875, + "grad_norm": 0.7953158234930094, + "kl": 0.11376953125, "learning_rate": 1.743184615314671e-07, - "loss": 0.2442, - "reward": 1.8020833730697632, - "reward_std": 0.14310213923454285, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.96875, + "loss": 0.0045, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 833 }, { "clip_ratio": 0.0, - "completion_length": 474.25, + "completion_length": 434.4583435058594, "epoch": 0.834, - "grad_norm": 16.1911003931491, - "kl": 2.2109375, + "grad_norm": 8.22832980491481, + "kl": 0.437744140625, "learning_rate": 1.7345605894346726e-07, - "loss": 0.5803, - "reward": 1.8437500596046448, - "reward_std": 0.3226669132709503, - "rewards/accuracy_reward": 0.8958333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9479166865348816, + "loss": 0.1007, + "reward": 0.9895833730697632, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833730697632, "step": 834 }, { "clip_ratio": 0.0, - "completion_length": 442.68751525878906, + "completion_length": 465.3958435058594, "epoch": 0.835, - "grad_norm": 12.473848302644043, - "kl": 1.08984375, + "grad_norm": 2.144733225418256, + "kl": 0.396240234375, "learning_rate": 1.7259824442455923e-07, - "loss": 0.197, - "reward": 1.8958333730697632, - "reward_std": 0.22079525142908096, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.0258, + "reward": 0.96875, + "reward_std": 0.04985775053501129, + "rewards/tag_count_reward": 0.96875, "step": 835 }, { "clip_ratio": 0.0, - "completion_length": 334.06251525878906, + "completion_length": 363.1666717529297, "epoch": 0.836, - "grad_norm": 0.8040340947522091, - "kl": 0.14208984375, + "grad_norm": 0.14361266387976063, + "kl": 0.044677734375, "learning_rate": 1.7174502842694212e-07, - "loss": 0.0057, - "reward": 2.0, + "loss": 0.0019, + "reward": 1.0, "reward_std": 0.0, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, "rewards/tag_count_reward": 1.0, "step": 836 }, { "clip_ratio": 0.0, - "completion_length": 660.6875, + "completion_length": 519.4166870117188, "epoch": 0.837, - "grad_norm": 17.6393130534593, - "kl": 3.7109375, + "grad_norm": 2.665693523346552, + "kl": 0.15869140625, "learning_rate": 1.7089642134678364e-07, - "loss": 0.7102, - "reward": 1.6822917461395264, - "reward_std": 0.3129502236843109, - "rewards/accuracy_reward": 0.7708333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9114583730697632, + "loss": -0.015, + "reward": 0.9843750298023224, + "reward_std": 0.05412658676505089, + "rewards/tag_count_reward": 0.9843750298023224, "step": 837 }, { "clip_ratio": 0.0, - "completion_length": 667.5000305175781, + "completion_length": 448.4791717529297, "epoch": 0.838, - "grad_norm": 40.69298460282286, - "kl": 4.6171875, + "grad_norm": 1.2240220774373272, + "kl": 0.126220703125, "learning_rate": 1.7005243352409333e-07, - "loss": 0.5095, - "reward": 1.5520833730697632, - "reward_std": 0.3984503895044327, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8854166865348816, + "loss": 0.0057, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 838 }, { "clip_ratio": 0.0, - "completion_length": 732.2083435058594, + "completion_length": 452.6875, "epoch": 0.839, - "grad_norm": 13.465857114273433, - "kl": 5.0, + "grad_norm": 1.5444835787344973, + "kl": 0.1484375, "learning_rate": 1.6921307524259625e-07, - "loss": 0.7938, - "reward": 1.453125, - "reward_std": 0.4194927215576172, - "rewards/accuracy_reward": 0.5833333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8697916865348816, + "loss": 0.0167, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 839 }, { "clip_ratio": 0.0, - "completion_length": 502.5833435058594, + "completion_length": 472.7083435058594, "epoch": 0.84, - "grad_norm": 13.338432547369223, - "kl": 1.6171875, + "grad_norm": 2.461122633133082, + "kl": 0.33251953125, "learning_rate": 1.6837835672960831e-07, - "loss": 0.1181, - "reward": 1.75, - "reward_std": 0.23834703117609024, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9583333432674408, + "loss": 0.0258, + "reward": 0.9843750298023224, + "reward_std": 0.05412658676505089, + "rewards/tag_count_reward": 0.9843750298023224, "step": 840 }, { "clip_ratio": 0.0, - "completion_length": 604.3333435058594, + "completion_length": 459.8333435058594, "epoch": 0.841, - "grad_norm": 21.809910975717514, - "kl": 3.04296875, + "grad_norm": 4.059867011479376, + "kl": 0.373779296875, "learning_rate": 1.6754828815591131e-07, - "loss": 0.7128, - "reward": 1.8072917461395264, - "reward_std": 0.24911796301603317, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8906250298023224, + "loss": 0.1308, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 841 }, { "clip_ratio": 0.0, - "completion_length": 419.8958435058594, + "completion_length": 466.0833435058594, "epoch": 0.842, - "grad_norm": 9.375291072934692, - "kl": 0.9140625, + "grad_norm": 13.052029701951339, + "kl": 0.919921875, "learning_rate": 1.6672287963562852e-07, - "loss": 0.195, - "reward": 1.8750000596046448, - "reward_std": 0.29994751513004303, - "rewards/accuracy_reward": 0.9375000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9375, + "loss": 0.2405, + "reward": 0.9635416865348816, + "reward_std": 0.09432543441653252, + "rewards/tag_count_reward": 0.9635416865348816, "step": 842 }, { "clip_ratio": 0.0, - "completion_length": 413.4583435058594, + "completion_length": 417.72918701171875, "epoch": 0.843, - "grad_norm": 13.044639569722202, - "kl": 0.70703125, + "grad_norm": 2.1216607471714966, + "kl": 0.21337890625, "learning_rate": 1.659021412261026e-07, - "loss": 0.2289, - "reward": 1.7135416865348816, - "reward_std": 0.09573988988995552, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9843750298023224, + "loss": 0.0097, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 843 }, { "clip_ratio": 0.0, - "completion_length": 581.125, + "completion_length": 534.6875305175781, "epoch": 0.844, - "grad_norm": 17.01242607550207, - "kl": 2.09765625, + "grad_norm": 34.203907822986416, + "kl": 1.27734375, "learning_rate": 1.6508608292777203e-07, - "loss": 0.5915, - "reward": 1.7395833730697632, - "reward_std": 0.27039821445941925, - "rewards/accuracy_reward": 0.8125, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9270833432674408, + "loss": 0.1112, + "reward": 0.96875, + "reward_std": 0.05653337761759758, + "rewards/tag_count_reward": 0.96875, "step": 844 }, { "clip_ratio": 0.0, - "completion_length": 446.2916717529297, + "completion_length": 452.5208435058594, "epoch": 0.845, - "grad_norm": 25.42754929810113, - "kl": 0.884765625, + "grad_norm": 3.0906941460379644, + "kl": 0.2978515625, "learning_rate": 1.6427471468404952e-07, - "loss": 0.4499, - "reward": 1.9583333730697632, - "reward_std": 0.14433756470680237, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9583333730697632, + "loss": 0.014, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 845 }, { "clip_ratio": 0.0, - "completion_length": 449.0, + "completion_length": 472.72918701171875, "epoch": 0.846, - "grad_norm": 12.076830805883889, - "kl": 0.53271484375, + "grad_norm": 1.7592547700862164, + "kl": 0.3369140625, "learning_rate": 1.6346804638120098e-07, - "loss": 0.1946, - "reward": 1.9895833730697632, + "loss": 0.007, + "reward": 0.9895833432674408, "reward_std": 0.03608439117670059, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, "rewards/tag_count_reward": 0.9895833432674408, "step": 846 }, { "clip_ratio": 0.0, - "completion_length": 426.2708435058594, + "completion_length": 465.62501525878906, "epoch": 0.847, - "grad_norm": 7.128662467421689, - "kl": 1.175537109375, + "grad_norm": 1.8111958992652284, + "kl": 0.093017578125, "learning_rate": 1.6266608784822542e-07, - "loss": 0.2181, - "reward": 1.90625, - "reward_std": 0.18331214785575867, - "rewards/accuracy_reward": 0.9375, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.96875, + "loss": 0.0012, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 847 }, { "clip_ratio": 0.0, - "completion_length": 453.7916717529297, + "completion_length": 494.85418701171875, "epoch": 0.848, - "grad_norm": 8.847459783648889, - "kl": 0.548828125, + "grad_norm": 2.2650504734811405, + "kl": 0.34521484375, "learning_rate": 1.6186884885673413e-07, - "loss": 0.156, - "reward": 1.9791666865348816, - "reward_std": 0.07216878235340118, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": -0.004, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 848 }, { "clip_ratio": 0.0, - "completion_length": 421.22918701171875, + "completion_length": 428.0625, "epoch": 0.849, - "grad_norm": 15.622644052186043, - "kl": 0.9326171875, + "grad_norm": 2.072160002401503, + "kl": 0.065673828125, "learning_rate": 1.610763391208329e-07, - "loss": 0.3653, - "reward": 1.921875, - "reward_std": 0.1829466074705124, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.016, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 849 }, { "clip_ratio": 0.0, - "completion_length": 574.4166717529297, + "completion_length": 431.9791717529297, "epoch": 0.85, - "grad_norm": 27.82210524236325, - "kl": 3.03125, + "grad_norm": 3.2160210643606466, + "kl": 0.185546875, "learning_rate": 1.6028856829700258e-07, - "loss": 0.5063, - "reward": 1.828125, - "reward_std": 0.36519497632980347, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9322916865348816, + "loss": 0.0443, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 850 }, { "clip_ratio": 0.0, - "completion_length": 439.6666717529297, + "completion_length": 471.625, "epoch": 0.851, - "grad_norm": 17.737456242269808, - "kl": 0.8125, + "grad_norm": 1.6513631639591588, + "kl": 0.24169921875, "learning_rate": 1.5950554598398228e-07, - "loss": 0.224, - "reward": 1.9270833730697632, - "reward_std": 0.20556553453207016, - "rewards/accuracy_reward": 0.9375, - "rewards/format_reward": 0.0, + "loss": 0.0202, + "reward": 0.9895833432674408, + "reward_std": 0.024328090250492096, "rewards/tag_count_reward": 0.9895833432674408, "step": 851 }, { "clip_ratio": 0.0, - "completion_length": 646.9375305175781, + "completion_length": 426.5833435058594, "epoch": 0.852, - "grad_norm": 54.5471325400904, - "kl": 5.09375, + "grad_norm": 0.45549423537189326, + "kl": 0.081298828125, "learning_rate": 1.5872728172265146e-07, - "loss": 0.7369, - "reward": 1.5520833730697632, - "reward_std": 0.4047369360923767, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8854166865348816, + "loss": 0.0032, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 852 }, { "clip_ratio": 0.0, - "completion_length": 540.5625152587891, + "completion_length": 447.375, "epoch": 0.853, - "grad_norm": 30.106508651874115, - "kl": 2.77734375, + "grad_norm": 0.656777240615989, + "kl": 0.079345703125, "learning_rate": 1.579537849959148e-07, - "loss": 0.1888, - "reward": 1.7708333730697632, - "reward_std": 0.15758132189512253, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9166666865348816, + "loss": 0.0035, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 853 }, { "clip_ratio": 0.0, - "completion_length": 523.4166870117188, + "completion_length": 466.2291717529297, "epoch": 0.854, - "grad_norm": 11.040360695362542, - "kl": 2.404296875, + "grad_norm": 2.2810028065560046, + "kl": 0.2255859375, "learning_rate": 1.5718506522858572e-07, - "loss": 0.4808, - "reward": 1.75, - "reward_std": 0.40676697343587875, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9583333432674408, + "loss": 0.0356, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 854 }, { "clip_ratio": 0.0, - "completion_length": 454.43751525878906, + "completion_length": 383.62501525878906, "epoch": 0.855, - "grad_norm": 20.954097171546355, - "kl": 2.66796875, + "grad_norm": 1.6475080603537837, + "kl": 0.099853515625, "learning_rate": 1.5642113178727193e-07, - "loss": 0.319, - "reward": 1.828125, - "reward_std": 0.14868026971817017, - "rewards/accuracy_reward": 0.875, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.953125, + "loss": 0.0046, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 855 }, { "clip_ratio": 0.0, - "completion_length": 417.91668701171875, + "completion_length": 438.8958435058594, "epoch": 0.856, - "grad_norm": 15.900808782689902, - "kl": 2.02197265625, + "grad_norm": 5.468548790410516, + "kl": 0.2724609375, "learning_rate": 1.5566199398026147e-07, - "loss": 0.2125, - "reward": 1.8958333730697632, - "reward_std": 0.17856809496879578, - "rewards/accuracy_reward": 0.9375, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9583333432674408, + "loss": 0.0359, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 856 }, { "clip_ratio": 0.0, - "completion_length": 567.375, + "completion_length": 452.0416717529297, "epoch": 0.857, - "grad_norm": 46.760570722727955, - "kl": 3.9375, + "grad_norm": 0.44153691701559905, + "kl": 0.08203125, "learning_rate": 1.5490766105740876e-07, - "loss": 0.3631, - "reward": 1.6302083730697632, - "reward_std": 0.23975825309753418, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9010416865348816, + "loss": 0.0032, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 857 }, { "clip_ratio": 0.0, - "completion_length": 543.1041870117188, + "completion_length": 423.1458435058594, "epoch": 0.858, - "grad_norm": 76.25935168085778, - "kl": 4.6728515625, + "grad_norm": 0.26511660327498243, + "kl": 0.0577392578125, "learning_rate": 1.5415814221002265e-07, - "loss": 0.4013, - "reward": 1.7291666865348816, - "reward_std": 0.3407784253358841, - "rewards/accuracy_reward": 0.8125000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9166666865348816, + "loss": 0.0023, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 858 }, { "clip_ratio": 0.0, - "completion_length": 517.1875152587891, + "completion_length": 476.5833435058594, "epoch": 0.859, - "grad_norm": 12.873109355236965, - "kl": 2.5546875, + "grad_norm": 3.1492680359957843, + "kl": 0.18505859375, "learning_rate": 1.5341344657075354e-07, - "loss": 0.5041, - "reward": 1.7916667461395264, - "reward_std": 0.46449391543865204, - "rewards/accuracy_reward": 0.8750000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9166666865348816, + "loss": 0.0432, + "reward": 0.9791666865348816, + "reward_std": 0.05689104273915291, + "rewards/tag_count_reward": 0.9791666865348816, "step": 859 }, { "clip_ratio": 0.0, - "completion_length": 511.27085876464844, + "completion_length": 540.2083435058594, "epoch": 0.86, - "grad_norm": 13.180712526003964, - "kl": 2.2265625, + "grad_norm": 3.7943706513929647, + "kl": 0.2841796875, "learning_rate": 1.5267358321348285e-07, - "loss": 0.4, - "reward": 1.8958333730697632, - "reward_std": 0.2564418688416481, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9375000298023224, + "loss": 0.0567, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 860 }, { "clip_ratio": 0.0, - "completion_length": 437.6875, + "completion_length": 497.3125, "epoch": 0.861, - "grad_norm": 19.345289784315373, - "kl": 1.71875, + "grad_norm": 0.5809798704119035, + "kl": 0.089111328125, "learning_rate": 1.5193856115321224e-07, - "loss": 0.2868, - "reward": 1.9322916865348816, - "reward_std": 0.19443782418966293, - "rewards/accuracy_reward": 0.9583333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 0.004, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 861 }, { "clip_ratio": 0.0, - "completion_length": 521.4791717529297, + "completion_length": 413.5208435058594, "epoch": 0.862, - "grad_norm": 10.797697948618687, - "kl": 2.3779296875, + "grad_norm": 0.2934335842153366, + "kl": 0.0626220703125, "learning_rate": 1.5120838934595337e-07, - "loss": 0.1819, - "reward": 1.890625, - "reward_std": 0.10701595991849899, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9322916865348816, + "loss": 0.0026, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 862 }, { "clip_ratio": 0.0, - "completion_length": 528.2083435058594, + "completion_length": 470.3333435058594, "epoch": 0.863, - "grad_norm": 11.74105775433942, - "kl": 2.5283203125, + "grad_norm": 1.5502893810310887, + "kl": 0.068115234375, "learning_rate": 1.5048307668861947e-07, - "loss": 0.4643, - "reward": 1.8854166865348816, - "reward_std": 0.24758216738700867, - "rewards/accuracy_reward": 0.9375, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9479166865348816, + "loss": -0.0, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 863 }, { "clip_ratio": 0.0, - "completion_length": 539.1875, + "completion_length": 416.5416717529297, "epoch": 0.864, - "grad_norm": 23.36833337000619, - "kl": 2.751953125, + "grad_norm": 0.558178976097742, + "kl": 0.080810546875, "learning_rate": 1.4976263201891613e-07, - "loss": 0.2596, - "reward": 1.5833333730697632, - "reward_std": 0.2119756042957306, - "rewards/accuracy_reward": 0.6666666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9166666865348816, + "loss": 0.0036, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 864 }, { "clip_ratio": 0.0, - "completion_length": 696.9791870117188, + "completion_length": 441.29168701171875, "epoch": 0.865, - "grad_norm": 22.44324840268027, - "kl": 4.8203125, + "grad_norm": 3.341005370129586, + "kl": 0.14990234375, "learning_rate": 1.4904706411523448e-07, - "loss": 0.7422, - "reward": 1.6822916865348816, - "reward_std": 0.47188854217529297, - "rewards/accuracy_reward": 0.8333333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8489583432674408, + "loss": -0.0057, + "reward": 0.9843750298023224, + "reward_std": 0.05412658676505089, + "rewards/tag_count_reward": 0.9843750298023224, "step": 865 }, { "clip_ratio": 0.0, - "completion_length": 366.12501525878906, + "completion_length": 467.2291717529297, "epoch": 0.866, - "grad_norm": 2.2295244287514624, - "kl": 0.27734375, + "grad_norm": 5.232549834844898, + "kl": 0.2735595703125, "learning_rate": 1.483363816965435e-07, - "loss": 0.0119, - "reward": 2.0, - "reward_std": 0.0, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": 0.1932, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 866 }, { "clip_ratio": 0.0, - "completion_length": 449.0833435058594, + "completion_length": 455.97918701171875, "epoch": 0.867, - "grad_norm": 10.933949291056058, - "kl": 1.212890625, + "grad_norm": 3.482765515194214, + "kl": 0.0701904296875, "learning_rate": 1.4763059342228434e-07, - "loss": 0.2937, - "reward": 1.9427083730697632, - "reward_std": 0.15781326591968536, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.0014, + "reward": 0.9791666865348816, + "reward_std": 0.07216878235340118, + "rewards/tag_count_reward": 0.9791666865348816, "step": 867 }, { "clip_ratio": 0.0, - "completion_length": 445.60418701171875, + "completion_length": 437.3125, "epoch": 0.868, - "grad_norm": 16.89676372758348, - "kl": 1.230224609375, + "grad_norm": 3.6447899326522513, + "kl": 0.25732421875, "learning_rate": 1.469297078922642e-07, - "loss": 0.3921, - "reward": 1.859375, - "reward_std": 0.27307260036468506, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9427083432674408, + "loss": 0.038, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 868 }, { "clip_ratio": 0.0, - "completion_length": 420.37501525878906, + "completion_length": 392.68751525878906, "epoch": 0.869, - "grad_norm": 9.979243305418649, - "kl": 1.10205078125, + "grad_norm": 2.4560052168914495, + "kl": 0.0963134765625, "learning_rate": 1.4623373364655223e-07, - "loss": 0.3988, - "reward": 1.9479166865348816, - "reward_std": 0.12452562153339386, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.96875, + "loss": -0.0033, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 869 }, { "clip_ratio": 0.0, - "completion_length": 426.1666717529297, + "completion_length": 423.3958435058594, "epoch": 0.87, - "grad_norm": 15.105806389434207, - "kl": 0.84375, + "grad_norm": 0.29718128547812317, + "kl": 0.04736328125, "learning_rate": 1.4554267916537495e-07, - "loss": 0.0444, - "reward": 1.5208333730697632, - "reward_std": 0.07216878235340118, - "rewards/accuracy_reward": 0.5416666679084301, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.0021, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 870 }, { "clip_ratio": 0.0, - "completion_length": 413.35418701171875, + "completion_length": 431.12501525878906, "epoch": 0.871, - "grad_norm": 3.4338103601291023, - "kl": 0.515625, + "grad_norm": 4.346343773847815, + "kl": 0.19873046875, "learning_rate": 1.448565528690129e-07, - "loss": 0.0713, - "reward": 1.9739583730697632, - "reward_std": 0.07278125733137131, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0173, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 871 }, { "clip_ratio": 0.0, - "completion_length": 483.66668701171875, + "completion_length": 400.8333435058594, "epoch": 0.872, - "grad_norm": 9.026083041526881, - "kl": 1.36328125, + "grad_norm": 0.3860896818957767, + "kl": 0.05810546875, "learning_rate": 1.4417536311769885e-07, - "loss": 0.1722, - "reward": 1.7239583730697632, - "reward_std": 0.1843368038535118, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.953125, + "loss": 0.0027, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 872 }, { "clip_ratio": 0.0, - "completion_length": 413.18751525878906, + "completion_length": 453.75001525878906, "epoch": 0.873, - "grad_norm": 4.833114467964448, - "kl": 0.734375, + "grad_norm": 2.6455563018916184, + "kl": 0.0634765625, "learning_rate": 1.4349911821151462e-07, - "loss": 0.1148, - "reward": 1.7395833730697632, - "reward_std": 0.03608439117670059, - "rewards/accuracy_reward": 0.75, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0052, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 873 }, { "clip_ratio": 0.0, - "completion_length": 451.89585876464844, + "completion_length": 462.4791717529297, "epoch": 0.874, - "grad_norm": 5.119723718848547, - "kl": 0.724609375, + "grad_norm": 5.504596501408147, + "kl": 0.4541015625, "learning_rate": 1.4282782639029128e-07, - "loss": 0.1015, - "reward": 1.9010416865348816, - "reward_std": 0.13870561122894287, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.984375, + "loss": 0.003, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 874 }, { "clip_ratio": 0.0, - "completion_length": 588.5833435058594, + "completion_length": 469.7500305175781, "epoch": 0.875, - "grad_norm": 24.091411402326845, - "kl": 2.6171875, + "grad_norm": 3.488122009899489, + "kl": 0.1461181640625, "learning_rate": 1.4216149583350755e-07, - "loss": 0.6931, - "reward": 1.6979166865348816, - "reward_std": 0.37922197580337524, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9062500298023224, + "loss": 0.0022, + "reward": 0.9791666865348816, + "reward_std": 0.07216878235340118, + "rewards/tag_count_reward": 0.9791666865348816, "step": 875 }, { "clip_ratio": 0.0, - "completion_length": 583.4583587646484, + "completion_length": 467.31251525878906, "epoch": 0.876, - "grad_norm": 9.87322300886677, - "kl": 2.7734375, + "grad_norm": 2.652800552168764, + "kl": 0.21240234375, "learning_rate": 1.4150013466019114e-07, - "loss": 0.4683, - "reward": 1.5989583730697632, - "reward_std": 0.3443516790866852, - "rewards/accuracy_reward": 0.6875000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9114583730697632, + "loss": 0.0464, + "reward": 0.9791666865348816, + "reward_std": 0.04865618050098419, + "rewards/tag_count_reward": 0.9791666865348816, "step": 876 }, { "clip_ratio": 0.0, - "completion_length": 363.0833435058594, + "completion_length": 405.1041717529297, "epoch": 0.877, - "grad_norm": 5.871176631143807, - "kl": 0.1904296875, + "grad_norm": 2.384109778280924, + "kl": 0.12841796875, "learning_rate": 1.4084375092881917e-07, - "loss": 0.0314, - "reward": 1.9791666865348816, - "reward_std": 0.07216878235340118, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": 0.0122, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 877 }, { "clip_ratio": 0.0, - "completion_length": 629.0625, + "completion_length": 493.81251525878906, "epoch": 0.878, - "grad_norm": 15.569073110301932, - "kl": 3.1484375, + "grad_norm": 1.833720375747527, + "kl": 0.16552734375, "learning_rate": 1.4019235263722034e-07, - "loss": 0.5262, - "reward": 1.8645833730697632, - "reward_std": 0.23833908885717392, - "rewards/accuracy_reward": 0.9583333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9062500298023224, + "loss": 0.0264, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 878 }, { "clip_ratio": 0.0, - "completion_length": 584.6458587646484, + "completion_length": 463.2916717529297, "epoch": 0.879, - "grad_norm": 23.095914353973114, - "kl": 2.900390625, + "grad_norm": 7.251525232629622, + "kl": 0.595703125, "learning_rate": 1.395459477224772e-07, - "loss": 0.7885, - "reward": 1.7968750596046448, - "reward_std": 0.34015803039073944, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8802083432674408, + "loss": 0.0301, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 879 }, { "clip_ratio": 0.0, - "completion_length": 384.3958435058594, + "completion_length": 452.0208435058594, "epoch": 0.88, - "grad_norm": 6.718359472177041, - "kl": 0.3994140625, + "grad_norm": 1.0364271720065523, + "kl": 0.122802734375, "learning_rate": 1.3890454406082956e-07, - "loss": 0.0871, - "reward": 1.9739583730697632, - "reward_std": 0.09021097794175148, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0057, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 880 }, { "clip_ratio": 0.0, - "completion_length": 397.3333435058594, + "completion_length": 412.5833435058594, "epoch": 0.881, - "grad_norm": 14.172598422527075, - "kl": 0.7939453125, + "grad_norm": 9.909144868862827, + "kl": 0.8623046875, "learning_rate": 1.3826814946757888e-07, - "loss": 0.2658, - "reward": 1.9583333730697632, - "reward_std": 0.10202578455209732, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9583333432674408, + "loss": 0.0001, + "reward": 0.9895833730697632, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833730697632, "step": 881 }, { "clip_ratio": 0.0, - "completion_length": 431.9375, + "completion_length": 499.16668701171875, "epoch": 0.882, - "grad_norm": 4.2235129287881366, - "kl": 0.5380859375, + "grad_norm": 11.477329590363407, + "kl": 0.501953125, "learning_rate": 1.3763677169699217e-07, - "loss": 0.0252, - "reward": 1.9635416865348816, - "reward_std": 0.12629537284374237, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.984375, + "loss": 0.337, + "reward": 0.9843750298023224, + "reward_std": 0.05412658676505089, + "rewards/tag_count_reward": 0.9843750298023224, "step": 882 }, { "clip_ratio": 0.0, - "completion_length": 384.5416717529297, + "completion_length": 416.22918701171875, "epoch": 0.883, - "grad_norm": 4.306578231326638, - "kl": 0.67236328125, + "grad_norm": 2.2472709963991817, + "kl": 0.16015625, "learning_rate": 1.370104184422085e-07, - "loss": 0.0895, - "reward": 1.9635416865348816, - "reward_std": 0.12629537284374237, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.984375, + "loss": 0.0062, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 883 }, { "clip_ratio": 0.0, - "completion_length": 499.1875305175781, + "completion_length": 541.6666870117188, "epoch": 0.884, - "grad_norm": 19.75935460962796, - "kl": 1.8193359375, + "grad_norm": 5.7416781883628625, + "kl": 0.939453125, "learning_rate": 1.3638909733514452e-07, - "loss": 0.1092, - "reward": 1.7916666865348816, - "reward_std": 0.08141736686229706, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9583333432674408, + "loss": 0.1641, + "reward": 0.9895833432674408, + "reward_std": 0.024328090250492096, + "rewards/tag_count_reward": 0.9895833432674408, "step": 884 }, { "clip_ratio": 0.0, - "completion_length": 503.7916717529297, + "completion_length": 419.1458435058594, "epoch": 0.885, - "grad_norm": 16.028117617035686, - "kl": 2.009765625, + "grad_norm": 2.5819552151406615, + "kl": 0.236328125, "learning_rate": 1.3577281594640182e-07, - "loss": 0.191, - "reward": 1.8489583730697632, - "reward_std": 0.2143372893333435, - "rewards/accuracy_reward": 0.8958333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.953125, + "loss": 0.0105, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 885 }, { "clip_ratio": 0.0, - "completion_length": 455.875, + "completion_length": 390.0833435058594, "epoch": 0.886, - "grad_norm": 19.34957246386415, - "kl": 1.66015625, + "grad_norm": 2.2738960276281825, + "kl": 0.17041015625, "learning_rate": 1.351615817851748e-07, - "loss": 0.5261, - "reward": 1.9166667461395264, - "reward_std": 0.2552907392382622, - "rewards/accuracy_reward": 0.9583333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9583333730697632, + "loss": 0.0277, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 886 }, { "clip_ratio": 0.0, - "completion_length": 533.4583435058594, + "completion_length": 453.18751525878906, "epoch": 0.887, - "grad_norm": 28.791270831640194, - "kl": 3.03515625, + "grad_norm": 2.318452927213316, + "kl": 0.369140625, "learning_rate": 1.345554022991586e-07, - "loss": 0.5317, - "reward": 1.8020833730697632, - "reward_std": 0.24154765531420708, - "rewards/accuracy_reward": 0.875, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9270833432674408, + "loss": 0.0422, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 887 }, { "clip_ratio": 0.0, - "completion_length": 519.5625152587891, + "completion_length": 440.0416717529297, "epoch": 0.888, - "grad_norm": 36.054010239539075, - "kl": 3.349365234375, + "grad_norm": 0.9147712689098717, + "kl": 0.130615234375, "learning_rate": 1.3395428487445914e-07, - "loss": 0.2319, - "reward": 1.84375, - "reward_std": 0.1291911005973816, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9270833432674408, + "loss": 0.0056, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 888 }, { "clip_ratio": 0.0, - "completion_length": 443.35418701171875, + "completion_length": 360.93751525878906, "epoch": 0.889, - "grad_norm": 24.928423460233837, - "kl": 1.230712890625, + "grad_norm": 2.337766683078408, + "kl": 0.178466796875, "learning_rate": 1.3335823683550237e-07, - "loss": 0.3925, - "reward": 1.9166666865348816, - "reward_std": 0.15386436879634857, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9583333432674408, + "loss": 0.0235, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 889 }, { "clip_ratio": 0.0, - "completion_length": 437.5833435058594, + "completion_length": 454.56251525878906, "epoch": 0.89, - "grad_norm": 6.067298175387683, - "kl": 0.9296875, + "grad_norm": 1.726684676479182, + "kl": 0.2177734375, "learning_rate": 1.3276726544494571e-07, - "loss": 0.2063, - "reward": 1.9583333730697632, - "reward_std": 0.09731236100196838, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.0089, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 890 }, { "clip_ratio": 0.0, - "completion_length": 545.9375, + "completion_length": 494.18751525878906, "epoch": 0.891, - "grad_norm": 18.626142163111073, - "kl": 3.71875, + "grad_norm": 7.304317674135185, + "kl": 0.42041015625, "learning_rate": 1.3218137790358892e-07, - "loss": 0.6902, - "reward": 1.8645833730697632, - "reward_std": 0.284028522670269, - "rewards/accuracy_reward": 0.9583333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9062500298023224, + "loss": 0.1586, + "reward": 0.984375, + "reward_std": 0.03884884715080261, + "rewards/tag_count_reward": 0.984375, "step": 891 }, { "clip_ratio": 0.0, - "completion_length": 455.10418701171875, + "completion_length": 448.56251525878906, "epoch": 0.892, - "grad_norm": 16.317481328285602, - "kl": 1.50927734375, + "grad_norm": 0.442336781880095, + "kl": 0.08056640625, "learning_rate": 1.316005813502869e-07, - "loss": 0.472, - "reward": 1.90625, - "reward_std": 0.265943706035614, - "rewards/accuracy_reward": 0.9375, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.96875, + "loss": 0.0032, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 892 }, { "clip_ratio": 0.0, - "completion_length": 667.0833435058594, + "completion_length": 410.66668701171875, "epoch": 0.893, - "grad_norm": 14.808109280731902, - "kl": 4.3984375, + "grad_norm": 0.5853659596469007, + "kl": 0.0888671875, "learning_rate": 1.3102488286186234e-07, - "loss": 0.6169, - "reward": 1.7083333730697632, - "reward_std": 0.38184280693531036, - "rewards/accuracy_reward": 0.8125, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8958333730697632, + "loss": 0.0039, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 893 }, { "clip_ratio": 0.0, - "completion_length": 366.75001525878906, + "completion_length": 423.5208435058594, "epoch": 0.894, - "grad_norm": 6.56350379557414, - "kl": 0.40625, + "grad_norm": 9.054535838277454, + "kl": 0.546875, "learning_rate": 1.3045428945301953e-07, - "loss": 0.0803, - "reward": 1.9791666865348816, - "reward_std": 0.07216878235340118, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.0423, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 894 }, { "clip_ratio": 0.0, - "completion_length": 432.1875, + "completion_length": 359.9583435058594, "epoch": 0.895, - "grad_norm": 15.471222474630176, - "kl": 1.77685546875, + "grad_norm": 4.654534141804648, + "kl": 0.08447265625, "learning_rate": 1.2988880807625927e-07, - "loss": 0.1978, - "reward": 1.8802083730697632, - "reward_std": 0.1175578162074089, - "rewards/accuracy_reward": 0.9375, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9427083432674408, + "loss": 0.0139, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 895 }, { "clip_ratio": 0.0, - "completion_length": 356.9791717529297, + "completion_length": 418.7083435058594, "epoch": 0.896, - "grad_norm": 1.2767526382040946, - "kl": 0.173828125, + "grad_norm": 4.214225314069966, + "kl": 0.294677734375, "learning_rate": 1.2932844562179352e-07, - "loss": 0.0077, - "reward": 2.0, - "reward_std": 0.0, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": 0.0797, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 896 }, { "clip_ratio": 0.0, - "completion_length": 471.00001525878906, + "completion_length": 398.3958435058594, "epoch": 0.897, - "grad_norm": 14.911287345066016, - "kl": 1.835205078125, + "grad_norm": 2.90752008700908, + "kl": 0.17138671875, "learning_rate": 1.2877320891746201e-07, - "loss": 0.1014, - "reward": 1.8541666865348816, - "reward_std": 0.1259434074163437, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9375, + "loss": -0.0046, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 897 }, { "clip_ratio": 0.0, - "completion_length": 416.4583435058594, + "completion_length": 420.0833435058594, "epoch": 0.898, - "grad_norm": 12.243165704828495, - "kl": 0.94873046875, + "grad_norm": 0.47940129933285197, + "kl": 0.07275390625, "learning_rate": 1.2822310472864885e-07, - "loss": 0.2108, - "reward": 1.9635416865348816, - "reward_std": 0.10811641067266464, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.984375, + "loss": 0.0032, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 898 }, { "clip_ratio": 0.0, - "completion_length": 595.6458740234375, + "completion_length": 467.6458435058594, "epoch": 0.899, - "grad_norm": 9.96728420274615, - "kl": 2.96875, + "grad_norm": 2.3433899793970165, + "kl": 0.1708984375, "learning_rate": 1.2767813975819983e-07, - "loss": 0.491, - "reward": 1.8125, - "reward_std": 0.26586486026644707, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9166666865348816, + "loss": 0.0115, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 899 }, { "clip_ratio": 0.0, - "completion_length": 767.1041870117188, + "completion_length": 465.4583435058594, "epoch": 0.9, - "grad_norm": 35.06484302697034, - "kl": 4.921875, + "grad_norm": 8.787946894065907, + "kl": 0.46923828125, "learning_rate": 1.2713832064634125e-07, - "loss": 0.4687, - "reward": 1.6614583730697632, - "reward_std": 0.24853000044822693, - "rewards/accuracy_reward": 0.8125000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8489583730697632, + "loss": 0.104, + "reward": 0.9739583432674408, + "reward_std": 0.0749332457780838, + "rewards/tag_count_reward": 0.9739583432674408, "step": 900 }, { "clip_ratio": 0.0, - "completion_length": 567.2708435058594, + "completion_length": 441.9583435058594, "epoch": 0.901, - "grad_norm": 11.291063117476682, - "kl": 2.54296875, + "grad_norm": 1.9942745799790182, + "kl": 0.19189453125, "learning_rate": 1.2660365397059856e-07, - "loss": 0.4214, - "reward": 1.8437500596046448, - "reward_std": 0.17957251518964767, - "rewards/accuracy_reward": 0.9375, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9062500298023224, + "loss": 0.0118, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 901 }, { "clip_ratio": 0.0, - "completion_length": 535.2291870117188, + "completion_length": 486.6458435058594, "epoch": 0.902, - "grad_norm": 7.6811915825242645, - "kl": 2.24609375, + "grad_norm": 1.4209046426026215, + "kl": 0.152587890625, "learning_rate": 1.260741462457165e-07, - "loss": 0.2827, - "reward": 1.7083333730697632, - "reward_std": 0.42435412108898163, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9166666865348816, + "loss": 0.0074, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 902 }, { "clip_ratio": 0.0, - "completion_length": 372.2916717529297, + "completion_length": 411.00001525878906, "epoch": 0.903, - "grad_norm": 0.985333233721206, - "kl": 0.173095703125, + "grad_norm": 3.3172086388415796, + "kl": 0.2999267578125, "learning_rate": 1.2554980392357956e-07, - "loss": 0.0078, - "reward": 2.0, - "reward_std": 0.0, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": 0.0598, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 903 }, { "clip_ratio": 0.0, - "completion_length": 554.7291717529297, + "completion_length": 491.4375305175781, "epoch": 0.904, - "grad_norm": 11.317529960741332, - "kl": 1.95947265625, + "grad_norm": 5.075284253325144, + "kl": 0.5068359375, "learning_rate": 1.2503063339313356e-07, - "loss": 0.3405, - "reward": 1.890625, - "reward_std": 0.215882807970047, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9114583432674408, + "loss": 0.1422, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 904 }, { "clip_ratio": 0.0, - "completion_length": 364.375, + "completion_length": 433.1041717529297, "epoch": 0.905, - "grad_norm": 1.4196218571907042, - "kl": 0.243408203125, + "grad_norm": 4.894805146692723, + "kl": 0.38818359375, "learning_rate": 1.2451664098030743e-07, - "loss": 0.0108, - "reward": 1.75, + "loss": 0.0244, + "reward": 1.0, "reward_std": 0.0, - "rewards/accuracy_reward": 0.75, - "rewards/format_reward": 0.0, "rewards/tag_count_reward": 1.0, "step": 905 }, { "clip_ratio": 0.0, - "completion_length": 512.5416870117188, + "completion_length": 433.00001525878906, "epoch": 0.906, - "grad_norm": 26.306931215703628, - "kl": 1.84375, + "grad_norm": 1.6623425457072913, + "kl": 0.134521484375, "learning_rate": 1.2400783294793668e-07, - "loss": 0.5532, - "reward": 1.7187500596046448, - "reward_std": 0.25687965750694275, - "rewards/accuracy_reward": 0.8125, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9062500298023224, + "loss": 0.0214, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 906 }, { "clip_ratio": 0.0, - "completion_length": 463.60418701171875, + "completion_length": 466.18751525878906, "epoch": 0.907, - "grad_norm": 7.513545666634734, - "kl": 0.9140625, + "grad_norm": 4.517468250608921, + "kl": 0.296875, "learning_rate": 1.235042154956865e-07, - "loss": 0.1246, - "reward": 1.7291666865348816, - "reward_std": 0.18408085405826569, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.0592, + "reward": 0.9895833432674408, + "reward_std": 0.024328090250492096, + "rewards/tag_count_reward": 0.9895833432674408, "step": 907 }, { "clip_ratio": 0.0, - "completion_length": 682.25, + "completion_length": 466.375, "epoch": 0.908, - "grad_norm": 9.026537672372555, - "kl": 2.296875, + "grad_norm": 1.9574484289394507, + "kl": 0.18359375, "learning_rate": 1.2300579475997657e-07, - "loss": 0.2919, - "reward": 1.765625, - "reward_std": 0.23241043090820312, - "rewards/accuracy_reward": 0.8750000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8906250298023224, + "loss": 0.0107, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 908 }, { "clip_ratio": 0.0, - "completion_length": 385.875, + "completion_length": 470.8333435058594, "epoch": 0.909, - "grad_norm": 0.6214679345120036, - "kl": 0.123779296875, + "grad_norm": 0.5829037579667905, + "kl": 0.078125, "learning_rate": 1.2251257681390645e-07, - "loss": 0.0054, - "reward": 2.0, + "loss": 0.0039, + "reward": 1.0, "reward_std": 0.0, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, "rewards/tag_count_reward": 1.0, "step": 909 }, { "clip_ratio": 0.0, - "completion_length": 503.43751525878906, + "completion_length": 452.35418701171875, "epoch": 0.91, - "grad_norm": 8.389301532922296, - "kl": 0.8125, + "grad_norm": 2.3185634515942803, + "kl": 0.314453125, "learning_rate": 1.220245676671809e-07, - "loss": 0.2653, - "reward": 1.9114583730697632, - "reward_std": 0.16091519594192505, - "rewards/accuracy_reward": 0.9375, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 0.0325, + "reward": 0.9791666865348816, + "reward_std": 0.05689104273915291, + "rewards/tag_count_reward": 0.9791666865348816, "step": 910 }, { "clip_ratio": 0.0, - "completion_length": 454.93751525878906, + "completion_length": 410.6666717529297, "epoch": 0.911, - "grad_norm": 21.36502879988365, - "kl": 0.943359375, + "grad_norm": 4.990915064622496, + "kl": 0.18603515625, "learning_rate": 1.2154177326603763e-07, - "loss": 0.3842, - "reward": 1.9270833730697632, - "reward_std": 0.19669441878795624, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.96875, + "loss": 0.0313, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 911 }, { "clip_ratio": 0.0, - "completion_length": 405.35418701171875, + "completion_length": 458.22918701171875, "epoch": 0.912, - "grad_norm": 8.427520686620069, - "kl": 0.375, + "grad_norm": 0.9703749274233409, + "kl": 0.1337890625, "learning_rate": 1.2106419949317388e-07, - "loss": 0.0913, - "reward": 1.9322917461395264, - "reward_std": 0.18752333521842957, - "rewards/accuracy_reward": 0.9375000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0056, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 912 }, { "clip_ratio": 0.0, - "completion_length": 507.16668701171875, + "completion_length": 388.3541717529297, "epoch": 0.913, - "grad_norm": 10.704434781606814, - "kl": 1.428466796875, + "grad_norm": 3.0648569871424067, + "kl": 0.1068115234375, "learning_rate": 1.2059185216767543e-07, - "loss": 0.1584, - "reward": 1.9322916865348816, - "reward_std": 0.09406512975692749, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.953125, + "loss": 0.0401, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 913 }, { "clip_ratio": 0.0, - "completion_length": 725.9166870117188, + "completion_length": 467.75, "epoch": 0.914, - "grad_norm": 18.241150444373933, - "kl": 3.9375, + "grad_norm": 4.927041693824767, + "kl": 0.495849609375, "learning_rate": 1.2012473704494537e-07, - "loss": 0.8696, - "reward": 1.5520833730697632, - "reward_std": 0.5469892621040344, - "rewards/accuracy_reward": 0.6875000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8645833730697632, + "loss": 0.0826, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 914 }, { "clip_ratio": 0.0, - "completion_length": 461.6250305175781, + "completion_length": 389.6458435058594, "epoch": 0.915, - "grad_norm": 10.87611972257946, - "kl": 1.36328125, + "grad_norm": 0.36372839672372403, + "kl": 0.0556640625, "learning_rate": 1.1966285981663407e-07, - "loss": 0.2484, - "reward": 1.8645833730697632, - "reward_std": 0.21650635078549385, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9479166865348816, + "loss": 0.0026, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 915 }, { "clip_ratio": 0.0, - "completion_length": 484.93751525878906, + "completion_length": 432.375, "epoch": 0.916, - "grad_norm": 6.965604737067303, - "kl": 1.25439453125, + "grad_norm": 2.696301347232995, + "kl": 0.184814453125, "learning_rate": 1.1920622611056974e-07, - "loss": 0.2105, - "reward": 1.8541666865348816, - "reward_std": 0.19279150664806366, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9583333432674408, + "loss": 0.0306, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 916 }, { "clip_ratio": 0.0, - "completion_length": 451.8333435058594, + "completion_length": 419.4791717529297, "epoch": 0.917, - "grad_norm": 7.775998883675492, - "kl": 0.721923828125, + "grad_norm": 3.225344987717421, + "kl": 0.382568359375, "learning_rate": 1.1875484149069004e-07, - "loss": 0.2078, - "reward": 1.9270833730697632, - "reward_std": 0.17236930131912231, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.96875, + "loss": 0.0374, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 917 }, { "clip_ratio": 0.0, - "completion_length": 480.41668701171875, + "completion_length": 494.0416717529297, "epoch": 0.918, - "grad_norm": 10.931268127538916, - "kl": 1.2861328125, + "grad_norm": 1.2396993112021069, + "kl": 0.177001953125, "learning_rate": 1.1830871145697412e-07, - "loss": 0.3782, - "reward": 1.8750000596046448, - "reward_std": 0.27763204276561737, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9583333730697632, + "loss": 0.0078, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 918 }, { "clip_ratio": 0.0, - "completion_length": 416.29168701171875, + "completion_length": 427.54168701171875, "epoch": 0.919, - "grad_norm": 13.568505638473718, - "kl": 0.58251953125, + "grad_norm": 2.675512597659298, + "kl": 0.29833984375, "learning_rate": 1.1786784144537563e-07, - "loss": 0.2691, - "reward": 1.9375, - "reward_std": 0.15539538860321045, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.0868, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 919 }, { "clip_ratio": 0.0, - "completion_length": 419.3333435058594, + "completion_length": 429.8958435058594, "epoch": 0.92, - "grad_norm": 10.840614129376137, - "kl": 0.622314453125, + "grad_norm": 0.5774129184213032, + "kl": 0.07568359375, "learning_rate": 1.1743223682775649e-07, - "loss": 0.1998, - "reward": 1.9166666865348816, - "reward_std": 0.21242958307266235, - "rewards/accuracy_reward": 0.9375, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.0033, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 920 }, { "clip_ratio": 0.0, - "completion_length": 421.2708435058594, + "completion_length": 448.0208435058594, "epoch": 0.921, - "grad_norm": 11.087410692109273, - "kl": 0.779541015625, + "grad_norm": 2.1214438632925203, + "kl": 0.25, "learning_rate": 1.1700190291182158e-07, - "loss": 0.2074, - "reward": 1.9479166865348816, - "reward_std": 0.11254207789897919, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.96875, + "loss": -0.0132, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 921 }, { "clip_ratio": 0.0, - "completion_length": 647.6458435058594, + "completion_length": 492.10418701171875, "epoch": 0.922, - "grad_norm": 46.246526593355426, - "kl": 4.875, + "grad_norm": 1.9715037166003222, + "kl": 0.11474609375, "learning_rate": 1.1657684494105386e-07, - "loss": 0.4458, - "reward": 1.8125, - "reward_std": 0.28306229412555695, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8958333432674408, + "loss": 0.0149, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 922 }, { "clip_ratio": 0.0, - "completion_length": 500.72918701171875, + "completion_length": 463.3333435058594, "epoch": 0.923, - "grad_norm": 33.71714763477938, - "kl": 4.23828125, + "grad_norm": 2.997274956666607, + "kl": 0.249267578125, "learning_rate": 1.1615706809465051e-07, - "loss": 0.8338, - "reward": 1.8229166865348816, - "reward_std": 0.44588886201381683, - "rewards/accuracy_reward": 0.8958333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9270833432674408, + "loss": -0.0124, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 923 }, { "clip_ratio": 0.0, - "completion_length": 444.4791717529297, + "completion_length": 458.1458435058594, "epoch": 0.924, - "grad_norm": 35.364839682175145, - "kl": 2.080078125, + "grad_norm": 2.4679882370570208, + "kl": 0.216064453125, "learning_rate": 1.1574257748745986e-07, - "loss": 0.335, - "reward": 1.9427083730697632, - "reward_std": 0.13446637988090515, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.984375, + "loss": 0.0178, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 924 }, { "clip_ratio": 0.0, - "completion_length": 433.91668701171875, + "completion_length": 425.50001525878906, "epoch": 0.925, - "grad_norm": 12.17609337216219, - "kl": 1.39599609375, + "grad_norm": 2.788486659392232, + "kl": 0.170166015625, "learning_rate": 1.1533337816991931e-07, - "loss": 0.3009, - "reward": 1.96875, - "reward_std": 0.0625, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.96875, + "loss": 0.038, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 925 }, { "clip_ratio": 0.0, - "completion_length": 492.1041717529297, + "completion_length": 448.25, "epoch": 0.926, - "grad_norm": 44.83841268808384, - "kl": 2.869140625, + "grad_norm": 4.4389011117909565, + "kl": 0.376220703125, "learning_rate": 1.1492947512799328e-07, - "loss": 0.1637, - "reward": 1.7864583730697632, - "reward_std": 0.0819607600569725, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.953125, + "loss": 0.0454, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 926 }, { "clip_ratio": 0.0, - "completion_length": 523.0208435058594, + "completion_length": 419.81251525878906, "epoch": 0.927, - "grad_norm": 57.230544256823784, - "kl": 4.568359375, + "grad_norm": 0.3004188751162577, + "kl": 0.060546875, "learning_rate": 1.1453087328311299e-07, - "loss": 0.5799, - "reward": 1.8177083730697632, - "reward_std": 0.34157679229974747, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9218750298023224, + "loss": 0.0026, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 927 }, { "clip_ratio": 0.0, - "completion_length": 452.4166717529297, + "completion_length": 463.68751525878906, "epoch": 0.928, - "grad_norm": 9.970684227802462, - "kl": 1.05078125, + "grad_norm": 0.5861374247940305, + "kl": 0.09130859375, "learning_rate": 1.1413757749211602e-07, - "loss": 0.1622, - "reward": 1.9635416865348816, - "reward_std": 0.08618465065956116, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.984375, + "loss": 0.004, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 928 }, { "clip_ratio": 0.0, - "completion_length": 468.2291717529297, + "completion_length": 458.29168701171875, "epoch": 0.929, - "grad_norm": 53.6430965224698, - "kl": 2.1640625, + "grad_norm": 0.22493358475542433, + "kl": 0.0609130859375, "learning_rate": 1.137495925471875e-07, - "loss": 0.3304, - "reward": 1.7187500596046448, - "reward_std": 0.11702756583690643, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9479166865348816, + "loss": 0.0025, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 929 }, { "clip_ratio": 0.0, - "completion_length": 695.8750305175781, + "completion_length": 468.3958435058594, "epoch": 0.93, - "grad_norm": 78.68164787219935, - "kl": 7.421875, + "grad_norm": 0.3975409634684941, + "kl": 0.079833984375, "learning_rate": 1.1336692317580158e-07, - "loss": 0.9211, - "reward": 1.6927083730697632, - "reward_std": 0.4273684620857239, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8593750298023224, + "loss": 0.0034, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 930 }, { "clip_ratio": 0.0, - "completion_length": 550.5416717529297, + "completion_length": 454.5833435058594, "epoch": 0.931, - "grad_norm": 21.661198522112688, - "kl": 2.9765625, + "grad_norm": 0.7831927134076526, + "kl": 0.11474609375, "learning_rate": 1.1298957404066381e-07, - "loss": 0.4426, - "reward": 1.7864583730697632, - "reward_std": 0.1418427713215351, - "rewards/accuracy_reward": 0.875, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9114583432674408, + "loss": 0.005, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 931 }, { "clip_ratio": 0.0, - "completion_length": 512.0208435058594, + "completion_length": 453.79168701171875, "epoch": 0.932, - "grad_norm": 12.007893366838966, - "kl": 1.79736328125, + "grad_norm": 0.2565546048580399, + "kl": 0.0589599609375, "learning_rate": 1.1261754973965422e-07, - "loss": 0.1912, - "reward": 1.8854166865348816, - "reward_std": 0.1823795735836029, - "rewards/accuracy_reward": 0.9375, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9479166865348816, + "loss": 0.0024, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 932 }, { "clip_ratio": 0.0, - "completion_length": 436.16668701171875, + "completion_length": 483.3125305175781, "epoch": 0.933, - "grad_norm": 10.535585600961307, - "kl": 1.52734375, + "grad_norm": 5.965932854238622, + "kl": 0.3040771484375, "learning_rate": 1.1225085480577158e-07, - "loss": 0.3373, - "reward": 1.8281250596046448, - "reward_std": 0.3007126897573471, - "rewards/accuracy_reward": 0.8750000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.953125, + "loss": 0.0664, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 933 }, { "clip_ratio": 0.0, - "completion_length": 468.3541717529297, + "completion_length": 401.2083435058594, "epoch": 0.934, - "grad_norm": 24.110368802837204, - "kl": 1.5546875, + "grad_norm": 0.5902863758628627, + "kl": 0.10260009765625, "learning_rate": 1.1188949370707787e-07, - "loss": 0.5719, - "reward": 1.8906250596046448, - "reward_std": 0.2716143727302551, - "rewards/accuracy_reward": 0.9375000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9531250298023224, + "loss": 0.0042, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 934 }, { "clip_ratio": 0.0, - "completion_length": 451.66668701171875, + "completion_length": 417.50001525878906, "epoch": 0.935, - "grad_norm": 16.317707309210853, - "kl": 1.220703125, + "grad_norm": 0.27583183911270276, + "kl": 0.0458984375, "learning_rate": 1.1153347084664419e-07, - "loss": 0.3826, - "reward": 1.9739583730697632, - "reward_std": 0.09021097421646118, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 0.0024, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 935 }, { "clip_ratio": 0.0, - "completion_length": 596.1458435058594, + "completion_length": 433.2291717529297, "epoch": 0.936, - "grad_norm": 16.419356322088824, - "kl": 3.71728515625, + "grad_norm": 1.666916279660374, + "kl": 0.111572265625, "learning_rate": 1.1118279056249653e-07, - "loss": 0.5207, - "reward": 1.6770833730697632, - "reward_std": 0.2518305666744709, - "rewards/accuracy_reward": 0.8125, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8645833730697632, + "loss": -0.0118, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 936 }, { "clip_ratio": 0.0, - "completion_length": 474.3958435058594, + "completion_length": 393.9166717529297, "epoch": 0.937, - "grad_norm": 24.622125442293328, - "kl": 1.7421875, + "grad_norm": 1.3125799330948598, + "kl": 0.0460205078125, "learning_rate": 1.1083745712756364e-07, - "loss": 0.6163, - "reward": 1.8697916865348816, - "reward_std": 0.2979765832424164, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.953125, + "loss": -0.0065, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 937 }, { "clip_ratio": 0.0, - "completion_length": 406.6458435058594, + "completion_length": 438.9583435058594, "epoch": 0.938, - "grad_norm": 1.2817567272375725, - "kl": 0.2197265625, + "grad_norm": 2.4309526964560475, + "kl": 0.089599609375, "learning_rate": 1.1049747474962444e-07, - "loss": 0.0102, - "reward": 2.0, - "reward_std": 0.0, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": 0.0167, + "reward": 0.9895833432674408, + "reward_std": 0.024328090250492096, + "rewards/tag_count_reward": 0.9895833432674408, "step": 938 }, { "clip_ratio": 0.0, - "completion_length": 662.9375305175781, + "completion_length": 403.7083435058594, "epoch": 0.939, - "grad_norm": 12.975852481008392, - "kl": 3.9609375, + "grad_norm": 2.0975552361291014, + "kl": 0.1544189453125, "learning_rate": 1.1016284757125685e-07, - "loss": 0.5205, - "reward": 1.5989583730697632, - "reward_std": 0.3047226071357727, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8489583432674408, + "loss": 0.0053, + "reward": 0.9895833432674408, + "reward_std": 0.024328093975782394, + "rewards/tag_count_reward": 0.9895833432674408, "step": 939 }, { "clip_ratio": 0.0, - "completion_length": 904.1875305175781, + "completion_length": 454.29168701171875, "epoch": 0.94, - "grad_norm": 39.348365291632554, - "kl": 7.546875, + "grad_norm": 4.360268899437786, + "kl": 0.097412109375, "learning_rate": 1.0983357966978745e-07, - "loss": 0.7768, - "reward": 1.1614583730697632, - "reward_std": 0.5211588144302368, - "rewards/accuracy_reward": 0.4166666716337204, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.7447916865348816, + "loss": 0.0512, + "reward": 0.9791666865348816, + "reward_std": 0.07216878235340118, + "rewards/tag_count_reward": 0.9791666865348816, "step": 940 }, { "clip_ratio": 0.0, - "completion_length": 529.4375305175781, + "completion_length": 405.1458435058594, "epoch": 0.941, - "grad_norm": 7.732976548485906, - "kl": 2.1875, + "grad_norm": 0.35462463024772617, + "kl": 0.0648193359375, "learning_rate": 1.0950967505724175e-07, - "loss": 0.2953, - "reward": 1.8958333730697632, - "reward_std": 0.16063910722732544, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9166666865348816, + "loss": 0.0029, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 941 }, { "clip_ratio": 0.0, - "completion_length": 544.2291870117188, + "completion_length": 483.66668701171875, "epoch": 0.942, - "grad_norm": 12.823222314178125, - "kl": 2.09375, + "grad_norm": 5.991695051097315, + "kl": 0.1943359375, "learning_rate": 1.0919113768029517e-07, - "loss": 0.4219, - "reward": 1.6770833730697632, - "reward_std": 0.2615434154868126, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9270833432674408, + "loss": 0.0394, + "reward": 0.9739583730697632, + "reward_std": 0.09021097794175148, + "rewards/tag_count_reward": 0.9739583730697632, "step": 942 }, { "clip_ratio": 0.0, - "completion_length": 573.7291870117188, + "completion_length": 418.2083435058594, "epoch": 0.943, - "grad_norm": 9.946009871267574, - "kl": 3.2109375, + "grad_norm": 0.23605136486972625, + "kl": 0.053955078125, "learning_rate": 1.0887797142022521e-07, - "loss": 0.6418, - "reward": 1.5000000596046448, - "reward_std": 0.36415572464466095, - "rewards/accuracy_reward": 0.5833333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9166666865348816, + "loss": 0.0024, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 943 }, { "clip_ratio": 0.0, - "completion_length": 546.75, + "completion_length": 461.6875, "epoch": 0.944, - "grad_norm": 17.06247731623334, - "kl": 1.8984375, + "grad_norm": 3.0034285302091157, + "kl": 0.3358154296875, "learning_rate": 1.0857018009286381e-07, - "loss": 0.4543, - "reward": 1.6822916865348816, - "reward_std": 0.39797835052013397, - "rewards/accuracy_reward": 0.7500000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9322916865348816, + "loss": 0.0398, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 944 }, { "clip_ratio": 0.0, - "completion_length": 587.4583587646484, + "completion_length": 441.6458435058594, "epoch": 0.945, - "grad_norm": 8.467447039160039, - "kl": 2.4736328125, + "grad_norm": 0.6116769185552291, + "kl": 0.0986328125, "learning_rate": 1.0826776744855121e-07, - "loss": 0.2653, - "reward": 1.6770833730697632, - "reward_std": 0.14059869945049286, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.90625, + "loss": 0.0043, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 945 }, { "clip_ratio": 0.0, - "completion_length": 556.3750305175781, + "completion_length": 448.3541717529297, "epoch": 0.946, - "grad_norm": 15.284145497913336, - "kl": 3.115234375, + "grad_norm": 5.836343401745722, + "kl": 0.224853515625, "learning_rate": 1.0797073717209013e-07, - "loss": 0.5471, - "reward": 1.6197917461395264, - "reward_std": 0.4588644206523895, - "rewards/accuracy_reward": 0.708333358168602, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9114583730697632, + "loss": 0.0643, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 946 }, { "clip_ratio": 0.0, - "completion_length": 386.72918701171875, + "completion_length": 499.5833435058594, "epoch": 0.947, - "grad_norm": 4.944140941065174, - "kl": 0.2900390625, + "grad_norm": 12.017409443755474, + "kl": 0.9892578125, "learning_rate": 1.0767909288270063e-07, - "loss": 0.0386, - "reward": 1.9895833730697632, - "reward_std": 0.03608439117670059, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.2575, + "reward": 0.9635416865348816, + "reward_std": 0.1110176369547844, + "rewards/tag_count_reward": 0.9635416865348816, "step": 947 }, { "clip_ratio": 0.0, - "completion_length": 519.1041870117188, + "completion_length": 456.9791717529297, "epoch": 0.948, - "grad_norm": 14.619777897588564, - "kl": 2.0341796875, + "grad_norm": 2.315036725074918, + "kl": 0.12255859375, "learning_rate": 1.0739283813397639e-07, - "loss": 0.4948, - "reward": 1.8802083730697632, - "reward_std": 0.29817967116832733, - "rewards/accuracy_reward": 0.9583333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.921875, + "loss": 0.0218, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 948 }, { "clip_ratio": 0.0, - "completion_length": 370.7083435058594, + "completion_length": 423.37501525878906, "epoch": 0.949, - "grad_norm": 2.501173599100684, - "kl": 0.21728515625, + "grad_norm": 1.684987978433669, + "kl": 0.15966796875, "learning_rate": 1.0711197641384115e-07, - "loss": 0.0165, - "reward": 1.984375, - "reward_std": 0.03884884715080261, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.984375, + "loss": 0.0065, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 949 }, { "clip_ratio": 0.0, - "completion_length": 610.1666717529297, + "completion_length": 433.6666717529297, "epoch": 0.95, - "grad_norm": 32.47022971756504, - "kl": 3.61474609375, + "grad_norm": 0.7242424715971006, + "kl": 0.0859375, "learning_rate": 1.068365111445064e-07, - "loss": 0.1573, - "reward": 1.78125, - "reward_std": 0.13339674472808838, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8854166865348816, + "loss": 0.0039, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 950 }, { "clip_ratio": 0.0, - "completion_length": 563.4166870117188, + "completion_length": 439.4166717529297, "epoch": 0.951, - "grad_norm": 8.348756777918963, - "kl": 2.25, + "grad_norm": 1.3350386086684431, + "kl": 0.102294921875, "learning_rate": 1.0656644568242946e-07, - "loss": 0.469, - "reward": 1.9270833730697632, - "reward_std": 0.1290598213672638, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9270833432674408, + "loss": -0.0403, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 951 }, { "clip_ratio": 0.0, - "completion_length": 448.43751525878906, + "completion_length": 463.31251525878906, "epoch": 0.952, - "grad_norm": 4.094810780690959, - "kl": 1.01953125, + "grad_norm": 4.247740925227692, + "kl": 0.31201171875, "learning_rate": 1.063017833182728e-07, - "loss": 0.1034, - "reward": 1.9427083730697632, - "reward_std": 0.11716237664222717, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.984375, + "loss": 0.0315, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 952 }, { "clip_ratio": 0.0, - "completion_length": 478.62501525878906, + "completion_length": 456.6458435058594, "epoch": 0.953, - "grad_norm": 13.375595929105703, - "kl": 1.3564453125, + "grad_norm": 0.3145352011003649, + "kl": 0.0618896484375, "learning_rate": 1.0604252727686379e-07, - "loss": 0.1365, - "reward": 1.7864583730697632, - "reward_std": 0.08618465065956116, - "rewards/accuracy_reward": 0.8333333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.953125, + "loss": 0.0025, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 953 }, { "clip_ratio": 0.0, - "completion_length": 419.25, + "completion_length": 369.1458435058594, "epoch": 0.954, - "grad_norm": 10.841443470170985, - "kl": 0.884765625, + "grad_norm": 0.8741994646212923, + "kl": 0.07708740234375, "learning_rate": 1.0578868071715544e-07, - "loss": 0.2009, - "reward": 1.8802083730697632, - "reward_std": 0.19846417382359505, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.0039, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 954 }, { "clip_ratio": 0.0, - "completion_length": 520.0833587646484, + "completion_length": 407.85418701171875, "epoch": 0.955, - "grad_norm": 47.82750066674377, - "kl": 2.34375, + "grad_norm": 1.2543903429135423, + "kl": 0.0997314453125, "learning_rate": 1.0554024673218806e-07, - "loss": 0.3393, - "reward": 1.6197916865348816, - "reward_std": 0.2554234117269516, - "rewards/accuracy_reward": 0.6875000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9322916865348816, + "loss": 0.0056, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 955 }, { "clip_ratio": 0.0, - "completion_length": 573.5000305175781, + "completion_length": 503.62501525878906, "epoch": 0.956, - "grad_norm": 9.892517936374505, - "kl": 2.20361328125, + "grad_norm": 3.2277603958784464, + "kl": 0.11083984375, "learning_rate": 1.0529722834905125e-07, - "loss": 0.4362, - "reward": 1.90625, - "reward_std": 0.21248003840446472, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9479166865348816, + "loss": 0.0483, + "reward": 0.984375, + "reward_std": 0.05412658676505089, + "rewards/tag_count_reward": 0.984375, "step": 956 }, { "clip_ratio": 0.0, - "completion_length": 512.75, + "completion_length": 397.3958435058594, "epoch": 0.957, - "grad_norm": 9.240502736552724, - "kl": 2.07421875, + "grad_norm": 1.613929883243826, + "kl": 0.10498046875, "learning_rate": 1.0505962852884739e-07, - "loss": 0.4272, - "reward": 1.8906250596046448, - "reward_std": 0.19872672855854034, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9322916865348816, + "loss": 0.0085, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 957 }, { "clip_ratio": 0.0, - "completion_length": 465.3333435058594, + "completion_length": 414.41668701171875, "epoch": 0.958, - "grad_norm": 21.66655032203654, - "kl": 1.623046875, + "grad_norm": 1.923232633431884, + "kl": 0.09423828125, "learning_rate": 1.0482745016665526e-07, - "loss": 0.4824, - "reward": 1.9270833730697632, - "reward_std": 0.21660535037517548, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9479166865348816, + "loss": 0.0127, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 958 }, { "clip_ratio": 0.0, - "completion_length": 497.0625305175781, + "completion_length": 469.8125, "epoch": 0.959, - "grad_norm": 24.417894985010946, - "kl": 1.9296875, + "grad_norm": 2.5359963977453805, + "kl": 0.1854248046875, "learning_rate": 1.0460069609149496e-07, - "loss": 0.5459, - "reward": 1.8072917461395264, - "reward_std": 0.31944867968559265, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9114583432674408, + "loss": 0.0261, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 959 }, { "clip_ratio": 0.0, - "completion_length": 718.7708435058594, + "completion_length": 471.3958435058594, "epoch": 0.96, - "grad_norm": 8.26912439193507, - "kl": 3.578125, + "grad_norm": 1.6343344818979024, + "kl": 0.13623046875, "learning_rate": 1.0437936906629334e-07, - "loss": 0.6486, - "reward": 1.8385416865348816, - "reward_std": 0.21385890245437622, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8385416865348816, + "loss": 0.0068, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 960 }, { "clip_ratio": 0.0, - "completion_length": 634.6666870117188, + "completion_length": 474.8333435058594, "epoch": 0.961, - "grad_norm": 9.318887950309726, - "kl": 2.93359375, + "grad_norm": 2.778614137062019, + "kl": 0.095458984375, "learning_rate": 1.0416347178785039e-07, - "loss": 0.4487, - "reward": 1.6875000596046448, - "reward_std": 0.33738670498132706, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9166666865348816, + "loss": 0.0435, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 961 }, { "clip_ratio": 0.0, - "completion_length": 501.35418701171875, + "completion_length": 495.75001525878906, "epoch": 0.962, - "grad_norm": 5.1992794505105415, - "kl": 0.994140625, + "grad_norm": 2.72097056019789, + "kl": 0.12646484375, "learning_rate": 1.0395300688680625e-07, - "loss": 0.1639, - "reward": 1.9166667461395264, - "reward_std": 0.2452331706881523, - "rewards/accuracy_reward": 0.9375000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.0063, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 962 }, { "clip_ratio": 0.0, - "completion_length": 447.87501525878906, + "completion_length": 395.7708435058594, "epoch": 0.963, - "grad_norm": 28.249343137200498, - "kl": 1.3359375, + "grad_norm": 1.4073298211988017, + "kl": 0.18896484375, "learning_rate": 1.0374797692760933e-07, - "loss": 0.5455, - "reward": 1.7864583730697632, - "reward_std": 0.3407367169857025, - "rewards/accuracy_reward": 0.8333333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.953125, + "loss": -0.0007, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 963 }, { "clip_ratio": 0.0, - "completion_length": 639.1250305175781, + "completion_length": 442.18751525878906, "epoch": 0.964, - "grad_norm": 13.645803795053776, - "kl": 2.72412109375, + "grad_norm": 9.695014254118218, + "kl": 0.65673828125, "learning_rate": 1.0354838440848501e-07, - "loss": 0.2751, - "reward": 1.7083333730697632, - "reward_std": 0.19094066321849823, - "rewards/accuracy_reward": 0.8125, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8958333432674408, + "loss": 0.0452, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 964 }, { "clip_ratio": 0.0, - "completion_length": 537.125, + "completion_length": 453.125, "epoch": 0.965, - "grad_norm": 12.061634640743186, - "kl": 1.82421875, + "grad_norm": 5.34938148154237, + "kl": 0.420166015625, "learning_rate": 1.0335423176140511e-07, - "loss": 0.1804, - "reward": 1.7031250596046448, - "reward_std": 0.34469615668058395, - "rewards/accuracy_reward": 0.7916666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9114583730697632, + "loss": 0.043, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 965 }, { "clip_ratio": 0.0, - "completion_length": 590.7291717529297, + "completion_length": 485.85418701171875, "epoch": 0.966, - "grad_norm": 22.711017534648803, - "kl": 2.71484375, + "grad_norm": 4.22238362065506, + "kl": 0.129638671875, "learning_rate": 1.0316552135205837e-07, - "loss": 0.3779, - "reward": 1.8125, - "reward_std": 0.22406284511089325, - "rewards/accuracy_reward": 0.8958333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9166666865348816, + "loss": 0.0346, + "reward": 0.9843750298023224, + "reward_std": 0.05412658676505089, + "rewards/tag_count_reward": 0.9843750298023224, "step": 966 }, { "clip_ratio": 0.0, - "completion_length": 387.4791717529297, + "completion_length": 458.81251525878906, "epoch": 0.967, - "grad_norm": 1.949983311660439, - "kl": 0.23681640625, + "grad_norm": 0.6331126151655933, + "kl": 0.096435546875, "learning_rate": 1.029822554798216e-07, - "loss": 0.0101, - "reward": 1.75, + "loss": 0.0043, + "reward": 1.0, "reward_std": 0.0, - "rewards/accuracy_reward": 0.75, - "rewards/format_reward": 0.0, "rewards/tag_count_reward": 1.0, "step": 967 }, { "clip_ratio": 0.0, - "completion_length": 616.0625, + "completion_length": 471.25001525878906, "epoch": 0.968, - "grad_norm": 13.628195437126449, - "kl": 3.875, + "grad_norm": 4.328218469990424, + "kl": 0.1728515625, "learning_rate": 1.0280443637773163e-07, - "loss": 0.6001, - "reward": 1.8541666865348816, - "reward_std": 0.20776879414916039, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.875, + "loss": 0.0843, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 968 }, { "clip_ratio": 0.0, - "completion_length": 349.6458435058594, + "completion_length": 389.62501525878906, "epoch": 0.969, - "grad_norm": 2.1705793238290965, - "kl": 0.192626953125, + "grad_norm": 0.705259059622117, + "kl": 0.08544921875, "learning_rate": 1.0263206621245807e-07, - "loss": 0.0086, - "reward": 2.0, + "loss": 0.0039, + "reward": 1.0, "reward_std": 0.0, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, "rewards/tag_count_reward": 1.0, "step": 969 }, { "clip_ratio": 0.0, - "completion_length": 483.27085876464844, + "completion_length": 389.9583435058594, "epoch": 0.97, - "grad_norm": 14.676399724823877, - "kl": 1.906494140625, + "grad_norm": 4.207076961902249, + "kl": 0.16650390625, "learning_rate": 1.0246514708427701e-07, - "loss": 0.435, - "reward": 1.8229166865348816, - "reward_std": 0.30924198031425476, - "rewards/accuracy_reward": 0.875, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9479166865348816, + "loss": 0.0082, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 970 }, { "clip_ratio": 0.0, - "completion_length": 479.45835876464844, + "completion_length": 391.4583435058594, "epoch": 0.971, - "grad_norm": 19.263883167544808, - "kl": 1.2978515625, + "grad_norm": 1.2010840276151276, + "kl": 0.11767578125, "learning_rate": 1.0230368102704531e-07, - "loss": 0.1467, - "reward": 1.7656250596046448, - "reward_std": 0.1077597625553608, - "rewards/accuracy_reward": 0.8125, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9531250298023224, + "loss": 0.0056, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 971 }, { "clip_ratio": 0.0, - "completion_length": 719.7083435058594, + "completion_length": 466.7708435058594, "epoch": 0.972, - "grad_norm": 44.8940755773109, - "kl": 4.109375, + "grad_norm": 1.8836702979327664, + "kl": 0.068603515625, "learning_rate": 1.0214767000817596e-07, - "loss": 0.4737, - "reward": 1.5885416865348816, - "reward_std": 0.3589571416378021, - "rewards/accuracy_reward": 0.7291666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8593750298023224, + "loss": 0.031, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 972 }, { "clip_ratio": 0.0, - "completion_length": 385.0625, + "completion_length": 380.50001525878906, "epoch": 0.973, - "grad_norm": 17.195243288642754, - "kl": 0.4140625, + "grad_norm": 0.19391861126194634, + "kl": 0.0452880859375, "learning_rate": 1.01997115928614e-07, - "loss": 0.1543, - "reward": 1.9583333730697632, - "reward_std": 0.14433756470680237, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.0019, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 973 }, { "clip_ratio": 0.0, - "completion_length": 613.0625305175781, + "completion_length": 452.9791717529297, "epoch": 0.974, - "grad_norm": 45.23194181951144, - "kl": 3.3359375, + "grad_norm": 7.651648747333932, + "kl": 0.5869140625, "learning_rate": 1.0185202062281336e-07, - "loss": 0.3672, - "reward": 1.7447917461395264, - "reward_std": 0.24208343774080276, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.890625, + "loss": 0.2445, + "reward": 0.9739583730697632, + "reward_std": 0.06669837608933449, + "rewards/tag_count_reward": 0.9739583730697632, "step": 974 }, { "clip_ratio": 0.0, - "completion_length": 374.9583435058594, + "completion_length": 427.4166717529297, "epoch": 0.975, - "grad_norm": 0.5261299178146756, - "kl": 0.115478515625, + "grad_norm": 8.716776035375233, + "kl": 0.171875, "learning_rate": 1.017123858587145e-07, - "loss": 0.0047, - "reward": 2.0, - "reward_std": 0.0, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": 0.0764, + "reward": 0.9843750298023224, + "reward_std": 0.05412658676505089, + "rewards/tag_count_reward": 0.9843750298023224, "step": 975 }, { "clip_ratio": 0.0, - "completion_length": 400.1458435058594, + "completion_length": 405.2708435058594, "epoch": 0.976, - "grad_norm": 2.873656529994595, - "kl": 0.400390625, + "grad_norm": 0.8957252085487097, + "kl": 0.1015625, "learning_rate": 1.0157821333772304e-07, - "loss": 0.0701, - "reward": 1.9739583730697632, - "reward_std": 0.07278125733137131, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0049, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 976 }, { "clip_ratio": 0.0, - "completion_length": 518.3125305175781, + "completion_length": 449.4166717529297, "epoch": 0.977, - "grad_norm": 9.996884165951943, - "kl": 1.54296875, + "grad_norm": 2.145008185762233, + "kl": 0.17578125, "learning_rate": 1.014495046946888e-07, - "loss": 0.328, - "reward": 1.8697916865348816, - "reward_std": 0.23225001990795135, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.953125, + "loss": 0.0096, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 977 }, { "clip_ratio": 0.0, - "completion_length": 338.5833435058594, + "completion_length": 373.8541717529297, "epoch": 0.978, - "grad_norm": 0.2598334557964813, - "kl": 0.07763671875, + "grad_norm": 4.585846477881522, + "kl": 0.301025390625, "learning_rate": 1.013262614978859e-07, - "loss": 0.0032, - "reward": 2.0, - "reward_std": 0.0, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": 0.0301, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 978 }, { "clip_ratio": 0.0, - "completion_length": 544.6875, + "completion_length": 429.0833435058594, "epoch": 0.979, - "grad_norm": 18.11242954069285, - "kl": 1.84765625, + "grad_norm": 1.6114492254576476, + "kl": 0.0986328125, "learning_rate": 1.0120848524899386e-07, - "loss": 0.6447, - "reward": 1.90625, - "reward_std": 0.20846855640411377, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9479166865348816, + "loss": 0.0046, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 979 }, { "clip_ratio": 0.0, - "completion_length": 430.72918701171875, + "completion_length": 421.7708435058594, "epoch": 0.98, - "grad_norm": 22.787067779066774, - "kl": 0.873046875, + "grad_norm": 0.5571361949186726, + "kl": 0.073486328125, "learning_rate": 1.0109617738307911e-07, - "loss": 0.2653, - "reward": 1.9635416865348816, - "reward_std": 0.12629535794258118, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.984375, + "loss": 0.0035, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 980 }, { "clip_ratio": 0.0, - "completion_length": 669.7916870117188, + "completion_length": 397.43751525878906, "epoch": 0.981, - "grad_norm": 13.008931776798422, - "kl": 3.09228515625, + "grad_norm": 31.643005792685624, + "kl": 0.489013671875, "learning_rate": 1.0098933926857752e-07, - "loss": 0.4015, - "reward": 1.6145833730697632, - "reward_std": 0.3112960457801819, - "rewards/accuracy_reward": 0.75, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8645833432674408, + "loss": 0.0589, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 981 }, { "clip_ratio": 0.0, - "completion_length": 542.4791717529297, + "completion_length": 524.6250305175781, "epoch": 0.982, - "grad_norm": 5.2310215871811305, - "kl": 2.27734375, + "grad_norm": 3.0764806977544823, + "kl": 0.183349609375, "learning_rate": 1.0088797220727779e-07, - "loss": 0.1947, - "reward": 1.65625, - "reward_std": 0.39038464426994324, - "rewards/accuracy_reward": 0.7708333730697632, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8854166865348816, + "loss": 0.0259, + "reward": 0.984375, + "reward_std": 0.03884884715080261, + "rewards/tag_count_reward": 0.984375, "step": 982 }, { "clip_ratio": 0.0, - "completion_length": 408.6041717529297, + "completion_length": 375.1875, "epoch": 0.983, - "grad_norm": 21.841874988628824, - "kl": 0.54248046875, + "grad_norm": 2.6538317352580862, + "kl": 0.0966796875, "learning_rate": 1.007920774343056e-07, - "loss": -0.0968, - "reward": 1.5625000596046448, - "reward_std": 0.22191037237644196, - "rewards/accuracy_reward": 0.5833333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.0132, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 983 }, { "clip_ratio": 0.0, - "completion_length": 383.0208435058594, + "completion_length": 420.56251525878906, "epoch": 0.984, - "grad_norm": 0.5021531299713134, - "kl": 0.109375, + "grad_norm": 1.3248447884837615, + "kl": 0.16162109375, "learning_rate": 1.0070165611810855e-07, - "loss": 0.0049, - "reward": 2.0, + "loss": 0.0072, + "reward": 1.0, "reward_std": 0.0, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, "rewards/tag_count_reward": 1.0, "step": 984 }, { "clip_ratio": 0.0, - "completion_length": 468.41668701171875, + "completion_length": 413.5208435058594, "epoch": 0.985, - "grad_norm": 28.70103109603422, - "kl": 0.80517578125, + "grad_norm": 1.8074473920007366, + "kl": 0.0712890625, "learning_rate": 1.0061670936044178e-07, - "loss": 0.4167, - "reward": 1.921875, - "reward_std": 0.20952197909355164, - "rewards/accuracy_reward": 0.9583333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9635416865348816, + "loss": 0.0004, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 985 }, { "clip_ratio": 0.0, - "completion_length": 496.10418701171875, + "completion_length": 415.375, "epoch": 0.986, - "grad_norm": 45.586855219651326, - "kl": 1.20703125, + "grad_norm": 2.1831096647312727, + "kl": 0.15283203125, "learning_rate": 1.005372381963547e-07, - "loss": 0.6514, - "reward": 1.6875000596046448, - "reward_std": 0.1885581836104393, - "rewards/accuracy_reward": 0.7291666716337204, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9583333432674408, + "loss": 0.0026, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 986 }, { "clip_ratio": 0.0, - "completion_length": 378.5208435058594, + "completion_length": 462.6041717529297, "epoch": 0.987, - "grad_norm": 0.8445568102034864, - "kl": 0.134765625, + "grad_norm": 3.0665952226528823, + "kl": 0.1796875, "learning_rate": 1.0046324359417842e-07, - "loss": 0.0057, - "reward": 2.0, - "reward_std": 0.0, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 1.0, + "loss": 0.0181, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 987 }, { "clip_ratio": 0.0, - "completion_length": 442.10418701171875, + "completion_length": 465.75001525878906, "epoch": 0.988, - "grad_norm": 6.607655396621452, - "kl": 0.60009765625, + "grad_norm": 0.44700716676946056, + "kl": 0.07275390625, "learning_rate": 1.0039472645551372e-07, - "loss": 0.0827, - "reward": 1.8281250596046448, - "reward_std": 0.25082099437713623, - "rewards/accuracy_reward": 0.8541666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9739583432674408, + "loss": 0.0031, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 988 }, { "clip_ratio": 0.0, - "completion_length": 489.7500305175781, + "completion_length": 417.93751525878906, "epoch": 0.989, - "grad_norm": 18.3159302819461, - "kl": 1.375, + "grad_norm": 7.927309333392955, + "kl": 0.1513671875, "learning_rate": 1.0033168761522048e-07, - "loss": 0.4835, - "reward": 1.8906250596046448, - "reward_std": 0.2560053765773773, - "rewards/accuracy_reward": 0.9375000298023224, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9531250298023224, + "loss": 0.1489, + "reward": 0.9843750298023224, + "reward_std": 0.05412658676505089, + "rewards/tag_count_reward": 0.9843750298023224, "step": 989 }, { "clip_ratio": 0.0, - "completion_length": 528.0208587646484, + "completion_length": 407.37501525878906, "epoch": 0.99, - "grad_norm": 9.160553830269919, - "kl": 2.30078125, + "grad_norm": 1.6469239300234044, + "kl": 0.191162109375, "learning_rate": 1.002741278414069e-07, - "loss": 0.3505, - "reward": 1.8333333730697632, - "reward_std": 0.30849485844373703, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9166666865348816, + "loss": 0.0086, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 990 }, { "clip_ratio": 0.0, - "completion_length": 595.2083435058594, + "completion_length": 448.00001525878906, "epoch": 0.991, - "grad_norm": 16.44046495360215, - "kl": 2.19921875, + "grad_norm": 0.21007285113485952, + "kl": 0.0518798828125, "learning_rate": 1.0022204783542078e-07, - "loss": 0.498, - "reward": 1.8437500596046448, - "reward_std": 0.3014799952507019, - "rewards/accuracy_reward": 0.9166666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9270833432674408, + "loss": 0.002, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 991 }, { "clip_ratio": 0.0, - "completion_length": 386.81251525878906, + "completion_length": 436.8333435058594, "epoch": 0.992, - "grad_norm": 7.34030608585405, - "kl": 0.2998046875, + "grad_norm": 2.200010988728657, + "kl": 0.115234375, "learning_rate": 1.0017544823184055e-07, - "loss": 0.045, - "reward": 1.9739583730697632, - "reward_std": 0.09021097421646118, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9947916865348816, + "loss": 0.0112, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 992 }, { "clip_ratio": 0.0, - "completion_length": 448.5416717529297, + "completion_length": 538.6250305175781, "epoch": 0.993, - "grad_norm": 6.756200871344645, - "kl": 0.93896484375, + "grad_norm": 4.283023167021855, + "kl": 0.2578125, "learning_rate": 1.001343295984676e-07, - "loss": 0.1362, - "reward": 1.9791666865348816, - "reward_std": 0.04865618050098419, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9791666865348816, + "loss": 0.0217, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 993 }, { "clip_ratio": 0.0, - "completion_length": 441.7083435058594, + "completion_length": 486.56251525878906, "epoch": 0.994, - "grad_norm": 4.712272916213893, - "kl": 0.884033203125, + "grad_norm": 5.259883325340397, + "kl": 0.28125, "learning_rate": 1.0009869243631952e-07, - "loss": 0.0955, - "reward": 1.9635416865348816, - "reward_std": 0.07750703394412994, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.984375, + "loss": 0.0557, + "reward": 0.9791666865348816, + "reward_std": 0.07216878235340118, + "rewards/tag_count_reward": 0.9791666865348816, "step": 994 }, { "clip_ratio": 0.0, - "completion_length": 681.4166870117188, + "completion_length": 520.25, "epoch": 0.995, - "grad_norm": 46.85479984284838, - "kl": 4.0546875, + "grad_norm": 2.437461178054645, + "kl": 0.203369140625, "learning_rate": 1.0006853717962393e-07, - "loss": 0.4603, - "reward": 1.3177083730697632, - "reward_std": 0.34362873435020447, - "rewards/accuracy_reward": 0.4791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8385416865348816, + "loss": 0.0343, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 995 }, { "clip_ratio": 0.0, - "completion_length": 685.4375152587891, + "completion_length": 455.0, "epoch": 0.996, - "grad_norm": 13.784914209991408, - "kl": 3.890625, + "grad_norm": 1.952234178856873, + "kl": 0.115234375, "learning_rate": 1.000438641958131e-07, - "loss": 0.5216, - "reward": 1.6197917461395264, - "reward_std": 0.39866499602794647, - "rewards/accuracy_reward": 0.7708333432674408, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.8489583432674408, + "loss": -0.0091, + "reward": 0.9895833432674408, + "reward_std": 0.03608439117670059, + "rewards/tag_count_reward": 0.9895833432674408, "step": 996 }, { "clip_ratio": 0.0, - "completion_length": 439.1458435058594, + "completion_length": 427.47918701171875, "epoch": 0.997, - "grad_norm": 66.07264188400963, - "kl": 1.552978515625, + "grad_norm": 4.669011479719766, + "kl": 0.1650390625, "learning_rate": 1.0002467378551954e-07, - "loss": 0.3375, - "reward": 1.9895833730697632, - "reward_std": 0.03608439117670059, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9895833432674408, + "loss": 0.0236, + "reward": 0.9843750298023224, + "reward_std": 0.05412658676505089, + "rewards/tag_count_reward": 0.9843750298023224, "step": 997 }, { "clip_ratio": 0.0, - "completion_length": 501.5833435058594, + "completion_length": 440.4791717529297, "epoch": 0.998, - "grad_norm": 12.910059251806782, - "kl": 1.615234375, + "grad_norm": 0.5694568503474046, + "kl": 0.089111328125, "learning_rate": 1.0001096618257236e-07, - "loss": 0.3175, - "reward": 1.9270833730697632, - "reward_std": 0.10264033079147339, - "rewards/accuracy_reward": 0.9791666865348816, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.9479166865348816, + "loss": 0.0041, + "reward": 1.0, + "reward_std": 0.0, + "rewards/tag_count_reward": 1.0, "step": 998 }, { "clip_ratio": 0.0, - "completion_length": 377.9583435058594, + "completion_length": 420.6458435058594, "epoch": 0.999, - "grad_norm": 14.647500835758157, - "kl": 0.62841796875, + "grad_norm": 7.818586157248591, + "kl": 0.497802734375, "learning_rate": 1.0000274155399433e-07, - "loss": 0.3012, - "reward": 1.984375, - "reward_std": 0.05412658676505089, - "rewards/accuracy_reward": 1.0, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.984375, + "loss": 0.1502, + "reward": 0.9947916865348816, + "reward_std": 0.018042195588350296, + "rewards/tag_count_reward": 0.9947916865348816, "step": 999 }, { "clip_ratio": 0.0, - "completion_length": 402.34375, + "completion_length": 441.03125, "epoch": 1.0, - "grad_norm": 12.450347895588743, - "kl": 0.5546875, + "grad_norm": 8.490967975746036, + "kl": 0.4716796875, "learning_rate": 1e-07, - "loss": 0.1298, - "reward": 1.734375, + "loss": 0.0229, + "reward": 0.9843750298023224, "reward_std": 0.05412658676505089, - "rewards/accuracy_reward": 0.75, - "rewards/format_reward": 0.0, - "rewards/tag_count_reward": 0.984375, + "rewards/tag_count_reward": 0.9843750298023224, "step": 1000 }, { "epoch": 1.0, "step": 1000, "total_flos": 0.0, - "train_loss": 0.17772822943551, - "train_runtime": 45365.7135, - "train_samples_per_second": 0.088, - "train_steps_per_second": 0.022 + "train_loss": 0.061943143279685954, + "train_runtime": 34538.0147, + "train_samples_per_second": 0.116, + "train_steps_per_second": 0.029 } ], "logging_steps": 1, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 1, - "save_steps": 500, + "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": {