Geo-R1-CI-0620 / trainer_state.json
miniHui's picture
Upload folder using huggingface_hub
2a0d5d9 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.2547121752419766,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 321.578125,
"epoch": 0.0005094243504839531,
"grad_norm": 21.497011168465292,
"kl": 0.0,
"learning_rate": 9.997452878247579e-07,
"loss": -0.0,
"reward": -0.492842435836792,
"reward_std": 0.7784243226051331,
"rewards/accuracy_reward": -0.4125000238418579,
"rewards/cosine_rewards": -0.08018936403095722,
"rewards/format_reward": 0.0,
"rewards/repetition_rewards": -0.0001530575300421333,
"step": 1
},
{
"clip_ratio": 0.0,
"completion_length": 211.796875,
"epoch": 0.0010188487009679063,
"grad_norm": 8.570878529351686,
"kl": 0.00115203857421875,
"learning_rate": 9.99490575649516e-07,
"loss": 0.0,
"reward": -0.2021125927567482,
"reward_std": 0.686398446559906,
"rewards/accuracy_reward": -0.18437501601874828,
"rewards/cosine_rewards": -0.01752197090536356,
"rewards/format_reward": 0.0,
"rewards/repetition_rewards": -0.00021561131143243983,
"step": 2
},
{
"clip_ratio": 0.0,
"completion_length": 242.640625,
"epoch": 0.0015282730514518594,
"grad_norm": 7.698910727869972,
"kl": 0.0014190673828125,
"learning_rate": 9.99235863474274e-07,
"loss": 0.0001,
"reward": -0.6304773092269897,
"reward_std": 0.5950716435909271,
"rewards/accuracy_reward": -0.6093750298023224,
"rewards/cosine_rewards": -0.03664374351501465,
"rewards/format_reward": 0.015625,
"rewards/repetition_rewards": -8.355615136679262e-05,
"step": 3
},
{
"clip_ratio": 0.0,
"completion_length": 192.765625,
"epoch": 0.0020376974019358125,
"grad_norm": 8.264023776311538,
"kl": 0.00258636474609375,
"learning_rate": 9.98981151299032e-07,
"loss": 0.0001,
"reward": -0.4020528346300125,
"reward_std": 0.7227448225021362,
"rewards/accuracy_reward": -0.38750001788139343,
"rewards/cosine_rewards": -0.014348747674375772,
"rewards/format_reward": 0.0,
"rewards/repetition_rewards": -0.00020408956333994865,
"step": 4
},
{
"clip_ratio": 0.0,
"completion_length": 199.953125,
"epoch": 0.0025471217524197657,
"grad_norm": 9.41735274952485,
"kl": 0.00286865234375,
"learning_rate": 9.9872643912379e-07,
"loss": 0.0001,
"reward": -0.45950669050216675,
"reward_std": 0.6219092607498169,
"rewards/accuracy_reward": -0.4343750476837158,
"rewards/cosine_rewards": -0.02503613755106926,
"rewards/format_reward": 0.0,
"rewards/repetition_rewards": -9.553764903103001e-05,
"step": 5
},
{
"clip_ratio": 0.0,
"completion_length": 197.9375,
"epoch": 0.003056546102903719,
"grad_norm": 12.944765767909546,
"kl": 0.008697509765625,
"learning_rate": 9.984717269485481e-07,
"loss": 0.0003,
"reward": -0.42242346704006195,
"reward_std": 0.6794147342443466,
"rewards/accuracy_reward": -0.40937504172325134,
"rewards/cosine_rewards": -0.028209966607391834,
"rewards/format_reward": 0.015625,
"rewards/repetition_rewards": -0.0004634863289538771,
"step": 6
},
{
"clip_ratio": 0.0,
"completion_length": 131.859375,
"epoch": 0.003565970453387672,
"grad_norm": 10.259430825273313,
"kl": 0.013763427734375,
"learning_rate": 9.98217014773306e-07,
"loss": 0.0005,
"reward": -0.33318234980106354,
"reward_std": 0.7437820434570312,
"rewards/accuracy_reward": -0.35625000298023224,
"rewards/cosine_rewards": -0.00804880098439753,
"rewards/format_reward": 0.03125,
"rewards/repetition_rewards": -0.00013354701513890177,
"step": 7
},
{
"clip_ratio": 0.0,
"completion_length": 138.0625,
"epoch": 0.004075394803871625,
"grad_norm": 8.664940595308508,
"kl": 0.01800537109375,
"learning_rate": 9.979623025980642e-07,
"loss": 0.0007,
"reward": -0.3353596553206444,
"reward_std": 0.7424190640449524,
"rewards/accuracy_reward": -0.32500002533197403,
"rewards/cosine_rewards": -0.010187382809817791,
"rewards/format_reward": 0.0,
"rewards/repetition_rewards": -0.00017226976342499256,
"step": 8
},
{
"clip_ratio": 0.0,
"completion_length": 130.609375,
"epoch": 0.004584819154355578,
"grad_norm": 12.906962146752678,
"kl": 0.013641357421875,
"learning_rate": 9.977075904228221e-07,
"loss": 0.0005,
"reward": -0.5576262176036835,
"reward_std": 0.38936011493206024,
"rewards/accuracy_reward": -0.546875,
"rewards/cosine_rewards": -0.010545612312853336,
"rewards/format_reward": 0.0,
"rewards/repetition_rewards": -0.00020559210679493845,
"step": 9
},
{
"clip_ratio": 0.0,
"completion_length": 118.375,
"epoch": 0.005094243504839531,
"grad_norm": 12.675664846435772,
"kl": 0.014129638671875,
"learning_rate": 9.974528782475803e-07,
"loss": 0.0006,
"reward": -0.5825353264808655,
"reward_std": 0.32141495356336236,
"rewards/accuracy_reward": -0.5750000178813934,
"rewards/cosine_rewards": -0.0075353041756898165,
"rewards/format_reward": 0.0,
"rewards/repetition_rewards": 0.0,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 116.5625,
"epoch": 0.0056036678553234845,
"grad_norm": 83.14378275688269,
"kl": 0.011932373046875,
"learning_rate": 9.971981660723382e-07,
"loss": 0.0005,
"reward": -0.4973638355731964,
"reward_std": 0.6479763090610504,
"rewards/accuracy_reward": -0.4906250536441803,
"rewards/cosine_rewards": -0.006738818949088454,
"rewards/format_reward": 0.0,
"rewards/repetition_rewards": 0.0,
"step": 11
},
{
"clip_ratio": 0.0,
"completion_length": 122.5,
"epoch": 0.006113092205807438,
"grad_norm": 10.015051037156322,
"kl": 0.01776123046875,
"learning_rate": 9.969434538970963e-07,
"loss": 0.0007,
"reward": -0.5842953324317932,
"reward_std": 0.3923248201608658,
"rewards/accuracy_reward": -0.5750000029802322,
"rewards/cosine_rewards": -0.009295305702835321,
"rewards/format_reward": 0.0,
"rewards/repetition_rewards": 0.0,
"step": 12
},
{
"clip_ratio": 0.0,
"completion_length": 113.984375,
"epoch": 0.006622516556291391,
"grad_norm": 11.394446741932766,
"kl": 0.018157958984375,
"learning_rate": 9.966887417218542e-07,
"loss": 0.0007,
"reward": -0.5545713007450104,
"reward_std": 0.5603736639022827,
"rewards/accuracy_reward": -0.5468750298023224,
"rewards/cosine_rewards": -0.007696274435147643,
"rewards/format_reward": 0.0,
"rewards/repetition_rewards": 0.0,
"step": 13
},
{
"clip_ratio": 0.0,
"completion_length": 105.8125,
"epoch": 0.007131940906775344,
"grad_norm": 11.774338615514537,
"kl": 0.017730712890625,
"learning_rate": 9.964340295466124e-07,
"loss": 0.0007,
"reward": -0.24103393778204918,
"reward_std": 0.770084798336029,
"rewards/accuracy_reward": -0.23750002682209015,
"rewards/cosine_rewards": -0.0035339330206625164,
"rewards/format_reward": 0.0,
"rewards/repetition_rewards": 0.0,
"step": 14
},
{
"clip_ratio": 0.0,
"completion_length": 99.5,
"epoch": 0.007641365257259297,
"grad_norm": 12.461454822945,
"kl": 0.01995849609375,
"learning_rate": 9.961793173713703e-07,
"loss": 0.0008,
"reward": -0.7055607736110687,
"reward_std": 0.2303236834704876,
"rewards/accuracy_reward": -0.7156250178813934,
"rewards/cosine_rewards": -0.005560769001021981,
"rewards/format_reward": 0.015625,
"rewards/repetition_rewards": 0.0,
"step": 15
},
{
"clip_ratio": 0.0,
"completion_length": 101.53125,
"epoch": 0.00815078960774325,
"grad_norm": 16.951183982865736,
"kl": 0.0206298828125,
"learning_rate": 9.959246051961282e-07,
"loss": 0.0008,
"reward": -0.3540929928421974,
"reward_std": 0.7245323657989502,
"rewards/accuracy_reward": -0.3500000238418579,
"rewards/cosine_rewards": -0.004092983668670058,
"rewards/format_reward": 0.0,
"rewards/repetition_rewards": 0.0,
"step": 16
},
{
"clip_ratio": 0.0,
"completion_length": 96.578125,
"epoch": 0.008660213958227204,
"grad_norm": 9.458837096460513,
"kl": 0.025634765625,
"learning_rate": 9.956698930208864e-07,
"loss": 0.001,
"reward": -0.36599001288414,
"reward_std": 0.6569808125495911,
"rewards/accuracy_reward": -0.37812502682209015,
"rewards/cosine_rewards": -0.003489995375275612,
"rewards/format_reward": 0.015625,
"rewards/repetition_rewards": 0.0,
"step": 17
},
{
"clip_ratio": 0.0,
"completion_length": 96.625,
"epoch": 0.009169638308711156,
"grad_norm": 11.937426620152417,
"kl": 0.02801513671875,
"learning_rate": 9.954151808456443e-07,
"loss": 0.0011,
"reward": -0.40710097551345825,
"reward_std": 0.7412720322608948,
"rewards/accuracy_reward": -0.43437501788139343,
"rewards/cosine_rewards": -0.003975986503064632,
"rewards/format_reward": 0.03125,
"rewards/repetition_rewards": 0.0,
"step": 18
},
{
"clip_ratio": 0.0,
"completion_length": 97.21875,
"epoch": 0.00967906265919511,
"grad_norm": 12.317962197180934,
"kl": 0.03466796875,
"learning_rate": 9.951604686704024e-07,
"loss": 0.0014,
"reward": -0.25013431906700134,
"reward_std": 0.7123757898807526,
"rewards/accuracy_reward": -0.32500001788139343,
"rewards/cosine_rewards": -0.003259307239204645,
"rewards/format_reward": 0.078125,
"rewards/repetition_rewards": 0.0,
"step": 19
},
{
"clip_ratio": 0.0,
"completion_length": 97.109375,
"epoch": 0.010188487009679063,
"grad_norm": 24.27751022595849,
"kl": 0.037109375,
"learning_rate": 9.949057564951603e-07,
"loss": 0.0015,
"reward": -0.2632312625646591,
"reward_std": 0.6930468529462814,
"rewards/accuracy_reward": -0.4624999985098839,
"rewards/cosine_rewards": -0.003856247873045504,
"rewards/format_reward": 0.203125,
"rewards/repetition_rewards": 0.0,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 109.03125,
"epoch": 0.010697911360163017,
"grad_norm": 12.508736780907405,
"kl": 0.053955078125,
"learning_rate": 9.946510443199185e-07,
"loss": 0.0022,
"reward": -0.010567170567810535,
"reward_std": 0.7874742448329926,
"rewards/accuracy_reward": -0.4125000238418579,
"rewards/cosine_rewards": -0.004317150334827602,
"rewards/format_reward": 0.40625,
"rewards/repetition_rewards": 0.0,
"step": 21
},
{
"clip_ratio": 0.0,
"completion_length": 113.390625,
"epoch": 0.011207335710646969,
"grad_norm": 10.983519481785477,
"kl": 0.073974609375,
"learning_rate": 9.943963321446764e-07,
"loss": 0.003,
"reward": 0.5529356598854065,
"reward_std": 0.9540310502052307,
"rewards/accuracy_reward": -0.2093750163912773,
"rewards/cosine_rewards": -0.003314302652142942,
"rewards/format_reward": 0.765625,
"rewards/repetition_rewards": 0.0,
"step": 22
},
{
"clip_ratio": 0.0,
"completion_length": 113.796875,
"epoch": 0.011716760061130923,
"grad_norm": 59.13650095831239,
"kl": 0.084716796875,
"learning_rate": 9.941416199694345e-07,
"loss": 0.0034,
"reward": 0.49799469113349915,
"reward_std": 0.6547213792800903,
"rewards/accuracy_reward": -0.43437501788139343,
"rewards/cosine_rewards": -0.005130313569679856,
"rewards/format_reward": 0.9375,
"rewards/repetition_rewards": 0.0,
"step": 23
},
{
"clip_ratio": 0.0,
"completion_length": 114.34375,
"epoch": 0.012226184411614875,
"grad_norm": 33.85321217925331,
"kl": 0.078369140625,
"learning_rate": 9.938869077941925e-07,
"loss": 0.0031,
"reward": 0.5440552532672882,
"reward_std": 0.4689805209636688,
"rewards/accuracy_reward": -0.43437501788139343,
"rewards/cosine_rewards": -0.005944762844592333,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": 0.0,
"step": 24
},
{
"clip_ratio": 0.0,
"completion_length": 99.078125,
"epoch": 0.01273560876209883,
"grad_norm": 27.614415529764607,
"kl": 0.23876953125,
"learning_rate": 9.936321956189506e-07,
"loss": 0.0096,
"reward": 0.319291889667511,
"reward_std": 0.2991320895962417,
"rewards/accuracy_reward": -0.659375011920929,
"rewards/cosine_rewards": -0.005708091426640749,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": 0.0,
"step": 25
},
{
"clip_ratio": 0.0,
"completion_length": 21.09375,
"epoch": 0.013245033112582781,
"grad_norm": 78.27860464199816,
"kl": 0.810546875,
"learning_rate": 9.933774834437085e-07,
"loss": 0.0324,
"reward": 0.758573591709137,
"reward_std": 0.8151377141475677,
"rewards/accuracy_reward": -0.24062500894069672,
"rewards/cosine_rewards": -0.0008013773494894849,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 26
},
{
"clip_ratio": 0.0,
"completion_length": 18.75,
"epoch": 0.013754457463066735,
"grad_norm": 15.792726008582374,
"kl": 0.876953125,
"learning_rate": 9.931227712684667e-07,
"loss": 0.0351,
"reward": 0.5177058726549149,
"reward_std": 0.6054319739341736,
"rewards/accuracy_reward": -0.46562501788139343,
"rewards/cosine_rewards": -0.000942649960052222,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": -0.000101461038866546,
"step": 27
},
{
"clip_ratio": 0.0,
"completion_length": 15.15625,
"epoch": 0.014263881813550688,
"grad_norm": 28.538085950991302,
"kl": 0.853515625,
"learning_rate": 9.928680590932246e-07,
"loss": 0.0342,
"reward": 0.30591557919979095,
"reward_std": 0.3449897766113281,
"rewards/accuracy_reward": -0.6625000238418579,
"rewards/cosine_rewards": -0.00033439824983361177,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": 0.0,
"step": 28
},
{
"clip_ratio": 0.0,
"completion_length": 13.1875,
"epoch": 0.014773306164034642,
"grad_norm": 19.672166251005525,
"kl": 0.939453125,
"learning_rate": 9.926133469179825e-07,
"loss": 0.0375,
"reward": 0.4091247171163559,
"reward_std": 0.46140581369400024,
"rewards/accuracy_reward": -0.5750000476837158,
"rewards/cosine_rewards": -0.0002502501738490537,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": 0.0,
"step": 29
},
{
"clip_ratio": 0.0,
"completion_length": 16.171875,
"epoch": 0.015282730514518594,
"grad_norm": 35.96869326683099,
"kl": 1.416015625,
"learning_rate": 9.923586347427406e-07,
"loss": 0.0566,
"reward": 0.5554585456848145,
"reward_std": 0.7011753022670746,
"rewards/accuracy_reward": -0.3812499940395355,
"rewards/cosine_rewards": -0.0007914370798971504,
"rewards/format_reward": 0.9375,
"rewards/repetition_rewards": 0.0,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 16.15625,
"epoch": 0.015792154865002548,
"grad_norm": 24.745191247130563,
"kl": 1.01171875,
"learning_rate": 9.921039225674986e-07,
"loss": 0.0405,
"reward": 0.6306657046079636,
"reward_std": 0.7620185613632202,
"rewards/accuracy_reward": -0.32187502086162567,
"rewards/cosine_rewards": -0.0005842609098181129,
"rewards/format_reward": 0.953125,
"rewards/repetition_rewards": 0.0,
"step": 31
},
{
"clip_ratio": 0.0,
"completion_length": 39.140625,
"epoch": 0.0163015792154865,
"grad_norm": 12.709104159306316,
"kl": 0.7265625,
"learning_rate": 9.918492103922567e-07,
"loss": 0.0291,
"reward": 0.38606902956962585,
"reward_std": 0.8792209327220917,
"rewards/accuracy_reward": -0.39375001192092896,
"rewards/cosine_rewards": -0.001430943259038031,
"rewards/format_reward": 0.78125,
"rewards/repetition_rewards": 0.0,
"step": 32
},
{
"clip_ratio": 0.0,
"completion_length": 15.03125,
"epoch": 0.016811003565970453,
"grad_norm": 17.976804747397264,
"kl": 0.904296875,
"learning_rate": 9.915944982170146e-07,
"loss": 0.0361,
"reward": 0.4996982365846634,
"reward_std": 0.7649624943733215,
"rewards/accuracy_reward": -0.4687500149011612,
"rewards/cosine_rewards": -0.00030174180574249476,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": 0.0,
"step": 33
},
{
"clip_ratio": 0.0,
"completion_length": 22.609375,
"epoch": 0.017320427916454408,
"grad_norm": 39.58286880123024,
"kl": 0.8828125,
"learning_rate": 9.913397860417728e-07,
"loss": 0.0353,
"reward": 0.4207390695810318,
"reward_std": 0.8459209501743317,
"rewards/accuracy_reward": -0.4375000298023224,
"rewards/cosine_rewards": -0.0011358977280906402,
"rewards/format_reward": 0.859375,
"rewards/repetition_rewards": 0.0,
"step": 34
},
{
"clip_ratio": 0.0,
"completion_length": 13.328125,
"epoch": 0.01782985226693836,
"grad_norm": 19.17123986238676,
"kl": 0.955078125,
"learning_rate": 9.910850738665307e-07,
"loss": 0.0383,
"reward": 0.474868506193161,
"reward_std": 0.6449769139289856,
"rewards/accuracy_reward": -0.49375005066394806,
"rewards/cosine_rewards": -0.0001314536166319158,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": 0.0,
"step": 35
},
{
"clip_ratio": 0.0,
"completion_length": 24.328125,
"epoch": 0.018339276617422313,
"grad_norm": 23.5658227799872,
"kl": 0.95703125,
"learning_rate": 9.908303616912888e-07,
"loss": 0.0382,
"reward": 0.4700201153755188,
"reward_std": 0.7454200983047485,
"rewards/accuracy_reward": -0.41875001788139343,
"rewards/cosine_rewards": -0.0018548529915278777,
"rewards/format_reward": 0.890625,
"rewards/repetition_rewards": 0.0,
"step": 36
},
{
"clip_ratio": 0.0,
"completion_length": 13.921875,
"epoch": 0.018848700967906265,
"grad_norm": 11.872294898362206,
"kl": 1.001953125,
"learning_rate": 9.905756495160467e-07,
"loss": 0.0401,
"reward": 0.5029261708259583,
"reward_std": 0.7742039263248444,
"rewards/accuracy_reward": -0.4343750327825546,
"rewards/cosine_rewards": -0.0001988118929148186,
"rewards/format_reward": 0.9375,
"rewards/repetition_rewards": 0.0,
"step": 37
},
{
"clip_ratio": 0.0,
"completion_length": 16.40625,
"epoch": 0.01935812531839022,
"grad_norm": 14.39070050703297,
"kl": 0.978515625,
"learning_rate": 9.903209373408049e-07,
"loss": 0.0391,
"reward": 0.4616774320602417,
"reward_std": 0.7915183901786804,
"rewards/accuracy_reward": -0.4125000238418579,
"rewards/cosine_rewards": -0.000822544090624433,
"rewards/format_reward": 0.875,
"rewards/repetition_rewards": 0.0,
"step": 38
},
{
"clip_ratio": 0.0,
"completion_length": 22.1875,
"epoch": 0.019867549668874173,
"grad_norm": 9.423514240680602,
"kl": 0.9375,
"learning_rate": 9.900662251655628e-07,
"loss": 0.0376,
"reward": 0.5430571883916855,
"reward_std": 0.5502887666225433,
"rewards/accuracy_reward": -0.4062500447034836,
"rewards/cosine_rewards": -0.003600762978749117,
"rewards/format_reward": 0.953125,
"rewards/repetition_rewards": -0.00021701389050576836,
"step": 39
},
{
"clip_ratio": 0.0,
"completion_length": 12.4375,
"epoch": 0.020376974019358125,
"grad_norm": 25.806979588800377,
"kl": 0.9140625,
"learning_rate": 9.89811512990321e-07,
"loss": 0.0366,
"reward": 0.503069132566452,
"reward_std": 0.6060213148593903,
"rewards/accuracy_reward": -0.4656250327825546,
"rewards/cosine_rewards": -5.582944686466362e-05,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": 0.0,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 12.46875,
"epoch": 0.020886398369842078,
"grad_norm": 20.104239930601235,
"kl": 0.9296875,
"learning_rate": 9.895568008150789e-07,
"loss": 0.0372,
"reward": 0.631201758980751,
"reward_std": 0.7148115336894989,
"rewards/accuracy_reward": -0.35312502086162567,
"rewards/cosine_rewards": -4.822800292458851e-05,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": 0.0,
"step": 41
},
{
"clip_ratio": 0.0,
"completion_length": 12.828125,
"epoch": 0.021395822720326033,
"grad_norm": 7.720832433302504,
"kl": 0.841796875,
"learning_rate": 9.89302088639837e-07,
"loss": 0.0336,
"reward": 0.5936954319477081,
"reward_std": 0.4961870163679123,
"rewards/accuracy_reward": -0.4062500298023224,
"rewards/cosine_rewards": -5.458852319861762e-05,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 42
},
{
"clip_ratio": 0.0,
"completion_length": 12.984375,
"epoch": 0.021905247070809986,
"grad_norm": 9.831243087089065,
"kl": 0.76953125,
"learning_rate": 9.89047376464595e-07,
"loss": 0.0308,
"reward": 0.6499472558498383,
"reward_std": 0.7755721807479858,
"rewards/accuracy_reward": -0.3500000238418579,
"rewards/cosine_rewards": -5.273178430797998e-05,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 43
},
{
"clip_ratio": 0.0,
"completion_length": 13.0,
"epoch": 0.022414671421293938,
"grad_norm": 15.405185965961632,
"kl": 0.79296875,
"learning_rate": 9.88792664289353e-07,
"loss": 0.0318,
"reward": 0.8749629557132721,
"reward_std": 0.8532125055789948,
"rewards/accuracy_reward": -0.1250000149011612,
"rewards/cosine_rewards": -3.706023017002735e-05,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 44
},
{
"clip_ratio": 0.0,
"completion_length": 13.0,
"epoch": 0.02292409577177789,
"grad_norm": 73.99173140679622,
"kl": 0.814453125,
"learning_rate": 9.88537952114111e-07,
"loss": 0.0326,
"reward": 0.8468359708786011,
"reward_std": 0.6202812939882278,
"rewards/accuracy_reward": -0.15312501415610313,
"rewards/cosine_rewards": -3.904559889633674e-05,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 45
},
{
"clip_ratio": 0.0,
"completion_length": 13.09375,
"epoch": 0.023433520122261846,
"grad_norm": 57.01467559353994,
"kl": 0.802734375,
"learning_rate": 9.882832399388691e-07,
"loss": 0.0321,
"reward": 0.7187013626098633,
"reward_std": 0.7317405939102173,
"rewards/accuracy_reward": -0.26562502793967724,
"rewards/cosine_rewards": -4.8641444664099254e-05,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": 0.0,
"step": 46
},
{
"clip_ratio": 0.0,
"completion_length": 14.8125,
"epoch": 0.023942944472745798,
"grad_norm": 49.94239079179156,
"kl": 0.8125,
"learning_rate": 9.88028527763627e-07,
"loss": 0.0325,
"reward": 0.7904289066791534,
"reward_std": 0.6411640644073486,
"rewards/accuracy_reward": -0.2093750163912773,
"rewards/cosine_rewards": -0.00019609702576417476,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 47
},
{
"clip_ratio": 0.0,
"completion_length": 13.984375,
"epoch": 0.02445236882322975,
"grad_norm": 29.421172213478044,
"kl": 0.8046875,
"learning_rate": 9.877738155883852e-07,
"loss": 0.0322,
"reward": 0.7342777252197266,
"reward_std": 0.3429698422551155,
"rewards/accuracy_reward": -0.2656249850988388,
"rewards/cosine_rewards": -9.731029422255233e-05,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 48
},
{
"clip_ratio": 0.0,
"completion_length": 13.0,
"epoch": 0.024961793173713703,
"grad_norm": 30.98665699286148,
"kl": 0.86328125,
"learning_rate": 9.87519103413143e-07,
"loss": 0.0346,
"reward": 1.0437248945236206,
"reward_std": 0.6164620369672775,
"rewards/accuracy_reward": 0.04374997317790985,
"rewards/cosine_rewards": -2.5148013037323835e-05,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 49
},
{
"clip_ratio": 0.0,
"completion_length": 13.0,
"epoch": 0.02547121752419766,
"grad_norm": 25.805195350433394,
"kl": 0.787109375,
"learning_rate": 9.872643912379012e-07,
"loss": 0.0315,
"reward": 0.6499470472335815,
"reward_std": 0.4753982424736023,
"rewards/accuracy_reward": -0.3500000238418579,
"rewards/cosine_rewards": -5.2943185437470675e-05,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 13.0,
"epoch": 0.02598064187468161,
"grad_norm": 60.37753917289171,
"kl": 0.865234375,
"learning_rate": 9.870096790626592e-07,
"loss": 0.0347,
"reward": 1.0999788641929626,
"reward_std": 0.716822475194931,
"rewards/accuracy_reward": 0.09999999590218067,
"rewards/cosine_rewards": -2.117727399308933e-05,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 51
},
{
"clip_ratio": 0.0,
"completion_length": 13.0,
"epoch": 0.026490066225165563,
"grad_norm": 60.29300770886218,
"kl": 0.859375,
"learning_rate": 9.867549668874173e-07,
"loss": 0.0343,
"reward": 1.3249947428703308,
"reward_std": 0.6325759440660477,
"rewards/accuracy_reward": 0.32499997690320015,
"rewards/cosine_rewards": -5.294318725646008e-06,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 52
},
{
"clip_ratio": 0.0,
"completion_length": 14.859375,
"epoch": 0.026999490575649515,
"grad_norm": 33.49731491963465,
"kl": 0.96484375,
"learning_rate": 9.865002547121752e-07,
"loss": 0.0386,
"reward": 0.6497911810874939,
"reward_std": 0.23335448652505875,
"rewards/accuracy_reward": -0.3500000163912773,
"rewards/cosine_rewards": -0.00020882973694824614,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 53
},
{
"clip_ratio": 0.0,
"completion_length": 13.0,
"epoch": 0.02750891492613347,
"grad_norm": 21.106152145400298,
"kl": 0.85546875,
"learning_rate": 9.862455425369333e-07,
"loss": 0.0342,
"reward": 1.3812487125396729,
"reward_std": 0.26327238231897354,
"rewards/accuracy_reward": 0.3812499940395355,
"rewards/cosine_rewards": -1.3235799087851774e-06,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 54
},
{
"clip_ratio": 0.0,
"completion_length": 13.0,
"epoch": 0.028018339276617423,
"grad_norm": 50.8468456202969,
"kl": 0.767578125,
"learning_rate": 9.859908303616913e-07,
"loss": 0.0307,
"reward": 1.493756651878357,
"reward_std": 0.3182205259799957,
"rewards/accuracy_reward": 0.4937499910593033,
"rewards/cosine_rewards": 6.617898179683834e-06,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 55
},
{
"clip_ratio": 0.0,
"completion_length": 13.0,
"epoch": 0.028527763627101375,
"grad_norm": 45.22229728431362,
"kl": 0.833984375,
"learning_rate": 9.857361181864494e-07,
"loss": 0.0334,
"reward": 0.9030899405479431,
"reward_std": 0.2386654019355774,
"rewards/accuracy_reward": -0.09687501937150955,
"rewards/cosine_rewards": -3.507485962472856e-05,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 56
},
{
"clip_ratio": 0.0,
"completion_length": 13.0,
"epoch": 0.029037187977585328,
"grad_norm": 251.48941881136554,
"kl": 0.828125,
"learning_rate": 9.854814060112073e-07,
"loss": 0.0331,
"reward": 1.5781376361846924,
"reward_std": 0.3039933070540428,
"rewards/accuracy_reward": 0.5781249701976776,
"rewards/cosine_rewards": 1.2574006632348755e-05,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 57
},
{
"clip_ratio": 0.0,
"completion_length": 14.453125,
"epoch": 0.029546612328069283,
"grad_norm": 36.77436472817121,
"kl": 0.939453125,
"learning_rate": 9.852266938359653e-07,
"loss": 0.0376,
"reward": 1.334273636341095,
"reward_std": 0.34448733925819397,
"rewards/accuracy_reward": 0.34999997913837433,
"rewards/cosine_rewards": -0.00010142281280423049,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": 0.0,
"step": 58
},
{
"clip_ratio": 0.0,
"completion_length": 14.4375,
"epoch": 0.030056036678553236,
"grad_norm": 46.54068675299219,
"kl": 0.89453125,
"learning_rate": 9.849719816607234e-07,
"loss": 0.0358,
"reward": 0.9967500269412994,
"reward_std": 0.4488208740949631,
"rewards/accuracy_reward": 0.012499993667006493,
"rewards/cosine_rewards": -0.00012501747096393956,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": 0.0,
"step": 59
},
{
"clip_ratio": 0.0,
"completion_length": 13.0,
"epoch": 0.030565461029037188,
"grad_norm": 63.09725081890949,
"kl": 0.837890625,
"learning_rate": 9.847172694854813e-07,
"loss": 0.0335,
"reward": 0.9874708652496338,
"reward_std": 0.33707569539546967,
"rewards/accuracy_reward": -0.012500002980232239,
"rewards/cosine_rewards": -2.9118752991053043e-05,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 13.0,
"epoch": 0.03107488537952114,
"grad_norm": 105.45023488529014,
"kl": 0.8359375,
"learning_rate": 9.844625573102394e-07,
"loss": 0.0334,
"reward": 1.1281058490276337,
"reward_std": 0.3039932996034622,
"rewards/accuracy_reward": 0.12812498584389687,
"rewards/cosine_rewards": -1.919190435728524e-05,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 61
},
{
"clip_ratio": 0.0,
"completion_length": 14.640625,
"epoch": 0.031584309730005096,
"grad_norm": 85.12929156788996,
"kl": 0.84375,
"learning_rate": 9.842078451349974e-07,
"loss": 0.0337,
"reward": 0.7748350501060486,
"reward_std": 0.5448895841836929,
"rewards/accuracy_reward": -0.2093750238418579,
"rewards/cosine_rewards": -0.00016493651855853386,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": 0.0,
"step": 62
},
{
"clip_ratio": 0.0,
"completion_length": 19.078125,
"epoch": 0.032093734080489045,
"grad_norm": 8.426167428667783,
"kl": 0.814453125,
"learning_rate": 9.839531329597555e-07,
"loss": 0.0326,
"reward": 0.874523401260376,
"reward_std": 0.0010828198865056038,
"rewards/accuracy_reward": -0.1250000149011612,
"rewards/cosine_rewards": -0.0004766158472193638,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 63
},
{
"clip_ratio": 0.0,
"completion_length": 19.34375,
"epoch": 0.032603158430973,
"grad_norm": 86.47647424288853,
"kl": 0.810546875,
"learning_rate": 9.836984207845134e-07,
"loss": 0.0324,
"reward": 1.6625866889953613,
"reward_std": 0.19662056118249893,
"rewards/accuracy_reward": 0.6625000238418579,
"rewards/cosine_rewards": 8.658922160975635e-05,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 64
},
{
"clip_ratio": 0.0,
"completion_length": 17.6875,
"epoch": 0.033112582781456956,
"grad_norm": 59.22678939682069,
"kl": 0.86328125,
"learning_rate": 9.834437086092716e-07,
"loss": 0.0345,
"reward": 0.915763258934021,
"reward_std": 0.082692209049128,
"rewards/accuracy_reward": -0.06875000894069672,
"rewards/cosine_rewards": 0.00013823993504047394,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": 0.0,
"step": 65
},
{
"clip_ratio": 0.0,
"completion_length": 24.671875,
"epoch": 0.033622007131940905,
"grad_norm": 170.96252285475958,
"kl": 0.7734375,
"learning_rate": 9.831889964340295e-07,
"loss": 0.0309,
"reward": 1.2965829372406006,
"reward_std": 0.32569222897291183,
"rewards/accuracy_reward": 0.2968750074505806,
"rewards/cosine_rewards": -0.0002921203849837184,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 66
},
{
"clip_ratio": 0.0,
"completion_length": 18.890625,
"epoch": 0.03413143148242486,
"grad_norm": 302.02752287161115,
"kl": 0.84765625,
"learning_rate": 9.829342842587876e-07,
"loss": 0.0339,
"reward": 1.2968038320541382,
"reward_std": 0.27610647678375244,
"rewards/accuracy_reward": 0.296875,
"rewards/cosine_rewards": -7.123823161236942e-05,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 67
},
{
"clip_ratio": 0.0,
"completion_length": 22.984375,
"epoch": 0.034640855832908816,
"grad_norm": 617.6786375620823,
"kl": 0.77734375,
"learning_rate": 9.826795720835456e-07,
"loss": 0.0311,
"reward": 1.4656760096549988,
"reward_std": 0.2886117473244667,
"rewards/accuracy_reward": 0.46562498807907104,
"rewards/cosine_rewards": 5.1008202717639506e-05,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 68
},
{
"clip_ratio": 0.0,
"completion_length": 24.34375,
"epoch": 0.035150280183392765,
"grad_norm": 29.79960642054238,
"kl": 0.728515625,
"learning_rate": 9.824248599083037e-07,
"loss": 0.0292,
"reward": 1.309334635734558,
"reward_std": 0.20424916595220566,
"rewards/accuracy_reward": 0.32500000298023224,
"rewards/cosine_rewards": -4.041045031044632e-05,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": 0.0,
"step": 69
},
{
"clip_ratio": 0.0,
"completion_length": 23.90625,
"epoch": 0.03565970453387672,
"grad_norm": 91.00871807242451,
"kl": 0.744140625,
"learning_rate": 9.821701477330616e-07,
"loss": 0.0298,
"reward": 1.2686043679714203,
"reward_std": 0.10558865318307653,
"rewards/accuracy_reward": 0.26874998211860657,
"rewards/cosine_rewards": -0.00014566810568794608,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 36.328125,
"epoch": 0.03616912888436067,
"grad_norm": 159.05100875041754,
"kl": 0.765625,
"learning_rate": 9.819154355578195e-07,
"loss": 0.0306,
"reward": 1.2812767028808594,
"reward_std": 0.6231541335582733,
"rewards/accuracy_reward": 0.296875,
"rewards/cosine_rewards": 2.6669338694773614e-05,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": 0.0,
"step": 71
},
{
"clip_ratio": 0.0,
"completion_length": 38.4375,
"epoch": 0.036678553234844626,
"grad_norm": 97.83490373613579,
"kl": 0.666015625,
"learning_rate": 9.816607233825777e-07,
"loss": 0.0266,
"reward": 1.647216558456421,
"reward_std": 0.32463081181049347,
"rewards/accuracy_reward": 0.6625000089406967,
"rewards/cosine_rewards": 0.00034145097015425563,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": 0.0,
"step": 72
},
{
"clip_ratio": 0.0,
"completion_length": 40.046875,
"epoch": 0.03718797758532858,
"grad_norm": 88.5872574021467,
"kl": 0.630859375,
"learning_rate": 9.814060112073356e-07,
"loss": 0.0253,
"reward": 1.7599374055862427,
"reward_std": 0.3454015702009201,
"rewards/accuracy_reward": 0.7750000059604645,
"rewards/cosine_rewards": 0.0005623315373668447,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": 0.0,
"step": 73
},
{
"clip_ratio": 0.0,
"completion_length": 60.140625,
"epoch": 0.03769740193581253,
"grad_norm": 16.49274288281998,
"kl": 0.501953125,
"learning_rate": 9.811512990320937e-07,
"loss": 0.0201,
"reward": 1.7728378772735596,
"reward_std": 0.2738931328058243,
"rewards/accuracy_reward": 0.8031250238418579,
"rewards/cosine_rewards": 0.0009629083215259016,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": 0.0,
"step": 74
},
{
"clip_ratio": 0.0,
"completion_length": 100.9375,
"epoch": 0.038206826286296486,
"grad_norm": 18.283090697254373,
"kl": 0.18359375,
"learning_rate": 9.808965868568517e-07,
"loss": 0.0074,
"reward": 1.437682330608368,
"reward_std": 0.19817885756492615,
"rewards/accuracy_reward": 0.4375000149011612,
"rewards/cosine_rewards": 0.00018233060836791992,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 75
},
{
"clip_ratio": 0.0,
"completion_length": 107.34375,
"epoch": 0.03871625063678044,
"grad_norm": 21.656183371701722,
"kl": 0.13330078125,
"learning_rate": 9.806418746816098e-07,
"loss": 0.0054,
"reward": 1.2524056434631348,
"reward_std": 0.14976192265748978,
"rewards/accuracy_reward": 0.26874999701976776,
"rewards/cosine_rewards": -0.0007193188357632607,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": 0.0,
"step": 76
},
{
"clip_ratio": 0.0,
"completion_length": 109.5,
"epoch": 0.03922567498726439,
"grad_norm": 12.450187143986858,
"kl": 0.1328125,
"learning_rate": 9.803871625063677e-07,
"loss": 0.0053,
"reward": 1.535181999206543,
"reward_std": 0.04510992762516253,
"rewards/accuracy_reward": 0.5499999970197678,
"rewards/cosine_rewards": 0.0008070359472185373,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": 0.0,
"step": 77
},
{
"clip_ratio": 0.0,
"completion_length": 112.96875,
"epoch": 0.039735099337748346,
"grad_norm": 96.22924405795379,
"kl": 0.12744140625,
"learning_rate": 9.801324503311258e-07,
"loss": 0.0051,
"reward": 1.4221445322036743,
"reward_std": 0.5387175530195236,
"rewards/accuracy_reward": 0.4374999850988388,
"rewards/cosine_rewards": 0.0002695363436941989,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": 0.0,
"step": 78
},
{
"clip_ratio": 0.0,
"completion_length": 117.46875,
"epoch": 0.040244523688232295,
"grad_norm": 22.356744283314942,
"kl": 0.12451171875,
"learning_rate": 9.798777381558838e-07,
"loss": 0.005,
"reward": 0.9280500411987305,
"reward_std": 0.3032594621181488,
"rewards/accuracy_reward": -0.06875001452863216,
"rewards/cosine_rewards": -0.003199932281859219,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 79
},
{
"clip_ratio": 0.0,
"completion_length": 110.09375,
"epoch": 0.04075394803871625,
"grad_norm": 11.587196398677966,
"kl": 0.12353515625,
"learning_rate": 9.79623025980642e-07,
"loss": 0.0049,
"reward": 1.0698014497756958,
"reward_std": 0.306557297706604,
"rewards/accuracy_reward": 0.07187498360872269,
"rewards/cosine_rewards": -0.0020735373545903713,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 119.359375,
"epoch": 0.041263372389200206,
"grad_norm": 15.552459183086345,
"kl": 0.115234375,
"learning_rate": 9.793683138053998e-07,
"loss": 0.0046,
"reward": 1.902881920337677,
"reward_std": 0.2866080105304718,
"rewards/accuracy_reward": 0.9156250059604645,
"rewards/cosine_rewards": 0.0028820185689255595,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": 0.0,
"step": 81
},
{
"clip_ratio": 0.0,
"completion_length": 121.28125,
"epoch": 0.041772796739684155,
"grad_norm": 21.56424441435487,
"kl": 0.110107421875,
"learning_rate": 9.79113601630158e-07,
"loss": 0.0044,
"reward": 1.2677271366119385,
"reward_std": 0.10610348492627963,
"rewards/accuracy_reward": 0.26874999701976776,
"rewards/cosine_rewards": -0.0010228125611320138,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 82
},
{
"clip_ratio": 0.0,
"completion_length": 115.484375,
"epoch": 0.04228222109016811,
"grad_norm": 11.018514876954287,
"kl": 0.125244140625,
"learning_rate": 9.788588894549159e-07,
"loss": 0.005,
"reward": 1.2675296068191528,
"reward_std": 0.16161296842619777,
"rewards/accuracy_reward": 0.26874999701976776,
"rewards/cosine_rewards": -0.0012203185469843447,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 83
},
{
"clip_ratio": 0.0,
"completion_length": 119.078125,
"epoch": 0.04279164544065207,
"grad_norm": 20.603407343348504,
"kl": 0.1083984375,
"learning_rate": 9.78604177279674e-07,
"loss": 0.0043,
"reward": 1.1548139452934265,
"reward_std": 0.537171483039856,
"rewards/accuracy_reward": 0.1562499925494194,
"rewards/cosine_rewards": -0.0014360386412590742,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 84
},
{
"clip_ratio": 0.0,
"completion_length": 122.640625,
"epoch": 0.043301069791136015,
"grad_norm": 60.83530697951784,
"kl": 0.18359375,
"learning_rate": 9.78349465104432e-07,
"loss": 0.0073,
"reward": 1.5197246074676514,
"reward_std": 0.5150813460350037,
"rewards/accuracy_reward": 0.518750011920929,
"rewards/cosine_rewards": 0.000974582158960402,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 85
},
{
"clip_ratio": 0.0,
"completion_length": 121.015625,
"epoch": 0.04381049414161997,
"grad_norm": 13.721540678774238,
"kl": 0.12451171875,
"learning_rate": 9.780947529291899e-07,
"loss": 0.005,
"reward": 1.1832407712936401,
"reward_std": 0.18641822785139084,
"rewards/accuracy_reward": 0.18437500298023224,
"rewards/cosine_rewards": -0.001134182559326291,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 86
},
{
"clip_ratio": 0.0,
"completion_length": 119.578125,
"epoch": 0.04431991849210392,
"grad_norm": 215.40977191184217,
"kl": 0.115478515625,
"learning_rate": 9.77840040753948e-07,
"loss": 0.0046,
"reward": 1.2237018644809723,
"reward_std": 0.23010382801294327,
"rewards/accuracy_reward": 0.24062499776482582,
"rewards/cosine_rewards": -0.0012981001054868102,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": 0.0,
"step": 87
},
{
"clip_ratio": 0.0,
"completion_length": 127.125,
"epoch": 0.044829342842587876,
"grad_norm": 11.570945117677702,
"kl": 0.110595703125,
"learning_rate": 9.77585328578706e-07,
"loss": 0.0044,
"reward": 1.5510605573654175,
"reward_std": 0.0015471973456442356,
"rewards/accuracy_reward": 0.550000011920929,
"rewards/cosine_rewards": 0.001060541602782905,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 88
},
{
"clip_ratio": 0.0,
"completion_length": 129.953125,
"epoch": 0.04533876719307183,
"grad_norm": 9.451343704964001,
"kl": 0.104248046875,
"learning_rate": 9.77330616403464e-07,
"loss": 0.0042,
"reward": 1.5073344111442566,
"reward_std": 0.35981758683919907,
"rewards/accuracy_reward": 0.5218750052154064,
"rewards/cosine_rewards": 0.0010844313073903322,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": 0.0,
"step": 89
},
{
"clip_ratio": 0.0,
"completion_length": 129.71875,
"epoch": 0.04584819154355578,
"grad_norm": 16.032910799390205,
"kl": 0.094970703125,
"learning_rate": 9.77075904228222e-07,
"loss": 0.0038,
"reward": 1.919905662536621,
"reward_std": 0.24129686888772994,
"rewards/accuracy_reward": 0.9156250059604645,
"rewards/cosine_rewards": 0.0042806623969227076,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 130.390625,
"epoch": 0.046357615894039736,
"grad_norm": 21.086614900693085,
"kl": 0.101806640625,
"learning_rate": 9.768211920529801e-07,
"loss": 0.0041,
"reward": 1.5919697284698486,
"reward_std": 0.2428576573729515,
"rewards/accuracy_reward": 0.6062500029802322,
"rewards/cosine_rewards": 0.0013447333476506174,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": 0.0,
"step": 91
},
{
"clip_ratio": 0.0,
"completion_length": 142.734375,
"epoch": 0.04686704024452369,
"grad_norm": 8.093804874468816,
"kl": 0.095458984375,
"learning_rate": 9.76566479877738e-07,
"loss": 0.0038,
"reward": 1.6935226917266846,
"reward_std": 0.1869470328092575,
"rewards/accuracy_reward": 0.690625011920929,
"rewards/cosine_rewards": 0.0028977063193451613,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 92
},
{
"clip_ratio": 0.0,
"completion_length": 143.203125,
"epoch": 0.04737646459500764,
"grad_norm": 10.624707519768712,
"kl": 0.099609375,
"learning_rate": 9.763117677024962e-07,
"loss": 0.004,
"reward": 1.4030739068984985,
"reward_std": 0.3680836334824562,
"rewards/accuracy_reward": 0.43437500298023224,
"rewards/cosine_rewards": -5.1158247515559196e-05,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": 0.0,
"step": 93
},
{
"clip_ratio": 0.0,
"completion_length": 150.90625,
"epoch": 0.047885888945491596,
"grad_norm": 25.156466743552734,
"kl": 0.101318359375,
"learning_rate": 9.760570555272541e-07,
"loss": 0.0041,
"reward": 1.5800000429153442,
"reward_std": 0.5084549486637115,
"rewards/accuracy_reward": 0.578125,
"rewards/cosine_rewards": 0.0018750545859802514,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 94
},
{
"clip_ratio": 0.0,
"completion_length": 148.46875,
"epoch": 0.048395313295975545,
"grad_norm": 12.684719084813777,
"kl": 0.10205078125,
"learning_rate": 9.758023433520122e-07,
"loss": 0.0041,
"reward": 1.5234779119491577,
"reward_std": 0.18682076036930084,
"rewards/accuracy_reward": 0.5218749940395355,
"rewards/cosine_rewards": 0.0016028713434934616,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 95
},
{
"clip_ratio": 0.0,
"completion_length": 161.109375,
"epoch": 0.0489047376464595,
"grad_norm": 13.854458043447748,
"kl": 0.108154296875,
"learning_rate": 9.755476311767702e-07,
"loss": 0.0043,
"reward": 1.6655999422073364,
"reward_std": 0.4286635220050812,
"rewards/accuracy_reward": 0.6624999791383743,
"rewards/cosine_rewards": 0.0030998505535535514,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 96
},
{
"clip_ratio": 0.0,
"completion_length": 167.609375,
"epoch": 0.049414161996943456,
"grad_norm": 15.632405914543359,
"kl": 0.098388671875,
"learning_rate": 9.752929190015283e-07,
"loss": 0.0039,
"reward": 1.1535860896110535,
"reward_std": 0.36188751459121704,
"rewards/accuracy_reward": 0.1562499888241291,
"rewards/cosine_rewards": -0.002663849270902574,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 97
},
{
"clip_ratio": 0.0,
"completion_length": 168.453125,
"epoch": 0.049923586347427405,
"grad_norm": 8.747829969093605,
"kl": 0.112060546875,
"learning_rate": 9.750382068262862e-07,
"loss": 0.0045,
"reward": 1.3531205654144287,
"reward_std": 0.18947682529687881,
"rewards/accuracy_reward": 0.3531249985098839,
"rewards/cosine_rewards": -4.528439603745937e-06,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 98
},
{
"clip_ratio": 0.0,
"completion_length": 168.71875,
"epoch": 0.05043301069791136,
"grad_norm": 12.880266278774922,
"kl": 0.112060546875,
"learning_rate": 9.747834946510442e-07,
"loss": 0.0045,
"reward": 1.619386613368988,
"reward_std": 0.5798123776912689,
"rewards/accuracy_reward": 0.6624999940395355,
"rewards/cosine_rewards": 0.0037616335321217775,
"rewards/format_reward": 0.953125,
"rewards/repetition_rewards": 0.0,
"step": 99
},
{
"clip_ratio": 0.0,
"completion_length": 173.859375,
"epoch": 0.05094243504839532,
"grad_norm": 18.637994264229288,
"kl": 0.109619140625,
"learning_rate": 9.745287824758023e-07,
"loss": 0.0044,
"reward": 1.448248565196991,
"reward_std": 0.4085986465215683,
"rewards/accuracy_reward": 0.4624999687075615,
"rewards/cosine_rewards": 0.0013735336251556873,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": 0.0,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 175.375,
"epoch": 0.051451859398879266,
"grad_norm": 26.647397602965935,
"kl": 0.110107421875,
"learning_rate": 9.742740703005602e-07,
"loss": 0.0044,
"reward": 1.0668614506721497,
"reward_std": 0.35026729106903076,
"rewards/accuracy_reward": 0.07187499292194843,
"rewards/cosine_rewards": -0.004894306650385261,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": -0.00011927480954909697,
"step": 101
},
{
"clip_ratio": 0.0,
"completion_length": 174.0,
"epoch": 0.05196128374936322,
"grad_norm": 12.612763516820905,
"kl": 0.112060546875,
"learning_rate": 9.740193581253183e-07,
"loss": 0.0045,
"reward": 1.4354371428489685,
"reward_std": 0.20804932340979576,
"rewards/accuracy_reward": 0.46562499552965164,
"rewards/cosine_rewards": 0.0010621265973895788,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": 0.0,
"step": 102
},
{
"clip_ratio": 0.0,
"completion_length": 176.984375,
"epoch": 0.05247070809984717,
"grad_norm": 19.00540852037756,
"kl": 0.116455078125,
"learning_rate": 9.737646459500763e-07,
"loss": 0.0047,
"reward": 1.0997494161128998,
"reward_std": 0.5674505531787872,
"rewards/accuracy_reward": 0.1499999761581421,
"rewards/cosine_rewards": -0.003250634763389826,
"rewards/format_reward": 0.953125,
"rewards/repetition_rewards": -0.0001250000059371814,
"step": 103
},
{
"clip_ratio": 0.0,
"completion_length": 187.3125,
"epoch": 0.052980132450331126,
"grad_norm": 8.993773014446798,
"kl": 0.115478515625,
"learning_rate": 9.735099337748344e-07,
"loss": 0.0046,
"reward": 1.547185480594635,
"reward_std": 0.5772347450256348,
"rewards/accuracy_reward": 0.5750000029802322,
"rewards/cosine_rewards": 0.0034354651579633355,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": 0.0,
"step": 104
},
{
"clip_ratio": 0.0,
"completion_length": 182.09375,
"epoch": 0.05348955680081508,
"grad_norm": 21.83379704072219,
"kl": 0.11279296875,
"learning_rate": 9.732552215995923e-07,
"loss": 0.0045,
"reward": 0.9665651321411133,
"reward_std": 0.19240357726812363,
"rewards/accuracy_reward": -0.012500010430812836,
"rewards/cosine_rewards": -0.005309856729581952,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": 0.0,
"step": 105
},
{
"clip_ratio": 0.0,
"completion_length": 184.296875,
"epoch": 0.05399898115129903,
"grad_norm": 15.74936299058057,
"kl": 0.1240234375,
"learning_rate": 9.730005094243505e-07,
"loss": 0.005,
"reward": 0.8526512682437897,
"reward_std": 0.45641621947288513,
"rewards/accuracy_reward": -0.1250000149011612,
"rewards/cosine_rewards": -0.006723731989040971,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": 0.0,
"step": 106
},
{
"clip_ratio": 0.0,
"completion_length": 178.203125,
"epoch": 0.054508405501782986,
"grad_norm": 7.621263779056561,
"kl": 0.116455078125,
"learning_rate": 9.727457972491084e-07,
"loss": 0.0047,
"reward": 1.4213617444038391,
"reward_std": 0.42073580622673035,
"rewards/accuracy_reward": 0.4375,
"rewards/cosine_rewards": -0.0005132523947395384,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": 0.0,
"step": 107
},
{
"clip_ratio": 0.0,
"completion_length": 180.234375,
"epoch": 0.05501782985226694,
"grad_norm": 14.098307573524853,
"kl": 0.119140625,
"learning_rate": 9.724910850738665e-07,
"loss": 0.0048,
"reward": 1.1232723593711853,
"reward_std": 0.45567604154348373,
"rewards/accuracy_reward": 0.12812499329447746,
"rewards/cosine_rewards": -0.004852580255828798,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 108
},
{
"clip_ratio": 0.0,
"completion_length": 183.328125,
"epoch": 0.05552725420275089,
"grad_norm": 10.937546247684887,
"kl": 0.18994140625,
"learning_rate": 9.722363728986245e-07,
"loss": 0.0076,
"reward": 1.8381596803665161,
"reward_std": 0.28626738488674164,
"rewards/accuracy_reward": 0.831250011920929,
"rewards/cosine_rewards": 0.006909639807417989,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 109
},
{
"clip_ratio": 0.0,
"completion_length": 174.375,
"epoch": 0.056036678553234846,
"grad_norm": 16.445027328915916,
"kl": 0.11181640625,
"learning_rate": 9.719816607233826e-07,
"loss": 0.0045,
"reward": 1.2096136808395386,
"reward_std": 0.36066293716430664,
"rewards/accuracy_reward": 0.21249999105930328,
"rewards/cosine_rewards": -0.002886334084905684,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 168.765625,
"epoch": 0.056546102903718795,
"grad_norm": 18.874204475299454,
"kl": 0.106689453125,
"learning_rate": 9.717269485481405e-07,
"loss": 0.0043,
"reward": 1.3519207835197449,
"reward_std": 0.08376272046007216,
"rewards/accuracy_reward": 0.3531250059604645,
"rewards/cosine_rewards": -0.0012042350135743618,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 111
},
{
"clip_ratio": 0.0,
"completion_length": 157.796875,
"epoch": 0.05705552725420275,
"grad_norm": 9.104959106458736,
"kl": 0.121337890625,
"learning_rate": 9.714722363728986e-07,
"loss": 0.0049,
"reward": 1.381228744983673,
"reward_std": 0.16323383897542953,
"rewards/accuracy_reward": 0.3812500238418579,
"rewards/cosine_rewards": -2.1282234229147434e-05,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 112
},
{
"clip_ratio": 0.0,
"completion_length": 162.171875,
"epoch": 0.05756495160468671,
"grad_norm": 13.28611466805594,
"kl": 0.10888671875,
"learning_rate": 9.712175241976566e-07,
"loss": 0.0044,
"reward": 1.3366525173187256,
"reward_std": 0.28694501193240285,
"rewards/accuracy_reward": 0.3531249985098839,
"rewards/cosine_rewards": -0.0008475282229483128,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": 0.0,
"step": 113
},
{
"clip_ratio": 0.0,
"completion_length": 159.4375,
"epoch": 0.058074375955170655,
"grad_norm": 20.0350592953945,
"kl": 0.107666015625,
"learning_rate": 9.709628120224145e-07,
"loss": 0.0043,
"reward": 1.4107850790023804,
"reward_std": 0.18804995715618134,
"rewards/accuracy_reward": 0.40937501937150955,
"rewards/cosine_rewards": 0.001410042867064476,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 114
},
{
"clip_ratio": 0.0,
"completion_length": 159.6875,
"epoch": 0.05858380030565461,
"grad_norm": 9.526099983425397,
"kl": 0.10595703125,
"learning_rate": 9.707080998471726e-07,
"loss": 0.0042,
"reward": 1.4221826791763306,
"reward_std": 0.2918977811932564,
"rewards/accuracy_reward": 0.4374999925494194,
"rewards/cosine_rewards": 0.00030758429784327745,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": 0.0,
"step": 115
},
{
"clip_ratio": 0.0,
"completion_length": 157.984375,
"epoch": 0.05909322465613857,
"grad_norm": 14.046929728525855,
"kl": 0.11572265625,
"learning_rate": 9.704533876719306e-07,
"loss": 0.0046,
"reward": 1.2389479279518127,
"reward_std": 0.4534989148378372,
"rewards/accuracy_reward": 0.24062498658895493,
"rewards/cosine_rewards": -0.00167706364300102,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 116
},
{
"clip_ratio": 0.0,
"completion_length": 167.390625,
"epoch": 0.059602649006622516,
"grad_norm": 12.733865198375515,
"kl": 0.105224609375,
"learning_rate": 9.701986754966887e-07,
"loss": 0.0042,
"reward": 1.0669120252132416,
"reward_std": 0.319850392639637,
"rewards/accuracy_reward": 0.07187498360872269,
"rewards/cosine_rewards": -0.004963014740496874,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 117
},
{
"clip_ratio": 0.0,
"completion_length": 156.140625,
"epoch": 0.06011207335710647,
"grad_norm": 11.670513012812588,
"kl": 0.093505859375,
"learning_rate": 9.699439633214466e-07,
"loss": 0.0038,
"reward": 1.665140986442566,
"reward_std": 0.12403370253741741,
"rewards/accuracy_reward": 0.6624999940395355,
"rewards/cosine_rewards": 0.0026409668498672545,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 118
},
{
"clip_ratio": 0.0,
"completion_length": 155.828125,
"epoch": 0.06062149770759042,
"grad_norm": 28.993909689663674,
"kl": 0.1015625,
"learning_rate": 9.696892511462047e-07,
"loss": 0.0041,
"reward": 1.0673952102661133,
"reward_std": 0.30907338857650757,
"rewards/accuracy_reward": 0.07187499105930328,
"rewards/cosine_rewards": -0.004479756113141775,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 119
},
{
"clip_ratio": 0.0,
"completion_length": 154.40625,
"epoch": 0.061130922058074376,
"grad_norm": 27.685530130702478,
"kl": 0.104736328125,
"learning_rate": 9.694345389709627e-07,
"loss": 0.0042,
"reward": 1.4373126029968262,
"reward_std": 0.21315501490607858,
"rewards/accuracy_reward": 0.4375,
"rewards/cosine_rewards": -0.0001873411238193512,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 152.0625,
"epoch": 0.06164034640855833,
"grad_norm": 12.6569610895899,
"kl": 0.121826171875,
"learning_rate": 9.691798267957208e-07,
"loss": 0.0049,
"reward": 1.3240773677825928,
"reward_std": 0.26775629818439484,
"rewards/accuracy_reward": 0.32499999552965164,
"rewards/cosine_rewards": -0.0009226472466252744,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 121
},
{
"clip_ratio": 0.0,
"completion_length": 151.78125,
"epoch": 0.06214977075904228,
"grad_norm": 11.557058687556623,
"kl": 0.103759765625,
"learning_rate": 9.689251146204787e-07,
"loss": 0.0041,
"reward": 1.7503631114959717,
"reward_std": 0.08287379238754511,
"rewards/accuracy_reward": 0.7468750178813934,
"rewards/cosine_rewards": 0.0034881452447734773,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 122
},
{
"clip_ratio": 0.0,
"completion_length": 158.578125,
"epoch": 0.06265919510952624,
"grad_norm": 19.676894670897823,
"kl": 0.1025390625,
"learning_rate": 9.686704024452369e-07,
"loss": 0.0041,
"reward": 1.3520426154136658,
"reward_std": 0.24371477961540222,
"rewards/accuracy_reward": 0.3531249761581421,
"rewards/cosine_rewards": -0.0010823981137946248,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 123
},
{
"clip_ratio": 0.0,
"completion_length": 167.484375,
"epoch": 0.06316861946001019,
"grad_norm": 7.7934253879172894,
"kl": 0.10107421875,
"learning_rate": 9.684156902699948e-07,
"loss": 0.004,
"reward": 1.4387494623661041,
"reward_std": 0.26880691200494766,
"rewards/accuracy_reward": 0.4374999888241291,
"rewards/cosine_rewards": 0.0015281732194125652,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": -0.000278731546131894,
"step": 124
},
{
"clip_ratio": 0.0,
"completion_length": 160.671875,
"epoch": 0.06367804381049415,
"grad_norm": 14.376740722468174,
"kl": 0.1064453125,
"learning_rate": 9.68160978094753e-07,
"loss": 0.0043,
"reward": 1.2107464671134949,
"reward_std": 0.20096861571073532,
"rewards/accuracy_reward": 0.21249999292194843,
"rewards/cosine_rewards": -0.0017535560764372349,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 125
},
{
"clip_ratio": 0.0,
"completion_length": 158.25,
"epoch": 0.06418746816097809,
"grad_norm": 24.20355073697258,
"kl": 0.10693359375,
"learning_rate": 9.679062659195109e-07,
"loss": 0.0043,
"reward": 1.09614896774292,
"reward_std": 0.4781967103481293,
"rewards/accuracy_reward": 0.09999998658895493,
"rewards/cosine_rewards": -0.0038510175654664636,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 126
},
{
"clip_ratio": 0.0,
"completion_length": 165.203125,
"epoch": 0.06469689251146205,
"grad_norm": 8.56510891629326,
"kl": 0.111083984375,
"learning_rate": 9.676515537442688e-07,
"loss": 0.0044,
"reward": 1.5521512031555176,
"reward_std": 0.46633191406726837,
"rewards/accuracy_reward": 0.550000011920929,
"rewards/cosine_rewards": 0.002151212247554213,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 127
},
{
"clip_ratio": 0.0,
"completion_length": 164.421875,
"epoch": 0.065206316861946,
"grad_norm": 13.379306876395301,
"kl": 0.120849609375,
"learning_rate": 9.67396841569027e-07,
"loss": 0.0048,
"reward": 1.722820222377777,
"reward_std": 0.32213538885116577,
"rewards/accuracy_reward": 0.71875,
"rewards/cosine_rewards": 0.004070190014317632,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 128
},
{
"clip_ratio": 0.0,
"completion_length": 173.703125,
"epoch": 0.06571574121242996,
"grad_norm": 153.40445361752842,
"kl": 0.1162109375,
"learning_rate": 9.67142129393785e-07,
"loss": 0.0047,
"reward": 1.2643532752990723,
"reward_std": 0.23417328391224146,
"rewards/accuracy_reward": 0.2656250037252903,
"rewards/cosine_rewards": -0.0012717264471575618,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 129
},
{
"clip_ratio": 0.0,
"completion_length": 168.5625,
"epoch": 0.06622516556291391,
"grad_norm": 9.275290295194138,
"kl": 0.10302734375,
"learning_rate": 9.66887417218543e-07,
"loss": 0.0041,
"reward": 1.0960015654563904,
"reward_std": 0.21426187455654144,
"rewards/accuracy_reward": 0.09999999403953552,
"rewards/cosine_rewards": -0.003998432832304388,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 176.265625,
"epoch": 0.06673458991339785,
"grad_norm": 19.81840956306445,
"kl": 0.105224609375,
"learning_rate": 9.66632705043301e-07,
"loss": 0.0042,
"reward": 1.6365814805030823,
"reward_std": 0.20723329484462738,
"rewards/accuracy_reward": 0.6343750059604645,
"rewards/cosine_rewards": 0.0022064344957470894,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 131
},
{
"clip_ratio": 0.0,
"completion_length": 174.234375,
"epoch": 0.06724401426388181,
"grad_norm": 8.864924483945437,
"kl": 0.108642578125,
"learning_rate": 9.66377992868059e-07,
"loss": 0.0044,
"reward": 1.3077268600463867,
"reward_std": 0.3278057724237442,
"rewards/accuracy_reward": 0.32500000670552254,
"rewards/cosine_rewards": -0.0012397709069773555,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": -0.00040839536814019084,
"step": 132
},
{
"clip_ratio": 0.0,
"completion_length": 175.6875,
"epoch": 0.06775343861436577,
"grad_norm": 14.31694516925494,
"kl": 0.112060546875,
"learning_rate": 9.661232806928172e-07,
"loss": 0.0045,
"reward": 1.3927981853485107,
"reward_std": 0.3101032227277756,
"rewards/accuracy_reward": 0.40937499701976776,
"rewards/cosine_rewards": -0.0007292817026609555,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": -0.00022258506942307577,
"step": 133
},
{
"clip_ratio": 0.0,
"completion_length": 174.515625,
"epoch": 0.06826286296484972,
"grad_norm": 13.206286933876525,
"kl": 0.110107421875,
"learning_rate": 9.65868568517575e-07,
"loss": 0.0044,
"reward": 1.4952284097671509,
"reward_std": 0.16513758851215243,
"rewards/accuracy_reward": 0.4937499910593033,
"rewards/cosine_rewards": 0.0014784452505409718,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 134
},
{
"clip_ratio": 0.0,
"completion_length": 188.15625,
"epoch": 0.06877228731533368,
"grad_norm": 8.196723017225267,
"kl": 0.110107421875,
"learning_rate": 9.656138563423332e-07,
"loss": 0.0044,
"reward": 1.3523318767547607,
"reward_std": 0.19119784235954285,
"rewards/accuracy_reward": 0.3531249985098839,
"rewards/cosine_rewards": -0.0007931197178550065,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 135
},
{
"clip_ratio": 0.0,
"completion_length": 184.375,
"epoch": 0.06928171166581763,
"grad_norm": 7.96588213615428,
"kl": 0.10400390625,
"learning_rate": 9.653591441670911e-07,
"loss": 0.0042,
"reward": 1.3809208273887634,
"reward_std": 0.16492938250303268,
"rewards/accuracy_reward": 0.3812500238418579,
"rewards/cosine_rewards": -0.00032906350679695606,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 136
},
{
"clip_ratio": 0.0,
"completion_length": 179.78125,
"epoch": 0.06979113601630157,
"grad_norm": 15.45343619628501,
"kl": 0.1142578125,
"learning_rate": 9.651044319918493e-07,
"loss": 0.0046,
"reward": 1.5364066362380981,
"reward_std": 0.3309681713581085,
"rewards/accuracy_reward": 0.550000011920929,
"rewards/cosine_rewards": 0.002031611278653145,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": 0.0,
"step": 137
},
{
"clip_ratio": 0.0,
"completion_length": 191.484375,
"epoch": 0.07030056036678553,
"grad_norm": 10.658123621710919,
"kl": 0.1142578125,
"learning_rate": 9.648497198166072e-07,
"loss": 0.0046,
"reward": 1.5228744149208069,
"reward_std": 0.08533496968448162,
"rewards/accuracy_reward": 0.5218750089406967,
"rewards/cosine_rewards": 0.0011002181563526392,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": -0.00010080645006382838,
"step": 138
},
{
"clip_ratio": 0.0,
"completion_length": 205.734375,
"epoch": 0.07080998471726949,
"grad_norm": 13.847520067525743,
"kl": 0.118408203125,
"learning_rate": 9.645950076413653e-07,
"loss": 0.0047,
"reward": 0.6919489502906799,
"reward_std": 0.29361478984355927,
"rewards/accuracy_reward": -0.29375001788139343,
"rewards/cosine_rewards": -0.014301038347184658,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 139
},
{
"clip_ratio": 0.0,
"completion_length": 187.6875,
"epoch": 0.07131940906775344,
"grad_norm": 32.08146525993247,
"kl": 0.115234375,
"learning_rate": 9.643402954661233e-07,
"loss": 0.0046,
"reward": 1.3814507126808167,
"reward_std": 0.10938079445622861,
"rewards/accuracy_reward": 0.3812499940395355,
"rewards/cosine_rewards": 0.00031310925260186195,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": -0.00011241007450735196,
"step": 140
},
{
"clip_ratio": 0.0,
"completion_length": 201.890625,
"epoch": 0.0718288334182374,
"grad_norm": 13.580365765026054,
"kl": 0.12158203125,
"learning_rate": 9.640855832908814e-07,
"loss": 0.0049,
"reward": 1.2904618978500366,
"reward_std": 0.09482555650174618,
"rewards/accuracy_reward": 0.296875,
"rewards/cosine_rewards": -0.006186658749356866,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": -0.00022644927958026528,
"step": 141
},
{
"clip_ratio": 0.0,
"completion_length": 196.5,
"epoch": 0.07233825776872134,
"grad_norm": 24.285612348709225,
"kl": 0.113525390625,
"learning_rate": 9.638308711156393e-07,
"loss": 0.0045,
"reward": 1.4387189745903015,
"reward_std": 0.30863603949546814,
"rewards/accuracy_reward": 0.4374999888241291,
"rewards/cosine_rewards": 0.0012189627159386873,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 142
},
{
"clip_ratio": 0.0,
"completion_length": 197.5,
"epoch": 0.0728476821192053,
"grad_norm": 22.094925488371874,
"kl": 0.11669921875,
"learning_rate": 9.635761589403972e-07,
"loss": 0.0047,
"reward": 1.495344638824463,
"reward_std": 0.46879828721284866,
"rewards/accuracy_reward": 0.4937499761581421,
"rewards/cosine_rewards": 0.0015945886261761189,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 143
},
{
"clip_ratio": 0.0,
"completion_length": 195.125,
"epoch": 0.07335710646968925,
"grad_norm": 7.954348731599845,
"kl": 0.1259765625,
"learning_rate": 9.633214467651554e-07,
"loss": 0.005,
"reward": 1.5794875025749207,
"reward_std": 0.2703954949975014,
"rewards/accuracy_reward": 0.6062500178813934,
"rewards/cosine_rewards": 0.0044874417362734675,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": 0.0,
"step": 144
},
{
"clip_ratio": 0.0,
"completion_length": 203.234375,
"epoch": 0.0738665308201732,
"grad_norm": 10.240352920236468,
"kl": 0.1240234375,
"learning_rate": 9.630667345899133e-07,
"loss": 0.005,
"reward": 1.323024868965149,
"reward_std": 0.3642221838235855,
"rewards/accuracy_reward": 0.32499999552965164,
"rewards/cosine_rewards": -0.0019751336076296866,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 145
},
{
"clip_ratio": 0.0,
"completion_length": 210.96875,
"epoch": 0.07437595517065716,
"grad_norm": 7.757169620409243,
"kl": 0.1318359375,
"learning_rate": 9.628120224146714e-07,
"loss": 0.0053,
"reward": 1.4797114729881287,
"reward_std": 0.40076301991939545,
"rewards/accuracy_reward": 0.4937500078231096,
"rewards/cosine_rewards": 0.001799287972971797,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": -0.00021284000831656158,
"step": 146
},
{
"clip_ratio": 0.0,
"completion_length": 203.015625,
"epoch": 0.0748853795211411,
"grad_norm": 9.791138031684504,
"kl": 0.1123046875,
"learning_rate": 9.625573102394294e-07,
"loss": 0.0045,
"reward": 1.5510019659996033,
"reward_std": 0.27960680425167084,
"rewards/accuracy_reward": 0.5781249850988388,
"rewards/cosine_rewards": 0.0043075907160528,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": -0.0001806358341127634,
"step": 147
},
{
"clip_ratio": 0.0,
"completion_length": 211.6875,
"epoch": 0.07539480387162506,
"grad_norm": 7.852116556950569,
"kl": 0.12060546875,
"learning_rate": 9.623025980641875e-07,
"loss": 0.0048,
"reward": 1.2943141460418701,
"reward_std": 0.43064263463020325,
"rewards/accuracy_reward": 0.2968749701976776,
"rewards/cosine_rewards": -0.0023121244739741087,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": -0.0002487746678525582,
"step": 148
},
{
"clip_ratio": 0.0,
"completion_length": 220.8125,
"epoch": 0.07590422822210902,
"grad_norm": 19.19937094751421,
"kl": 0.1240234375,
"learning_rate": 9.620478858889454e-07,
"loss": 0.005,
"reward": 1.8400413990020752,
"reward_std": 0.39075249433517456,
"rewards/accuracy_reward": 0.8593749701976776,
"rewards/cosine_rewards": 0.011916308663785458,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": 0.0,
"step": 149
},
{
"clip_ratio": 0.0,
"completion_length": 219.890625,
"epoch": 0.07641365257259297,
"grad_norm": 17.532821778348946,
"kl": 0.1376953125,
"learning_rate": 9.617931737137036e-07,
"loss": 0.0055,
"reward": 1.5498095750808716,
"reward_std": 0.29365313798189163,
"rewards/accuracy_reward": 0.5781250298023224,
"rewards/cosine_rewards": 0.0030243303044699132,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": -8.979885024018586e-05,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 215.3125,
"epoch": 0.07692307692307693,
"grad_norm": 8.653969191960044,
"kl": 0.120361328125,
"learning_rate": 9.615384615384615e-07,
"loss": 0.0048,
"reward": 1.2203205227851868,
"reward_std": 0.5703159868717194,
"rewards/accuracy_reward": 0.24062499403953552,
"rewards/cosine_rewards": -0.004613903176505119,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": -6.565126386703923e-05,
"step": 151
},
{
"clip_ratio": 0.0,
"completion_length": 214.484375,
"epoch": 0.07743250127356088,
"grad_norm": 56.921468540439704,
"kl": 0.119873046875,
"learning_rate": 9.612837493632196e-07,
"loss": 0.0048,
"reward": 1.2310086488723755,
"reward_std": 0.41925153136253357,
"rewards/accuracy_reward": 0.2656249925494194,
"rewards/cosine_rewards": -0.00321156473364681,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": -0.00015470296784769744,
"step": 152
},
{
"clip_ratio": 0.0,
"completion_length": 219.0,
"epoch": 0.07794192562404482,
"grad_norm": 9.400079372239615,
"kl": 0.107666015625,
"learning_rate": 9.610290371879775e-07,
"loss": 0.0043,
"reward": 1.6124141216278076,
"reward_std": 0.487982913851738,
"rewards/accuracy_reward": 0.606249988079071,
"rewards/cosine_rewards": 0.006320342654362321,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": -0.00015624999650754035,
"step": 153
},
{
"clip_ratio": 0.0,
"completion_length": 214.90625,
"epoch": 0.07845134997452878,
"grad_norm": 16.320123460893743,
"kl": 0.125732421875,
"learning_rate": 9.607743250127357e-07,
"loss": 0.005,
"reward": 1.5100122094154358,
"reward_std": 0.4279818534851074,
"rewards/accuracy_reward": 0.5218749791383743,
"rewards/cosine_rewards": 0.0037621970986947417,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": 0.0,
"step": 154
},
{
"clip_ratio": 0.0,
"completion_length": 226.125,
"epoch": 0.07896077432501274,
"grad_norm": 12.345482711993489,
"kl": 0.163330078125,
"learning_rate": 9.605196128374936e-07,
"loss": 0.0065,
"reward": 0.9318991005420685,
"reward_std": 0.23740804940462112,
"rewards/accuracy_reward": -0.04062497615814209,
"rewards/cosine_rewards": -0.011850890005007386,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": 0.0,
"step": 155
},
{
"clip_ratio": 0.0,
"completion_length": 223.671875,
"epoch": 0.07947019867549669,
"grad_norm": 6.991107442023172,
"kl": 0.1240234375,
"learning_rate": 9.602649006622515e-07,
"loss": 0.005,
"reward": 0.9476701319217682,
"reward_std": 0.33865927904844284,
"rewards/accuracy_reward": -0.04062502086162567,
"rewards/cosine_rewards": -0.011704806645866483,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 156
},
{
"clip_ratio": 0.0,
"completion_length": 239.46875,
"epoch": 0.07997962302598065,
"grad_norm": 11.299778697394407,
"kl": 0.117919921875,
"learning_rate": 9.600101884870097e-07,
"loss": 0.0047,
"reward": 1.3649136424064636,
"reward_std": 0.42557042837142944,
"rewards/accuracy_reward": 0.3812499940395355,
"rewards/cosine_rewards": -0.0007113651372492313,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": 0.0,
"step": 157
},
{
"clip_ratio": 0.0,
"completion_length": 231.015625,
"epoch": 0.08048904737646459,
"grad_norm": 16.322542283478924,
"kl": 0.12255859375,
"learning_rate": 9.597554763117676e-07,
"loss": 0.0049,
"reward": 1.3941306471824646,
"reward_std": 0.4155275672674179,
"rewards/accuracy_reward": 0.40937498211860657,
"rewards/cosine_rewards": 0.000547687232028693,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": -0.00016711230273358524,
"step": 158
},
{
"clip_ratio": 0.0,
"completion_length": 213.3125,
"epoch": 0.08099847172694855,
"grad_norm": 6.393101239111367,
"kl": 0.11865234375,
"learning_rate": 9.595007641365257e-07,
"loss": 0.0047,
"reward": 1.2930153012275696,
"reward_std": 0.2976529533043504,
"rewards/accuracy_reward": 0.296875,
"rewards/cosine_rewards": -0.0038597104139626026,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 159
},
{
"clip_ratio": 0.0,
"completion_length": 215.125,
"epoch": 0.0815078960774325,
"grad_norm": 15.2053157125375,
"kl": 0.119140625,
"learning_rate": 9.592460519612836e-07,
"loss": 0.0048,
"reward": 1.2102863192558289,
"reward_std": 0.43368688225746155,
"rewards/accuracy_reward": 0.2124999836087227,
"rewards/cosine_rewards": -0.0022136420011520386,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 160
},
{
"clip_ratio": 0.0,
"completion_length": 209.109375,
"epoch": 0.08201732042791646,
"grad_norm": 8.230345044429809,
"kl": 0.11376953125,
"learning_rate": 9.589913397860418e-07,
"loss": 0.0045,
"reward": 1.5247125625610352,
"reward_std": 0.313697911798954,
"rewards/accuracy_reward": 0.5218749791383743,
"rewards/cosine_rewards": 0.0028375727706588805,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 161
},
{
"clip_ratio": 0.0,
"completion_length": 208.421875,
"epoch": 0.08252674477840041,
"grad_norm": 7.440050437747031,
"kl": 0.132568359375,
"learning_rate": 9.587366276107997e-07,
"loss": 0.0053,
"reward": 1.4958758354187012,
"reward_std": 0.2720055654644966,
"rewards/accuracy_reward": 0.4937499761581421,
"rewards/cosine_rewards": 0.002125886792782694,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 162
},
{
"clip_ratio": 0.0,
"completion_length": 201.125,
"epoch": 0.08303616912888435,
"grad_norm": 44.81125981696038,
"kl": 0.119384765625,
"learning_rate": 9.584819154355578e-07,
"loss": 0.0048,
"reward": 1.5242316722869873,
"reward_std": 0.6014019548892975,
"rewards/accuracy_reward": 0.5218749940395355,
"rewards/cosine_rewards": 0.0023566827294416726,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 163
},
{
"clip_ratio": 0.0,
"completion_length": 207.046875,
"epoch": 0.08354559347936831,
"grad_norm": 13.052733655899614,
"kl": 0.119140625,
"learning_rate": 9.582272032603158e-07,
"loss": 0.0048,
"reward": 1.6689130067825317,
"reward_std": 0.2884200101252645,
"rewards/accuracy_reward": 0.6624999940395355,
"rewards/cosine_rewards": 0.0064131125109270215,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 164
},
{
"clip_ratio": 0.0,
"completion_length": 204.53125,
"epoch": 0.08405501782985227,
"grad_norm": 45.85990240379708,
"kl": 0.455078125,
"learning_rate": 9.57972491085074e-07,
"loss": 0.0181,
"reward": 1.72576242685318,
"reward_std": 0.48727013170719147,
"rewards/accuracy_reward": 0.7187499701976776,
"rewards/cosine_rewards": 0.007012464571744204,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 165
},
{
"clip_ratio": 0.0,
"completion_length": 203.890625,
"epoch": 0.08456444218033622,
"grad_norm": 62.93710363981597,
"kl": 0.117431640625,
"learning_rate": 9.577177789098318e-07,
"loss": 0.0047,
"reward": 0.9781621694564819,
"reward_std": 0.20786645263433456,
"rewards/accuracy_reward": -0.012500008568167686,
"rewards/cosine_rewards": -0.009250549599528313,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": -8.729050023248419e-05,
"step": 166
},
{
"clip_ratio": 0.0,
"completion_length": 201.109375,
"epoch": 0.08507386653082018,
"grad_norm": 11.046432411232663,
"kl": 0.13134765625,
"learning_rate": 9.5746306673459e-07,
"loss": 0.0052,
"reward": 1.3762089014053345,
"reward_std": 0.32985249161720276,
"rewards/accuracy_reward": 0.37812500819563866,
"rewards/cosine_rewards": -0.0019161199452355504,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 167
},
{
"clip_ratio": 0.0,
"completion_length": 201.578125,
"epoch": 0.08558329088130413,
"grad_norm": 5.950394983602238,
"kl": 0.11279296875,
"learning_rate": 9.572083545593479e-07,
"loss": 0.0045,
"reward": 1.0190700888633728,
"reward_std": 0.6297826766967773,
"rewards/accuracy_reward": 0.04062497615814209,
"rewards/cosine_rewards": -0.005929919425398111,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": 0.0,
"step": 168
},
{
"clip_ratio": 0.0,
"completion_length": 204.71875,
"epoch": 0.08609271523178808,
"grad_norm": 17.640294707452878,
"kl": 0.11572265625,
"learning_rate": 9.56953642384106e-07,
"loss": 0.0046,
"reward": 0.9798631221055984,
"reward_std": 0.20544240390881896,
"rewards/accuracy_reward": -0.012500017881393433,
"rewards/cosine_rewards": -0.007636879570782185,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 169
},
{
"clip_ratio": 0.0,
"completion_length": 190.34375,
"epoch": 0.08660213958227203,
"grad_norm": 7.762505201079191,
"kl": 0.112548828125,
"learning_rate": 9.56698930208864e-07,
"loss": 0.0045,
"reward": 1.152494490146637,
"reward_std": 0.30975981056690216,
"rewards/accuracy_reward": 0.15625,
"rewards/cosine_rewards": -0.0037555836606770754,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 170
},
{
"clip_ratio": 0.0,
"completion_length": 194.875,
"epoch": 0.08711156393275599,
"grad_norm": 11.506198147130426,
"kl": 0.111083984375,
"learning_rate": 9.564442180336219e-07,
"loss": 0.0045,
"reward": 0.9780029058456421,
"reward_std": 0.6867689490318298,
"rewards/accuracy_reward": -0.012500010430812836,
"rewards/cosine_rewards": -0.009497055783867836,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 171
},
{
"clip_ratio": 0.0,
"completion_length": 189.125,
"epoch": 0.08762098828323994,
"grad_norm": 11.819517524957538,
"kl": 0.10546875,
"learning_rate": 9.5618950585838e-07,
"loss": 0.0042,
"reward": 1.3241556882858276,
"reward_std": 0.3773365914821625,
"rewards/accuracy_reward": 0.32499998807907104,
"rewards/cosine_rewards": -0.0008443233091384172,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 172
},
{
"clip_ratio": 0.0,
"completion_length": 182.34375,
"epoch": 0.0881304126337239,
"grad_norm": 8.416955648144409,
"kl": 0.116943359375,
"learning_rate": 9.55934793683138e-07,
"loss": 0.0047,
"reward": 1.6661878824234009,
"reward_std": 0.20251824986189604,
"rewards/accuracy_reward": 0.6624999940395355,
"rewards/cosine_rewards": 0.003687863936647773,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 173
},
{
"clip_ratio": 0.0,
"completion_length": 191.921875,
"epoch": 0.08863983698420784,
"grad_norm": 7.131693684668596,
"kl": 0.12060546875,
"learning_rate": 9.55680081507896e-07,
"loss": 0.0048,
"reward": 1.0765551328659058,
"reward_std": 0.3972722738981247,
"rewards/accuracy_reward": 0.09999998845160007,
"rewards/cosine_rewards": -0.007819817401468754,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": 0.0,
"step": 174
},
{
"clip_ratio": 0.0,
"completion_length": 181.75,
"epoch": 0.0891492613346918,
"grad_norm": 11.781793838489648,
"kl": 0.11083984375,
"learning_rate": 9.55425369332654e-07,
"loss": 0.0044,
"reward": 1.5515506863594055,
"reward_std": 0.3071342632174492,
"rewards/accuracy_reward": 0.5500000268220901,
"rewards/cosine_rewards": 0.0015506702475249767,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 175
},
{
"clip_ratio": 0.0,
"completion_length": 175.609375,
"epoch": 0.08965868568517575,
"grad_norm": 40.566147710572594,
"kl": 0.109130859375,
"learning_rate": 9.551706571574121e-07,
"loss": 0.0044,
"reward": 1.5527549982070923,
"reward_std": 0.39126846194267273,
"rewards/accuracy_reward": 0.5499999895691872,
"rewards/cosine_rewards": 0.0027549704536795616,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 176
},
{
"clip_ratio": 0.0,
"completion_length": 178.53125,
"epoch": 0.09016811003565971,
"grad_norm": 12.817541000164553,
"kl": 0.10595703125,
"learning_rate": 9.5491594498217e-07,
"loss": 0.0042,
"reward": 1.9808745980262756,
"reward_std": 0.08480274910107255,
"rewards/accuracy_reward": 0.971875011920929,
"rewards/cosine_rewards": 0.009283588267862797,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": -0.0002840909000951797,
"step": 177
},
{
"clip_ratio": 0.0,
"completion_length": 186.75,
"epoch": 0.09067753438614366,
"grad_norm": 7.914504626786531,
"kl": 0.103759765625,
"learning_rate": 9.546612328069282e-07,
"loss": 0.0041,
"reward": 1.5245178937911987,
"reward_std": 0.34963520616292953,
"rewards/accuracy_reward": 0.5218750089406967,
"rewards/cosine_rewards": 0.0026429439894855022,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 178
},
{
"clip_ratio": 0.0,
"completion_length": 188.9375,
"epoch": 0.0911869587366276,
"grad_norm": 8.759963126963402,
"kl": 0.13037109375,
"learning_rate": 9.544065206316861e-07,
"loss": 0.0052,
"reward": 1.638785481452942,
"reward_std": 0.2284149518236518,
"rewards/accuracy_reward": 0.6343750059604645,
"rewards/cosine_rewards": 0.0044105148408561945,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 179
},
{
"clip_ratio": 0.0,
"completion_length": 197.90625,
"epoch": 0.09169638308711156,
"grad_norm": 6.418887244829745,
"kl": 0.116943359375,
"learning_rate": 9.541518084564442e-07,
"loss": 0.0047,
"reward": 1.3810052275657654,
"reward_std": 0.4032685235142708,
"rewards/accuracy_reward": 0.3812499940395355,
"rewards/cosine_rewards": -0.0001616678200662136,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": -8.311169949593022e-05,
"step": 180
},
{
"clip_ratio": 0.0,
"completion_length": 186.078125,
"epoch": 0.09220580743759552,
"grad_norm": 6.304857280986285,
"kl": 0.12890625,
"learning_rate": 9.538970962812022e-07,
"loss": 0.0051,
"reward": 1.2630045115947723,
"reward_std": 0.17365956178400666,
"rewards/accuracy_reward": 0.2656250037252903,
"rewards/cosine_rewards": -0.002620481769554317,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 181
},
{
"clip_ratio": 0.0,
"completion_length": 190.265625,
"epoch": 0.09271523178807947,
"grad_norm": 11.35970580998742,
"kl": 0.11181640625,
"learning_rate": 9.536423841059602e-07,
"loss": 0.0045,
"reward": 1.6366259455680847,
"reward_std": 0.2085256204009056,
"rewards/accuracy_reward": 0.6343750059604645,
"rewards/cosine_rewards": 0.002349784132093191,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": -9.889240755001083e-05,
"step": 182
},
{
"clip_ratio": 0.0,
"completion_length": 195.96875,
"epoch": 0.09322465613856343,
"grad_norm": 33.125698049494765,
"kl": 0.118408203125,
"learning_rate": 9.533876719307182e-07,
"loss": 0.0048,
"reward": 1.553468942642212,
"reward_std": 0.16592675540596247,
"rewards/accuracy_reward": 0.550000011920929,
"rewards/cosine_rewards": 0.003468883689492941,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": 0.0,
"step": 183
},
{
"clip_ratio": 0.0,
"completion_length": 205.171875,
"epoch": 0.09373408048904738,
"grad_norm": 15.866784423418947,
"kl": 0.115234375,
"learning_rate": 9.531329597554763e-07,
"loss": 0.0046,
"reward": 1.1207141280174255,
"reward_std": 0.19428733736276627,
"rewards/accuracy_reward": 0.12812499701976776,
"rewards/cosine_rewards": -0.007209272123873234,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": -0.00020161290012765676,
"step": 184
},
{
"clip_ratio": 0.0,
"completion_length": 205.640625,
"epoch": 0.09424350483953133,
"grad_norm": 20.352317051449248,
"kl": 0.3115234375,
"learning_rate": 9.528782475802343e-07,
"loss": 0.0124,
"reward": 1.6525439023971558,
"reward_std": 0.38362888991832733,
"rewards/accuracy_reward": 0.6625000238418579,
"rewards/cosine_rewards": 0.005779681145213544,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": -0.00011081559932790697,
"step": 185
},
{
"clip_ratio": 0.0,
"completion_length": 201.234375,
"epoch": 0.09475292919001528,
"grad_norm": 9.691736527471202,
"kl": 0.124755859375,
"learning_rate": 9.526235354049923e-07,
"loss": 0.005,
"reward": 0.9626118838787079,
"reward_std": 0.40054861456155777,
"rewards/accuracy_reward": -0.015625011175870895,
"rewards/cosine_rewards": -0.006138101452961564,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": 0.0,
"step": 186
},
{
"clip_ratio": 0.0,
"completion_length": 223.578125,
"epoch": 0.09526235354049924,
"grad_norm": 9.437447186660206,
"kl": 0.123046875,
"learning_rate": 9.523688232297503e-07,
"loss": 0.0049,
"reward": 1.569740116596222,
"reward_std": 0.1397167220711708,
"rewards/accuracy_reward": 0.5781250298023224,
"rewards/cosine_rewards": 0.007240177597850561,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": 0.0,
"step": 187
},
{
"clip_ratio": 0.0,
"completion_length": 233.625,
"epoch": 0.09577177789098319,
"grad_norm": 14.943967064784786,
"kl": 0.15087890625,
"learning_rate": 9.521141110545084e-07,
"loss": 0.006,
"reward": 1.0288785099983215,
"reward_std": 0.2980290725827217,
"rewards/accuracy_reward": 0.07187499105930328,
"rewards/cosine_rewards": -0.011638639261946082,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": -0.00010775862028822303,
"step": 188
},
{
"clip_ratio": 0.0,
"completion_length": 213.921875,
"epoch": 0.09628120224146715,
"grad_norm": 15.000304282598657,
"kl": 0.12744140625,
"learning_rate": 9.518593988792664e-07,
"loss": 0.0051,
"reward": 1.3486477732658386,
"reward_std": 0.30507488548755646,
"rewards/accuracy_reward": 0.34999997913837433,
"rewards/cosine_rewards": -0.0010681524872779846,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": -0.0002840909000951797,
"step": 189
},
{
"clip_ratio": 0.0,
"completion_length": 212.96875,
"epoch": 0.09679062659195109,
"grad_norm": 5.941418391788204,
"kl": 0.13916015625,
"learning_rate": 9.516046867040244e-07,
"loss": 0.0056,
"reward": 1.6924657821655273,
"reward_std": 0.3630830645561218,
"rewards/accuracy_reward": 0.7187499850988388,
"rewards/cosine_rewards": 0.004965720232576132,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": 0.0,
"step": 190
},
{
"clip_ratio": 0.0,
"completion_length": 230.421875,
"epoch": 0.09730005094243505,
"grad_norm": 7.708020896616392,
"kl": 0.14453125,
"learning_rate": 9.513499745287824e-07,
"loss": 0.0058,
"reward": 1.250920683145523,
"reward_std": 0.4801155626773834,
"rewards/accuracy_reward": 0.2968749776482582,
"rewards/cosine_rewards": 0.0009206933900713921,
"rewards/format_reward": 0.953125,
"rewards/repetition_rewards": 0.0,
"step": 191
},
{
"clip_ratio": 0.0,
"completion_length": 247.3125,
"epoch": 0.097809475292919,
"grad_norm": 13.916574397802314,
"kl": 0.13671875,
"learning_rate": 9.510952623535404e-07,
"loss": 0.0055,
"reward": 1.129820704460144,
"reward_std": 0.717576265335083,
"rewards/accuracy_reward": 0.21249999850988388,
"rewards/cosine_rewards": -0.0045542995212599635,
"rewards/format_reward": 0.921875,
"rewards/repetition_rewards": 0.0,
"step": 192
},
{
"clip_ratio": 0.0,
"completion_length": 244.21875,
"epoch": 0.09831889964340296,
"grad_norm": 5.201799859391353,
"kl": 0.13427734375,
"learning_rate": 9.508405501782984e-07,
"loss": 0.0054,
"reward": 1.3024629950523376,
"reward_std": 0.4307016432285309,
"rewards/accuracy_reward": 0.37812498956918716,
"rewards/cosine_rewards": 0.0024630045518279076,
"rewards/format_reward": 0.921875,
"rewards/repetition_rewards": 0.0,
"step": 193
},
{
"clip_ratio": 0.0,
"completion_length": 232.203125,
"epoch": 0.09882832399388691,
"grad_norm": 9.365246983313067,
"kl": 0.12939453125,
"learning_rate": 9.505858380030564e-07,
"loss": 0.0052,
"reward": 0.7090668827295303,
"reward_std": 0.5288920998573303,
"rewards/accuracy_reward": -0.23750002309679985,
"rewards/cosine_rewards": -0.022183137945830822,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": 0.0,
"step": 194
},
{
"clip_ratio": 0.0,
"completion_length": 225.8125,
"epoch": 0.09933774834437085,
"grad_norm": 12.601377217707842,
"kl": 0.14501953125,
"learning_rate": 9.503311258278145e-07,
"loss": 0.0058,
"reward": 1.413894236087799,
"reward_std": 0.7254346013069153,
"rewards/accuracy_reward": 0.5218749940395355,
"rewards/cosine_rewards": 0.001518724486231804,
"rewards/format_reward": 0.890625,
"rewards/repetition_rewards": -0.00012443749437807128,
"step": 195
},
{
"clip_ratio": 0.0,
"completion_length": 226.1875,
"epoch": 0.09984717269485481,
"grad_norm": 6.45339117400833,
"kl": 0.1376953125,
"learning_rate": 9.500764136525725e-07,
"loss": 0.0055,
"reward": 1.5756230354309082,
"reward_std": 0.48759835958480835,
"rewards/accuracy_reward": 0.6624999940395355,
"rewards/cosine_rewards": 0.007316130446270108,
"rewards/format_reward": 0.90625,
"rewards/repetition_rewards": -0.0004430353583302349,
"step": 196
},
{
"clip_ratio": 0.0,
"completion_length": 227.796875,
"epoch": 0.10035659704533877,
"grad_norm": 11.359096408502863,
"kl": 0.2138671875,
"learning_rate": 9.498217014773305e-07,
"loss": 0.0086,
"reward": 1.2498727440834045,
"reward_std": 0.5983296632766724,
"rewards/accuracy_reward": 0.37812499701976776,
"rewards/cosine_rewards": -0.0031815596157684922,
"rewards/format_reward": 0.875,
"rewards/repetition_rewards": -7.070136052789167e-05,
"step": 197
},
{
"clip_ratio": 0.0,
"completion_length": 232.234375,
"epoch": 0.10086602139582272,
"grad_norm": 13.920508776432966,
"kl": 0.12255859375,
"learning_rate": 9.495669893020886e-07,
"loss": 0.0049,
"reward": 0.6862081587314606,
"reward_std": 0.7485357820987701,
"rewards/accuracy_reward": -0.2656250223517418,
"rewards/cosine_rewards": -0.01686593284830451,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": -5.089576370664872e-05,
"step": 198
},
{
"clip_ratio": 0.0,
"completion_length": 205.140625,
"epoch": 0.10137544574630668,
"grad_norm": 8.671114765474545,
"kl": 0.124267578125,
"learning_rate": 9.493122771268466e-07,
"loss": 0.005,
"reward": 1.1818422079086304,
"reward_std": 0.5584293901920319,
"rewards/accuracy_reward": 0.29375000298023224,
"rewards/cosine_rewards": -0.002258662148960866,
"rewards/format_reward": 0.890625,
"rewards/repetition_rewards": -0.0002741228090599179,
"step": 199
},
{
"clip_ratio": 0.0,
"completion_length": 216.875,
"epoch": 0.10188487009679063,
"grad_norm": 10.078840755345926,
"kl": 0.126708984375,
"learning_rate": 9.490575649516046e-07,
"loss": 0.0051,
"reward": 1.3181660771369934,
"reward_std": 0.6019489467144012,
"rewards/accuracy_reward": 0.40937498211860657,
"rewards/cosine_rewards": 0.002541057765483856,
"rewards/format_reward": 0.90625,
"rewards/repetition_rewards": 0.0,
"step": 200
},
{
"clip_ratio": 0.0,
"completion_length": 199.515625,
"epoch": 0.10239429444727458,
"grad_norm": 5.555372749575627,
"kl": 0.130859375,
"learning_rate": 9.488028527763627e-07,
"loss": 0.0052,
"reward": 1.5905040502548218,
"reward_std": 0.41512130200862885,
"rewards/accuracy_reward": 0.6624999940395355,
"rewards/cosine_rewards": 0.006129102781414986,
"rewards/format_reward": 0.921875,
"rewards/repetition_rewards": 0.0,
"step": 201
},
{
"clip_ratio": 0.0,
"completion_length": 235.03125,
"epoch": 0.10290371879775853,
"grad_norm": 10.052848791937414,
"kl": 0.12890625,
"learning_rate": 9.485481406011207e-07,
"loss": 0.0051,
"reward": 1.1239948272705078,
"reward_std": 0.9119550585746765,
"rewards/accuracy_reward": 0.26875001192092896,
"rewards/cosine_rewards": -0.004009488970041275,
"rewards/format_reward": 0.859375,
"rewards/repetition_rewards": -0.00012065636838087812,
"step": 202
},
{
"clip_ratio": 0.0,
"completion_length": 243.5,
"epoch": 0.10341314314824249,
"grad_norm": 11.024825341246498,
"kl": 0.115966796875,
"learning_rate": 9.482934284258787e-07,
"loss": 0.0046,
"reward": 0.9447762966156006,
"reward_std": 0.800986647605896,
"rewards/accuracy_reward": 0.12812498584389687,
"rewards/cosine_rewards": -0.011282204184681177,
"rewards/format_reward": 0.828125,
"rewards/repetition_rewards": -0.00019145716942148283,
"step": 203
},
{
"clip_ratio": 0.0,
"completion_length": 232.375,
"epoch": 0.10392256749872644,
"grad_norm": 7.842670844504004,
"kl": 0.118408203125,
"learning_rate": 9.480387162506367e-07,
"loss": 0.0047,
"reward": 1.2975149750709534,
"reward_std": 0.6046717762947083,
"rewards/accuracy_reward": 0.40937497094273567,
"rewards/cosine_rewards": -0.0023666354827582836,
"rewards/format_reward": 0.890625,
"rewards/repetition_rewards": -0.00011837121564894915,
"step": 204
},
{
"clip_ratio": 0.0,
"completion_length": 224.234375,
"epoch": 0.1044319918492104,
"grad_norm": 20.462186918743633,
"kl": 0.125732421875,
"learning_rate": 9.477840040753947e-07,
"loss": 0.005,
"reward": 1.0097321271896362,
"reward_std": 0.4872446656227112,
"rewards/accuracy_reward": 0.1250000149011612,
"rewards/cosine_rewards": -0.005829372443258762,
"rewards/format_reward": 0.890625,
"rewards/repetition_rewards": -6.351625779643655e-05,
"step": 205
},
{
"clip_ratio": 0.0,
"completion_length": 224.328125,
"epoch": 0.10494141619969434,
"grad_norm": 18.578745554695768,
"kl": 0.1298828125,
"learning_rate": 9.475292919001527e-07,
"loss": 0.0052,
"reward": 0.8584832549095154,
"reward_std": 0.5744369626045227,
"rewards/accuracy_reward": 0.040624991059303284,
"rewards/cosine_rewards": -0.010225818026810884,
"rewards/format_reward": 0.828125,
"rewards/repetition_rewards": -4.101049853488803e-05,
"step": 206
},
{
"clip_ratio": 0.0,
"completion_length": 209.234375,
"epoch": 0.1054508405501783,
"grad_norm": 20.30723423804259,
"kl": 0.11376953125,
"learning_rate": 9.472745797249107e-07,
"loss": 0.0045,
"reward": 1.0917281210422516,
"reward_std": 0.4984763488173485,
"rewards/accuracy_reward": 0.24062497913837433,
"rewards/cosine_rewards": -0.008111415430903435,
"rewards/format_reward": 0.859375,
"rewards/repetition_rewards": -0.0001604560275154654,
"step": 207
},
{
"clip_ratio": 0.0,
"completion_length": 202.28125,
"epoch": 0.10596026490066225,
"grad_norm": 12.813170138938949,
"kl": 0.130859375,
"learning_rate": 9.470198675496688e-07,
"loss": 0.0052,
"reward": 1.258288562297821,
"reward_std": 0.4630318433046341,
"rewards/accuracy_reward": 0.3812500238418579,
"rewards/cosine_rewards": 0.0022279657423496246,
"rewards/format_reward": 0.875,
"rewards/repetition_rewards": -0.00018934992840513587,
"step": 208
},
{
"clip_ratio": 0.0,
"completion_length": 199.0,
"epoch": 0.10646968925114621,
"grad_norm": 10.281274330223528,
"kl": 0.15185546875,
"learning_rate": 9.467651553744268e-07,
"loss": 0.0061,
"reward": 1.2817729711532593,
"reward_std": 0.4518425017595291,
"rewards/accuracy_reward": 0.40937501937150955,
"rewards/cosine_rewards": -0.0026020415825769305,
"rewards/format_reward": 0.875,
"rewards/repetition_rewards": 0.0,
"step": 209
},
{
"clip_ratio": 0.0,
"completion_length": 191.90625,
"epoch": 0.10697911360163016,
"grad_norm": 5.991490542584776,
"kl": 0.118896484375,
"learning_rate": 9.465104431991848e-07,
"loss": 0.0047,
"reward": 1.537351131439209,
"reward_std": 0.5478895753622055,
"rewards/accuracy_reward": 0.6875000149011612,
"rewards/cosine_rewards": 0.00610114517621696,
"rewards/format_reward": 0.84375,
"rewards/repetition_rewards": 0.0,
"step": 210
},
{
"clip_ratio": 0.0,
"completion_length": 217.015625,
"epoch": 0.1074885379521141,
"grad_norm": 60.00536554673066,
"kl": 0.115966796875,
"learning_rate": 9.462557310239428e-07,
"loss": 0.0046,
"reward": 0.7785031795501709,
"reward_std": 0.42832519114017487,
"rewards/accuracy_reward": -0.1250000223517418,
"rewards/cosine_rewards": -0.018284045159816742,
"rewards/format_reward": 0.921875,
"rewards/repetition_rewards": -8.778089977568015e-05,
"step": 211
},
{
"clip_ratio": 0.0,
"completion_length": 205.21875,
"epoch": 0.10799796230259806,
"grad_norm": 21.72003474650607,
"kl": 0.1220703125,
"learning_rate": 9.460010188487009e-07,
"loss": 0.0049,
"reward": 1.0551989674568176,
"reward_std": 0.46487441658973694,
"rewards/accuracy_reward": 0.18437497317790985,
"rewards/cosine_rewards": -0.004139983095228672,
"rewards/format_reward": 0.875,
"rewards/repetition_rewards": -3.600230411393568e-05,
"step": 212
},
{
"clip_ratio": 0.0,
"completion_length": 237.609375,
"epoch": 0.10850738665308202,
"grad_norm": 13.593112035734373,
"kl": 0.118896484375,
"learning_rate": 9.457463066734589e-07,
"loss": 0.0048,
"reward": 1.4092811346054077,
"reward_std": 0.6851305663585663,
"rewards/accuracy_reward": 0.6343750059604645,
"rewards/cosine_rewards": 0.009402429801411927,
"rewards/format_reward": 0.765625,
"rewards/repetition_rewards": -0.00012127523950766772,
"step": 213
},
{
"clip_ratio": 0.0,
"completion_length": 266.65625,
"epoch": 0.10901681100356597,
"grad_norm": 102.50188740817565,
"kl": 0.114013671875,
"learning_rate": 9.45491594498217e-07,
"loss": 0.0046,
"reward": 1.3722986578941345,
"reward_std": 0.5870523750782013,
"rewards/accuracy_reward": 0.5218750089406967,
"rewards/cosine_rewards": -0.008539619389921427,
"rewards/format_reward": 0.859375,
"rewards/repetition_rewards": -0.000411735316447448,
"step": 214
},
{
"clip_ratio": 0.0,
"completion_length": 264.796875,
"epoch": 0.10952623535404993,
"grad_norm": 17.000809347446246,
"kl": 0.114013671875,
"learning_rate": 9.452368823229751e-07,
"loss": 0.0046,
"reward": 1.182218611240387,
"reward_std": 0.6600647866725922,
"rewards/accuracy_reward": 0.265625,
"rewards/cosine_rewards": -0.02086095977574587,
"rewards/format_reward": 0.9375,
"rewards/repetition_rewards": -4.542151145869866e-05,
"step": 215
},
{
"clip_ratio": 0.0,
"completion_length": 305.796875,
"epoch": 0.11003565970453388,
"grad_norm": 21.15698667263886,
"kl": 0.107666015625,
"learning_rate": 9.449821701477331e-07,
"loss": 0.0043,
"reward": 1.1834356784820557,
"reward_std": 0.686463937163353,
"rewards/accuracy_reward": 0.2968749925494194,
"rewards/cosine_rewards": -0.019689313136041164,
"rewards/format_reward": 0.90625,
"rewards/repetition_rewards": 0.0,
"step": 216
},
{
"clip_ratio": 0.0,
"completion_length": 400.296875,
"epoch": 0.11054508405501783,
"grad_norm": 7.9150578217370535,
"kl": 0.09814453125,
"learning_rate": 9.447274579724911e-07,
"loss": 0.0039,
"reward": 1.0662736892700195,
"reward_std": 0.8222787380218506,
"rewards/accuracy_reward": 0.18437499552965164,
"rewards/cosine_rewards": -0.0396728478372097,
"rewards/format_reward": 0.921875,
"rewards/repetition_rewards": -0.00030338978831423447,
"step": 217
},
{
"clip_ratio": 0.0,
"completion_length": 345.671875,
"epoch": 0.11105450840550178,
"grad_norm": 10.397918174345362,
"kl": 0.1357421875,
"learning_rate": 9.444727457972492e-07,
"loss": 0.0054,
"reward": 1.6620882153511047,
"reward_std": 0.6701975017786026,
"rewards/accuracy_reward": 0.71875,
"rewards/cosine_rewards": 0.021556629799306393,
"rewards/format_reward": 0.921875,
"rewards/repetition_rewards": -9.343791316496208e-05,
"step": 218
},
{
"clip_ratio": 0.0,
"completion_length": 361.984375,
"epoch": 0.11156393275598574,
"grad_norm": 16.664810091620872,
"kl": 0.09619140625,
"learning_rate": 9.442180336220072e-07,
"loss": 0.0038,
"reward": 0.6033791899681091,
"reward_std": 0.47761378437280655,
"rewards/accuracy_reward": -0.2656250149011612,
"rewards/cosine_rewards": -0.0521757323294878,
"rewards/format_reward": 0.921875,
"rewards/repetition_rewards": -0.0006950152310309932,
"step": 219
},
{
"clip_ratio": 0.0,
"completion_length": 359.359375,
"epoch": 0.11207335710646969,
"grad_norm": 6.030171938320145,
"kl": 0.09423828125,
"learning_rate": 9.439633214467651e-07,
"loss": 0.0038,
"reward": 1.0732125043869019,
"reward_std": 0.532948449254036,
"rewards/accuracy_reward": 0.18437499180436134,
"rewards/cosine_rewards": -0.017268475145101547,
"rewards/format_reward": 0.90625,
"rewards/repetition_rewards": -0.00014401252064999426,
"step": 220
},
{
"clip_ratio": 0.0,
"completion_length": 467.921875,
"epoch": 0.11258278145695365,
"grad_norm": 13.578372073106125,
"kl": 0.08740234375,
"learning_rate": 9.437086092715231e-07,
"loss": 0.0035,
"reward": 1.089949607849121,
"reward_std": 0.7014666199684143,
"rewards/accuracy_reward": 0.18437499180436134,
"rewards/cosine_rewards": -0.04696316970512271,
"rewards/format_reward": 0.953125,
"rewards/repetition_rewards": -0.0005872593028470874,
"step": 221
},
{
"clip_ratio": 0.0,
"completion_length": 295.15625,
"epoch": 0.11309220580743759,
"grad_norm": 9.243544591708364,
"kl": 0.099609375,
"learning_rate": 9.434538970962812e-07,
"loss": 0.004,
"reward": 1.25474151968956,
"reward_std": 0.42495501041412354,
"rewards/accuracy_reward": 0.2968750074505806,
"rewards/cosine_rewards": -0.010176160372793674,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": -0.0007073541928548366,
"step": 222
},
{
"clip_ratio": 0.0,
"completion_length": 327.9375,
"epoch": 0.11360163015792155,
"grad_norm": 11.854145682727308,
"kl": 0.09423828125,
"learning_rate": 9.431991849210392e-07,
"loss": 0.0038,
"reward": 1.336020827293396,
"reward_std": 0.5929334163665771,
"rewards/accuracy_reward": 0.3812499977648258,
"rewards/cosine_rewards": 0.0018316814675927162,
"rewards/format_reward": 0.953125,
"rewards/repetition_rewards": -0.0001858295945567079,
"step": 223
},
{
"clip_ratio": 0.0,
"completion_length": 224.109375,
"epoch": 0.1141110545084055,
"grad_norm": 9.830726171785136,
"kl": 0.13330078125,
"learning_rate": 9.429444727457972e-07,
"loss": 0.0053,
"reward": 0.9984832406044006,
"reward_std": 0.45606285333633423,
"rewards/accuracy_reward": 0.043749988079071045,
"rewards/cosine_rewards": -0.014016739558428526,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": 0.0,
"step": 224
},
{
"clip_ratio": 0.0,
"completion_length": 213.640625,
"epoch": 0.11462047885888946,
"grad_norm": 9.176338530222994,
"kl": 0.115234375,
"learning_rate": 9.426897605705553e-07,
"loss": 0.0046,
"reward": 1.2019062638282776,
"reward_std": 0.7189642786979675,
"rewards/accuracy_reward": 0.29374999925494194,
"rewards/cosine_rewards": -0.01371871994342655,
"rewards/format_reward": 0.921875,
"rewards/repetition_rewards": 0.0,
"step": 225
},
{
"clip_ratio": 0.0,
"completion_length": 199.71875,
"epoch": 0.11512990320937341,
"grad_norm": 11.349158424596924,
"kl": 0.110107421875,
"learning_rate": 9.424350483953133e-07,
"loss": 0.0044,
"reward": 1.3144216537475586,
"reward_std": 0.4914311468601227,
"rewards/accuracy_reward": 0.32499998807907104,
"rewards/cosine_rewards": 0.005097148037748411,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": -5.056634472566657e-05,
"step": 226
},
{
"clip_ratio": 0.0,
"completion_length": 232.390625,
"epoch": 0.11563932755985736,
"grad_norm": 8.61981591225416,
"kl": 0.105712890625,
"learning_rate": 9.421803362200713e-07,
"loss": 0.0042,
"reward": 1.064522534608841,
"reward_std": 0.3509945422410965,
"rewards/accuracy_reward": 0.15312501788139343,
"rewards/cosine_rewards": -0.010458544362336397,
"rewards/format_reward": 0.921875,
"rewards/repetition_rewards": -1.8939394067274407e-05,
"step": 227
},
{
"clip_ratio": 0.0,
"completion_length": 197.75,
"epoch": 0.11614875191034131,
"grad_norm": 12.524545820572607,
"kl": 0.106689453125,
"learning_rate": 9.419256240448294e-07,
"loss": 0.0043,
"reward": 1.3913479149341583,
"reward_std": 0.2862061709165573,
"rewards/accuracy_reward": 0.40937498956918716,
"rewards/cosine_rewards": -0.0021625147201120853,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": -0.00023957982193678617,
"step": 228
},
{
"clip_ratio": 0.0,
"completion_length": 193.5,
"epoch": 0.11665817626082527,
"grad_norm": 17.83927533701907,
"kl": 0.13232421875,
"learning_rate": 9.416709118695874e-07,
"loss": 0.0053,
"reward": 1.5334136486053467,
"reward_std": 0.45667168498039246,
"rewards/accuracy_reward": 0.6062500029802322,
"rewards/cosine_rewards": 0.005288586835376918,
"rewards/format_reward": 0.921875,
"rewards/repetition_rewards": 0.0,
"step": 229
},
{
"clip_ratio": 0.0,
"completion_length": 247.671875,
"epoch": 0.11716760061130922,
"grad_norm": 23.81656431934888,
"kl": 0.108642578125,
"learning_rate": 9.414161996943454e-07,
"loss": 0.0043,
"reward": 1.1264008283615112,
"reward_std": 0.6892756521701813,
"rewards/accuracy_reward": 0.21249999105930328,
"rewards/cosine_rewards": -0.02350334101356566,
"rewards/format_reward": 0.9375,
"rewards/repetition_rewards": -9.586199303157628e-05,
"step": 230
},
{
"clip_ratio": 0.0,
"completion_length": 262.59375,
"epoch": 0.11767702496179318,
"grad_norm": 28.467466655884312,
"kl": 0.12109375,
"learning_rate": 9.411614875191034e-07,
"loss": 0.0048,
"reward": 1.4944193363189697,
"reward_std": 0.3786798119544983,
"rewards/accuracy_reward": 0.518750011920929,
"rewards/cosine_rewards": 0.0069862306118011475,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": -6.69164874125272e-05,
"step": 231
},
{
"clip_ratio": 0.0,
"completion_length": 248.40625,
"epoch": 0.11818644931227713,
"grad_norm": 9.248494107592876,
"kl": 0.12060546875,
"learning_rate": 9.409067753438615e-07,
"loss": 0.0048,
"reward": 1.2941021919250488,
"reward_std": 0.5259552597999573,
"rewards/accuracy_reward": 0.3749999776482582,
"rewards/cosine_rewards": 0.012875130865722895,
"rewards/format_reward": 0.90625,
"rewards/repetition_rewards": -2.2944199372432195e-05,
"step": 232
},
{
"clip_ratio": 0.0,
"completion_length": 264.15625,
"epoch": 0.11869587366276108,
"grad_norm": 14.194855529249866,
"kl": 0.107666015625,
"learning_rate": 9.406520631686195e-07,
"loss": 0.0043,
"reward": 1.4532509446144104,
"reward_std": 0.47306837141513824,
"rewards/accuracy_reward": 0.46562501788139343,
"rewards/cosine_rewards": 0.003419560845941305,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": -0.00016866176156327128,
"step": 233
},
{
"clip_ratio": 0.0,
"completion_length": 376.875,
"epoch": 0.11920529801324503,
"grad_norm": 24.881210594835412,
"kl": 0.0986328125,
"learning_rate": 9.403973509933774e-07,
"loss": 0.0039,
"reward": 0.9971878528594971,
"reward_std": 0.8903799057006836,
"rewards/accuracy_reward": 0.09375,
"rewards/cosine_rewards": -0.018273995257914066,
"rewards/format_reward": 0.921875,
"rewards/repetition_rewards": -0.00016315293032675982,
"step": 234
},
{
"clip_ratio": 0.0,
"completion_length": 432.28125,
"epoch": 0.11971472236372899,
"grad_norm": 4.4996951306965975,
"kl": 0.08544921875,
"learning_rate": 9.401426388181355e-07,
"loss": 0.0034,
"reward": 1.3378186225891113,
"reward_std": 0.8512288331985474,
"rewards/accuracy_reward": 0.4593750089406967,
"rewards/cosine_rewards": -0.027322867885231972,
"rewards/format_reward": 0.90625,
"rewards/repetition_rewards": -0.000483501615235582,
"step": 235
},
{
"clip_ratio": 0.0,
"completion_length": 465.765625,
"epoch": 0.12022414671421294,
"grad_norm": 5.402908709299499,
"kl": 0.080322265625,
"learning_rate": 9.398879266428935e-07,
"loss": 0.0032,
"reward": 1.4937435388565063,
"reward_std": 0.35082364082336426,
"rewards/accuracy_reward": 0.5499999821186066,
"rewards/cosine_rewards": -0.024816589895635843,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": -0.0001899001763376873,
"step": 236
},
{
"clip_ratio": 0.0,
"completion_length": 431.796875,
"epoch": 0.1207335710646969,
"grad_norm": 11.178072528468537,
"kl": 0.0947265625,
"learning_rate": 9.396332144676515e-07,
"loss": 0.0038,
"reward": 1.1548867225646973,
"reward_std": 0.8218154907226562,
"rewards/accuracy_reward": 0.23749998956918716,
"rewards/cosine_rewards": -0.0042152018286287785,
"rewards/format_reward": 0.921875,
"rewards/repetition_rewards": -0.0002730985652306117,
"step": 237
},
{
"clip_ratio": 0.0,
"completion_length": 506.640625,
"epoch": 0.12124299541518084,
"grad_norm": 3.9623481852584983,
"kl": 0.078857421875,
"learning_rate": 9.393785022924095e-07,
"loss": 0.0032,
"reward": 1.2667301297187805,
"reward_std": 0.8781076371669769,
"rewards/accuracy_reward": 0.40937498211860657,
"rewards/cosine_rewards": -0.04850983805954456,
"rewards/format_reward": 0.90625,
"rewards/repetition_rewards": -0.0003849874483421445,
"step": 238
},
{
"clip_ratio": 0.0,
"completion_length": 443.515625,
"epoch": 0.1217524197656648,
"grad_norm": 5.742693044990549,
"kl": 0.094482421875,
"learning_rate": 9.391237901171676e-07,
"loss": 0.0038,
"reward": 0.6565631031990051,
"reward_std": 0.7225559949874878,
"rewards/accuracy_reward": -0.15312501043081284,
"rewards/cosine_rewards": -0.08046763762831688,
"rewards/format_reward": 0.890625,
"rewards/repetition_rewards": -0.00046927113726269454,
"step": 239
},
{
"clip_ratio": 0.0,
"completion_length": 347.265625,
"epoch": 0.12226184411614875,
"grad_norm": 7.476762059683771,
"kl": 0.08984375,
"learning_rate": 9.388690779419256e-07,
"loss": 0.0036,
"reward": 1.2646641731262207,
"reward_std": 0.35142165422439575,
"rewards/accuracy_reward": 0.296875,
"rewards/cosine_rewards": -0.016350463964045048,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": -0.00023539320682175457,
"step": 240
},
{
"clip_ratio": 0.0,
"completion_length": 241.125,
"epoch": 0.12277126846663271,
"grad_norm": 7.077545719384053,
"kl": 0.101318359375,
"learning_rate": 9.386143657666836e-07,
"loss": 0.0041,
"reward": 0.9919856488704681,
"reward_std": 0.6525652855634689,
"rewards/accuracy_reward": 0.043749988079071045,
"rewards/cosine_rewards": -0.020295456051826477,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": -0.00021890102652832866,
"step": 241
},
{
"clip_ratio": 0.0,
"completion_length": 240.765625,
"epoch": 0.12328069281711666,
"grad_norm": 8.441107538365218,
"kl": 0.1044921875,
"learning_rate": 9.383596535914417e-07,
"loss": 0.0042,
"reward": 1.616421401500702,
"reward_std": 0.3022947758436203,
"rewards/accuracy_reward": 0.6593749970197678,
"rewards/cosine_rewards": 0.004121019504964352,
"rewards/format_reward": 0.953125,
"rewards/repetition_rewards": -0.00019959894416388124,
"step": 242
},
{
"clip_ratio": 0.0,
"completion_length": 175.796875,
"epoch": 0.1237901171676006,
"grad_norm": 9.380134906229868,
"kl": 0.11083984375,
"learning_rate": 9.381049414161997e-07,
"loss": 0.0044,
"reward": 1.3176445960998535,
"reward_std": 0.3997122645378113,
"rewards/accuracy_reward": 0.32499998807907104,
"rewards/cosine_rewards": -0.0073250585701316595,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": -3.028100763913244e-05,
"step": 243
},
{
"clip_ratio": 0.0,
"completion_length": 220.53125,
"epoch": 0.12429954151808456,
"grad_norm": 5.4472083052212765,
"kl": 0.109375,
"learning_rate": 9.378502292409577e-07,
"loss": 0.0044,
"reward": 1.6422365307807922,
"reward_std": 0.2728146519511938,
"rewards/accuracy_reward": 0.6624999940395355,
"rewards/cosine_rewards": 0.01141381449997425,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": -0.00042724609375,
"step": 244
},
{
"clip_ratio": 0.0,
"completion_length": 248.828125,
"epoch": 0.12480896586856852,
"grad_norm": 8.442731808091501,
"kl": 0.1083984375,
"learning_rate": 9.375955170657157e-07,
"loss": 0.0043,
"reward": 1.3157773613929749,
"reward_std": 0.4320952445268631,
"rewards/accuracy_reward": 0.3531249985098839,
"rewards/cosine_rewards": 0.009643017314374447,
"rewards/format_reward": 0.953125,
"rewards/repetition_rewards": -0.00011561772407731041,
"step": 245
},
{
"clip_ratio": 0.0,
"completion_length": 222.6875,
"epoch": 0.12531839021905247,
"grad_norm": 14.294535330077375,
"kl": 0.1171875,
"learning_rate": 9.373408048904738e-07,
"loss": 0.0047,
"reward": 1.331631362438202,
"reward_std": 0.4216170907020569,
"rewards/accuracy_reward": 0.3531249985098839,
"rewards/cosine_rewards": 0.00982090923935175,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": -6.454958565882407e-05,
"step": 246
},
{
"clip_ratio": 0.0,
"completion_length": 219.984375,
"epoch": 0.12582781456953643,
"grad_norm": 9.55932148178992,
"kl": 0.108642578125,
"learning_rate": 9.370860927152318e-07,
"loss": 0.0043,
"reward": 1.340530276298523,
"reward_std": 0.4534093588590622,
"rewards/accuracy_reward": 0.3531250096857548,
"rewards/cosine_rewards": 0.003091069171205163,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": -6.0797665355494246e-05,
"step": 247
},
{
"clip_ratio": 0.0,
"completion_length": 325.234375,
"epoch": 0.12633723892002038,
"grad_norm": 17.80951371391722,
"kl": 0.110595703125,
"learning_rate": 9.368313805399897e-07,
"loss": 0.0044,
"reward": 1.2751246690750122,
"reward_std": 0.520209550857544,
"rewards/accuracy_reward": 0.2968749850988388,
"rewards/cosine_rewards": 0.010451191570609808,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": -0.0009515111669315957,
"step": 248
},
{
"clip_ratio": 0.0,
"completion_length": 366.625,
"epoch": 0.12684666327050434,
"grad_norm": 12.086775377390172,
"kl": 0.111083984375,
"learning_rate": 9.365766683647478e-07,
"loss": 0.0044,
"reward": 0.8402246385812759,
"reward_std": 0.6177513003349304,
"rewards/accuracy_reward": -0.04062502086162567,
"rewards/cosine_rewards": -0.04077841015532613,
"rewards/format_reward": 0.921875,
"rewards/repetition_rewards": -0.0002469850951456465,
"step": 249
},
{
"clip_ratio": 0.0,
"completion_length": 449.125,
"epoch": 0.1273560876209883,
"grad_norm": 11.748922331365623,
"kl": 0.0869140625,
"learning_rate": 9.363219561895058e-07,
"loss": 0.0035,
"reward": 1.7429784536361694,
"reward_std": 0.6669142842292786,
"rewards/accuracy_reward": 0.746874988079071,
"rewards/cosine_rewards": 0.02791230659931898,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": -0.000558776329853572,
"step": 250
},
{
"clip_ratio": 0.0,
"completion_length": 523.34375,
"epoch": 0.12786551197147222,
"grad_norm": 5.635036318066103,
"kl": 0.073974609375,
"learning_rate": 9.360672440142638e-07,
"loss": 0.003,
"reward": 1.3756027221679688,
"reward_std": 0.3430413454771042,
"rewards/accuracy_reward": 0.4375,
"rewards/cosine_rewards": -0.01460547186434269,
"rewards/format_reward": 0.953125,
"rewards/repetition_rewards": -0.0004167625156696886,
"step": 251
},
{
"clip_ratio": 0.0,
"completion_length": 583.78125,
"epoch": 0.12837493632195618,
"grad_norm": 3.7721855945126013,
"kl": 0.071533203125,
"learning_rate": 9.358125318390219e-07,
"loss": 0.0029,
"reward": 1.1054343283176422,
"reward_std": 0.9506143927574158,
"rewards/accuracy_reward": 0.23749998211860657,
"rewards/cosine_rewards": -0.06930245459079742,
"rewards/format_reward": 0.9375,
"rewards/repetition_rewards": -0.00026325164799345657,
"step": 252
},
{
"clip_ratio": 0.0,
"completion_length": 691.8125,
"epoch": 0.12888436067244013,
"grad_norm": 4.515905173052182,
"kl": 0.06494140625,
"learning_rate": 9.355578196637799e-07,
"loss": 0.0026,
"reward": 1.1395662426948547,
"reward_std": 1.164560616016388,
"rewards/accuracy_reward": 0.24062499403953552,
"rewards/cosine_rewards": -0.06932513415813446,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": -0.0004836731095565483,
"step": 253
},
{
"clip_ratio": 0.0,
"completion_length": 750.703125,
"epoch": 0.1293937850229241,
"grad_norm": 3.759822074251192,
"kl": 0.0609130859375,
"learning_rate": 9.353031074885379e-07,
"loss": 0.0024,
"reward": 1.3212904930114746,
"reward_std": 0.9738726019859314,
"rewards/accuracy_reward": 0.4625000059604645,
"rewards/cosine_rewards": -0.03070250153541565,
"rewards/format_reward": 0.890625,
"rewards/repetition_rewards": -0.0011320026533212513,
"step": 254
},
{
"clip_ratio": 0.0,
"completion_length": 604.40625,
"epoch": 0.12990320937340805,
"grad_norm": 4.697641184697458,
"kl": 0.082763671875,
"learning_rate": 9.350483953132959e-07,
"loss": 0.0033,
"reward": 1.1115484535694122,
"reward_std": 0.7478219866752625,
"rewards/accuracy_reward": 0.23125000298023224,
"rewards/cosine_rewards": -0.05675292294472456,
"rewards/format_reward": 0.9375,
"rewards/repetition_rewards": -0.00044860908383270726,
"step": 255
},
{
"clip_ratio": 0.0,
"completion_length": 635.84375,
"epoch": 0.130412633723892,
"grad_norm": 4.028619362240205,
"kl": 0.09765625,
"learning_rate": 9.34793683138054e-07,
"loss": 0.0039,
"reward": 1.457118034362793,
"reward_std": 0.788001298904419,
"rewards/accuracy_reward": 0.5218749791383743,
"rewards/cosine_rewards": -0.0015127966180443764,
"rewards/format_reward": 0.9375,
"rewards/repetition_rewards": -0.0007440973713528365,
"step": 256
},
{
"clip_ratio": 0.0,
"completion_length": 589.40625,
"epoch": 0.13092205807437596,
"grad_norm": 5.63417318829876,
"kl": 0.07275390625,
"learning_rate": 9.34538970962812e-07,
"loss": 0.0029,
"reward": 1.4154019951820374,
"reward_std": 0.815990686416626,
"rewards/accuracy_reward": 0.518750011920929,
"rewards/cosine_rewards": -0.02476619742810726,
"rewards/format_reward": 0.921875,
"rewards/repetition_rewards": -0.00045683811185881495,
"step": 257
},
{
"clip_ratio": 0.0,
"completion_length": 676.984375,
"epoch": 0.1314314824248599,
"grad_norm": 6.021417699991294,
"kl": 0.065673828125,
"learning_rate": 9.3428425878757e-07,
"loss": 0.0026,
"reward": 0.6927553117275238,
"reward_std": 0.8777336776256561,
"rewards/accuracy_reward": -0.0781250074505806,
"rewards/cosine_rewards": -0.15041033178567886,
"rewards/format_reward": 0.921875,
"rewards/repetition_rewards": -0.0005843567778356373,
"step": 258
},
{
"clip_ratio": 0.0,
"completion_length": 555.203125,
"epoch": 0.13194090677534387,
"grad_norm": 8.306056976533926,
"kl": 0.082763671875,
"learning_rate": 9.340295466123281e-07,
"loss": 0.0033,
"reward": 1.2112269699573517,
"reward_std": 0.9674933552742004,
"rewards/accuracy_reward": 0.43437499552965164,
"rewards/cosine_rewards": -0.08218972198665142,
"rewards/format_reward": 0.859375,
"rewards/repetition_rewards": -0.0003333477216074243,
"step": 259
},
{
"clip_ratio": 0.0,
"completion_length": 694.484375,
"epoch": 0.13245033112582782,
"grad_norm": 6.232070420425315,
"kl": 0.06689453125,
"learning_rate": 9.337748344370861e-07,
"loss": 0.0027,
"reward": 1.0117461681365967,
"reward_std": 0.7802118062973022,
"rewards/accuracy_reward": 0.21249999478459358,
"rewards/cosine_rewards": -0.09079772233963013,
"rewards/format_reward": 0.890625,
"rewards/repetition_rewards": -0.0005810301227029413,
"step": 260
},
{
"clip_ratio": 0.0,
"completion_length": 543.5625,
"epoch": 0.13295975547631178,
"grad_norm": 6.334798789966088,
"kl": 0.08544921875,
"learning_rate": 9.335201222618441e-07,
"loss": 0.0034,
"reward": 1.0542153716087341,
"reward_std": 0.8291297852993011,
"rewards/accuracy_reward": 0.18124999105930328,
"rewards/cosine_rewards": -0.032575659453868866,
"rewards/format_reward": 0.90625,
"rewards/repetition_rewards": -0.0007089868013281375,
"step": 261
},
{
"clip_ratio": 0.0,
"completion_length": 356.015625,
"epoch": 0.1334691798267957,
"grad_norm": 10.460916199726386,
"kl": 0.098388671875,
"learning_rate": 9.33265410086602e-07,
"loss": 0.0039,
"reward": 0.6651052087545395,
"reward_std": 0.9073293209075928,
"rewards/accuracy_reward": -0.09687501192092896,
"rewards/cosine_rewards": -0.03463773522526026,
"rewards/format_reward": 0.796875,
"rewards/repetition_rewards": -0.000257108491496183,
"step": 262
},
{
"clip_ratio": 0.0,
"completion_length": 345.109375,
"epoch": 0.13397860417727966,
"grad_norm": 105.36035227056938,
"kl": 0.10546875,
"learning_rate": 9.330106979113601e-07,
"loss": 0.0042,
"reward": 1.6959076523780823,
"reward_std": 0.6433850526809692,
"rewards/accuracy_reward": 0.7374999523162842,
"rewards/cosine_rewards": 0.036790573969483376,
"rewards/format_reward": 0.921875,
"rewards/repetition_rewards": -0.0002579164138296619,
"step": 263
},
{
"clip_ratio": 0.0,
"completion_length": 225.671875,
"epoch": 0.13448802852776362,
"grad_norm": 12.269079038415155,
"kl": 0.1044921875,
"learning_rate": 9.327559857361181e-07,
"loss": 0.0042,
"reward": 1.3304521441459656,
"reward_std": 0.7337057292461395,
"rewards/accuracy_reward": 0.40312500298023224,
"rewards/cosine_rewards": -0.009734044317156076,
"rewards/format_reward": 0.9375,
"rewards/repetition_rewards": -0.0004387954395497218,
"step": 264
},
{
"clip_ratio": 0.0,
"completion_length": 191.421875,
"epoch": 0.13499745287824758,
"grad_norm": 5.0495702848173805,
"kl": 0.12451171875,
"learning_rate": 9.325012735608761e-07,
"loss": 0.005,
"reward": 1.5114508867263794,
"reward_std": 0.4991532266139984,
"rewards/accuracy_reward": 0.6031249910593033,
"rewards/cosine_rewards": 0.0021946561755612493,
"rewards/format_reward": 0.90625,
"rewards/repetition_rewards": -0.0001188212918350473,
"step": 265
},
{
"clip_ratio": 0.0,
"completion_length": 212.890625,
"epoch": 0.13550687722873153,
"grad_norm": 8.380147213080475,
"kl": 0.11376953125,
"learning_rate": 9.322465613856342e-07,
"loss": 0.0046,
"reward": 1.3185867071151733,
"reward_std": 0.5081266015768051,
"rewards/accuracy_reward": 0.37812499701976776,
"rewards/cosine_rewards": 0.0029779861215502024,
"rewards/format_reward": 0.9375,
"rewards/repetition_rewards": -1.6225338185904548e-05,
"step": 266
},
{
"clip_ratio": 0.0,
"completion_length": 163.4375,
"epoch": 0.1360163015792155,
"grad_norm": 6.7670554711392725,
"kl": 0.1259765625,
"learning_rate": 9.319918492103922e-07,
"loss": 0.005,
"reward": 1.917210876941681,
"reward_std": 0.2323581874370575,
"rewards/accuracy_reward": 0.96875,
"rewards/cosine_rewards": 0.011114767286926508,
"rewards/format_reward": 0.9375,
"rewards/repetition_rewards": -0.00015385003644041717,
"step": 267
},
{
"clip_ratio": 0.0,
"completion_length": 156.0,
"epoch": 0.13652572592969944,
"grad_norm": 7.285618988190272,
"kl": 0.119873046875,
"learning_rate": 9.317371370351502e-07,
"loss": 0.0048,
"reward": 1.2626032829284668,
"reward_std": 0.6921159029006958,
"rewards/accuracy_reward": 0.34687499701976776,
"rewards/cosine_rewards": -0.006146675441414118,
"rewards/format_reward": 0.921875,
"rewards/repetition_rewards": 0.0,
"step": 268
},
{
"clip_ratio": 0.0,
"completion_length": 152.953125,
"epoch": 0.1370351502801834,
"grad_norm": 10.577916333197056,
"kl": 0.140625,
"learning_rate": 9.314824248599083e-07,
"loss": 0.0056,
"reward": 1.2036974430084229,
"reward_std": 0.5991593599319458,
"rewards/accuracy_reward": 0.2968749850988388,
"rewards/cosine_rewards": 0.0007108037825673819,
"rewards/format_reward": 0.90625,
"rewards/repetition_rewards": -0.0001382743357680738,
"step": 269
},
{
"clip_ratio": 0.0,
"completion_length": 158.34375,
"epoch": 0.13754457463066735,
"grad_norm": 14.293416719573916,
"kl": 0.1201171875,
"learning_rate": 9.312277126846663e-07,
"loss": 0.0048,
"reward": 1.2185573279857635,
"reward_std": 0.43015679717063904,
"rewards/accuracy_reward": 0.24062500149011612,
"rewards/cosine_rewards": -0.006263321032747626,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": -0.00017934850984602235,
"step": 270
},
{
"clip_ratio": 0.0,
"completion_length": 166.46875,
"epoch": 0.1380539989811513,
"grad_norm": 9.393354579467495,
"kl": 0.1240234375,
"learning_rate": 9.309730005094243e-07,
"loss": 0.005,
"reward": 1.5472444295883179,
"reward_std": 0.5279964953660965,
"rewards/accuracy_reward": 0.606249988079071,
"rewards/cosine_rewards": 0.003494387026876211,
"rewards/format_reward": 0.9375,
"rewards/repetition_rewards": 0.0,
"step": 271
},
{
"clip_ratio": 0.0,
"completion_length": 149.609375,
"epoch": 0.13856342333163527,
"grad_norm": 7.200843784037537,
"kl": 0.117431640625,
"learning_rate": 9.307182883341823e-07,
"loss": 0.0047,
"reward": 1.3769221901893616,
"reward_std": 0.4806235730648041,
"rewards/accuracy_reward": 0.40937501937150955,
"rewards/cosine_rewards": -0.0011419787188060582,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": -6.0797665355494246e-05,
"step": 272
},
{
"clip_ratio": 0.0,
"completion_length": 143.828125,
"epoch": 0.1390728476821192,
"grad_norm": 11.003082967675637,
"kl": 0.18359375,
"learning_rate": 9.304635761589404e-07,
"loss": 0.0073,
"reward": 1.3800683617591858,
"reward_std": 0.4095611423254013,
"rewards/accuracy_reward": 0.40937498211860657,
"rewards/cosine_rewards": 0.0019433526322245598,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": 0.0,
"step": 273
},
{
"clip_ratio": 0.0,
"completion_length": 144.453125,
"epoch": 0.13958227203260315,
"grad_norm": 6.9562429559567285,
"kl": 0.130859375,
"learning_rate": 9.302088639836984e-07,
"loss": 0.0052,
"reward": 1.424567699432373,
"reward_std": 0.2551300157792866,
"rewards/accuracy_reward": 0.4375,
"rewards/cosine_rewards": 0.0028896235453430563,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": -0.0001969077275134623,
"step": 274
},
{
"clip_ratio": 0.0,
"completion_length": 149.296875,
"epoch": 0.1400916963830871,
"grad_norm": 9.330566123712217,
"kl": 0.1240234375,
"learning_rate": 9.299541518084564e-07,
"loss": 0.005,
"reward": 1.2650930285453796,
"reward_std": 0.42955365777015686,
"rewards/accuracy_reward": 0.32499999552965164,
"rewards/cosine_rewards": 0.0025930306874215603,
"rewards/format_reward": 0.9375,
"rewards/repetition_rewards": 0.0,
"step": 275
},
{
"clip_ratio": 0.0,
"completion_length": 207.75,
"epoch": 0.14060112073357106,
"grad_norm": 8.495983071057866,
"kl": 0.11962890625,
"learning_rate": 9.296994396332144e-07,
"loss": 0.0048,
"reward": 1.8627826571464539,
"reward_std": 0.2839447557926178,
"rewards/accuracy_reward": 0.859375,
"rewards/cosine_rewards": 0.019193909130990505,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": -0.0001612851265235804,
"step": 276
},
{
"clip_ratio": 0.0,
"completion_length": 162.515625,
"epoch": 0.14111054508405502,
"grad_norm": 12.855580556594866,
"kl": 0.14306640625,
"learning_rate": 9.294447274579724e-07,
"loss": 0.0057,
"reward": 1.5162805318832397,
"reward_std": 0.6588033437728882,
"rewards/accuracy_reward": 0.6343750059604645,
"rewards/cosine_rewards": -0.008702149149030447,
"rewards/format_reward": 0.890625,
"rewards/repetition_rewards": -1.7208149074576795e-05,
"step": 277
},
{
"clip_ratio": 0.0,
"completion_length": 281.484375,
"epoch": 0.14161996943453897,
"grad_norm": 10.357622467045976,
"kl": 0.102783203125,
"learning_rate": 9.291900152827304e-07,
"loss": 0.0041,
"reward": 1.1203789710998535,
"reward_std": 0.6814777851104736,
"rewards/accuracy_reward": 0.17812500894069672,
"rewards/cosine_rewards": -0.010577938985079527,
"rewards/format_reward": 0.953125,
"rewards/repetition_rewards": -0.0002930604387074709,
"step": 278
},
{
"clip_ratio": 0.0,
"completion_length": 235.390625,
"epoch": 0.14212939378502293,
"grad_norm": 23.483530147678458,
"kl": 0.114013671875,
"learning_rate": 9.289353031074884e-07,
"loss": 0.0046,
"reward": 1.3522316813468933,
"reward_std": 0.28182537853717804,
"rewards/accuracy_reward": 0.3812500238418579,
"rewards/cosine_rewards": 0.0023158364929258823,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": -8.418447396252304e-05,
"step": 279
},
{
"clip_ratio": 0.0,
"completion_length": 271.96875,
"epoch": 0.14263881813550688,
"grad_norm": 5.804128123317947,
"kl": 0.109619140625,
"learning_rate": 9.286805909322465e-07,
"loss": 0.0044,
"reward": 1.2828457355499268,
"reward_std": 0.5574119389057159,
"rewards/accuracy_reward": 0.3500000163912773,
"rewards/cosine_rewards": -0.004654169548302889,
"rewards/format_reward": 0.9375,
"rewards/repetition_rewards": 0.0,
"step": 280
},
{
"clip_ratio": 0.0,
"completion_length": 346.984375,
"epoch": 0.14314824248599084,
"grad_norm": 9.062411976412948,
"kl": 0.09130859375,
"learning_rate": 9.284258787570045e-07,
"loss": 0.0037,
"reward": 1.9385767579078674,
"reward_std": 0.3151838555932045,
"rewards/accuracy_reward": 0.9437500238418579,
"rewards/cosine_rewards": 0.04203657992184162,
"rewards/format_reward": 0.953125,
"rewards/repetition_rewards": -0.0003348248792462982,
"step": 281
},
{
"clip_ratio": 0.0,
"completion_length": 395.875,
"epoch": 0.1436576668364748,
"grad_norm": 8.221976106115973,
"kl": 0.104248046875,
"learning_rate": 9.281711665817625e-07,
"loss": 0.0042,
"reward": 1.323907494544983,
"reward_std": 0.6098371148109436,
"rewards/accuracy_reward": 0.40312501788139343,
"rewards/cosine_rewards": 0.014727211673744023,
"rewards/format_reward": 0.90625,
"rewards/repetition_rewards": -0.00019470852021186147,
"step": 282
},
{
"clip_ratio": 0.0,
"completion_length": 603.21875,
"epoch": 0.14416709118695872,
"grad_norm": 7.812929807725803,
"kl": 0.084228515625,
"learning_rate": 9.279164544065206e-07,
"loss": 0.0034,
"reward": 1.3660696744918823,
"reward_std": 0.6207956671714783,
"rewards/accuracy_reward": 0.46562500298023224,
"rewards/cosine_rewards": -0.005538210505619645,
"rewards/format_reward": 0.90625,
"rewards/repetition_rewards": -0.0002670584217412397,
"step": 283
},
{
"clip_ratio": 0.0,
"completion_length": 525.6875,
"epoch": 0.14467651553744268,
"grad_norm": 8.491615227525342,
"kl": 0.08056640625,
"learning_rate": 9.276617422312786e-07,
"loss": 0.0032,
"reward": 1.3353699743747711,
"reward_std": 0.5946642160415649,
"rewards/accuracy_reward": 0.40937500447034836,
"rewards/cosine_rewards": -0.02690817415714264,
"rewards/format_reward": 0.953125,
"rewards/repetition_rewards": -0.0002218634108430706,
"step": 284
},
{
"clip_ratio": 0.0,
"completion_length": 599.984375,
"epoch": 0.14518593988792663,
"grad_norm": 16.68462964732329,
"kl": 0.077880859375,
"learning_rate": 9.274070300560366e-07,
"loss": 0.0031,
"reward": 0.9605185687541962,
"reward_std": 0.7793702185153961,
"rewards/accuracy_reward": 0.09999999031424522,
"rewards/cosine_rewards": -0.06100003980100155,
"rewards/format_reward": 0.921875,
"rewards/repetition_rewards": -0.0003564156068023294,
"step": 285
},
{
"clip_ratio": 0.0,
"completion_length": 649.3125,
"epoch": 0.1456953642384106,
"grad_norm": 10.597989534798653,
"kl": 0.068115234375,
"learning_rate": 9.271523178807946e-07,
"loss": 0.0027,
"reward": 1.1927469968795776,
"reward_std": 1.0099957585334778,
"rewards/accuracy_reward": 0.34999997913837433,
"rewards/cosine_rewards": -0.04736426845192909,
"rewards/format_reward": 0.890625,
"rewards/repetition_rewards": -0.0005137407861184329,
"step": 286
},
{
"clip_ratio": 0.0,
"completion_length": 621.234375,
"epoch": 0.14620478858889455,
"grad_norm": 5.399955402557674,
"kl": 0.072265625,
"learning_rate": 9.268976057055527e-07,
"loss": 0.0029,
"reward": 0.821646511554718,
"reward_std": 0.9464232325553894,
"rewards/accuracy_reward": 0.03749999776482582,
"rewards/cosine_rewards": -0.10573448240756989,
"rewards/format_reward": 0.890625,
"rewards/repetition_rewards": -0.0007440397967002355,
"step": 287
},
{
"clip_ratio": 0.0,
"completion_length": 646.796875,
"epoch": 0.1467142129393785,
"grad_norm": 5.9108976297695355,
"kl": 0.075439453125,
"learning_rate": 9.266428935303107e-07,
"loss": 0.003,
"reward": 1.8053097128868103,
"reward_std": 0.5278272330760956,
"rewards/accuracy_reward": 0.7749999761581421,
"rewards/cosine_rewards": 0.061956772580742836,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": -0.0003969733224948868,
"step": 288
},
{
"clip_ratio": 0.0,
"completion_length": 628.484375,
"epoch": 0.14722363728986246,
"grad_norm": 4.280094122642851,
"kl": 0.0692138671875,
"learning_rate": 9.263881813550687e-07,
"loss": 0.0028,
"reward": 0.7580513060092926,
"reward_std": 0.9215057492256165,
"rewards/accuracy_reward": -0.04062502086162567,
"rewards/cosine_rewards": -0.1223737820982933,
"rewards/format_reward": 0.921875,
"rewards/repetition_rewards": -0.0008249446109402925,
"step": 289
},
{
"clip_ratio": 0.0,
"completion_length": 697.03125,
"epoch": 0.1477330616403464,
"grad_norm": 4.704795726585343,
"kl": 0.068359375,
"learning_rate": 9.261334691798267e-07,
"loss": 0.0027,
"reward": 1.0915009379386902,
"reward_std": 0.6004486382007599,
"rewards/accuracy_reward": 0.21249999105930328,
"rewards/cosine_rewards": -0.05759305879473686,
"rewards/format_reward": 0.9375,
"rewards/repetition_rewards": -0.0009060115553438663,
"step": 290
},
{
"clip_ratio": 0.0,
"completion_length": 739.375,
"epoch": 0.14824248599083037,
"grad_norm": 5.468968593755717,
"kl": 0.065185546875,
"learning_rate": 9.258787570045847e-07,
"loss": 0.0026,
"reward": 1.328648567199707,
"reward_std": 0.8502229452133179,
"rewards/accuracy_reward": 0.40312500298023224,
"rewards/cosine_rewards": -0.027191368862986565,
"rewards/format_reward": 0.953125,
"rewards/repetition_rewards": -0.00041003923979587853,
"step": 291
},
{
"clip_ratio": 0.0,
"completion_length": 800.390625,
"epoch": 0.14875191034131433,
"grad_norm": 2.644913201802289,
"kl": 0.07861328125,
"learning_rate": 9.256240448293427e-07,
"loss": 0.0031,
"reward": 1.5775163769721985,
"reward_std": 0.6978716552257538,
"rewards/accuracy_reward": 0.6562500149011612,
"rewards/cosine_rewards": 0.031187113374471664,
"rewards/format_reward": 0.890625,
"rewards/repetition_rewards": -0.0005458263913169503,
"step": 292
},
{
"clip_ratio": 0.0,
"completion_length": 969.328125,
"epoch": 0.14926133469179828,
"grad_norm": 3.858694298808609,
"kl": 0.0548095703125,
"learning_rate": 9.253693326541008e-07,
"loss": 0.0022,
"reward": 0.39561687409877777,
"reward_std": 1.1356619894504547,
"rewards/accuracy_reward": -0.1625000238418579,
"rewards/cosine_rewards": -0.23805859684944153,
"rewards/format_reward": 0.796875,
"rewards/repetition_rewards": -0.0006995665607973933,
"step": 293
},
{
"clip_ratio": 0.0,
"completion_length": 1044.703125,
"epoch": 0.1497707590422822,
"grad_norm": 2.0660275837501643,
"kl": 0.0902099609375,
"learning_rate": 9.251146204788588e-07,
"loss": 0.0036,
"reward": 1.0626700818538666,
"reward_std": 1.1662874221801758,
"rewards/accuracy_reward": 0.3531249985098839,
"rewards/cosine_rewards": -0.055325835943222046,
"rewards/format_reward": 0.765625,
"rewards/repetition_rewards": -0.0007540385995525867,
"step": 294
},
{
"clip_ratio": 0.0,
"completion_length": 934.28125,
"epoch": 0.15028018339276616,
"grad_norm": 7.548230617974183,
"kl": 0.0538330078125,
"learning_rate": 9.248599083036168e-07,
"loss": 0.0022,
"reward": 1.2535955309867859,
"reward_std": 1.0525287985801697,
"rewards/accuracy_reward": 0.3750000223517418,
"rewards/cosine_rewards": -0.04287016252055764,
"rewards/format_reward": 0.921875,
"rewards/repetition_rewards": -0.00040922046173363924,
"step": 295
},
{
"clip_ratio": 0.0,
"completion_length": 790.09375,
"epoch": 0.15078960774325012,
"grad_norm": 3.7401563979919654,
"kl": 0.0584716796875,
"learning_rate": 9.246051961283748e-07,
"loss": 0.0023,
"reward": 1.1489249467849731,
"reward_std": 0.5376773178577423,
"rewards/accuracy_reward": 0.2937499899417162,
"rewards/cosine_rewards": -0.08189126010984182,
"rewards/format_reward": 0.9375,
"rewards/repetition_rewards": -0.0004337812424637377,
"step": 296
},
{
"clip_ratio": 0.0,
"completion_length": 836.375,
"epoch": 0.15129903209373408,
"grad_norm": 2.8153067499507105,
"kl": 0.0618896484375,
"learning_rate": 9.243504839531329e-07,
"loss": 0.0025,
"reward": 1.3525272011756897,
"reward_std": 0.8126451969146729,
"rewards/accuracy_reward": 0.4906250238418579,
"rewards/cosine_rewards": -0.012518584728240967,
"rewards/format_reward": 0.875,
"rewards/repetition_rewards": -0.000579186889808625,
"step": 297
},
{
"clip_ratio": 0.0,
"completion_length": 890.625,
"epoch": 0.15180845644421803,
"grad_norm": 5.316949650260697,
"kl": 0.0552978515625,
"learning_rate": 9.240957717778909e-07,
"loss": 0.0022,
"reward": 1.2640092372894287,
"reward_std": 0.8870376944541931,
"rewards/accuracy_reward": 0.4062499850988388,
"rewards/cosine_rewards": -0.0009442958980798721,
"rewards/format_reward": 0.859375,
"rewards/repetition_rewards": -0.0006714609917253256,
"step": 298
},
{
"clip_ratio": 0.0,
"completion_length": 813.953125,
"epoch": 0.152317880794702,
"grad_norm": 3.8825478721674953,
"kl": 0.0574951171875,
"learning_rate": 9.23841059602649e-07,
"loss": 0.0023,
"reward": 1.2717376947402954,
"reward_std": 0.830648809671402,
"rewards/accuracy_reward": 0.4375,
"rewards/cosine_rewards": -0.03994514420628548,
"rewards/format_reward": 0.875,
"rewards/repetition_rewards": -0.0008170758956111968,
"step": 299
},
{
"clip_ratio": 0.0,
"completion_length": 752.140625,
"epoch": 0.15282730514518594,
"grad_norm": 5.06920521582769,
"kl": 0.059814453125,
"learning_rate": 9.235863474274071e-07,
"loss": 0.0024,
"reward": 1.1217154264450073,
"reward_std": 0.8524642586708069,
"rewards/accuracy_reward": 0.24062498286366463,
"rewards/cosine_rewards": -0.04011305421590805,
"rewards/format_reward": 0.921875,
"rewards/repetition_rewards": -0.0006714656192343682,
"step": 300
},
{
"clip_ratio": 0.0,
"completion_length": 684.625,
"epoch": 0.1533367294956699,
"grad_norm": 8.555507127767159,
"kl": 0.0672607421875,
"learning_rate": 9.233316352521651e-07,
"loss": 0.0027,
"reward": 1.1471417546272278,
"reward_std": 0.7909112870693207,
"rewards/accuracy_reward": 0.2656249962747097,
"rewards/cosine_rewards": -0.039835451170802116,
"rewards/format_reward": 0.921875,
"rewards/repetition_rewards": -0.0005227623041719198,
"step": 301
},
{
"clip_ratio": 0.0,
"completion_length": 667.78125,
"epoch": 0.15384615384615385,
"grad_norm": 2.9040824114504877,
"kl": 0.064697265625,
"learning_rate": 9.230769230769231e-07,
"loss": 0.0026,
"reward": 0.9261243343353271,
"reward_std": 0.668161928653717,
"rewards/accuracy_reward": 0.1281249988824129,
"rewards/cosine_rewards": -0.06077958270907402,
"rewards/format_reward": 0.859375,
"rewards/repetition_rewards": -0.0005960852140560746,
"step": 302
},
{
"clip_ratio": 0.0,
"completion_length": 702.484375,
"epoch": 0.1543555781966378,
"grad_norm": 4.4461209381275655,
"kl": 0.06298828125,
"learning_rate": 9.228222109016812e-07,
"loss": 0.0025,
"reward": 1.506935715675354,
"reward_std": 0.6653757691383362,
"rewards/accuracy_reward": 0.5468749850988388,
"rewards/cosine_rewards": 0.03871871158480644,
"rewards/format_reward": 0.921875,
"rewards/repetition_rewards": -0.0005329845298547298,
"step": 303
},
{
"clip_ratio": 0.0,
"completion_length": 633.140625,
"epoch": 0.15486500254712177,
"grad_norm": 3.9254091150548933,
"kl": 0.069091796875,
"learning_rate": 9.225674987264391e-07,
"loss": 0.0028,
"reward": 1.3886016011238098,
"reward_std": 0.9017740190029144,
"rewards/accuracy_reward": 0.5749999731779099,
"rewards/cosine_rewards": -0.02933959849178791,
"rewards/format_reward": 0.84375,
"rewards/repetition_rewards": -0.000808820070233196,
"step": 304
},
{
"clip_ratio": 0.0,
"completion_length": 648.625,
"epoch": 0.1553744268976057,
"grad_norm": 6.070022774209878,
"kl": 0.068115234375,
"learning_rate": 9.223127865511971e-07,
"loss": 0.0027,
"reward": 1.6925800442695618,
"reward_std": 0.6231902837753296,
"rewards/accuracy_reward": 0.6625000238418579,
"rewards/cosine_rewards": 0.06164960749447346,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": -0.0003195497556589544,
"step": 305
},
{
"clip_ratio": 0.0,
"completion_length": 614.15625,
"epoch": 0.15588385124808965,
"grad_norm": 11.091865062468658,
"kl": 0.317138671875,
"learning_rate": 9.220580743759551e-07,
"loss": 0.0127,
"reward": 1.5423057079315186,
"reward_std": 0.3847469687461853,
"rewards/accuracy_reward": 0.5468749962747097,
"rewards/cosine_rewards": 0.05880427733063698,
"rewards/format_reward": 0.9375,
"rewards/repetition_rewards": -0.0008736126183066517,
"step": 306
},
{
"clip_ratio": 0.0,
"completion_length": 612.578125,
"epoch": 0.1563932755985736,
"grad_norm": 3.103459103182676,
"kl": 0.0673828125,
"learning_rate": 9.218033622007132e-07,
"loss": 0.0027,
"reward": 1.6744784712791443,
"reward_std": 0.659433513879776,
"rewards/accuracy_reward": 0.6875,
"rewards/cosine_rewards": 0.06587037723511457,
"rewards/format_reward": 0.921875,
"rewards/repetition_rewards": -0.0007668640464544296,
"step": 307
},
{
"clip_ratio": 0.0,
"completion_length": 657.265625,
"epoch": 0.15690269994905756,
"grad_norm": 4.50781660421839,
"kl": 0.068115234375,
"learning_rate": 9.215486500254712e-07,
"loss": 0.0027,
"reward": 1.145881563425064,
"reward_std": 1.0458006858825684,
"rewards/accuracy_reward": 0.34062499180436134,
"rewards/cosine_rewards": -0.03727734461426735,
"rewards/format_reward": 0.84375,
"rewards/repetition_rewards": -0.0012161528575234115,
"step": 308
},
{
"clip_ratio": 0.0,
"completion_length": 771.5625,
"epoch": 0.15741212429954152,
"grad_norm": 6.920330329994176,
"kl": 0.064208984375,
"learning_rate": 9.212939378502292e-07,
"loss": 0.0026,
"reward": 0.8615269958972931,
"reward_std": 0.9165626764297485,
"rewards/accuracy_reward": 0.140625,
"rewards/cosine_rewards": -0.07544910162687302,
"rewards/format_reward": 0.796875,
"rewards/repetition_rewards": -0.0005239159800112247,
"step": 309
},
{
"clip_ratio": 0.0,
"completion_length": 758.96875,
"epoch": 0.15792154865002547,
"grad_norm": 11.706103376756111,
"kl": 0.056396484375,
"learning_rate": 9.210392256749873e-07,
"loss": 0.0023,
"reward": 1.567901074886322,
"reward_std": 1.1157508492469788,
"rewards/accuracy_reward": 0.6437499821186066,
"rewards/cosine_rewards": 0.08178849518299103,
"rewards/format_reward": 0.84375,
"rewards/repetition_rewards": -0.0013874000869691372,
"step": 310
},
{
"clip_ratio": 0.0,
"completion_length": 694.390625,
"epoch": 0.15843097300050943,
"grad_norm": 4.42128730127276,
"kl": 0.062255859375,
"learning_rate": 9.207845134997453e-07,
"loss": 0.0025,
"reward": 0.944963201880455,
"reward_std": 0.9125352203845978,
"rewards/accuracy_reward": 0.16249998658895493,
"rewards/cosine_rewards": -0.04496639594435692,
"rewards/format_reward": 0.828125,
"rewards/repetition_rewards": -0.0006953877746127546,
"step": 311
},
{
"clip_ratio": 0.0,
"completion_length": 812.46875,
"epoch": 0.15894039735099338,
"grad_norm": 5.888648418898334,
"kl": 0.0587158203125,
"learning_rate": 9.205298013245033e-07,
"loss": 0.0023,
"reward": 0.6825668215751648,
"reward_std": 1.0514086484909058,
"rewards/accuracy_reward": 0.04999999701976776,
"rewards/cosine_rewards": -0.13240730948746204,
"rewards/format_reward": 0.765625,
"rewards/repetition_rewards": -0.0006508340884465724,
"step": 312
},
{
"clip_ratio": 0.0,
"completion_length": 703.703125,
"epoch": 0.15944982170147734,
"grad_norm": 5.201587660434957,
"kl": 0.0626220703125,
"learning_rate": 9.202750891492613e-07,
"loss": 0.0025,
"reward": 0.849999725818634,
"reward_std": 1.2490254640579224,
"rewards/accuracy_reward": 0.16249999590218067,
"rewards/cosine_rewards": -0.04634671099483967,
"rewards/format_reward": 0.734375,
"rewards/repetition_rewards": -0.0005285786173772067,
"step": 313
},
{
"clip_ratio": 0.0,
"completion_length": 732.84375,
"epoch": 0.1599592460519613,
"grad_norm": 41.79369545195822,
"kl": 0.0654296875,
"learning_rate": 9.200203769740194e-07,
"loss": 0.0026,
"reward": 1.359117031097412,
"reward_std": 1.1281075477600098,
"rewards/accuracy_reward": 0.49687501788139343,
"rewards/cosine_rewards": 0.06599474605172873,
"rewards/format_reward": 0.796875,
"rewards/repetition_rewards": -0.0006276974454522133,
"step": 314
},
{
"clip_ratio": 0.0,
"completion_length": 633.609375,
"epoch": 0.16046867040244522,
"grad_norm": 5.659472242303819,
"kl": 0.090087890625,
"learning_rate": 9.197656647987774e-07,
"loss": 0.0036,
"reward": 1.149334043264389,
"reward_std": 1.1551178693771362,
"rewards/accuracy_reward": 0.3593749962747097,
"rewards/cosine_rewards": 0.025284748524427414,
"rewards/format_reward": 0.765625,
"rewards/repetition_rewards": -0.0009507373906672001,
"step": 315
},
{
"clip_ratio": 0.0,
"completion_length": 704.09375,
"epoch": 0.16097809475292918,
"grad_norm": 5.702114455603425,
"kl": 0.071044921875,
"learning_rate": 9.195109526235354e-07,
"loss": 0.0028,
"reward": 1.3667227029800415,
"reward_std": 0.6237545907497406,
"rewards/accuracy_reward": 0.4031249713152647,
"rewards/cosine_rewards": 0.01146969199180603,
"rewards/format_reward": 0.953125,
"rewards/repetition_rewards": -0.0009969472303055227,
"step": 316
},
{
"clip_ratio": 0.0,
"completion_length": 615.984375,
"epoch": 0.16148751910341314,
"grad_norm": 7.41926766137556,
"kl": 0.072998046875,
"learning_rate": 9.192562404482935e-07,
"loss": 0.0029,
"reward": 1.2655977010726929,
"reward_std": 0.7071200311183929,
"rewards/accuracy_reward": 0.37187498807907104,
"rewards/cosine_rewards": -0.0119027029722929,
"rewards/format_reward": 0.90625,
"rewards/repetition_rewards": -0.00062458252068609,
"step": 317
},
{
"clip_ratio": 0.0,
"completion_length": 655.65625,
"epoch": 0.1619969434538971,
"grad_norm": 6.476802877604566,
"kl": 0.072265625,
"learning_rate": 9.190015282730514e-07,
"loss": 0.0029,
"reward": 1.4926868677139282,
"reward_std": 0.5651115030050278,
"rewards/accuracy_reward": 0.4906250089406967,
"rewards/cosine_rewards": 0.05030408315360546,
"rewards/format_reward": 0.953125,
"rewards/repetition_rewards": -0.0013672530185431242,
"step": 318
},
{
"clip_ratio": 0.0,
"completion_length": 694.4375,
"epoch": 0.16250636780438105,
"grad_norm": 5.651262358287829,
"kl": 0.078125,
"learning_rate": 9.187468160978094e-07,
"loss": 0.0031,
"reward": 1.6467930674552917,
"reward_std": 0.6130897700786591,
"rewards/accuracy_reward": 0.6343749761581421,
"rewards/cosine_rewards": 0.060116952285170555,
"rewards/format_reward": 0.953125,
"rewards/repetition_rewards": -0.0008239042945206165,
"step": 319
},
{
"clip_ratio": 0.0,
"completion_length": 627.1875,
"epoch": 0.163015792154865,
"grad_norm": 8.752414113774137,
"kl": 0.087890625,
"learning_rate": 9.184921039225674e-07,
"loss": 0.0035,
"reward": 1.2824658155441284,
"reward_std": 0.6804981231689453,
"rewards/accuracy_reward": 0.4281250089406967,
"rewards/cosine_rewards": -0.004211767576634884,
"rewards/format_reward": 0.859375,
"rewards/repetition_rewards": -0.0008223777404054999,
"step": 320
},
{
"clip_ratio": 0.0,
"completion_length": 647.421875,
"epoch": 0.16352521650534896,
"grad_norm": 24.555397071153298,
"kl": 0.10791015625,
"learning_rate": 9.182373917473255e-07,
"loss": 0.0043,
"reward": 1.5703404545783997,
"reward_std": 0.6466428339481354,
"rewards/accuracy_reward": 0.5781249850988388,
"rewards/cosine_rewards": 0.023975687101483345,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": -0.0005103159819555003,
"step": 321
},
{
"clip_ratio": 0.0,
"completion_length": 679.453125,
"epoch": 0.1640346408558329,
"grad_norm": 10.65571265954387,
"kl": 0.0751953125,
"learning_rate": 9.179826795720835e-07,
"loss": 0.003,
"reward": 1.6510714292526245,
"reward_std": 1.0072646141052246,
"rewards/accuracy_reward": 0.7062499523162842,
"rewards/cosine_rewards": 0.0703657679259777,
"rewards/format_reward": 0.875,
"rewards/repetition_rewards": -0.0005443187110358849,
"step": 322
},
{
"clip_ratio": 0.0,
"completion_length": 815.84375,
"epoch": 0.16454406520631687,
"grad_norm": 3.0291641455139042,
"kl": 0.0577392578125,
"learning_rate": 9.177279673968415e-07,
"loss": 0.0023,
"reward": 0.7281904220581055,
"reward_std": 0.7364227771759033,
"rewards/accuracy_reward": -0.07187500596046448,
"rewards/cosine_rewards": -0.15170371532440186,
"rewards/format_reward": 0.953125,
"rewards/repetition_rewards": -0.0013558552600443363,
"step": 323
},
{
"clip_ratio": 0.0,
"completion_length": 644.84375,
"epoch": 0.16505348955680083,
"grad_norm": 4.950572350987556,
"kl": 0.081787109375,
"learning_rate": 9.174732552215996e-07,
"loss": 0.0033,
"reward": 1.5456467270851135,
"reward_std": 0.3990190625190735,
"rewards/accuracy_reward": 0.5750000178813934,
"rewards/cosine_rewards": 0.03384638950228691,
"rewards/format_reward": 0.9375,
"rewards/repetition_rewards": -0.000699635551427491,
"step": 324
},
{
"clip_ratio": 0.0,
"completion_length": 692.96875,
"epoch": 0.16556291390728478,
"grad_norm": 5.615028773473959,
"kl": 0.0675048828125,
"learning_rate": 9.172185430463576e-07,
"loss": 0.0027,
"reward": 1.4643962979316711,
"reward_std": 0.538501039147377,
"rewards/accuracy_reward": 0.4906250089406967,
"rewards/cosine_rewards": 0.021510865539312363,
"rewards/format_reward": 0.953125,
"rewards/repetition_rewards": -0.000864640751387924,
"step": 325
},
{
"clip_ratio": 0.0,
"completion_length": 748.71875,
"epoch": 0.1660723382577687,
"grad_norm": 12.207464085563803,
"kl": 0.071533203125,
"learning_rate": 9.169638308711156e-07,
"loss": 0.0029,
"reward": 1.1908642947673798,
"reward_std": 0.8150831162929535,
"rewards/accuracy_reward": 0.3156250100582838,
"rewards/cosine_rewards": 0.0008599106222391129,
"rewards/format_reward": 0.875,
"rewards/repetition_rewards": -0.000620643695583567,
"step": 326
},
{
"clip_ratio": 0.0,
"completion_length": 692.828125,
"epoch": 0.16658176260825266,
"grad_norm": 4.633110149052728,
"kl": 0.067626953125,
"learning_rate": 9.167091186958737e-07,
"loss": 0.0027,
"reward": 1.3975687623023987,
"reward_std": 0.6602180898189545,
"rewards/accuracy_reward": 0.40937499701976776,
"rewards/cosine_rewards": 0.020020989701151848,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": -0.0005772198055638,
"step": 327
},
{
"clip_ratio": 0.0,
"completion_length": 886.234375,
"epoch": 0.16709118695873662,
"grad_norm": 11.480819900317456,
"kl": 0.0567626953125,
"learning_rate": 9.164544065206317e-07,
"loss": 0.0023,
"reward": 1.3322511315345764,
"reward_std": 0.7808408439159393,
"rewards/accuracy_reward": 0.3812499865889549,
"rewards/cosine_rewards": -0.0007541030645370483,
"rewards/format_reward": 0.953125,
"rewards/repetition_rewards": -0.0013698027469217777,
"step": 328
},
{
"clip_ratio": 0.0,
"completion_length": 869.515625,
"epoch": 0.16760061130922058,
"grad_norm": 7.997306620734284,
"kl": 0.0577392578125,
"learning_rate": 9.161996943453897e-07,
"loss": 0.0023,
"reward": 1.1871361136436462,
"reward_std": 0.9155566692352295,
"rewards/accuracy_reward": 0.3218750059604645,
"rewards/cosine_rewards": -0.0396097619086504,
"rewards/format_reward": 0.90625,
"rewards/repetition_rewards": -0.0013791794190183282,
"step": 329
},
{
"clip_ratio": 0.0,
"completion_length": 879.125,
"epoch": 0.16811003565970453,
"grad_norm": 3.8086916601970175,
"kl": 0.05810546875,
"learning_rate": 9.159449821701477e-07,
"loss": 0.0023,
"reward": 1.3816418051719666,
"reward_std": 0.8457719385623932,
"rewards/accuracy_reward": 0.43437500298023224,
"rewards/cosine_rewards": 0.026831649709492922,
"rewards/format_reward": 0.921875,
"rewards/repetition_rewards": -0.0014398820349015296,
"step": 330
},
{
"clip_ratio": 0.0,
"completion_length": 1077.125,
"epoch": 0.1686194600101885,
"grad_norm": 3.3635095209387,
"kl": 0.05029296875,
"learning_rate": 9.156902699949058e-07,
"loss": 0.002,
"reward": 1.4233552813529968,
"reward_std": 0.8923040926456451,
"rewards/accuracy_reward": 0.5718750357627869,
"rewards/cosine_rewards": 0.05618499033153057,
"rewards/format_reward": 0.796875,
"rewards/repetition_rewards": -0.0015796992811374366,
"step": 331
},
{
"clip_ratio": 0.0,
"completion_length": 1025.625,
"epoch": 0.16912888436067244,
"grad_norm": 2.6795836472345886,
"kl": 0.053955078125,
"learning_rate": 9.154355578196637e-07,
"loss": 0.0022,
"reward": 1.5009884238243103,
"reward_std": 0.7616147696971893,
"rewards/accuracy_reward": 0.46562496945261955,
"rewards/cosine_rewards": 0.08306753821671009,
"rewards/format_reward": 0.953125,
"rewards/repetition_rewards": -0.0008290903642773628,
"step": 332
},
{
"clip_ratio": 0.0,
"completion_length": 1130.390625,
"epoch": 0.1696383087111564,
"grad_norm": 2.8897374461041068,
"kl": 0.05615234375,
"learning_rate": 9.151808456444217e-07,
"loss": 0.0022,
"reward": 0.9568201899528503,
"reward_std": 0.883324146270752,
"rewards/accuracy_reward": 0.18437499552965164,
"rewards/cosine_rewards": -0.14675537310540676,
"rewards/format_reward": 0.921875,
"rewards/repetition_rewards": -0.0026744193164631724,
"step": 333
},
{
"clip_ratio": 0.0,
"completion_length": 1140.796875,
"epoch": 0.17014773306164035,
"grad_norm": 3.24780768226595,
"kl": 0.053955078125,
"learning_rate": 9.149261334691798e-07,
"loss": 0.0022,
"reward": 0.4763996750116348,
"reward_std": 1.3050541877746582,
"rewards/accuracy_reward": -0.07187501713633537,
"rewards/cosine_rewards": -0.26269275695085526,
"rewards/format_reward": 0.8125,
"rewards/repetition_rewards": -0.0015325736021623015,
"step": 334
},
{
"clip_ratio": 0.0,
"completion_length": 1162.21875,
"epoch": 0.1706571574121243,
"grad_norm": 7.96370989509143,
"kl": 0.0509033203125,
"learning_rate": 9.146714212939378e-07,
"loss": 0.002,
"reward": 1.0168579816818237,
"reward_std": 1.0622537732124329,
"rewards/accuracy_reward": 0.23749998211860657,
"rewards/cosine_rewards": -0.06289426982402802,
"rewards/format_reward": 0.84375,
"rewards/repetition_rewards": -0.0014976929523982108,
"step": 335
},
{
"clip_ratio": 0.0,
"completion_length": 1223.59375,
"epoch": 0.17116658176260827,
"grad_norm": 5.774523253643062,
"kl": 0.083251953125,
"learning_rate": 9.144167091186958e-07,
"loss": 0.0033,
"reward": 0.9260146915912628,
"reward_std": 1.3471828699111938,
"rewards/accuracy_reward": 0.26249998807907104,
"rewards/cosine_rewards": -0.11641103774309158,
"rewards/format_reward": 0.78125,
"rewards/repetition_rewards": -0.0013242715504020452,
"step": 336
},
{
"clip_ratio": 0.0,
"completion_length": 1069.484375,
"epoch": 0.1716760061130922,
"grad_norm": 7.732047491992381,
"kl": 0.0555419921875,
"learning_rate": 9.141619969434538e-07,
"loss": 0.0022,
"reward": 1.0389263331890106,
"reward_std": 0.9250738620758057,
"rewards/accuracy_reward": 0.20937499403953552,
"rewards/cosine_rewards": -0.09034883230924606,
"rewards/format_reward": 0.921875,
"rewards/repetition_rewards": -0.001974849379621446,
"step": 337
},
{
"clip_ratio": 0.0,
"completion_length": 846.234375,
"epoch": 0.17218543046357615,
"grad_norm": 6.146520612031918,
"kl": 0.06689453125,
"learning_rate": 9.139072847682119e-07,
"loss": 0.0027,
"reward": 1.5287657380104065,
"reward_std": 0.7281034886837006,
"rewards/accuracy_reward": 0.5218749940395355,
"rewards/cosine_rewards": 0.055092147551476955,
"rewards/format_reward": 0.953125,
"rewards/repetition_rewards": -0.0013264745939522982,
"step": 338
},
{
"clip_ratio": 0.0,
"completion_length": 878.59375,
"epoch": 0.1726948548140601,
"grad_norm": 5.859040533770109,
"kl": 0.059814453125,
"learning_rate": 9.136525725929699e-07,
"loss": 0.0024,
"reward": 1.309591829776764,
"reward_std": 0.8282720148563385,
"rewards/accuracy_reward": 0.3781249839812517,
"rewards/cosine_rewards": 0.02607971802353859,
"rewards/format_reward": 0.90625,
"rewards/repetition_rewards": -0.0008628710638731718,
"step": 339
},
{
"clip_ratio": 0.0,
"completion_length": 706.234375,
"epoch": 0.17320427916454406,
"grad_norm": 4.068095402441544,
"kl": 0.066162109375,
"learning_rate": 9.133978604177279e-07,
"loss": 0.0026,
"reward": 1.1101016998291016,
"reward_std": 0.7019257247447968,
"rewards/accuracy_reward": 0.20624998956918716,
"rewards/cosine_rewards": -0.03270102944225073,
"rewards/format_reward": 0.9375,
"rewards/repetition_rewards": -0.0009472573874518275,
"step": 340
},
{
"clip_ratio": 0.0,
"completion_length": 782.09375,
"epoch": 0.17371370351502802,
"grad_norm": 8.092723935778833,
"kl": 0.07080078125,
"learning_rate": 9.13143148242486e-07,
"loss": 0.0028,
"reward": 1.3624014258384705,
"reward_std": 0.6876442432403564,
"rewards/accuracy_reward": 0.40937499701976776,
"rewards/cosine_rewards": 0.0012194328010082245,
"rewards/format_reward": 0.953125,
"rewards/repetition_rewards": -0.0013180217938497663,
"step": 341
},
{
"clip_ratio": 0.0,
"completion_length": 725.109375,
"epoch": 0.17422312786551197,
"grad_norm": 8.756999335905311,
"kl": 0.130126953125,
"learning_rate": 9.12888436067244e-07,
"loss": 0.0052,
"reward": 1.1084296703338623,
"reward_std": 1.0551597476005554,
"rewards/accuracy_reward": 0.2343750037252903,
"rewards/cosine_rewards": -0.062263866886496544,
"rewards/format_reward": 0.9375,
"rewards/repetition_rewards": -0.0011814486351795495,
"step": 342
},
{
"clip_ratio": 0.0,
"completion_length": 664.5625,
"epoch": 0.17473255221599593,
"grad_norm": 4.670760280414413,
"kl": 0.07275390625,
"learning_rate": 9.12633723892002e-07,
"loss": 0.0029,
"reward": 1.379169523715973,
"reward_std": 0.6884946823120117,
"rewards/accuracy_reward": 0.40937499701976776,
"rewards/cosine_rewards": -0.014050468802452087,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": -0.0005300141347106546,
"step": 343
},
{
"clip_ratio": 0.0,
"completion_length": 667.359375,
"epoch": 0.17524197656647988,
"grad_norm": 27.098795999332967,
"kl": 0.08056640625,
"learning_rate": 9.123790117167601e-07,
"loss": 0.0032,
"reward": 1.6130830645561218,
"reward_std": 0.44565099477767944,
"rewards/accuracy_reward": 0.5781249701976776,
"rewards/cosine_rewards": 0.051279583014547825,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": -0.0006965193606447428,
"step": 344
},
{
"clip_ratio": 0.0,
"completion_length": 654.65625,
"epoch": 0.17575140091696384,
"grad_norm": 10.133659329005896,
"kl": 0.075439453125,
"learning_rate": 9.121242995415181e-07,
"loss": 0.003,
"reward": 1.6888669729232788,
"reward_std": 0.506424754858017,
"rewards/accuracy_reward": 0.690625011920929,
"rewards/cosine_rewards": 0.06190674379467964,
"rewards/format_reward": 0.9375,
"rewards/repetition_rewards": -0.0011647465871647,
"step": 345
},
{
"clip_ratio": 0.0,
"completion_length": 647.6875,
"epoch": 0.1762608252674478,
"grad_norm": 5.129606334927334,
"kl": 0.07958984375,
"learning_rate": 9.11869587366276e-07,
"loss": 0.0032,
"reward": 1.2593636512756348,
"reward_std": 0.41098763048648834,
"rewards/accuracy_reward": 0.296875,
"rewards/cosine_rewards": -0.005569446831941605,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": -0.0006919201114214957,
"step": 346
},
{
"clip_ratio": 0.0,
"completion_length": 696.46875,
"epoch": 0.17677024961793172,
"grad_norm": 11.616711482358948,
"kl": 0.074462890625,
"learning_rate": 9.11614875191034e-07,
"loss": 0.003,
"reward": 1.4469356536865234,
"reward_std": 0.6090122163295746,
"rewards/accuracy_reward": 0.46562500298023224,
"rewards/cosine_rewards": -0.0018481542356312275,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": -0.001216164615470916,
"step": 347
},
{
"clip_ratio": 0.0,
"completion_length": 660.0,
"epoch": 0.17727967396841568,
"grad_norm": 16.70982931235053,
"kl": 0.092041015625,
"learning_rate": 9.113601630157921e-07,
"loss": 0.0037,
"reward": 1.3860605359077454,
"reward_std": 0.5821886360645294,
"rewards/accuracy_reward": 0.40625,
"rewards/cosine_rewards": 0.01206381805241108,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": -0.0010032225982286036,
"step": 348
},
{
"clip_ratio": 0.0,
"completion_length": 819.25,
"epoch": 0.17778909831889964,
"grad_norm": 10.312325223777075,
"kl": 0.0694580078125,
"learning_rate": 9.111054508405501e-07,
"loss": 0.0028,
"reward": 1.3597615957260132,
"reward_std": 0.5677385032176971,
"rewards/accuracy_reward": 0.4375000074505806,
"rewards/cosine_rewards": 0.0019306838512420654,
"rewards/format_reward": 0.921875,
"rewards/repetition_rewards": -0.0015441215364262462,
"step": 349
},
{
"clip_ratio": 0.0,
"completion_length": 816.125,
"epoch": 0.1782985226693836,
"grad_norm": 3.8656926374469416,
"kl": 0.07080078125,
"learning_rate": 9.108507386653081e-07,
"loss": 0.0028,
"reward": 1.1428874135017395,
"reward_std": 0.40452495217323303,
"rewards/accuracy_reward": 0.21249999105930328,
"rewards/cosine_rewards": -0.05335182696580887,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": -0.0006357444362947717,
"step": 350
},
{
"clip_ratio": 0.0,
"completion_length": 839.09375,
"epoch": 0.17880794701986755,
"grad_norm": 10.545025641089767,
"kl": 0.062744140625,
"learning_rate": 9.105960264900662e-07,
"loss": 0.0025,
"reward": 1.440682828426361,
"reward_std": 0.7122917473316193,
"rewards/accuracy_reward": 0.4375,
"rewards/cosine_rewards": 0.004494791850447655,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": -0.001311894680839032,
"step": 351
},
{
"clip_ratio": 0.0,
"completion_length": 745.34375,
"epoch": 0.1793173713703515,
"grad_norm": 5.236449549563,
"kl": 0.081787109375,
"learning_rate": 9.103413143148242e-07,
"loss": 0.0033,
"reward": 1.7106852531433105,
"reward_std": 0.4475601017475128,
"rewards/accuracy_reward": 0.6875,
"rewards/cosine_rewards": 0.07085046917200089,
"rewards/format_reward": 0.953125,
"rewards/repetition_rewards": -0.0007901439967099577,
"step": 352
},
{
"clip_ratio": 0.0,
"completion_length": 782.65625,
"epoch": 0.17982679572083546,
"grad_norm": 4.40390803368756,
"kl": 0.07568359375,
"learning_rate": 9.100866021395822e-07,
"loss": 0.003,
"reward": 1.321226179599762,
"reward_std": 0.5729265064001083,
"rewards/accuracy_reward": 0.3812500238418579,
"rewards/cosine_rewards": -0.04316529631614685,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": -0.001233478484209627,
"step": 353
},
{
"clip_ratio": 0.0,
"completion_length": 911.4375,
"epoch": 0.18033622007131941,
"grad_norm": 4.585368026245459,
"kl": 0.083740234375,
"learning_rate": 9.098318899643402e-07,
"loss": 0.0034,
"reward": 1.2610972821712494,
"reward_std": 0.5936008393764496,
"rewards/accuracy_reward": 0.3812499828636646,
"rewards/cosine_rewards": -0.02527322620153427,
"rewards/format_reward": 0.90625,
"rewards/repetition_rewards": -0.0011294231517240405,
"step": 354
},
{
"clip_ratio": 0.0,
"completion_length": 961.59375,
"epoch": 0.18084564442180337,
"grad_norm": 7.408796362493787,
"kl": 0.0693359375,
"learning_rate": 9.095771777890983e-07,
"loss": 0.0028,
"reward": 1.2509925812482834,
"reward_std": 0.5566798448562622,
"rewards/accuracy_reward": 0.3499999940395355,
"rewards/cosine_rewards": -0.034961797297000885,
"rewards/format_reward": 0.9375,
"rewards/repetition_rewards": -0.001545542269013822,
"step": 355
},
{
"clip_ratio": 0.0,
"completion_length": 875.421875,
"epoch": 0.18135506877228733,
"grad_norm": 6.46350391356739,
"kl": 0.08251953125,
"learning_rate": 9.093224656138563e-07,
"loss": 0.0033,
"reward": 1.1181039810180664,
"reward_std": 0.674926146864891,
"rewards/accuracy_reward": 0.23749998956918716,
"rewards/cosine_rewards": -0.05567748658359051,
"rewards/format_reward": 0.9375,
"rewards/repetition_rewards": -0.0012184783699922264,
"step": 356
},
{
"clip_ratio": 0.0,
"completion_length": 1032.265625,
"epoch": 0.18186449312277128,
"grad_norm": 6.831510020206218,
"kl": 0.0609130859375,
"learning_rate": 9.090677534386143e-07,
"loss": 0.0024,
"reward": 1.559360921382904,
"reward_std": 0.6407117247581482,
"rewards/accuracy_reward": 0.518750011920929,
"rewards/cosine_rewards": 0.057725198566913605,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": -0.0014893330517224967,
"step": 357
},
{
"clip_ratio": 0.0,
"completion_length": 1343.859375,
"epoch": 0.1823739174732552,
"grad_norm": 4.523279934522272,
"kl": 0.05419921875,
"learning_rate": 9.088130412633724e-07,
"loss": 0.0022,
"reward": 1.3108936548233032,
"reward_std": 1.3749122023582458,
"rewards/accuracy_reward": 0.4843749850988388,
"rewards/cosine_rewards": -0.015468426048755646,
"rewards/format_reward": 0.84375,
"rewards/repetition_rewards": -0.0017629386857151985,
"step": 358
},
{
"clip_ratio": 0.0,
"completion_length": 1283.875,
"epoch": 0.18288334182373917,
"grad_norm": 2.9278358224956733,
"kl": 0.046875,
"learning_rate": 9.085583290881304e-07,
"loss": 0.0019,
"reward": 0.899000346660614,
"reward_std": 1.2357721328735352,
"rewards/accuracy_reward": 0.20000001043081284,
"rewards/cosine_rewards": -0.12764177471399307,
"rewards/format_reward": 0.828125,
"rewards/repetition_rewards": -0.0014829274150542915,
"step": 359
},
{
"clip_ratio": 0.0,
"completion_length": 1351.765625,
"epoch": 0.18339276617422312,
"grad_norm": 5.747269025294299,
"kl": 0.05029296875,
"learning_rate": 9.083036169128883e-07,
"loss": 0.002,
"reward": 0.685440868139267,
"reward_std": 1.0775729417800903,
"rewards/accuracy_reward": 0.062499986961483955,
"rewards/cosine_rewards": -0.23488027602434158,
"rewards/format_reward": 0.859375,
"rewards/repetition_rewards": -0.0015538162551820278,
"step": 360
},
{
"clip_ratio": 0.0,
"completion_length": 1328.359375,
"epoch": 0.18390219052470708,
"grad_norm": 5.5666444239146,
"kl": 0.046630859375,
"learning_rate": 9.080489047376463e-07,
"loss": 0.0019,
"reward": 1.440912902355194,
"reward_std": 1.3688839673995972,
"rewards/accuracy_reward": 0.546875,
"rewards/cosine_rewards": 0.05198000371456146,
"rewards/format_reward": 0.84375,
"rewards/repetition_rewards": -0.0016921277856454253,
"step": 361
},
{
"clip_ratio": 0.0,
"completion_length": 1222.359375,
"epoch": 0.18441161487519103,
"grad_norm": 3.8469994374841496,
"kl": 0.063720703125,
"learning_rate": 9.077941925624044e-07,
"loss": 0.0025,
"reward": 1.2567678689956665,
"reward_std": 1.0096549689769745,
"rewards/accuracy_reward": 0.3531249761581421,
"rewards/cosine_rewards": -0.06352230161428452,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": -0.001584898098371923,
"step": 362
},
{
"clip_ratio": 0.0,
"completion_length": 1071.90625,
"epoch": 0.184921039225675,
"grad_norm": 10.748025785151507,
"kl": 0.080810546875,
"learning_rate": 9.075394803871624e-07,
"loss": 0.0032,
"reward": 1.4773434400558472,
"reward_std": 0.7611989676952362,
"rewards/accuracy_reward": 0.518750011920929,
"rewards/cosine_rewards": 0.022458821535110474,
"rewards/format_reward": 0.9375,
"rewards/repetition_rewards": -0.0013653661007992923,
"step": 363
},
{
"clip_ratio": 0.0,
"completion_length": 1068.84375,
"epoch": 0.18543046357615894,
"grad_norm": 6.961531569684862,
"kl": 0.0966796875,
"learning_rate": 9.072847682119204e-07,
"loss": 0.0039,
"reward": 1.3342331051826477,
"reward_std": 0.9612607657909393,
"rewards/accuracy_reward": 0.4906249940395355,
"rewards/cosine_rewards": -0.029875734820961952,
"rewards/format_reward": 0.875,
"rewards/repetition_rewards": -0.0015161921037361026,
"step": 364
},
{
"clip_ratio": 0.0,
"completion_length": 1151.765625,
"epoch": 0.1859398879266429,
"grad_norm": 4.787495962824196,
"kl": 0.0526123046875,
"learning_rate": 9.070300560366785e-07,
"loss": 0.0021,
"reward": 0.44851796329021454,
"reward_std": 0.6482652425765991,
"rewards/accuracy_reward": -0.18125002831220627,
"rewards/cosine_rewards": -0.3225611299276352,
"rewards/format_reward": 0.953125,
"rewards/repetition_rewards": -0.0007959024223964661,
"step": 365
},
{
"clip_ratio": 0.0,
"completion_length": 1087.453125,
"epoch": 0.18644931227712686,
"grad_norm": 3.5882611622656704,
"kl": 0.05517578125,
"learning_rate": 9.067753438614365e-07,
"loss": 0.0022,
"reward": 1.0280417203903198,
"reward_std": 0.8365518152713776,
"rewards/accuracy_reward": 0.2093750163912773,
"rewards/cosine_rewards": -0.10154062137007713,
"rewards/format_reward": 0.921875,
"rewards/repetition_rewards": -0.0016676230588927865,
"step": 366
},
{
"clip_ratio": 0.0,
"completion_length": 921.3125,
"epoch": 0.1869587366276108,
"grad_norm": 10.03430669886217,
"kl": 0.07080078125,
"learning_rate": 9.065206316861945e-07,
"loss": 0.0028,
"reward": 1.1769609451293945,
"reward_std": 0.880241334438324,
"rewards/accuracy_reward": 0.2656249925494194,
"rewards/cosine_rewards": -0.04036855325102806,
"rewards/format_reward": 0.953125,
"rewards/repetition_rewards": -0.0014205531333573163,
"step": 367
},
{
"clip_ratio": 0.0,
"completion_length": 841.6875,
"epoch": 0.18746816097809477,
"grad_norm": 27.23508906311087,
"kl": 0.07861328125,
"learning_rate": 9.062659195109526e-07,
"loss": 0.0031,
"reward": 1.685433030128479,
"reward_std": 0.49864277243614197,
"rewards/accuracy_reward": 0.6875,
"rewards/cosine_rewards": 0.1084844060242176,
"rewards/format_reward": 0.890625,
"rewards/repetition_rewards": -0.0011764070368371904,
"step": 368
},
{
"clip_ratio": 0.0,
"completion_length": 737.03125,
"epoch": 0.1879775853285787,
"grad_norm": 18.41490109995637,
"kl": 0.08740234375,
"learning_rate": 9.060112073357106e-07,
"loss": 0.0035,
"reward": 1.3734083771705627,
"reward_std": 0.4119359850883484,
"rewards/accuracy_reward": 0.37812499701976776,
"rewards/cosine_rewards": 0.01141296117566526,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": -0.000504543146234937,
"step": 369
},
{
"clip_ratio": 0.0,
"completion_length": 707.796875,
"epoch": 0.18848700967906265,
"grad_norm": 35.58611363543668,
"kl": 0.084716796875,
"learning_rate": 9.057564951604686e-07,
"loss": 0.0034,
"reward": 1.6782256960868835,
"reward_std": 0.5156250298023224,
"rewards/accuracy_reward": 0.6343749761581421,
"rewards/cosine_rewards": 0.07835755217820406,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": -0.0032568235765211284,
"step": 370
},
{
"clip_ratio": 0.0,
"completion_length": 647.015625,
"epoch": 0.1889964340295466,
"grad_norm": 7.982503076191807,
"kl": 0.086669921875,
"learning_rate": 9.055017829852266e-07,
"loss": 0.0035,
"reward": 1.760904848575592,
"reward_std": 0.49070215225219727,
"rewards/accuracy_reward": 0.690625011920929,
"rewards/cosine_rewards": 0.0864610131829977,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": -0.0005561279249377549,
"step": 371
},
{
"clip_ratio": 0.0,
"completion_length": 716.359375,
"epoch": 0.18950585838003056,
"grad_norm": 10.914576943859945,
"kl": 0.077880859375,
"learning_rate": 9.052470708099847e-07,
"loss": 0.0031,
"reward": 1.9701185822486877,
"reward_std": 0.40086938440799713,
"rewards/accuracy_reward": 0.831250011920929,
"rewards/cosine_rewards": 0.1397455483675003,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": -0.0008769762353040278,
"step": 372
},
{
"clip_ratio": 0.0,
"completion_length": 738.546875,
"epoch": 0.19001528273051452,
"grad_norm": 6.034059111318338,
"kl": 0.08544921875,
"learning_rate": 9.049923586347427e-07,
"loss": 0.0034,
"reward": 1.8058127164840698,
"reward_std": 0.41330619156360626,
"rewards/accuracy_reward": 0.7468750178813934,
"rewards/cosine_rewards": 0.1067701168358326,
"rewards/format_reward": 0.953125,
"rewards/repetition_rewards": -0.0009574841533321887,
"step": 373
},
{
"clip_ratio": 0.0,
"completion_length": 728.0,
"epoch": 0.19052470708099847,
"grad_norm": 8.532753416320839,
"kl": 0.07861328125,
"learning_rate": 9.047376464595006e-07,
"loss": 0.0031,
"reward": 1.0842646658420563,
"reward_std": 0.44039003551006317,
"rewards/accuracy_reward": 0.15312500298023224,
"rewards/cosine_rewards": -0.052397772669792175,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": -0.0008375749748665839,
"step": 374
},
{
"clip_ratio": 0.0,
"completion_length": 804.21875,
"epoch": 0.19103413143148243,
"grad_norm": 7.4599789196405375,
"kl": 0.078125,
"learning_rate": 9.044829342842587e-07,
"loss": 0.0031,
"reward": 0.974018394947052,
"reward_std": 0.5849625766277313,
"rewards/accuracy_reward": 0.09999999403953552,
"rewards/cosine_rewards": -0.10963174607604742,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": -0.0007248484616866335,
"step": 375
},
{
"clip_ratio": 0.0,
"completion_length": 767.84375,
"epoch": 0.19154355578196638,
"grad_norm": 7.073657321582574,
"kl": 0.0703125,
"learning_rate": 9.042282221090167e-07,
"loss": 0.0028,
"reward": 0.914261519908905,
"reward_std": 0.7159627079963684,
"rewards/accuracy_reward": 0.09999998658895493,
"rewards/cosine_rewards": -0.13804471492767334,
"rewards/format_reward": 0.953125,
"rewards/repetition_rewards": -0.0008187246276065707,
"step": 376
},
{
"clip_ratio": 0.0,
"completion_length": 898.65625,
"epoch": 0.19205298013245034,
"grad_norm": 5.152812978669099,
"kl": 0.060791015625,
"learning_rate": 9.039735099337747e-07,
"loss": 0.0024,
"reward": 1.3070060014724731,
"reward_std": 0.5369542390108109,
"rewards/accuracy_reward": 0.3531250059604645,
"rewards/cosine_rewards": 0.01811320334672928,
"rewards/format_reward": 0.9375,
"rewards/repetition_rewards": -0.0017322039348073304,
"step": 377
},
{
"clip_ratio": 0.0,
"completion_length": 824.65625,
"epoch": 0.1925624044829343,
"grad_norm": 4.4264228290413055,
"kl": 0.071044921875,
"learning_rate": 9.037187977585327e-07,
"loss": 0.0028,
"reward": 1.9969289302825928,
"reward_std": 0.36238182336091995,
"rewards/accuracy_reward": 0.887499988079071,
"rewards/cosine_rewards": 0.1572401076555252,
"rewards/format_reward": 0.953125,
"rewards/repetition_rewards": -0.0009361990523757413,
"step": 378
},
{
"clip_ratio": 0.0,
"completion_length": 1041.234375,
"epoch": 0.19307182883341822,
"grad_norm": 2.966764644317296,
"kl": 0.0531005859375,
"learning_rate": 9.034640855832908e-07,
"loss": 0.0021,
"reward": 1.9203879237174988,
"reward_std": 0.6297050192952156,
"rewards/accuracy_reward": 0.831250011920929,
"rewards/cosine_rewards": 0.1525670364499092,
"rewards/format_reward": 0.9375,
"rewards/repetition_rewards": -0.0009290309972129762,
"step": 379
},
{
"clip_ratio": 0.0,
"completion_length": 1066.578125,
"epoch": 0.19358125318390218,
"grad_norm": 6.45607299399273,
"kl": 0.0604248046875,
"learning_rate": 9.032093734080488e-07,
"loss": 0.0024,
"reward": 1.5978580713272095,
"reward_std": 0.7550583779811859,
"rewards/accuracy_reward": 0.546875,
"rewards/cosine_rewards": 0.0831909030675888,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": -0.0009578557801432908,
"step": 380
},
{
"clip_ratio": 0.0,
"completion_length": 1159.84375,
"epoch": 0.19409067753438614,
"grad_norm": 15.218618980246333,
"kl": 0.0557861328125,
"learning_rate": 9.029546612328068e-07,
"loss": 0.0022,
"reward": 1.495898723602295,
"reward_std": 0.8071758449077606,
"rewards/accuracy_reward": 0.5187499970197678,
"rewards/cosine_rewards": 0.04102367162704468,
"rewards/format_reward": 0.9375,
"rewards/repetition_rewards": -0.0013749129138886929,
"step": 381
},
{
"clip_ratio": 0.0,
"completion_length": 1371.671875,
"epoch": 0.1946001018848701,
"grad_norm": 2.5671889777891415,
"kl": 0.0416259765625,
"learning_rate": 9.026999490575649e-07,
"loss": 0.0017,
"reward": 1.4654145240783691,
"reward_std": 0.9137448668479919,
"rewards/accuracy_reward": 0.5468749925494194,
"rewards/cosine_rewards": 0.029642254114151,
"rewards/format_reward": 0.890625,
"rewards/repetition_rewards": -0.0017276888247579336,
"step": 382
},
{
"clip_ratio": 0.0,
"completion_length": 1303.015625,
"epoch": 0.19510952623535405,
"grad_norm": 3.460987727073742,
"kl": 0.0421142578125,
"learning_rate": 9.024452368823229e-07,
"loss": 0.0017,
"reward": 1.348323106765747,
"reward_std": 0.42551596462726593,
"rewards/accuracy_reward": 0.40937499701976776,
"rewards/cosine_rewards": 0.0025482475757598877,
"rewards/format_reward": 0.9375,
"rewards/repetition_rewards": -0.0011001455131918192,
"step": 383
},
{
"clip_ratio": 0.0,
"completion_length": 1507.828125,
"epoch": 0.195618950585838,
"grad_norm": 2.4731324069110165,
"kl": 0.040771484375,
"learning_rate": 9.021905247070809e-07,
"loss": 0.0016,
"reward": 1.2840899229049683,
"reward_std": 1.3389369249343872,
"rewards/accuracy_reward": 0.43437498807907104,
"rewards/cosine_rewards": 0.007216873578727245,
"rewards/format_reward": 0.84375,
"rewards/repetition_rewards": -0.0012519625015556812,
"step": 384
},
{
"clip_ratio": 0.0,
"completion_length": 1392.5625,
"epoch": 0.19612837493632196,
"grad_norm": 3.9847528096417464,
"kl": 0.0401611328125,
"learning_rate": 9.019358125318391e-07,
"loss": 0.0016,
"reward": 0.952269122004509,
"reward_std": 1.110903412103653,
"rewards/accuracy_reward": 0.21249999478459358,
"rewards/cosine_rewards": -0.16534814983606339,
"rewards/format_reward": 0.90625,
"rewards/repetition_rewards": -0.001132699428126216,
"step": 385
},
{
"clip_ratio": 0.0,
"completion_length": 1509.515625,
"epoch": 0.19663779928680591,
"grad_norm": 1.3917929153423205,
"kl": 0.0386962890625,
"learning_rate": 9.016811003565971e-07,
"loss": 0.0015,
"reward": 1.3951207399368286,
"reward_std": 1.3895853757858276,
"rewards/accuracy_reward": 0.49062497913837433,
"rewards/cosine_rewards": 0.030975546687841415,
"rewards/format_reward": 0.875,
"rewards/repetition_rewards": -0.0014798620832152665,
"step": 386
},
{
"clip_ratio": 0.0,
"completion_length": 1453.75,
"epoch": 0.19714722363728987,
"grad_norm": 4.5123123100553215,
"kl": 0.040283203125,
"learning_rate": 9.014263881813551e-07,
"loss": 0.0016,
"reward": 1.0250075459480286,
"reward_std": 1.1121925115585327,
"rewards/accuracy_reward": 0.2656249925494194,
"rewards/cosine_rewards": -0.06678299978375435,
"rewards/format_reward": 0.828125,
"rewards/repetition_rewards": -0.001959475106559694,
"step": 387
},
{
"clip_ratio": 0.0,
"completion_length": 1502.953125,
"epoch": 0.19765664798777383,
"grad_norm": 6.3687413391252665,
"kl": 0.0384521484375,
"learning_rate": 9.011716760061131e-07,
"loss": 0.0015,
"reward": 0.6134699061512947,
"reward_std": 0.8420631885528564,
"rewards/accuracy_reward": 0.015624985098838806,
"rewards/cosine_rewards": -0.2909963075071573,
"rewards/format_reward": 0.890625,
"rewards/repetition_rewards": -0.0017838198109529912,
"step": 388
},
{
"clip_ratio": 0.0,
"completion_length": 1398.71875,
"epoch": 0.19816607233825778,
"grad_norm": 3.252398002176852,
"kl": 0.04052734375,
"learning_rate": 9.009169638308711e-07,
"loss": 0.0016,
"reward": 0.6156338006258011,
"reward_std": 1.171474575996399,
"rewards/accuracy_reward": -0.012500010430812836,
"rewards/cosine_rewards": -0.2769355773925781,
"rewards/format_reward": 0.90625,
"rewards/repetition_rewards": -0.0011806105903815478,
"step": 389
},
{
"clip_ratio": 0.0,
"completion_length": 1356.796875,
"epoch": 0.1986754966887417,
"grad_norm": 2.9564973631467386,
"kl": 0.0411376953125,
"learning_rate": 9.006622516556291e-07,
"loss": 0.0016,
"reward": 1.4809187650680542,
"reward_std": 0.4241075813770294,
"rewards/accuracy_reward": 0.4656249899417162,
"rewards/cosine_rewards": 0.0635819137096405,
"rewards/format_reward": 0.953125,
"rewards/repetition_rewards": -0.0014131638454273343,
"step": 390
},
{
"clip_ratio": 0.0,
"completion_length": 1314.8125,
"epoch": 0.19918492103922567,
"grad_norm": 1.9868954496027869,
"kl": 0.040283203125,
"learning_rate": 9.004075394803871e-07,
"loss": 0.0016,
"reward": 0.3289404660463333,
"reward_std": 0.6832451522350311,
"rewards/accuracy_reward": -0.23750004172325134,
"rewards/cosine_rewards": -0.38559940457344055,
"rewards/format_reward": 0.953125,
"rewards/repetition_rewards": -0.0010851426632143557,
"step": 391
},
{
"clip_ratio": 0.0,
"completion_length": 1211.65625,
"epoch": 0.19969434538970962,
"grad_norm": 2.3384012636409524,
"kl": 0.0426025390625,
"learning_rate": 9.001528273051452e-07,
"loss": 0.0017,
"reward": 1.7431849241256714,
"reward_std": 0.5287438631057739,
"rewards/accuracy_reward": 0.6624999940395355,
"rewards/cosine_rewards": 0.0974309928715229,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": -0.0011210814118385315,
"step": 392
},
{
"clip_ratio": 0.0,
"completion_length": 1199.40625,
"epoch": 0.20020376974019358,
"grad_norm": 8.037383660003005,
"kl": 0.0426025390625,
"learning_rate": 8.998981151299032e-07,
"loss": 0.0017,
"reward": 1.205706238746643,
"reward_std": 0.5482289791107178,
"rewards/accuracy_reward": 0.2968749925494194,
"rewards/cosine_rewards": -0.09018014371395111,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": -0.0009886454208754003,
"step": 393
},
{
"clip_ratio": 0.0,
"completion_length": 1215.25,
"epoch": 0.20071319409067753,
"grad_norm": 2.7015176132022205,
"kl": 0.04150390625,
"learning_rate": 8.996434029546612e-07,
"loss": 0.0017,
"reward": 1.3461086750030518,
"reward_std": 0.36276355385780334,
"rewards/accuracy_reward": 0.3812499940395355,
"rewards/cosine_rewards": -0.033333455212414265,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": -0.0018078879220411181,
"step": 394
},
{
"clip_ratio": 0.0,
"completion_length": 1148.140625,
"epoch": 0.2012226184411615,
"grad_norm": 2.4525739585224064,
"kl": 0.0447998046875,
"learning_rate": 8.993886907794193e-07,
"loss": 0.0018,
"reward": 1.6304560899734497,
"reward_std": 0.6783818304538727,
"rewards/accuracy_reward": 0.5781249850988388,
"rewards/cosine_rewards": 0.0690329410135746,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": -0.0010768624488264322,
"step": 395
},
{
"clip_ratio": 0.0,
"completion_length": 1234.03125,
"epoch": 0.20173204279164544,
"grad_norm": 2.620518407503657,
"kl": 0.0426025390625,
"learning_rate": 8.991339786041773e-07,
"loss": 0.0017,
"reward": 1.0580366849899292,
"reward_std": 0.45367684960365295,
"rewards/accuracy_reward": 0.18437499552965164,
"rewards/cosine_rewards": -0.09430436789989471,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": -0.0007839706668164581,
"step": 396
},
{
"clip_ratio": 0.0,
"completion_length": 1255.140625,
"epoch": 0.2022414671421294,
"grad_norm": 2.848324792333859,
"kl": 0.0416259765625,
"learning_rate": 8.988792664289353e-07,
"loss": 0.0017,
"reward": 1.396336853504181,
"reward_std": 0.6851004362106323,
"rewards/accuracy_reward": 0.40937498584389687,
"rewards/cosine_rewards": 0.003251887857913971,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": -0.00066499671083875,
"step": 397
},
{
"clip_ratio": 0.0,
"completion_length": 1243.234375,
"epoch": 0.20275089149261336,
"grad_norm": 2.5122988909394457,
"kl": 0.04150390625,
"learning_rate": 8.986245542536933e-07,
"loss": 0.0017,
"reward": 2.053937077522278,
"reward_std": 0.5187530070543289,
"rewards/accuracy_reward": 0.8312499821186066,
"rewards/cosine_rewards": 0.22372649610042572,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": -0.0010393889679107815,
"step": 398
},
{
"clip_ratio": 0.0,
"completion_length": 1395.28125,
"epoch": 0.2032603158430973,
"grad_norm": 8.131421667160394,
"kl": 0.039306640625,
"learning_rate": 8.983698420784514e-07,
"loss": 0.0016,
"reward": 1.9118317365646362,
"reward_std": 0.3381110727787018,
"rewards/accuracy_reward": 0.7187500149011612,
"rewards/cosine_rewards": 0.19487697072327137,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": -0.001795282296370715,
"step": 399
},
{
"clip_ratio": 0.0,
"completion_length": 1477.328125,
"epoch": 0.20376974019358127,
"grad_norm": 2.5663546513961992,
"kl": 0.0489501953125,
"learning_rate": 8.981151299032094e-07,
"loss": 0.002,
"reward": 0.616385743021965,
"reward_std": 0.5365406274795532,
"rewards/accuracy_reward": -0.012500017881393433,
"rewards/cosine_rewards": -0.27611421793699265,
"rewards/format_reward": 0.90625,
"rewards/repetition_rewards": -0.0012499869335442781,
"step": 400
},
{
"clip_ratio": 0.0,
"completion_length": 1641.53125,
"epoch": 0.2042791645440652,
"grad_norm": 2.3476982545455947,
"kl": 0.0382080078125,
"learning_rate": 8.978604177279674e-07,
"loss": 0.0015,
"reward": 0.38990160822868347,
"reward_std": 1.22097048163414,
"rewards/accuracy_reward": -0.06875001266598701,
"rewards/cosine_rewards": -0.352715402841568,
"rewards/format_reward": 0.8125,
"rewards/repetition_rewards": -0.0011329837725497782,
"step": 401
},
{
"clip_ratio": 0.0,
"completion_length": 1740.96875,
"epoch": 0.20478858889454915,
"grad_norm": 1.6789909982175664,
"kl": 0.036376953125,
"learning_rate": 8.976057055527255e-07,
"loss": 0.0015,
"reward": 0.7690124660730362,
"reward_std": 1.7883394956588745,
"rewards/accuracy_reward": 0.24062499403953552,
"rewards/cosine_rewards": -0.15724666975438595,
"rewards/format_reward": 0.6875,
"rewards/repetition_rewards": -0.001865879981778562,
"step": 402
},
{
"clip_ratio": 0.0,
"completion_length": 1715.625,
"epoch": 0.2052980132450331,
"grad_norm": 1.732486740072958,
"kl": 0.035400390625,
"learning_rate": 8.973509933774834e-07,
"loss": 0.0014,
"reward": 0.6791011095046997,
"reward_std": 1.0334843397140503,
"rewards/accuracy_reward": 0.1249999925494194,
"rewards/cosine_rewards": -0.21049801260232925,
"rewards/format_reward": 0.765625,
"rewards/repetition_rewards": -0.0010258048423565924,
"step": 403
},
{
"clip_ratio": 0.0,
"completion_length": 1585.046875,
"epoch": 0.20580743759551706,
"grad_norm": 1.6162362158227377,
"kl": 0.037109375,
"learning_rate": 8.970962812022414e-07,
"loss": 0.0015,
"reward": 0.9881232976913452,
"reward_std": 1.0253838300704956,
"rewards/accuracy_reward": 0.24062499403953552,
"rewards/cosine_rewards": -0.12615075334906578,
"rewards/format_reward": 0.875,
"rewards/repetition_rewards": -0.001350913429632783,
"step": 404
},
{
"clip_ratio": 0.0,
"completion_length": 1497.734375,
"epoch": 0.20631686194600102,
"grad_norm": 5.362930427796704,
"kl": 0.039306640625,
"learning_rate": 8.968415690269994e-07,
"loss": 0.0016,
"reward": 1.5187935531139374,
"reward_std": 0.5071015954017639,
"rewards/accuracy_reward": 0.5218749940395355,
"rewards/cosine_rewards": 0.07592727243900299,
"rewards/format_reward": 0.921875,
"rewards/repetition_rewards": -0.0008836896740831435,
"step": 405
},
{
"clip_ratio": 0.0,
"completion_length": 1471.890625,
"epoch": 0.20682628629648497,
"grad_norm": 2.5474971754837896,
"kl": 0.0374755859375,
"learning_rate": 8.965868568517575e-07,
"loss": 0.0015,
"reward": 1.7093470096588135,
"reward_std": 0.26929083466529846,
"rewards/accuracy_reward": 0.6062499955296516,
"rewards/cosine_rewards": 0.13566255569458008,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": -0.0013155650231055915,
"step": 406
},
{
"clip_ratio": 0.0,
"completion_length": 1517.765625,
"epoch": 0.20733571064696893,
"grad_norm": 2.3954440093211695,
"kl": 0.0372314453125,
"learning_rate": 8.963321446765155e-07,
"loss": 0.0015,
"reward": 1.6693125367164612,
"reward_std": 0.8508188724517822,
"rewards/accuracy_reward": 0.5781250149011612,
"rewards/cosine_rewards": 0.12336396798491478,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": -0.000926460576010868,
"step": 407
},
{
"clip_ratio": 0.0,
"completion_length": 1427.40625,
"epoch": 0.20784513499745289,
"grad_norm": 4.487502302070771,
"kl": 0.037109375,
"learning_rate": 8.960774325012735e-07,
"loss": 0.0015,
"reward": 1.6373254656791687,
"reward_std": 0.37433764338493347,
"rewards/accuracy_reward": 0.550000011920929,
"rewards/cosine_rewards": 0.11931294947862625,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": -0.0007375250570476055,
"step": 408
},
{
"clip_ratio": 0.0,
"completion_length": 1500.875,
"epoch": 0.20835455934793684,
"grad_norm": 5.555469475832445,
"kl": 0.0374755859375,
"learning_rate": 8.958227203260316e-07,
"loss": 0.0015,
"reward": 1.398006021976471,
"reward_std": 1.336867332458496,
"rewards/accuracy_reward": 0.4375,
"rewards/cosine_rewards": 0.02401774376630783,
"rewards/format_reward": 0.9375,
"rewards/repetition_rewards": -0.001011726533761248,
"step": 409
},
{
"clip_ratio": 0.0,
"completion_length": 1434.859375,
"epoch": 0.2088639836984208,
"grad_norm": 3.6143044040934105,
"kl": 0.0435791015625,
"learning_rate": 8.955680081507896e-07,
"loss": 0.0017,
"reward": 1.618862271308899,
"reward_std": 0.7050271332263947,
"rewards/accuracy_reward": 0.5468750074505806,
"rewards/cosine_rewards": 0.1038745865225792,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": -0.000637321179965511,
"step": 410
},
{
"clip_ratio": 0.0,
"completion_length": 1506.828125,
"epoch": 0.20937340804890472,
"grad_norm": 3.854404990598997,
"kl": 0.0361328125,
"learning_rate": 8.953132959755476e-07,
"loss": 0.0014,
"reward": 1.6651726961135864,
"reward_std": 0.45976050198078156,
"rewards/accuracy_reward": 0.5781250074505806,
"rewards/cosine_rewards": 0.11916181445121765,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": -0.0008640679297968745,
"step": 411
},
{
"clip_ratio": 0.0,
"completion_length": 1526.96875,
"epoch": 0.20988283239938868,
"grad_norm": 2.3422021364641736,
"kl": 0.03662109375,
"learning_rate": 8.950585838003057e-07,
"loss": 0.0015,
"reward": 0.6352521181106567,
"reward_std": 1.1320685744285583,
"rewards/accuracy_reward": -0.012500017881393433,
"rewards/cosine_rewards": -0.28875819593667984,
"rewards/format_reward": 0.9375,
"rewards/repetition_rewards": -0.0009897005802486092,
"step": 412
},
{
"clip_ratio": 0.0,
"completion_length": 1510.03125,
"epoch": 0.21039225674987264,
"grad_norm": 2.1794044547587275,
"kl": 0.0567626953125,
"learning_rate": 8.948038716250637e-07,
"loss": 0.0023,
"reward": 1.4266446828842163,
"reward_std": 0.8459653854370117,
"rewards/accuracy_reward": 0.4624999910593033,
"rewards/cosine_rewards": 0.07424483820796013,
"rewards/format_reward": 0.890625,
"rewards/repetition_rewards": -0.0007251804636325687,
"step": 413
},
{
"clip_ratio": 0.0,
"completion_length": 1505.8125,
"epoch": 0.2109016811003566,
"grad_norm": 2.019375037035424,
"kl": 0.042236328125,
"learning_rate": 8.945491594498217e-07,
"loss": 0.0017,
"reward": 1.4379878044128418,
"reward_std": 0.6174334287643433,
"rewards/accuracy_reward": 0.4374999888241291,
"rewards/cosine_rewards": 0.04812653362751007,
"rewards/format_reward": 0.953125,
"rewards/repetition_rewards": -0.0007637535745743662,
"step": 414
},
{
"clip_ratio": 0.0,
"completion_length": 1476.9375,
"epoch": 0.21141110545084055,
"grad_norm": 2.647595808512136,
"kl": 0.041259765625,
"learning_rate": 8.942944472745797e-07,
"loss": 0.0016,
"reward": 0.9988905191421509,
"reward_std": 0.6921209692955017,
"rewards/accuracy_reward": 0.20937499403953552,
"rewards/cosine_rewards": -0.1462814100086689,
"rewards/format_reward": 0.9375,
"rewards/repetition_rewards": -0.0017031602037604898,
"step": 415
},
{
"clip_ratio": 0.0,
"completion_length": 1508.578125,
"epoch": 0.2119205298013245,
"grad_norm": 2.6566052420282933,
"kl": 0.03466796875,
"learning_rate": 8.940397350993378e-07,
"loss": 0.0014,
"reward": 1.211571991443634,
"reward_std": 1.0560529828071594,
"rewards/accuracy_reward": 0.32499998807907104,
"rewards/cosine_rewards": -0.0497976616024971,
"rewards/format_reward": 0.9375,
"rewards/repetition_rewards": -0.00113033052184619,
"step": 416
},
{
"clip_ratio": 0.0,
"completion_length": 1513.40625,
"epoch": 0.21242995415180846,
"grad_norm": 2.084079343038072,
"kl": 0.0411376953125,
"learning_rate": 8.937850229240957e-07,
"loss": 0.0016,
"reward": 0.5206416845321655,
"reward_std": 0.49498558044433594,
"rewards/accuracy_reward": -0.09687501192092896,
"rewards/cosine_rewards": -0.36537329852581024,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": -0.0014850463485345244,
"step": 417
},
{
"clip_ratio": 0.0,
"completion_length": 1506.921875,
"epoch": 0.21293937850229241,
"grad_norm": 1.7101934155068432,
"kl": 0.036865234375,
"learning_rate": 8.935303107488537e-07,
"loss": 0.0015,
"reward": 1.16130793094635,
"reward_std": 0.738935075700283,
"rewards/accuracy_reward": 0.2968749850988388,
"rewards/cosine_rewards": -0.08809526264667511,
"rewards/format_reward": 0.953125,
"rewards/repetition_rewards": -0.0005968308250885457,
"step": 418
},
{
"clip_ratio": 0.0,
"completion_length": 1452.859375,
"epoch": 0.21344880285277637,
"grad_norm": 2.6364264984634236,
"kl": 0.037109375,
"learning_rate": 8.932755985736118e-07,
"loss": 0.0015,
"reward": 1.4882609844207764,
"reward_std": 0.6527669131755829,
"rewards/accuracy_reward": 0.4937499836087227,
"rewards/cosine_rewards": 0.042041175067424774,
"rewards/format_reward": 0.953125,
"rewards/repetition_rewards": -0.0006552368577104062,
"step": 419
},
{
"clip_ratio": 0.0,
"completion_length": 1425.0,
"epoch": 0.21395822720326033,
"grad_norm": 22.24294483419425,
"kl": 0.0374755859375,
"learning_rate": 8.930208863983698e-07,
"loss": 0.0015,
"reward": 1.5828353762626648,
"reward_std": 0.6265529096126556,
"rewards/accuracy_reward": 0.5468749850988388,
"rewards/cosine_rewards": 0.08372939098626375,
"rewards/format_reward": 0.953125,
"rewards/repetition_rewards": -0.0008939505496528,
"step": 420
},
{
"clip_ratio": 0.0,
"completion_length": 1396.171875,
"epoch": 0.21446765155374428,
"grad_norm": 2.8168572000468366,
"kl": 0.049560546875,
"learning_rate": 8.927661742231278e-07,
"loss": 0.002,
"reward": 1.6206639409065247,
"reward_std": 0.5450826287269592,
"rewards/accuracy_reward": 0.546875,
"rewards/cosine_rewards": 0.12136101722717285,
"rewards/format_reward": 0.953125,
"rewards/repetition_rewards": -0.0006969515234231949,
"step": 421
},
{
"clip_ratio": 0.0,
"completion_length": 1395.9375,
"epoch": 0.2149770759042282,
"grad_norm": 1.840733711397487,
"kl": 0.0379638671875,
"learning_rate": 8.925114620478858e-07,
"loss": 0.0015,
"reward": 1.8798171877861023,
"reward_std": 0.5979900360107422,
"rewards/accuracy_reward": 0.690625011920929,
"rewards/cosine_rewards": 0.18991604819893837,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": -0.0007238158723339438,
"step": 422
},
{
"clip_ratio": 0.0,
"completion_length": 1503.703125,
"epoch": 0.21548650025471217,
"grad_norm": 2.327429653832842,
"kl": 0.0377197265625,
"learning_rate": 8.922567498726439e-07,
"loss": 0.0015,
"reward": 1.1887712478637695,
"reward_std": 0.615043044090271,
"rewards/accuracy_reward": 0.2968749850988388,
"rewards/cosine_rewards": -0.09194361418485641,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": -0.0005351053987396881,
"step": 423
},
{
"clip_ratio": 0.0,
"completion_length": 1528.765625,
"epoch": 0.21599592460519612,
"grad_norm": 3.1639646848610017,
"kl": 0.0347900390625,
"learning_rate": 8.920020376974019e-07,
"loss": 0.0014,
"reward": 1.1957539916038513,
"reward_std": 1.2394747734069824,
"rewards/accuracy_reward": 0.3531249985098839,
"rewards/cosine_rewards": -0.03115752711892128,
"rewards/format_reward": 0.875,
"rewards/repetition_rewards": -0.001213467272464186,
"step": 424
},
{
"clip_ratio": 0.0,
"completion_length": 1674.765625,
"epoch": 0.21650534895568008,
"grad_norm": 2.5043711144165126,
"kl": 0.0338134765625,
"learning_rate": 8.917473255221599e-07,
"loss": 0.0014,
"reward": 1.1731443107128143,
"reward_std": 0.8068048655986786,
"rewards/accuracy_reward": 0.3218749836087227,
"rewards/cosine_rewards": -0.038288604468107224,
"rewards/format_reward": 0.890625,
"rewards/repetition_rewards": -0.0010670205520000309,
"step": 425
},
{
"clip_ratio": 0.0,
"completion_length": 1664.5,
"epoch": 0.21701477330616403,
"grad_norm": 3.6500533940846327,
"kl": 0.03515625,
"learning_rate": 8.91492613346918e-07,
"loss": 0.0014,
"reward": 0.6680706441402435,
"reward_std": 1.1470927596092224,
"rewards/accuracy_reward": 0.012499995529651642,
"rewards/cosine_rewards": -0.28094063699245453,
"rewards/format_reward": 0.9375,
"rewards/repetition_rewards": -0.000988698098808527,
"step": 426
},
{
"clip_ratio": 0.0,
"completion_length": 1733.359375,
"epoch": 0.217524197656648,
"grad_norm": 1.7264860203774577,
"kl": 0.033203125,
"learning_rate": 8.91237901171676e-07,
"loss": 0.0013,
"reward": 1.151515543460846,
"reward_std": 1.0065627694129944,
"rewards/accuracy_reward": 0.37812499701976776,
"rewards/cosine_rewards": -0.02256488800048828,
"rewards/format_reward": 0.796875,
"rewards/repetition_rewards": -0.0009195689344778657,
"step": 427
},
{
"clip_ratio": 0.0,
"completion_length": 1738.171875,
"epoch": 0.21803362200713194,
"grad_norm": 1.8028678441679502,
"kl": 0.033447265625,
"learning_rate": 8.90983188996434e-07,
"loss": 0.0013,
"reward": 0.33622707426548004,
"reward_std": 1.554500699043274,
"rewards/accuracy_reward": -0.046875011175870895,
"rewards/cosine_rewards": -0.35029861330986023,
"rewards/format_reward": 0.734375,
"rewards/repetition_rewards": -0.0009743365517351776,
"step": 428
},
{
"clip_ratio": 0.0,
"completion_length": 1710.0,
"epoch": 0.2185430463576159,
"grad_norm": 1.7315216890300187,
"kl": 0.0386962890625,
"learning_rate": 8.90728476821192e-07,
"loss": 0.0015,
"reward": 1.2531213760375977,
"reward_std": 1.7619973421096802,
"rewards/accuracy_reward": 0.4624999910593033,
"rewards/cosine_rewards": 0.010814379900693893,
"rewards/format_reward": 0.78125,
"rewards/repetition_rewards": -0.0014429978909902275,
"step": 429
},
{
"clip_ratio": 0.0,
"completion_length": 1681.5625,
"epoch": 0.21905247070809986,
"grad_norm": 1.448581293364632,
"kl": 0.0350341796875,
"learning_rate": 8.904737646459501e-07,
"loss": 0.0014,
"reward": 0.5936174094676971,
"reward_std": 1.1982838213443756,
"rewards/accuracy_reward": 0.015624940395355225,
"rewards/cosine_rewards": -0.2807646095752716,
"rewards/format_reward": 0.859375,
"rewards/repetition_rewards": -0.0006179730116855353,
"step": 430
},
{
"clip_ratio": 0.0,
"completion_length": 1480.1875,
"epoch": 0.2195618950585838,
"grad_norm": 4.830088485887878,
"kl": 0.0394287109375,
"learning_rate": 8.90219052470708e-07,
"loss": 0.0016,
"reward": 1.1779060363769531,
"reward_std": 1.0625053942203522,
"rewards/accuracy_reward": 0.31562499701976776,
"rewards/cosine_rewards": -0.05869085341691971,
"rewards/format_reward": 0.921875,
"rewards/repetition_rewards": -0.0009031399386003613,
"step": 431
},
{
"clip_ratio": 0.0,
"completion_length": 1440.796875,
"epoch": 0.22007131940906777,
"grad_norm": 2.4405157931321124,
"kl": 0.037109375,
"learning_rate": 8.89964340295466e-07,
"loss": 0.0015,
"reward": 0.9002698361873627,
"reward_std": 0.7880153059959412,
"rewards/accuracy_reward": 0.09999998845160007,
"rewards/cosine_rewards": -0.18298358470201492,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": -0.0011215846752747893,
"step": 432
},
{
"clip_ratio": 0.0,
"completion_length": 1348.515625,
"epoch": 0.2205807437595517,
"grad_norm": 2.1326390000811806,
"kl": 0.0418701171875,
"learning_rate": 8.897096281202241e-07,
"loss": 0.0017,
"reward": 0.7409723997116089,
"reward_std": 0.7918355762958527,
"rewards/accuracy_reward": 0.015624990686774254,
"rewards/cosine_rewards": -0.21155225485563278,
"rewards/format_reward": 0.9375,
"rewards/repetition_rewards": -0.000600404484430328,
"step": 433
},
{
"clip_ratio": 0.0,
"completion_length": 1298.125,
"epoch": 0.22109016811003565,
"grad_norm": 3.815337891096848,
"kl": 0.0418701171875,
"learning_rate": 8.894549159449821e-07,
"loss": 0.0017,
"reward": 1.8587952256202698,
"reward_std": 0.6939655542373657,
"rewards/accuracy_reward": 0.7187499701976776,
"rewards/cosine_rewards": 0.1717987135052681,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": -0.0005034840432927012,
"step": 434
},
{
"clip_ratio": 0.0,
"completion_length": 1156.0,
"epoch": 0.2215995924605196,
"grad_norm": 4.479065196602373,
"kl": 0.0440673828125,
"learning_rate": 8.892002037697401e-07,
"loss": 0.0018,
"reward": 1.4347090125083923,
"reward_std": 0.3772214949131012,
"rewards/accuracy_reward": 0.43749997206032276,
"rewards/cosine_rewards": -0.0022302046418190002,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": -0.0005607931379927322,
"step": 435
},
{
"clip_ratio": 0.0,
"completion_length": 1158.46875,
"epoch": 0.22210901681100356,
"grad_norm": 4.149290240827553,
"kl": 0.0455322265625,
"learning_rate": 8.889454915944982e-07,
"loss": 0.0018,
"reward": 1.0905642956495285,
"reward_std": 0.5234603583812714,
"rewards/accuracy_reward": 0.2124999761581421,
"rewards/cosine_rewards": -0.10600101202726364,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": -0.00030972264357842505,
"step": 436
},
{
"clip_ratio": 0.0,
"completion_length": 1058.734375,
"epoch": 0.22261844116148752,
"grad_norm": 6.6780990750455205,
"kl": 0.046630859375,
"learning_rate": 8.886907794192562e-07,
"loss": 0.0019,
"reward": 0.9731817841529846,
"reward_std": 0.8772869110107422,
"rewards/accuracy_reward": 0.09687498956918716,
"rewards/cosine_rewards": -0.09189720638096333,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": -0.0005459659732878208,
"step": 437
},
{
"clip_ratio": 0.0,
"completion_length": 1056.5,
"epoch": 0.22312786551197147,
"grad_norm": 2.737447142182153,
"kl": 0.044189453125,
"learning_rate": 8.884360672440142e-07,
"loss": 0.0018,
"reward": 1.1522070169448853,
"reward_std": 0.6963326930999756,
"rewards/accuracy_reward": 0.24062499403953552,
"rewards/cosine_rewards": -0.07206200063228607,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": -0.0007310137443710119,
"step": 438
},
{
"clip_ratio": 0.0,
"completion_length": 1072.171875,
"epoch": 0.22363728986245543,
"grad_norm": 2.6298041678582953,
"kl": 0.046875,
"learning_rate": 8.881813550687722e-07,
"loss": 0.0019,
"reward": 1.4510762691497803,
"reward_std": 0.5045955777168274,
"rewards/accuracy_reward": 0.49375002086162567,
"rewards/cosine_rewards": 0.020084097981452942,
"rewards/format_reward": 0.9375,
"rewards/repetition_rewards": -0.0002578186395112425,
"step": 439
},
{
"clip_ratio": 0.0,
"completion_length": 1094.546875,
"epoch": 0.22414671421293939,
"grad_norm": 2.047145097208152,
"kl": 0.0438232421875,
"learning_rate": 8.879266428935303e-07,
"loss": 0.0018,
"reward": 1.5189008712768555,
"reward_std": 0.34239334613084793,
"rewards/accuracy_reward": 0.4906250089406967,
"rewards/cosine_rewards": 0.0758383758366108,
"rewards/format_reward": 0.953125,
"rewards/repetition_rewards": -0.0006874670943943784,
"step": 440
},
{
"clip_ratio": 0.0,
"completion_length": 1083.09375,
"epoch": 0.22465613856342334,
"grad_norm": 2.966919929118076,
"kl": 0.0457763671875,
"learning_rate": 8.876719307182883e-07,
"loss": 0.0018,
"reward": 1.2056291699409485,
"reward_std": 0.828714907169342,
"rewards/accuracy_reward": 0.29687498696148396,
"rewards/cosine_rewards": -0.043857116252183914,
"rewards/format_reward": 0.953125,
"rewards/repetition_rewards": -0.0005137350672157481,
"step": 441
},
{
"clip_ratio": 0.0,
"completion_length": 1103.375,
"epoch": 0.2251655629139073,
"grad_norm": 3.389083005059361,
"kl": 0.042724609375,
"learning_rate": 8.874172185430463e-07,
"loss": 0.0017,
"reward": 1.483572542667389,
"reward_std": 0.5207121074199677,
"rewards/accuracy_reward": 0.4624999910593033,
"rewards/cosine_rewards": 0.053015733137726784,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": -0.0006932187097845599,
"step": 442
},
{
"clip_ratio": 0.0,
"completion_length": 1217.421875,
"epoch": 0.22567498726439122,
"grad_norm": 1.8084576051013326,
"kl": 0.0426025390625,
"learning_rate": 8.871625063678044e-07,
"loss": 0.0017,
"reward": 1.604416847229004,
"reward_std": 0.68864506483078,
"rewards/accuracy_reward": 0.578125,
"rewards/cosine_rewards": 0.10491618514060974,
"rewards/format_reward": 0.921875,
"rewards/repetition_rewards": -0.0004992800822947174,
"step": 443
},
{
"clip_ratio": 0.0,
"completion_length": 1285.484375,
"epoch": 0.22618441161487518,
"grad_norm": 20.129612374292474,
"kl": 0.042236328125,
"learning_rate": 8.869077941925624e-07,
"loss": 0.0017,
"reward": 1.7994786500930786,
"reward_std": 0.3120774105191231,
"rewards/accuracy_reward": 0.6624999940395355,
"rewards/cosine_rewards": 0.15342308580875397,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": -0.0008193884277716279,
"step": 444
},
{
"clip_ratio": 0.0,
"completion_length": 1370.578125,
"epoch": 0.22669383596535914,
"grad_norm": 3.234143174544009,
"kl": 0.0433349609375,
"learning_rate": 8.866530820173203e-07,
"loss": 0.0017,
"reward": 1.3398171067237854,
"reward_std": 0.7532171607017517,
"rewards/accuracy_reward": 0.3812500014901161,
"rewards/cosine_rewards": -0.0252380333840847,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": -0.0005697726446669549,
"step": 445
},
{
"clip_ratio": 0.0,
"completion_length": 1474.875,
"epoch": 0.2272032603158431,
"grad_norm": 1.7926893560129409,
"kl": 0.040283203125,
"learning_rate": 8.863983698420783e-07,
"loss": 0.0016,
"reward": 1.4312800765037537,
"reward_std": 0.6859093904495239,
"rewards/accuracy_reward": 0.43437500298023224,
"rewards/cosine_rewards": 0.02916320227086544,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": -0.0010081499349325895,
"step": 446
},
{
"clip_ratio": 0.0,
"completion_length": 1589.5625,
"epoch": 0.22771268466632705,
"grad_norm": 1.796069908482133,
"kl": 0.036865234375,
"learning_rate": 8.861436576668364e-07,
"loss": 0.0015,
"reward": 1.443231225013733,
"reward_std": 0.6114392578601837,
"rewards/accuracy_reward": 0.4375000074505806,
"rewards/cosine_rewards": 0.038026634603738785,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": -0.001045387762133032,
"step": 447
},
{
"clip_ratio": 0.0,
"completion_length": 1672.09375,
"epoch": 0.228222109016811,
"grad_norm": 2.1300405555940802,
"kl": 0.0377197265625,
"learning_rate": 8.858889454915944e-07,
"loss": 0.0015,
"reward": 1.5443891882896423,
"reward_std": 0.5743480771780014,
"rewards/accuracy_reward": 0.550000011920929,
"rewards/cosine_rewards": 0.10473084449768066,
"rewards/format_reward": 0.890625,
"rewards/repetition_rewards": -0.0009665640536695719,
"step": 448
},
{
"clip_ratio": 0.0,
"completion_length": 1766.0,
"epoch": 0.22873153336729496,
"grad_norm": 4.287974732646472,
"kl": 0.037841796875,
"learning_rate": 8.856342333163524e-07,
"loss": 0.0015,
"reward": 1.3031042218208313,
"reward_std": 1.7085354328155518,
"rewards/accuracy_reward": 0.4593749940395355,
"rewards/cosine_rewards": 0.06326716393232346,
"rewards/format_reward": 0.78125,
"rewards/repetition_rewards": -0.0007880023040343076,
"step": 449
},
{
"clip_ratio": 0.0,
"completion_length": 1844.640625,
"epoch": 0.22924095771777891,
"grad_norm": 1.6528382190432698,
"kl": 0.0341796875,
"learning_rate": 8.853795211411105e-07,
"loss": 0.0014,
"reward": 0.5885469168424606,
"reward_std": 1.7182486653327942,
"rewards/accuracy_reward": 0.17812500149011612,
"rewards/cosine_rewards": -0.1980201005935669,
"rewards/format_reward": 0.609375,
"rewards/repetition_rewards": -0.0009329892345704138,
"step": 450
},
{
"clip_ratio": 0.0,
"completion_length": 1887.9375,
"epoch": 0.22975038206826287,
"grad_norm": 1.8787523511209072,
"kl": 0.0335693359375,
"learning_rate": 8.851248089658685e-07,
"loss": 0.0013,
"reward": 0.7307622581720352,
"reward_std": 1.600571632385254,
"rewards/accuracy_reward": 0.24062500894069672,
"rewards/cosine_rewards": -0.13401341438293457,
"rewards/format_reward": 0.625,
"rewards/repetition_rewards": -0.0008493586792610586,
"step": 451
},
{
"clip_ratio": 0.0,
"completion_length": 1833.78125,
"epoch": 0.23025980641874683,
"grad_norm": 6.893897138558536,
"kl": 0.0357666015625,
"learning_rate": 8.848700967906265e-07,
"loss": 0.0014,
"reward": 1.0294001996517181,
"reward_std": 1.7062013149261475,
"rewards/accuracy_reward": 0.40312499552965164,
"rewards/cosine_rewards": 0.0025026053190231323,
"rewards/format_reward": 0.625,
"rewards/repetition_rewards": -0.0012274246546439826,
"step": 452
},
{
"clip_ratio": 0.0,
"completion_length": 1943.453125,
"epoch": 0.23076923076923078,
"grad_norm": 2.3588444541934273,
"kl": 0.0322265625,
"learning_rate": 8.846153846153846e-07,
"loss": 0.0013,
"reward": 0.19596866890788078,
"reward_std": 1.8146210312843323,
"rewards/accuracy_reward": -0.02500000223517418,
"rewards/cosine_rewards": -0.3406580686569214,
"rewards/format_reward": 0.5625,
"rewards/repetition_rewards": -0.0008732638962101191,
"step": 453
},
{
"clip_ratio": 0.0,
"completion_length": 1863.109375,
"epoch": 0.2312786551197147,
"grad_norm": 1.5475872027740656,
"kl": 0.0400390625,
"learning_rate": 8.843606724401426e-07,
"loss": 0.0016,
"reward": 0.2073364406824112,
"reward_std": 1.7386137247085571,
"rewards/accuracy_reward": -0.043750010430812836,
"rewards/cosine_rewards": -0.3414689302444458,
"rewards/format_reward": 0.59375,
"rewards/repetition_rewards": -0.0011946168669965118,
"step": 454
},
{
"clip_ratio": 0.0,
"completion_length": 1659.65625,
"epoch": 0.23178807947019867,
"grad_norm": 3.381623642320965,
"kl": 0.0540771484375,
"learning_rate": 8.841059602649006e-07,
"loss": 0.0022,
"reward": 1.5414963960647583,
"reward_std": 1.3864411413669586,
"rewards/accuracy_reward": 0.6218750178813934,
"rewards/cosine_rewards": 0.20184022560715675,
"rewards/format_reward": 0.71875,
"rewards/repetition_rewards": -0.000968798267422244,
"step": 455
},
{
"clip_ratio": 0.0,
"completion_length": 1617.046875,
"epoch": 0.23229750382068262,
"grad_norm": 7.137141646753771,
"kl": 0.0372314453125,
"learning_rate": 8.838512480896586e-07,
"loss": 0.0015,
"reward": 1.1092736423015594,
"reward_std": 0.9871836006641388,
"rewards/accuracy_reward": 0.2687499839812517,
"rewards/cosine_rewards": -0.09614543057978153,
"rewards/format_reward": 0.9375,
"rewards/repetition_rewards": -0.0008309493132401258,
"step": 456
},
{
"clip_ratio": 0.0,
"completion_length": 1511.703125,
"epoch": 0.23280692817116658,
"grad_norm": 2.858407735203425,
"kl": 0.0447998046875,
"learning_rate": 8.835965359144167e-07,
"loss": 0.0018,
"reward": 1.4474474489688873,
"reward_std": 0.8567388504743576,
"rewards/accuracy_reward": 0.4937500078231096,
"rewards/cosine_rewards": 0.06434839963912964,
"rewards/format_reward": 0.890625,
"rewards/repetition_rewards": -0.0012760092504322529,
"step": 457
},
{
"clip_ratio": 0.0,
"completion_length": 1550.828125,
"epoch": 0.23331635252165053,
"grad_norm": 2.432372091234863,
"kl": 0.0408935546875,
"learning_rate": 8.833418237391747e-07,
"loss": 0.0016,
"reward": 1.0046057403087616,
"reward_std": 1.0828097462654114,
"rewards/accuracy_reward": 0.20937498658895493,
"rewards/cosine_rewards": -0.14123845472931862,
"rewards/format_reward": 0.9375,
"rewards/repetition_rewards": -0.0010308316559530795,
"step": 458
},
{
"clip_ratio": 0.0,
"completion_length": 1519.9375,
"epoch": 0.2338257768721345,
"grad_norm": 2.65385943980829,
"kl": 0.0380859375,
"learning_rate": 8.830871115639326e-07,
"loss": 0.0015,
"reward": 1.5737290382385254,
"reward_std": 0.676769882440567,
"rewards/accuracy_reward": 0.5187499821186066,
"rewards/cosine_rewards": 0.10273971408605576,
"rewards/format_reward": 0.953125,
"rewards/repetition_rewards": -0.0008856799395289272,
"step": 459
},
{
"clip_ratio": 0.0,
"completion_length": 1412.171875,
"epoch": 0.23433520122261844,
"grad_norm": 7.668944991953695,
"kl": 0.03955078125,
"learning_rate": 8.828323993886907e-07,
"loss": 0.0016,
"reward": 1.2575648427009583,
"reward_std": 0.8219007402658463,
"rewards/accuracy_reward": 0.3499999940395355,
"rewards/cosine_rewards": -0.029430712573230267,
"rewards/format_reward": 0.9375,
"rewards/repetition_rewards": -0.0005044575809733942,
"step": 460
},
{
"clip_ratio": 0.0,
"completion_length": 1428.671875,
"epoch": 0.2348446255731024,
"grad_norm": 3.3174885087942747,
"kl": 0.041259765625,
"learning_rate": 8.825776872134487e-07,
"loss": 0.0017,
"reward": 0.5390121340751648,
"reward_std": 0.6499587297439575,
"rewards/accuracy_reward": -0.09687501192092896,
"rewards/cosine_rewards": -0.3163621127605438,
"rewards/format_reward": 0.953125,
"rewards/repetition_rewards": -0.0008757157484069467,
"step": 461
},
{
"clip_ratio": 0.0,
"completion_length": 1389.328125,
"epoch": 0.23535404992358636,
"grad_norm": 1.976395582002063,
"kl": 0.040771484375,
"learning_rate": 8.823229750382067e-07,
"loss": 0.0016,
"reward": 1.6086109280586243,
"reward_std": 0.5066869556903839,
"rewards/accuracy_reward": 0.5218749716877937,
"rewards/cosine_rewards": 0.08817524462938309,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": -0.0014392710290849209,
"step": 462
},
{
"clip_ratio": 0.0,
"completion_length": 1436.625,
"epoch": 0.2358634742740703,
"grad_norm": 2.4713376444103146,
"kl": 0.039794921875,
"learning_rate": 8.820682628629647e-07,
"loss": 0.0016,
"reward": 1.111421525478363,
"reward_std": 0.9693822264671326,
"rewards/accuracy_reward": 0.24062500149011612,
"rewards/cosine_rewards": -0.11270357295870781,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": -0.0008748299151193351,
"step": 463
},
{
"clip_ratio": 0.0,
"completion_length": 1397.96875,
"epoch": 0.23637289862455427,
"grad_norm": 2.688736588573913,
"kl": 0.0450439453125,
"learning_rate": 8.818135506877228e-07,
"loss": 0.0018,
"reward": 1.0273907780647278,
"reward_std": 0.6092932820320129,
"rewards/accuracy_reward": 0.20937500149011612,
"rewards/cosine_rewards": -0.11841067671775818,
"rewards/format_reward": 0.9375,
"rewards/repetition_rewards": -0.0010735246760305017,
"step": 464
},
{
"clip_ratio": 0.0,
"completion_length": 1502.5,
"epoch": 0.2368823229750382,
"grad_norm": 2.233484328017706,
"kl": 0.03955078125,
"learning_rate": 8.815588385124808e-07,
"loss": 0.0016,
"reward": 2.0390628576278687,
"reward_std": 0.4271709471940994,
"rewards/accuracy_reward": 0.7750000059604645,
"rewards/cosine_rewards": 0.26496873423457146,
"rewards/format_reward": 1.0,
"rewards/repetition_rewards": -0.0009058607101906091,
"step": 465
},
{
"clip_ratio": 0.0,
"completion_length": 1610.734375,
"epoch": 0.23739174732552215,
"grad_norm": 3.1291938497316867,
"kl": 0.0394287109375,
"learning_rate": 8.813041263372388e-07,
"loss": 0.0016,
"reward": 1.7407687306404114,
"reward_std": 0.8337388634681702,
"rewards/accuracy_reward": 0.659375011920929,
"rewards/cosine_rewards": 0.17613628506660461,
"rewards/format_reward": 0.90625,
"rewards/repetition_rewards": -0.0009925005142576993,
"step": 466
},
{
"clip_ratio": 0.0,
"completion_length": 1568.890625,
"epoch": 0.2379011716760061,
"grad_norm": 2.121477514968572,
"kl": 0.0380859375,
"learning_rate": 8.810494141619969e-07,
"loss": 0.0015,
"reward": 1.3490102887153625,
"reward_std": 0.7418502867221832,
"rewards/accuracy_reward": 0.37812500447034836,
"rewards/cosine_rewards": 0.003062829375267029,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": -0.0009275085176341236,
"step": 467
},
{
"clip_ratio": 0.0,
"completion_length": 1698.8125,
"epoch": 0.23841059602649006,
"grad_norm": 2.4246076523727025,
"kl": 0.03662109375,
"learning_rate": 8.807947019867549e-07,
"loss": 0.0015,
"reward": 1.3666119575500488,
"reward_std": 1.175959825515747,
"rewards/accuracy_reward": 0.46562501788139343,
"rewards/cosine_rewards": 0.02737235650420189,
"rewards/format_reward": 0.875,
"rewards/repetition_rewards": -0.0013854140415787697,
"step": 468
},
{
"clip_ratio": 0.0,
"completion_length": 1691.90625,
"epoch": 0.23892002037697402,
"grad_norm": 1.3535821296606787,
"kl": 0.039306640625,
"learning_rate": 8.805399898115129e-07,
"loss": 0.0016,
"reward": 1.2414605617523193,
"reward_std": 1.0560136437416077,
"rewards/accuracy_reward": 0.3812499940395355,
"rewards/cosine_rewards": -0.02948123589158058,
"rewards/format_reward": 0.890625,
"rewards/repetition_rewards": -0.0009331759065389633,
"step": 469
},
{
"clip_ratio": 0.0,
"completion_length": 1686.46875,
"epoch": 0.23942944472745797,
"grad_norm": 1.6084637763929415,
"kl": 0.0467529296875,
"learning_rate": 8.802852776362711e-07,
"loss": 0.0019,
"reward": 2.0340508222579956,
"reward_std": 1.1313848793506622,
"rewards/accuracy_reward": 0.8312499821186066,
"rewards/cosine_rewards": 0.3132530748844147,
"rewards/format_reward": 0.890625,
"rewards/repetition_rewards": -0.001077289809472859,
"step": 470
},
{
"clip_ratio": 0.0,
"completion_length": 1638.4375,
"epoch": 0.23993886907794193,
"grad_norm": 3.9206215218528553,
"kl": 0.0384521484375,
"learning_rate": 8.800305654610291e-07,
"loss": 0.0015,
"reward": 1.417995810508728,
"reward_std": 0.7844535112380981,
"rewards/accuracy_reward": 0.4375,
"rewards/cosine_rewards": 0.04390082508325577,
"rewards/format_reward": 0.9375,
"rewards/repetition_rewards": -0.0009050128574017435,
"step": 471
},
{
"clip_ratio": 0.0,
"completion_length": 1583.171875,
"epoch": 0.24044829342842589,
"grad_norm": 1.862615408125007,
"kl": 0.0404052734375,
"learning_rate": 8.797758532857871e-07,
"loss": 0.0016,
"reward": 1.3552428185939789,
"reward_std": 0.831163614988327,
"rewards/accuracy_reward": 0.40937500074505806,
"rewards/cosine_rewards": -0.005982518196105957,
"rewards/format_reward": 0.953125,
"rewards/repetition_rewards": -0.001274671230930835,
"step": 472
},
{
"clip_ratio": 0.0,
"completion_length": 1606.359375,
"epoch": 0.24095771777890984,
"grad_norm": 6.217207017794225,
"kl": 0.039794921875,
"learning_rate": 8.795211411105451e-07,
"loss": 0.0016,
"reward": 1.677711844444275,
"reward_std": 0.8612502366304398,
"rewards/accuracy_reward": 0.5781249701976776,
"rewards/cosine_rewards": 0.11638512089848518,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": -0.0011732576531358063,
"step": 473
},
{
"clip_ratio": 0.0,
"completion_length": 1521.84375,
"epoch": 0.2414671421293938,
"grad_norm": 2.9743621746841677,
"kl": 0.0421142578125,
"learning_rate": 8.792664289353031e-07,
"loss": 0.0017,
"reward": 1.578629732131958,
"reward_std": 0.6186130940914154,
"rewards/accuracy_reward": 0.5218749940395355,
"rewards/cosine_rewards": 0.08938230201601982,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": -0.0013775942497886717,
"step": 474
},
{
"clip_ratio": 0.0,
"completion_length": 1560.25,
"epoch": 0.24197656647987772,
"grad_norm": 3.931202850710427,
"kl": 0.0396728515625,
"learning_rate": 8.790117167600611e-07,
"loss": 0.0016,
"reward": 1.8553311824798584,
"reward_std": 0.5484062433242798,
"rewards/accuracy_reward": 0.6906249970197678,
"rewards/cosine_rewards": 0.1969544254243374,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": -0.0009982050396502018,
"step": 475
},
{
"clip_ratio": 0.0,
"completion_length": 1487.171875,
"epoch": 0.24248599083036168,
"grad_norm": 1.8115703582475124,
"kl": 0.0428466796875,
"learning_rate": 8.787570045848191e-07,
"loss": 0.0017,
"reward": 1.0939862728118896,
"reward_std": 0.652959406375885,
"rewards/accuracy_reward": 0.24062499776482582,
"rewards/cosine_rewards": -0.0988575927913189,
"rewards/format_reward": 0.953125,
"rewards/repetition_rewards": -0.0009061352466233075,
"step": 476
},
{
"clip_ratio": 0.0,
"completion_length": 1421.203125,
"epoch": 0.24299541518084564,
"grad_norm": 50.509798046645564,
"kl": 0.0455322265625,
"learning_rate": 8.785022924095772e-07,
"loss": 0.0018,
"reward": 1.1556105613708496,
"reward_std": 0.8080581426620483,
"rewards/accuracy_reward": 0.26874998956918716,
"rewards/cosine_rewards": -0.06538418680429459,
"rewards/format_reward": 0.953125,
"rewards/repetition_rewards": -0.0008802659867797047,
"step": 477
},
{
"clip_ratio": 0.0,
"completion_length": 1423.25,
"epoch": 0.2435048395313296,
"grad_norm": 2.143196092673566,
"kl": 0.042724609375,
"learning_rate": 8.782475802343352e-07,
"loss": 0.0017,
"reward": 1.4646123051643372,
"reward_std": 0.4019291028380394,
"rewards/accuracy_reward": 0.4374999925494194,
"rewards/cosine_rewards": 0.04411640763282776,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": -0.0013792455429211259,
"step": 478
},
{
"clip_ratio": 0.0,
"completion_length": 1525.359375,
"epoch": 0.24401426388181355,
"grad_norm": 1.4650345234014313,
"kl": 0.043701171875,
"learning_rate": 8.779928680590932e-07,
"loss": 0.0018,
"reward": 1.7237411737442017,
"reward_std": 0.6819100677967072,
"rewards/accuracy_reward": 0.6031249761581421,
"rewards/cosine_rewards": 0.13805609196424484,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": -0.0018149468814954162,
"step": 479
},
{
"clip_ratio": 0.0,
"completion_length": 1518.6875,
"epoch": 0.2445236882322975,
"grad_norm": 2.4194184684625166,
"kl": 0.0440673828125,
"learning_rate": 8.777381558838512e-07,
"loss": 0.0018,
"reward": 1.4634617269039154,
"reward_std": 0.4558331221342087,
"rewards/accuracy_reward": 0.46562497317790985,
"rewards/cosine_rewards": 0.03047458827495575,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": -0.0013878352474421263,
"step": 480
},
{
"clip_ratio": 0.0,
"completion_length": 1559.859375,
"epoch": 0.24503311258278146,
"grad_norm": 5.778258363606284,
"kl": 0.041015625,
"learning_rate": 8.774834437086093e-07,
"loss": 0.0016,
"reward": 1.1754435896873474,
"reward_std": 0.629539430141449,
"rewards/accuracy_reward": 0.2968749850988388,
"rewards/cosine_rewards": -0.058087632060050964,
"rewards/format_reward": 0.9375,
"rewards/repetition_rewards": -0.000843802816234529,
"step": 481
},
{
"clip_ratio": 0.0,
"completion_length": 1613.171875,
"epoch": 0.24554253693326542,
"grad_norm": 1.6143411312623293,
"kl": 0.0389404296875,
"learning_rate": 8.772287315333673e-07,
"loss": 0.0016,
"reward": 0.8586589694023132,
"reward_std": 0.45200832188129425,
"rewards/accuracy_reward": 0.09999999403953552,
"rewards/cosine_rewards": -0.22472049295902252,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": -0.0009954352863132954,
"step": 482
},
{
"clip_ratio": 0.0,
"completion_length": 1670.359375,
"epoch": 0.24605196128374937,
"grad_norm": 2.239924603374423,
"kl": 0.0592041015625,
"learning_rate": 8.769740193581253e-07,
"loss": 0.0024,
"reward": 1.522126853466034,
"reward_std": 0.8742709904909134,
"rewards/accuracy_reward": 0.4937499910593033,
"rewards/cosine_rewards": 0.060823358595371246,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": -0.0011965514277108014,
"step": 483
},
{
"clip_ratio": 0.0,
"completion_length": 1628.625,
"epoch": 0.24656138563423333,
"grad_norm": 8.106075584628705,
"kl": 0.0419921875,
"learning_rate": 8.767193071828834e-07,
"loss": 0.0017,
"reward": 0.9253878593444824,
"reward_std": 1.307717740535736,
"rewards/accuracy_reward": 0.18437500298023224,
"rewards/cosine_rewards": -0.14811599627137184,
"rewards/format_reward": 0.890625,
"rewards/repetition_rewards": -0.0014961253036744893,
"step": 484
},
{
"clip_ratio": 0.0,
"completion_length": 1648.875,
"epoch": 0.24707080998471728,
"grad_norm": 1.6847159437413002,
"kl": 0.0389404296875,
"learning_rate": 8.764645950076414e-07,
"loss": 0.0016,
"reward": 1.4973651766777039,
"reward_std": 0.9533334523439407,
"rewards/accuracy_reward": 0.5218749791383743,
"rewards/cosine_rewards": 0.0704129058867693,
"rewards/format_reward": 0.90625,
"rewards/repetition_rewards": -0.0011727037781383842,
"step": 485
},
{
"clip_ratio": 0.0,
"completion_length": 1511.140625,
"epoch": 0.2475802343352012,
"grad_norm": 1.736665525794252,
"kl": 0.0399169921875,
"learning_rate": 8.762098828323994e-07,
"loss": 0.0016,
"reward": 0.7075473368167877,
"reward_std": 0.8960316479206085,
"rewards/accuracy_reward": 0.015624992549419403,
"rewards/cosine_rewards": -0.2443552017211914,
"rewards/format_reward": 0.9375,
"rewards/repetition_rewards": -0.001222497143317014,
"step": 486
},
{
"clip_ratio": 0.0,
"completion_length": 1567.671875,
"epoch": 0.24808965868568517,
"grad_norm": 3.031931829741236,
"kl": 0.0386962890625,
"learning_rate": 8.759551706571575e-07,
"loss": 0.0015,
"reward": 1.3812061548233032,
"reward_std": 0.8888083398342133,
"rewards/accuracy_reward": 0.4093749672174454,
"rewards/cosine_rewards": 0.004479339346289635,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": -0.0013981764786876738,
"step": 487
},
{
"clip_ratio": 0.0,
"completion_length": 1457.359375,
"epoch": 0.24859908303616912,
"grad_norm": 5.524727047487839,
"kl": 0.0506591796875,
"learning_rate": 8.757004584819154e-07,
"loss": 0.002,
"reward": 1.8092041611671448,
"reward_std": 0.5159921646118164,
"rewards/accuracy_reward": 0.6343750059604645,
"rewards/cosine_rewards": 0.19165128469467163,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": -0.0011971485218964517,
"step": 488
},
{
"clip_ratio": 0.0,
"completion_length": 1379.078125,
"epoch": 0.24910850738665308,
"grad_norm": 7.278746642143023,
"kl": 0.055419921875,
"learning_rate": 8.754457463066734e-07,
"loss": 0.0022,
"reward": 1.194389447569847,
"reward_std": 0.5000828057527542,
"rewards/accuracy_reward": 0.26874998211860657,
"rewards/cosine_rewards": -0.04208715260028839,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": -0.0010234276414848864,
"step": 489
},
{
"clip_ratio": 0.0,
"completion_length": 1397.78125,
"epoch": 0.24961793173713703,
"grad_norm": 2.9100594613125352,
"kl": 0.0543212890625,
"learning_rate": 8.751910341314314e-07,
"loss": 0.0022,
"reward": 1.6665399670600891,
"reward_std": 0.6835527420043945,
"rewards/accuracy_reward": 0.6625000089406967,
"rewards/cosine_rewards": 0.11463410407304764,
"rewards/format_reward": 0.890625,
"rewards/repetition_rewards": -0.00121912601753138,
"step": 490
},
{
"clip_ratio": 0.0,
"completion_length": 1359.703125,
"epoch": 0.250127356087621,
"grad_norm": 12.325171247664876,
"kl": 0.0457763671875,
"learning_rate": 8.749363219561895e-07,
"loss": 0.0018,
"reward": 1.8410940766334534,
"reward_std": 0.4001428484916687,
"rewards/accuracy_reward": 0.690625011920929,
"rewards/cosine_rewards": 0.18309018202126026,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": -0.0013711884384974837,
"step": 491
},
{
"clip_ratio": 0.0,
"completion_length": 1470.75,
"epoch": 0.25063678043810494,
"grad_norm": 15.076912789505334,
"kl": 0.041015625,
"learning_rate": 8.746816097809475e-07,
"loss": 0.0016,
"reward": 1.2887136340141296,
"reward_std": 0.8450455367565155,
"rewards/accuracy_reward": 0.3531249761581421,
"rewards/cosine_rewards": -0.03244372457265854,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": -0.0007176562794484198,
"step": 492
},
{
"clip_ratio": 0.0,
"completion_length": 1348.140625,
"epoch": 0.2511462047885889,
"grad_norm": 10.32545410862146,
"kl": 0.05810546875,
"learning_rate": 8.744268976057055e-07,
"loss": 0.0023,
"reward": 1.227342277765274,
"reward_std": 1.0202240645885468,
"rewards/accuracy_reward": 0.3500000238418579,
"rewards/cosine_rewards": 0.002939566969871521,
"rewards/format_reward": 0.875,
"rewards/repetition_rewards": -0.0005973072838969529,
"step": 493
},
{
"clip_ratio": 0.0,
"completion_length": 1301.828125,
"epoch": 0.25165562913907286,
"grad_norm": 3.667270735290933,
"kl": 0.0645751953125,
"learning_rate": 8.741721854304636e-07,
"loss": 0.0026,
"reward": 1.2533040046691895,
"reward_std": 0.7434202134609222,
"rewards/accuracy_reward": 0.32500000298023224,
"rewards/cosine_rewards": -0.03934769332408905,
"rewards/format_reward": 0.96875,
"rewards/repetition_rewards": -0.0010982811218127608,
"step": 494
},
{
"clip_ratio": 0.0,
"completion_length": 1290.1875,
"epoch": 0.2521650534895568,
"grad_norm": 4.3465734782527345,
"kl": 0.0535888671875,
"learning_rate": 8.739174732552216e-07,
"loss": 0.0021,
"reward": 0.6353173404932022,
"reward_std": 0.6600025594234467,
"rewards/accuracy_reward": -0.040625013411045074,
"rewards/cosine_rewards": -0.2607284113764763,
"rewards/format_reward": 0.9375,
"rewards/repetition_rewards": -0.0008293068385683,
"step": 495
},
{
"clip_ratio": 0.0,
"completion_length": 1355.375,
"epoch": 0.25267447784004077,
"grad_norm": 4.901543791199254,
"kl": 0.060546875,
"learning_rate": 8.736627610799796e-07,
"loss": 0.0024,
"reward": 1.1404387950897217,
"reward_std": 0.670623242855072,
"rewards/accuracy_reward": 0.26874999701976776,
"rewards/cosine_rewards": -0.11177334189414978,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": -0.0009129364043474197,
"step": 496
},
{
"clip_ratio": 0.0,
"completion_length": 1328.6875,
"epoch": 0.2531839021905247,
"grad_norm": 3.9728562721335745,
"kl": 0.0489501953125,
"learning_rate": 8.734080489047376e-07,
"loss": 0.002,
"reward": 1.1998997032642365,
"reward_std": 0.630705714225769,
"rewards/accuracy_reward": 0.32500001788139343,
"rewards/cosine_rewards": -0.06150183826684952,
"rewards/format_reward": 0.9375,
"rewards/repetition_rewards": -0.0010984738764818758,
"step": 497
},
{
"clip_ratio": 0.0,
"completion_length": 1373.671875,
"epoch": 0.2536933265410087,
"grad_norm": 3.325255325148765,
"kl": 0.04833984375,
"learning_rate": 8.731533367294957e-07,
"loss": 0.0019,
"reward": 1.2019822597503662,
"reward_std": 0.38447779417037964,
"rewards/accuracy_reward": 0.296875,
"rewards/cosine_rewards": -0.04712319001555443,
"rewards/format_reward": 0.953125,
"rewards/repetition_rewards": -0.0008945107110776007,
"step": 498
},
{
"clip_ratio": 0.0,
"completion_length": 1309.28125,
"epoch": 0.25420275089149263,
"grad_norm": 5.499632802088616,
"kl": 0.072265625,
"learning_rate": 8.728986245542537e-07,
"loss": 0.0029,
"reward": 1.6111189126968384,
"reward_std": 0.1892632469534874,
"rewards/accuracy_reward": 0.550000011920929,
"rewards/cosine_rewards": 0.07773812115192413,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": -0.0009941596072167158,
"step": 499
},
{
"clip_ratio": 0.0,
"completion_length": 1363.625,
"epoch": 0.2547121752419766,
"grad_norm": 7.195546109062687,
"kl": 0.0482177734375,
"learning_rate": 8.726439123790117e-07,
"loss": 0.0019,
"reward": 1.9320534467697144,
"reward_std": 0.4148600548505783,
"rewards/accuracy_reward": 0.7468750178813934,
"rewards/cosine_rewards": 0.20207761228084564,
"rewards/format_reward": 0.984375,
"rewards/repetition_rewards": -0.0012741541431751102,
"step": 500
}
],
"logging_steps": 1.0,
"max_steps": 3926,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}