GRPO / trainer_state.json
LLucass's picture
Model save
439dfa7 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.17142857142857143,
"eval_steps": 500,
"global_step": 150,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 2693.6875610351562,
"entropy": 0.3662109375,
"epoch": 0.001142857142857143,
"grad_norm": 0.12395373731851578,
"kl": 0.0,
"learning_rate": 6.666666666666667e-08,
"loss": 0.0,
"reward": 0.7708333535119891,
"reward_std": 0.4629540964961052,
"rewards/accuracy_reward": 0.25000001303851604,
"rewards/format_reward": 0.5208333386108279,
"step": 1
},
{
"completion_length": 3127.3958435058594,
"entropy": 0.353515625,
"epoch": 0.002285714285714286,
"grad_norm": 0.14846429228782654,
"kl": 0.0,
"learning_rate": 1.3333333333333334e-07,
"loss": 0.0,
"reward": 0.6458333637565374,
"reward_std": 0.4249730706214905,
"rewards/accuracy_reward": 0.2812500102445483,
"rewards/format_reward": 0.3645833386108279,
"step": 2
},
{
"completion_length": 3685.041748046875,
"entropy": 0.4443359375,
"epoch": 0.0034285714285714284,
"grad_norm": 0.10399040579795837,
"kl": 4.1425228118896484e-05,
"learning_rate": 2e-07,
"loss": 0.0,
"reward": 0.23958333674818277,
"reward_std": 0.3668827787041664,
"rewards/accuracy_reward": 0.0729166679084301,
"rewards/format_reward": 0.16666667256504297,
"step": 3
},
{
"completion_length": 2380.291778564453,
"entropy": 0.40478515625,
"epoch": 0.004571428571428572,
"grad_norm": 0.16352659463882446,
"kl": 3.409385681152344e-05,
"learning_rate": 2.6666666666666667e-07,
"loss": 0.0,
"reward": 0.8229166865348816,
"reward_std": 0.507609948515892,
"rewards/accuracy_reward": 0.19791667722165585,
"rewards/format_reward": 0.6250000223517418,
"step": 4
},
{
"completion_length": 3441.2188720703125,
"entropy": 0.45458984375,
"epoch": 0.005714285714285714,
"grad_norm": 0.15812984108924866,
"kl": 4.1961669921875e-05,
"learning_rate": 3.333333333333333e-07,
"loss": 0.0,
"reward": 0.42708334885537624,
"reward_std": 0.5058739930391312,
"rewards/accuracy_reward": 0.07291666697710752,
"rewards/format_reward": 0.35416667722165585,
"step": 5
},
{
"completion_length": 3382.3438110351562,
"entropy": 0.45166015625,
"epoch": 0.006857142857142857,
"grad_norm": 0.15454305708408356,
"kl": 4.26173210144043e-05,
"learning_rate": 4e-07,
"loss": 0.0,
"reward": 0.40625000558793545,
"reward_std": 0.5202516540884972,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/format_reward": 0.3229166744276881,
"step": 6
},
{
"completion_length": 3277.291748046875,
"entropy": 0.39404296875,
"epoch": 0.008,
"grad_norm": 0.13690507411956787,
"kl": 2.562999725341797e-05,
"learning_rate": 4.6666666666666666e-07,
"loss": 0.0,
"reward": 0.8854166865348816,
"reward_std": 0.6845719665288925,
"rewards/accuracy_reward": 0.2708333432674408,
"rewards/format_reward": 0.6145833432674408,
"step": 7
},
{
"completion_length": 2841.916748046875,
"entropy": 0.36083984375,
"epoch": 0.009142857142857144,
"grad_norm": 0.1767321527004242,
"kl": 2.4050474166870117e-05,
"learning_rate": 5.333333333333333e-07,
"loss": 0.0,
"reward": 0.8854166967794299,
"reward_std": 0.3672378845512867,
"rewards/accuracy_reward": 0.3958333535119891,
"rewards/format_reward": 0.4895833460614085,
"step": 8
},
{
"completion_length": 3480.6563110351562,
"entropy": 0.4384765625,
"epoch": 0.010285714285714285,
"grad_norm": 0.15406936407089233,
"kl": 3.796815872192383e-05,
"learning_rate": 6e-07,
"loss": 0.0,
"reward": 0.5520833432674408,
"reward_std": 0.6496799141168594,
"rewards/accuracy_reward": 0.17708333488553762,
"rewards/format_reward": 0.3750000074505806,
"step": 9
},
{
"completion_length": 2963.572967529297,
"entropy": 0.3544921875,
"epoch": 0.011428571428571429,
"grad_norm": 0.15688633918762207,
"kl": 2.5287270545959473e-05,
"learning_rate": 6.666666666666666e-07,
"loss": 0.0,
"reward": 0.5937500223517418,
"reward_std": 0.5099271312355995,
"rewards/accuracy_reward": 0.17708333861082792,
"rewards/format_reward": 0.4166666753590107,
"step": 10
},
{
"completion_length": 3573.7500610351562,
"entropy": 0.37890625,
"epoch": 0.012571428571428572,
"grad_norm": 0.12983083724975586,
"kl": 2.5391578674316406e-05,
"learning_rate": 7.333333333333332e-07,
"loss": 0.0,
"reward": 0.3125000111758709,
"reward_std": 0.5802810192108154,
"rewards/accuracy_reward": 0.1041666679084301,
"rewards/format_reward": 0.20833334140479565,
"step": 11
},
{
"completion_length": 2520.8958740234375,
"entropy": 0.39111328125,
"epoch": 0.013714285714285714,
"grad_norm": 0.20449091494083405,
"kl": 3.743171691894531e-05,
"learning_rate": 8e-07,
"loss": 0.0,
"reward": 0.8020833656191826,
"reward_std": 0.4411254972219467,
"rewards/accuracy_reward": 0.14583333395421505,
"rewards/format_reward": 0.6562500223517418,
"step": 12
},
{
"completion_length": 3038.041748046875,
"entropy": 0.3828125,
"epoch": 0.014857142857142857,
"grad_norm": 0.14574433863162994,
"kl": 2.5153160095214844e-05,
"learning_rate": 8.666666666666667e-07,
"loss": 0.0,
"reward": 0.6875000298023224,
"reward_std": 0.3254704251885414,
"rewards/accuracy_reward": 0.22916666697710752,
"rewards/format_reward": 0.4583333432674408,
"step": 13
},
{
"completion_length": 3116.3125610351562,
"entropy": 0.37109375,
"epoch": 0.016,
"grad_norm": 0.202586367726326,
"kl": 1.9026920199394226e-05,
"learning_rate": 9.333333333333333e-07,
"loss": 0.0,
"reward": 0.5833333507180214,
"reward_std": 0.4630111753940582,
"rewards/accuracy_reward": 0.21875001024454832,
"rewards/format_reward": 0.3645833395421505,
"step": 14
},
{
"completion_length": 2924.0521240234375,
"entropy": 0.36328125,
"epoch": 0.017142857142857144,
"grad_norm": 0.09130721539258957,
"kl": 1.4469027519226074e-05,
"learning_rate": 1e-06,
"loss": 0.0,
"reward": 0.604166679084301,
"reward_std": 0.22134994342923164,
"rewards/accuracy_reward": 0.1979166716337204,
"rewards/format_reward": 0.4062500074505806,
"step": 15
},
{
"completion_length": 3887.5521850585938,
"entropy": 0.4755859375,
"epoch": 0.018285714285714287,
"grad_norm": 0.11806491017341614,
"kl": 2.9832124710083008e-05,
"learning_rate": 9.998781585307575e-07,
"loss": 0.0,
"reward": 0.11458333674818277,
"reward_std": 0.26997610181570053,
"rewards/accuracy_reward": 0.041666666977107525,
"rewards/format_reward": 0.07291666977107525,
"step": 16
},
{
"completion_length": 2579.625030517578,
"entropy": 0.44091796875,
"epoch": 0.019428571428571427,
"grad_norm": 0.2032601535320282,
"kl": 3.93986701965332e-05,
"learning_rate": 9.99512700102336e-07,
"loss": 0.0,
"reward": 0.7083333507180214,
"reward_std": 0.39187028259038925,
"rewards/accuracy_reward": 0.19791667442768812,
"rewards/format_reward": 0.5104166753590107,
"step": 17
},
{
"completion_length": 3089.104248046875,
"entropy": 0.3671875,
"epoch": 0.02057142857142857,
"grad_norm": 0.11376938223838806,
"kl": 1.2531876564025879e-05,
"learning_rate": 9.989038226169207e-07,
"loss": 0.0,
"reward": 0.5625000251457095,
"reward_std": 0.35285963863134384,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/format_reward": 0.3958333386108279,
"step": 18
},
{
"completion_length": 3130.760498046875,
"entropy": 0.39111328125,
"epoch": 0.021714285714285714,
"grad_norm": 0.09636794775724411,
"kl": 2.8267502784729004e-05,
"learning_rate": 9.98051855792412e-07,
"loss": 0.0,
"reward": 0.8125000111758709,
"reward_std": 0.3496965616941452,
"rewards/accuracy_reward": 0.36458333395421505,
"rewards/format_reward": 0.44791667722165585,
"step": 19
},
{
"completion_length": 2585.9896545410156,
"entropy": 0.329833984375,
"epoch": 0.022857142857142857,
"grad_norm": 0.15105831623077393,
"kl": 6.628036499023438e-05,
"learning_rate": 9.969572609838744e-07,
"loss": 0.0,
"reward": 0.9791666716337204,
"reward_std": 0.3452813923358917,
"rewards/accuracy_reward": 0.2812500037252903,
"rewards/format_reward": 0.6979166716337204,
"step": 20
},
{
"completion_length": 2804.229248046875,
"entropy": 0.42578125,
"epoch": 0.024,
"grad_norm": 0.2109123021364212,
"kl": 0.00016552209854125977,
"learning_rate": 9.956206309337066e-07,
"loss": 0.0,
"reward": 0.6145833432674408,
"reward_std": 0.4177238382399082,
"rewards/accuracy_reward": 0.1458333395421505,
"rewards/format_reward": 0.46875002048909664,
"step": 21
},
{
"completion_length": 1903.1459045410156,
"entropy": 0.419921875,
"epoch": 0.025142857142857144,
"grad_norm": 0.20996998250484467,
"kl": 0.00026351213455200195,
"learning_rate": 9.940426894506606e-07,
"loss": 0.0,
"reward": 1.1041667014360428,
"reward_std": 0.4033822976052761,
"rewards/accuracy_reward": 0.29166667722165585,
"rewards/format_reward": 0.8125000149011612,
"step": 22
},
{
"completion_length": 2714.0729370117188,
"entropy": 0.36865234375,
"epoch": 0.026285714285714287,
"grad_norm": 0.16544093191623688,
"kl": 0.00011658668518066406,
"learning_rate": 9.922242910178859e-07,
"loss": 0.0,
"reward": 0.6770833507180214,
"reward_std": 0.6271640285849571,
"rewards/accuracy_reward": 0.1770833432674408,
"rewards/format_reward": 0.5000000223517418,
"step": 23
},
{
"completion_length": 2834.2396850585938,
"entropy": 0.373046875,
"epoch": 0.027428571428571427,
"grad_norm": 0.10939397662878036,
"kl": 0.0001084059476852417,
"learning_rate": 9.901664203302124e-07,
"loss": 0.0,
"reward": 0.7916666865348816,
"reward_std": 0.5711240321397781,
"rewards/accuracy_reward": 0.2187500074505806,
"rewards/format_reward": 0.572916679084301,
"step": 24
},
{
"completion_length": 2877.1354370117188,
"entropy": 0.4296875,
"epoch": 0.02857142857142857,
"grad_norm": 0.10193013399839401,
"kl": 0.00018364191055297852,
"learning_rate": 9.878701917609207e-07,
"loss": 0.0,
"reward": 0.677083358168602,
"reward_std": 0.2898401468992233,
"rewards/accuracy_reward": 0.2395833432674408,
"rewards/format_reward": 0.4375,
"step": 25
},
{
"completion_length": 3221.2396850585938,
"entropy": 0.4248046875,
"epoch": 0.029714285714285714,
"grad_norm": 0.07458896934986115,
"kl": 3.0487775802612305e-05,
"learning_rate": 9.853368487582886e-07,
"loss": 0.0,
"reward": 0.6562500149011612,
"reward_std": 0.25371449440717697,
"rewards/accuracy_reward": 0.19791666977107525,
"rewards/format_reward": 0.4583333358168602,
"step": 26
},
{
"completion_length": 3297.3959350585938,
"entropy": 0.45703125,
"epoch": 0.030857142857142857,
"grad_norm": 0.0925775095820427,
"kl": 0.00012201815843582153,
"learning_rate": 9.825677631722435e-07,
"loss": 0.0,
"reward": 0.541666679084301,
"reward_std": 0.4426998719573021,
"rewards/accuracy_reward": 0.15625000279396772,
"rewards/format_reward": 0.385416679084301,
"step": 27
},
{
"completion_length": 2984.2188110351562,
"entropy": 0.3994140625,
"epoch": 0.032,
"grad_norm": 0.12723609805107117,
"kl": 0.00015980005264282227,
"learning_rate": 9.795644345114794e-07,
"loss": 0.0,
"reward": 0.8437500447034836,
"reward_std": 0.48607436567544937,
"rewards/accuracy_reward": 0.3333333460614085,
"rewards/format_reward": 0.5104166865348816,
"step": 28
},
{
"completion_length": 3707.2501220703125,
"entropy": 0.43408203125,
"epoch": 0.03314285714285714,
"grad_norm": 0.20130394399166107,
"kl": 0.0003858804702758789,
"learning_rate": 9.76328489131448e-07,
"loss": 0.0,
"reward": 0.2500000027939677,
"reward_std": 0.37919554859399796,
"rewards/accuracy_reward": 0.06250000186264515,
"rewards/format_reward": 0.18750001024454832,
"step": 29
},
{
"completion_length": 3099.9063110351562,
"entropy": 0.384765625,
"epoch": 0.03428571428571429,
"grad_norm": 0.13564985990524292,
"kl": 0.0005750656127929688,
"learning_rate": 9.728616793536587e-07,
"loss": 0.0,
"reward": 0.8854166828095913,
"reward_std": 0.5532158613204956,
"rewards/accuracy_reward": 0.3125000009313226,
"rewards/format_reward": 0.572916679084301,
"step": 30
},
{
"completion_length": 3310.8125610351562,
"entropy": 0.40087890625,
"epoch": 0.03542857142857143,
"grad_norm": 0.14703762531280518,
"kl": 0.0007152557373046875,
"learning_rate": 9.69165882516764e-07,
"loss": 0.0,
"reward": 0.4583333469927311,
"reward_std": 0.4937985762953758,
"rewards/accuracy_reward": 0.16666667070239782,
"rewards/format_reward": 0.2916666716337204,
"step": 31
},
{
"completion_length": 3543.666748046875,
"entropy": 0.4521484375,
"epoch": 0.036571428571428574,
"grad_norm": 0.11301636695861816,
"kl": 0.00032842159271240234,
"learning_rate": 9.65243099959949e-07,
"loss": 0.0,
"reward": 0.6250000223517418,
"reward_std": 0.46596524864435196,
"rewards/accuracy_reward": 0.28125000558793545,
"rewards/format_reward": 0.34375,
"step": 32
},
{
"completion_length": 3395.947998046875,
"entropy": 0.384765625,
"epoch": 0.037714285714285714,
"grad_norm": 0.12107253074645996,
"kl": 0.00042450428009033203,
"learning_rate": 9.610954559391704e-07,
"loss": 0.0,
"reward": 0.604166679084301,
"reward_std": 0.5497709587216377,
"rewards/accuracy_reward": 0.18750000093132257,
"rewards/format_reward": 0.41666667722165585,
"step": 33
},
{
"completion_length": 2621.218780517578,
"entropy": 0.45263671875,
"epoch": 0.038857142857142854,
"grad_norm": 0.15317150950431824,
"kl": 0.0013637542724609375,
"learning_rate": 9.567251964768342e-07,
"loss": 0.0001,
"reward": 0.8541666865348816,
"reward_std": 0.4670567326247692,
"rewards/accuracy_reward": 0.31250001303851604,
"rewards/format_reward": 0.5416666828095913,
"step": 34
},
{
"completion_length": 3166.3958740234375,
"entropy": 0.43115234375,
"epoch": 0.04,
"grad_norm": 0.1469903290271759,
"kl": 0.0011509060859680176,
"learning_rate": 9.521346881455354e-07,
"loss": 0.0,
"reward": 0.6458333656191826,
"reward_std": 0.6130613833665848,
"rewards/accuracy_reward": 0.23958333767950535,
"rewards/format_reward": 0.4062500149011612,
"step": 35
},
{
"completion_length": 3509.697998046875,
"entropy": 0.513671875,
"epoch": 0.04114285714285714,
"grad_norm": 0.11033376306295395,
"kl": 0.0011191368103027344,
"learning_rate": 9.473264167865171e-07,
"loss": 0.0,
"reward": 0.23958333395421505,
"reward_std": 0.24118434637784958,
"rewards/accuracy_reward": 0.031250000931322575,
"rewards/format_reward": 0.20833334140479565,
"step": 36
},
{
"completion_length": 3363.8333740234375,
"entropy": 0.42138671875,
"epoch": 0.04228571428571429,
"grad_norm": 0.11778294295072556,
"kl": 0.0008115768432617188,
"learning_rate": 9.42302986163543e-07,
"loss": 0.0,
"reward": 0.2812500149011612,
"reward_std": 0.13804075866937637,
"rewards/accuracy_reward": 0.031250000931322575,
"rewards/format_reward": 0.25,
"step": 37
},
{
"completion_length": 3610.8438110351562,
"entropy": 0.44677734375,
"epoch": 0.04342857142857143,
"grad_norm": 0.061884235590696335,
"kl": 0.0005736351013183594,
"learning_rate": 9.370671165529144e-07,
"loss": 0.0,
"reward": 0.21875000558793545,
"reward_std": 0.17128896713256836,
"rewards/accuracy_reward": 0.10416666697710752,
"rewards/format_reward": 0.11458333861082792,
"step": 38
},
{
"completion_length": 2926.4063110351562,
"entropy": 0.36669921875,
"epoch": 0.044571428571428574,
"grad_norm": 0.1068028062582016,
"kl": 0.0011527538299560547,
"learning_rate": 9.316216432703916e-07,
"loss": 0.0,
"reward": 0.7708333656191826,
"reward_std": 0.1930682435631752,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.5208333507180214,
"step": 39
},
{
"completion_length": 2785.1146545410156,
"entropy": 0.388671875,
"epoch": 0.045714285714285714,
"grad_norm": 0.17572174966335297,
"kl": 0.0032024383544921875,
"learning_rate": 9.259695151358214e-07,
"loss": 0.0001,
"reward": 0.7291666902601719,
"reward_std": 0.3721684589982033,
"rewards/accuracy_reward": 0.1979166716337204,
"rewards/format_reward": 0.5312500186264515,
"step": 40
},
{
"completion_length": 3123.947998046875,
"entropy": 0.35791015625,
"epoch": 0.046857142857142854,
"grad_norm": 0.144905224442482,
"kl": 0.0007574558258056641,
"learning_rate": 9.20113792876298e-07,
"loss": 0.0,
"reward": 0.5416666865348816,
"reward_std": 0.4893290549516678,
"rewards/accuracy_reward": 0.12500000651925802,
"rewards/format_reward": 0.416666679084301,
"step": 41
},
{
"completion_length": 3056.197998046875,
"entropy": 0.48193359375,
"epoch": 0.048,
"grad_norm": 0.07001210004091263,
"kl": 0.000598907470703125,
"learning_rate": 9.140576474687263e-07,
"loss": 0.0,
"reward": 0.30208333395421505,
"reward_std": 0.15690935403108597,
"rewards/accuracy_reward": 0.02083333395421505,
"rewards/format_reward": 0.2812500009313226,
"step": 42
},
{
"completion_length": 3097.479248046875,
"entropy": 0.4033203125,
"epoch": 0.04914285714285714,
"grad_norm": 0.09348543733358383,
"kl": 0.001148223876953125,
"learning_rate": 9.078043584226815e-07,
"loss": 0.0,
"reward": 0.4895833358168602,
"reward_std": 0.3020758181810379,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/format_reward": 0.32291666977107525,
"step": 43
},
{
"completion_length": 2797.7084197998047,
"entropy": 0.39013671875,
"epoch": 0.05028571428571429,
"grad_norm": 0.15556485950946808,
"kl": 0.0014767646789550781,
"learning_rate": 9.013573120044966e-07,
"loss": 0.0001,
"reward": 0.8020833386108279,
"reward_std": 0.3607782945036888,
"rewards/accuracy_reward": 0.2708333395421505,
"rewards/format_reward": 0.5312500102445483,
"step": 44
},
{
"completion_length": 3618.791748046875,
"entropy": 0.423828125,
"epoch": 0.05142857142857143,
"grad_norm": 0.09923144429922104,
"kl": 0.0026645660400390625,
"learning_rate": 8.9471999940354e-07,
"loss": 0.0001,
"reward": 0.5833333348855376,
"reward_std": 0.4189528524875641,
"rewards/accuracy_reward": 0.2604166716337204,
"rewards/format_reward": 0.3229166781529784,
"step": 45
},
{
"completion_length": 3482.479248046875,
"entropy": 0.50634765625,
"epoch": 0.052571428571428575,
"grad_norm": 0.12269324064254761,
"kl": 0.0013875961303710938,
"learning_rate": 8.878960148416747e-07,
"loss": 0.0001,
"reward": 0.22916667722165585,
"reward_std": 0.26679350435733795,
"rewards/accuracy_reward": 0.0416666679084301,
"rewards/format_reward": 0.18750000186264515,
"step": 46
},
{
"completion_length": 2958.291748046875,
"entropy": 0.390625,
"epoch": 0.053714285714285714,
"grad_norm": 0.16963143646717072,
"kl": 0.0011917352676391602,
"learning_rate": 8.808890536269229e-07,
"loss": 0.0,
"reward": 0.8854166967794299,
"reward_std": 0.5451135858893394,
"rewards/accuracy_reward": 0.3437500149011612,
"rewards/format_reward": 0.5416666669771075,
"step": 47
},
{
"completion_length": 2956.416717529297,
"entropy": 0.396484375,
"epoch": 0.054857142857142854,
"grad_norm": 0.14105089008808136,
"kl": 0.0033426284790039062,
"learning_rate": 8.737029101523929e-07,
"loss": 0.0001,
"reward": 0.7395833507180214,
"reward_std": 0.5457281768321991,
"rewards/accuracy_reward": 0.29166666977107525,
"rewards/format_reward": 0.4479166679084301,
"step": 48
},
{
"completion_length": 2448.1146850585938,
"entropy": 0.36865234375,
"epoch": 0.056,
"grad_norm": 0.15970121324062347,
"kl": 0.006764888763427734,
"learning_rate": 8.663414758415478e-07,
"loss": 0.0003,
"reward": 0.895833395421505,
"reward_std": 0.464010052382946,
"rewards/accuracy_reward": 0.25000000558793545,
"rewards/format_reward": 0.6458333507180214,
"step": 49
},
{
"completion_length": 3050.1041870117188,
"entropy": 0.34521484375,
"epoch": 0.05714285714285714,
"grad_norm": 0.12386268377304077,
"kl": 0.0011911392211914062,
"learning_rate": 8.588087370409302e-07,
"loss": 0.0,
"reward": 0.6562500325962901,
"reward_std": 0.4570996016263962,
"rewards/accuracy_reward": 0.2916666818782687,
"rewards/format_reward": 0.3645833497866988,
"step": 50
},
{
"completion_length": 2495.2708740234375,
"entropy": 0.44873046875,
"epoch": 0.05828571428571429,
"grad_norm": 0.12384030222892761,
"kl": 0.005596160888671875,
"learning_rate": 8.511087728614862e-07,
"loss": 0.0002,
"reward": 0.6875000149011612,
"reward_std": 0.3033446706831455,
"rewards/accuracy_reward": 0.1458333395421505,
"rewards/format_reward": 0.5416666716337204,
"step": 51
},
{
"completion_length": 3027.406280517578,
"entropy": 0.384765625,
"epoch": 0.05942857142857143,
"grad_norm": 0.0901699811220169,
"kl": 0.0021982192993164062,
"learning_rate": 8.432457529696548e-07,
"loss": 0.0001,
"reward": 0.8750000298023224,
"reward_std": 0.5351639539003372,
"rewards/accuracy_reward": 0.3958333507180214,
"rewards/format_reward": 0.4791666865348816,
"step": 52
},
{
"completion_length": 2952.8646850585938,
"entropy": 0.41845703125,
"epoch": 0.060571428571428575,
"grad_norm": 0.09236861765384674,
"kl": 0.0012669563293457031,
"learning_rate": 8.352239353294194e-07,
"loss": 0.0001,
"reward": 0.8541666865348816,
"reward_std": 0.5214647725224495,
"rewards/accuracy_reward": 0.260416679084301,
"rewards/format_reward": 0.5937500074505806,
"step": 53
},
{
"completion_length": 2996.1250610351562,
"entropy": 0.3837890625,
"epoch": 0.061714285714285715,
"grad_norm": 0.15135987102985382,
"kl": 0.0015659332275390625,
"learning_rate": 8.270476638965461e-07,
"loss": 0.0001,
"reward": 0.9479166939854622,
"reward_std": 0.7639089524745941,
"rewards/accuracy_reward": 0.4062500111758709,
"rewards/format_reward": 0.5416666828095913,
"step": 54
},
{
"completion_length": 3076.2500610351562,
"entropy": 0.4130859375,
"epoch": 0.06285714285714286,
"grad_norm": 0.12680813670158386,
"kl": 0.0023276805877685547,
"learning_rate": 8.187213662662538e-07,
"loss": 0.0001,
"reward": 0.6979166865348816,
"reward_std": 0.5675121322274208,
"rewards/accuracy_reward": 0.23958333861082792,
"rewards/format_reward": 0.458333358168602,
"step": 55
},
{
"completion_length": 3058.104248046875,
"entropy": 0.4072265625,
"epoch": 0.064,
"grad_norm": 0.10628776252269745,
"kl": 0.0009794235229492188,
"learning_rate": 8.102495512755938e-07,
"loss": 0.0,
"reward": 0.6562500298023224,
"reward_std": 0.3362164571881294,
"rewards/accuracy_reward": 0.19791666697710752,
"rewards/format_reward": 0.4583333469927311,
"step": 56
},
{
"completion_length": 3532.2813110351562,
"entropy": 0.3369140625,
"epoch": 0.06514285714285714,
"grad_norm": 0.09391733258962631,
"kl": 0.0005993843078613281,
"learning_rate": 8.01636806561836e-07,
"loss": 0.0,
"reward": 0.3854166865348816,
"reward_std": 0.3325711265206337,
"rewards/accuracy_reward": 0.08333333395421505,
"rewards/format_reward": 0.3020833432674408,
"step": 57
},
{
"completion_length": 2239.1145935058594,
"entropy": 0.322998046875,
"epoch": 0.06628571428571428,
"grad_norm": 0.11698172241449356,
"kl": 0.0037631988525390625,
"learning_rate": 7.928877960781808e-07,
"loss": 0.0002,
"reward": 1.0000000223517418,
"reward_std": 0.39449498802423477,
"rewards/accuracy_reward": 0.2604166669771075,
"rewards/format_reward": 0.7395833358168602,
"step": 58
},
{
"completion_length": 3092.3438110351562,
"entropy": 0.3662109375,
"epoch": 0.06742857142857143,
"grad_norm": 0.10882271081209183,
"kl": 0.001026153564453125,
"learning_rate": 7.840072575681468e-07,
"loss": 0.0,
"reward": 0.5625000298023224,
"reward_std": 0.39667778089642525,
"rewards/accuracy_reward": 0.19791667442768812,
"rewards/format_reward": 0.36458333395421505,
"step": 59
},
{
"completion_length": 3120.5209350585938,
"entropy": 0.38037109375,
"epoch": 0.06857142857142857,
"grad_norm": 0.1319083720445633,
"kl": 0.0019273757934570312,
"learning_rate": 7.75e-07,
"loss": 0.0001,
"reward": 0.583333358168602,
"reward_std": 0.49578939378261566,
"rewards/accuracy_reward": 0.13541666977107525,
"rewards/format_reward": 0.4479166902601719,
"step": 60
},
{
"completion_length": 2971.9166870117188,
"entropy": 0.36669921875,
"epoch": 0.06971428571428571,
"grad_norm": 0.17734739184379578,
"kl": 0.0010924339294433594,
"learning_rate": 7.658709009626109e-07,
"loss": 0.0,
"reward": 0.8020833730697632,
"reward_std": 0.5149242952466011,
"rewards/accuracy_reward": 0.2083333358168602,
"rewards/format_reward": 0.5937500149011612,
"step": 61
},
{
"completion_length": 2529.7188720703125,
"entropy": 0.329833984375,
"epoch": 0.07085714285714285,
"grad_norm": 0.26185768842697144,
"kl": 0.016622543334960938,
"learning_rate": 7.566249040241553e-07,
"loss": 0.0007,
"reward": 0.9270833656191826,
"reward_std": 0.4443442225456238,
"rewards/accuracy_reward": 0.27083334885537624,
"rewards/format_reward": 0.6562500149011612,
"step": 62
},
{
"completion_length": 2196.1250610351562,
"entropy": 0.36279296875,
"epoch": 0.072,
"grad_norm": 0.1202726885676384,
"kl": 0.0028791427612304688,
"learning_rate": 7.472670160550848e-07,
"loss": 0.0001,
"reward": 1.1562500596046448,
"reward_std": 0.43789636343717575,
"rewards/accuracy_reward": 0.385416679084301,
"rewards/format_reward": 0.7708333432674408,
"step": 63
},
{
"completion_length": 3074.041717529297,
"entropy": 0.42041015625,
"epoch": 0.07314285714285715,
"grad_norm": 0.10591074079275131,
"kl": 0.0019989013671875,
"learning_rate": 7.37802304516818e-07,
"loss": 0.0001,
"reward": 0.6250000149011612,
"reward_std": 0.4529266282916069,
"rewards/accuracy_reward": 0.18750000651925802,
"rewards/format_reward": 0.4375000074505806,
"step": 64
},
{
"completion_length": 2871.416748046875,
"entropy": 0.365478515625,
"epoch": 0.07428571428571429,
"grad_norm": 0.16383042931556702,
"kl": 0.001850128173828125,
"learning_rate": 7.282358947176205e-07,
"loss": 0.0001,
"reward": 0.6666666865348816,
"reward_std": 0.38071464747190475,
"rewards/accuracy_reward": 0.19791667722165585,
"rewards/format_reward": 0.4687500149011612,
"step": 65
},
{
"completion_length": 2014.6354370117188,
"entropy": 0.33642578125,
"epoch": 0.07542857142857143,
"grad_norm": 0.26954010128974915,
"kl": 0.008558273315429688,
"learning_rate": 7.185729670371604e-07,
"loss": 0.0003,
"reward": 0.9375000447034836,
"reward_std": 0.35192636400461197,
"rewards/accuracy_reward": 0.34375000838190317,
"rewards/format_reward": 0.59375,
"step": 66
},
{
"completion_length": 3587.479248046875,
"entropy": 0.36572265625,
"epoch": 0.07657142857142857,
"grad_norm": 0.08370436728000641,
"kl": 0.001728057861328125,
"learning_rate": 7.08818754121241e-07,
"loss": 0.0001,
"reward": 0.22916667442768812,
"reward_std": 0.2259194478392601,
"rewards/accuracy_reward": 0.010416666977107525,
"rewards/format_reward": 0.21875000279396772,
"step": 67
},
{
"completion_length": 2018.3750457763672,
"entropy": 0.36474609375,
"epoch": 0.07771428571428571,
"grad_norm": 0.18221524357795715,
"kl": 0.0046844482421875,
"learning_rate": 6.989785380482312e-07,
"loss": 0.0002,
"reward": 0.916666716337204,
"reward_std": 0.35510556399822235,
"rewards/accuracy_reward": 0.2604166716337204,
"rewards/format_reward": 0.6562500149011612,
"step": 68
},
{
"completion_length": 2231.1979370117188,
"entropy": 0.41796875,
"epoch": 0.07885714285714286,
"grad_norm": 0.19485469162464142,
"kl": 0.0045928955078125,
"learning_rate": 6.890576474687263e-07,
"loss": 0.0002,
"reward": 0.6666666939854622,
"reward_std": 0.3663952201604843,
"rewards/accuracy_reward": 0.06250000186264515,
"rewards/format_reward": 0.6041666865348816,
"step": 69
},
{
"completion_length": 3180.8854370117188,
"entropy": 0.38232421875,
"epoch": 0.08,
"grad_norm": 0.13257598876953125,
"kl": 0.00274658203125,
"learning_rate": 6.790614547199906e-07,
"loss": 0.0001,
"reward": 0.45833334885537624,
"reward_std": 0.4246904104948044,
"rewards/accuracy_reward": 0.07291666977107525,
"rewards/format_reward": 0.38541666977107525,
"step": 70
},
{
"completion_length": 2643.1771240234375,
"entropy": 0.43798828125,
"epoch": 0.08114285714285714,
"grad_norm": 0.13770414888858795,
"kl": 0.0029726028442382812,
"learning_rate": 6.68995372916741e-07,
"loss": 0.0001,
"reward": 0.7500000149011612,
"reward_std": 0.30269280821084976,
"rewards/accuracy_reward": 0.2500000074505806,
"rewards/format_reward": 0.5000000149011612,
"step": 71
},
{
"completion_length": 2940.6146545410156,
"entropy": 0.4892578125,
"epoch": 0.08228571428571428,
"grad_norm": 0.20104120671749115,
"kl": 0.0030574798583984375,
"learning_rate": 6.588648530198504e-07,
"loss": 0.0001,
"reward": 0.5000000149011612,
"reward_std": 0.47806398570537567,
"rewards/accuracy_reward": 0.06250000186264515,
"rewards/format_reward": 0.4375000111758709,
"step": 72
},
{
"completion_length": 3779.4583740234375,
"entropy": 0.513671875,
"epoch": 0.08342857142857144,
"grad_norm": 0.09181614220142365,
"kl": 0.0015926361083984375,
"learning_rate": 6.486753808845564e-07,
"loss": 0.0001,
"reward": 0.3229166744276881,
"reward_std": 0.4466712549328804,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.19791667070239782,
"step": 73
},
{
"completion_length": 3236.4688720703125,
"entropy": 0.4130859375,
"epoch": 0.08457142857142858,
"grad_norm": 0.1495177149772644,
"kl": 0.0028803348541259766,
"learning_rate": 6.384324742897735e-07,
"loss": 0.0001,
"reward": 0.645833358168602,
"reward_std": 0.4744175747036934,
"rewards/accuracy_reward": 0.26041666977107525,
"rewards/format_reward": 0.385416679084301,
"step": 74
},
{
"completion_length": 3159.635498046875,
"entropy": 0.404296875,
"epoch": 0.08571428571428572,
"grad_norm": 0.11836569011211395,
"kl": 0.0026121139526367188,
"learning_rate": 6.281416799501187e-07,
"loss": 0.0001,
"reward": 0.697916679084301,
"reward_std": 0.46560388058423996,
"rewards/accuracy_reward": 0.22916666697710752,
"rewards/format_reward": 0.4687500111758709,
"step": 75
},
{
"completion_length": 2450.041748046875,
"entropy": 0.412353515625,
"epoch": 0.08685714285714285,
"grad_norm": 0.1628679782152176,
"kl": 0.0018434524536132812,
"learning_rate": 6.178085705122674e-07,
"loss": 0.0001,
"reward": 0.6875,
"reward_std": 0.31900282949209213,
"rewards/accuracy_reward": 0.08333333395421505,
"rewards/format_reward": 0.6041666716337204,
"step": 76
},
{
"completion_length": 3208.3751220703125,
"entropy": 0.453125,
"epoch": 0.088,
"grad_norm": 0.1415146142244339,
"kl": 0.002140045166015625,
"learning_rate": 6.074387415372676e-07,
"loss": 0.0001,
"reward": 0.5104166818782687,
"reward_std": 0.44949568808078766,
"rewards/accuracy_reward": 0.09375000558793545,
"rewards/format_reward": 0.4166666744276881,
"step": 77
},
{
"completion_length": 2896.6771240234375,
"entropy": 0.3759765625,
"epoch": 0.08914285714285715,
"grad_norm": 0.13142429292201996,
"kl": 0.0013580322265625,
"learning_rate": 5.97037808470444e-07,
"loss": 0.0001,
"reward": 0.7500000298023224,
"reward_std": 0.5779594928026199,
"rewards/accuracy_reward": 0.250000006519258,
"rewards/format_reward": 0.5000000223517418,
"step": 78
},
{
"completion_length": 2312.0938110351562,
"entropy": 0.3544921875,
"epoch": 0.09028571428571429,
"grad_norm": 0.1474093347787857,
"kl": 0.0019969940185546875,
"learning_rate": 5.866114036005362e-07,
"loss": 0.0001,
"reward": 0.8125000149011612,
"reward_std": 0.36936958134174347,
"rewards/accuracy_reward": 0.20833333674818277,
"rewards/format_reward": 0.604166679084301,
"step": 79
},
{
"completion_length": 3418.0833740234375,
"entropy": 0.49755859375,
"epoch": 0.09142857142857143,
"grad_norm": 0.14068344235420227,
"kl": 0.002742767333984375,
"learning_rate": 5.761651730097142e-07,
"loss": 0.0001,
"reward": 0.5208333544433117,
"reward_std": 0.4246201291680336,
"rewards/accuracy_reward": 0.16666666697710752,
"rewards/format_reward": 0.3541666716337204,
"step": 80
},
{
"completion_length": 2992.0729370117188,
"entropy": 0.56787109375,
"epoch": 0.09257142857142857,
"grad_norm": 0.1596606820821762,
"kl": 0.005756378173828125,
"learning_rate": 5.657047735161255e-07,
"loss": 0.0002,
"reward": 0.4895833432674408,
"reward_std": 0.3185732662677765,
"rewards/accuracy_reward": 0.11458333395421505,
"rewards/format_reward": 0.3750000074505806,
"step": 81
},
{
"completion_length": 2483.9584350585938,
"entropy": 0.39306640625,
"epoch": 0.09371428571428571,
"grad_norm": 0.13652034103870392,
"kl": 0.002727508544921875,
"learning_rate": 5.552358696106288e-07,
"loss": 0.0001,
"reward": 0.8020833432674408,
"reward_std": 0.24248424544930458,
"rewards/accuracy_reward": 0.3020833395421505,
"rewards/format_reward": 0.5000000074505806,
"step": 82
},
{
"completion_length": 2964.8646850585938,
"entropy": 0.48486328125,
"epoch": 0.09485714285714286,
"grad_norm": 0.10738710314035416,
"kl": 0.00264739990234375,
"learning_rate": 5.447641303893714e-07,
"loss": 0.0001,
"reward": 0.5312500074505806,
"reward_std": 0.3985592797398567,
"rewards/accuracy_reward": 0.1770833395421505,
"rewards/format_reward": 0.3541666716337204,
"step": 83
},
{
"completion_length": 3069.7396240234375,
"entropy": 0.45263671875,
"epoch": 0.096,
"grad_norm": 0.14286305010318756,
"kl": 0.0017604827880859375,
"learning_rate": 5.342952264838747e-07,
"loss": 0.0001,
"reward": 0.739583358168602,
"reward_std": 0.44136959314346313,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.4895833432674408,
"step": 84
},
{
"completion_length": 2664.2188110351562,
"entropy": 0.324951171875,
"epoch": 0.09714285714285714,
"grad_norm": 0.13478516042232513,
"kl": 0.002002716064453125,
"learning_rate": 5.238348269902859e-07,
"loss": 0.0001,
"reward": 0.7604166716337204,
"reward_std": 0.5178688690066338,
"rewards/accuracy_reward": 0.15625000186264515,
"rewards/format_reward": 0.6041666679084301,
"step": 85
},
{
"completion_length": 2774.291748046875,
"entropy": 0.465576171875,
"epoch": 0.09828571428571428,
"grad_norm": 0.16704270243644714,
"kl": 0.00365447998046875,
"learning_rate": 5.133885963994639e-07,
"loss": 0.0001,
"reward": 0.6250000102445483,
"reward_std": 0.2606133744120598,
"rewards/accuracy_reward": 0.16666666977107525,
"rewards/format_reward": 0.4583333386108279,
"step": 86
},
{
"completion_length": 2400.302215576172,
"entropy": 0.4453125,
"epoch": 0.09942857142857142,
"grad_norm": 0.23015545308589935,
"kl": 0.0040435791015625,
"learning_rate": 5.02962191529556e-07,
"loss": 0.0002,
"reward": 0.8125000447034836,
"reward_std": 0.5173326060175896,
"rewards/accuracy_reward": 0.18750001024454832,
"rewards/format_reward": 0.625,
"step": 87
},
{
"completion_length": 2469.7396697998047,
"entropy": 0.41943359375,
"epoch": 0.10057142857142858,
"grad_norm": 0.16470564901828766,
"kl": 0.0041961669921875,
"learning_rate": 4.925612584627324e-07,
"loss": 0.0002,
"reward": 1.0208333730697632,
"reward_std": 0.693773627281189,
"rewards/accuracy_reward": 0.3750000186264515,
"rewards/format_reward": 0.6458333507180214,
"step": 88
},
{
"completion_length": 2834.635498046875,
"entropy": 0.37939453125,
"epoch": 0.10171428571428572,
"grad_norm": 0.1899234652519226,
"kl": 0.003200531005859375,
"learning_rate": 4.821914294877326e-07,
"loss": 0.0001,
"reward": 0.6562500149011612,
"reward_std": 0.5321320816874504,
"rewards/accuracy_reward": 0.17708333395421505,
"rewards/format_reward": 0.479166679084301,
"step": 89
},
{
"completion_length": 2226.3959045410156,
"entropy": 0.59619140625,
"epoch": 0.10285714285714286,
"grad_norm": 0.15134188532829285,
"kl": 0.0078887939453125,
"learning_rate": 4.7185832004988133e-07,
"loss": 0.0003,
"reward": 0.6458333563059568,
"reward_std": 0.21650634706020355,
"rewards/accuracy_reward": 0.031250000931322575,
"rewards/format_reward": 0.6145833460614085,
"step": 90
},
{
"completion_length": 2510.8334045410156,
"entropy": 0.423828125,
"epoch": 0.104,
"grad_norm": 0.15344281494617462,
"kl": 0.0041046142578125,
"learning_rate": 4.6156752571022637e-07,
"loss": 0.0002,
"reward": 0.8958333637565374,
"reward_std": 0.40820014476776123,
"rewards/accuracy_reward": 0.2604166669771075,
"rewards/format_reward": 0.6354166818782687,
"step": 91
},
{
"completion_length": 2551.062530517578,
"entropy": 0.390869140625,
"epoch": 0.10514285714285715,
"grad_norm": 0.12282641232013702,
"kl": 0.00506591796875,
"learning_rate": 4.513246191154434e-07,
"loss": 0.0002,
"reward": 0.6770833432674408,
"reward_std": 0.3805246874690056,
"rewards/accuracy_reward": 0.09375000093132257,
"rewards/format_reward": 0.5833333432674408,
"step": 92
},
{
"completion_length": 3784.7188110351562,
"entropy": 0.638671875,
"epoch": 0.10628571428571429,
"grad_norm": 0.2033790946006775,
"kl": 0.0058460235595703125,
"learning_rate": 4.4113514698014953e-07,
"loss": 0.0002,
"reward": 0.0729166679084301,
"reward_std": 0.18205293267965317,
"rewards/accuracy_reward": 0.0,
"rewards/format_reward": 0.0729166679084301,
"step": 93
},
{
"completion_length": 2915.9375,
"entropy": 0.5439453125,
"epoch": 0.10742857142857143,
"grad_norm": 0.19546350836753845,
"kl": 0.004344940185546875,
"learning_rate": 4.3100462708325914e-07,
"loss": 0.0002,
"reward": 0.5833333395421505,
"reward_std": 0.427902989089489,
"rewards/accuracy_reward": 0.18750000279396772,
"rewards/format_reward": 0.3958333358168602,
"step": 94
},
{
"completion_length": 3644.4688110351562,
"entropy": 0.47607421875,
"epoch": 0.10857142857142857,
"grad_norm": 0.09118141978979111,
"kl": 0.0022993087768554688,
"learning_rate": 4.209385452800095e-07,
"loss": 0.0001,
"reward": 0.3958333358168602,
"reward_std": 0.4476298391819,
"rewards/accuracy_reward": 0.1041666679084301,
"rewards/format_reward": 0.2916666679084301,
"step": 95
},
{
"completion_length": 2458.0833740234375,
"entropy": 0.37353515625,
"epoch": 0.10971428571428571,
"grad_norm": 0.16451668739318848,
"kl": 0.0038547515869140625,
"learning_rate": 4.1094235253127374e-07,
"loss": 0.0002,
"reward": 0.8750000447034836,
"reward_std": 0.4621882885694504,
"rewards/accuracy_reward": 0.2708333348855376,
"rewards/format_reward": 0.6041666865348816,
"step": 96
},
{
"completion_length": 2523.0833740234375,
"entropy": 0.421875,
"epoch": 0.11085714285714286,
"grad_norm": 0.1885625571012497,
"kl": 0.003082275390625,
"learning_rate": 4.0102146195176887e-07,
"loss": 0.0001,
"reward": 0.927083358168602,
"reward_std": 0.5721743106842041,
"rewards/accuracy_reward": 0.2708333348855376,
"rewards/format_reward": 0.6562500223517418,
"step": 97
},
{
"completion_length": 2336.5625915527344,
"entropy": 0.384033203125,
"epoch": 0.112,
"grad_norm": 0.15700186789035797,
"kl": 0.0025310516357421875,
"learning_rate": 3.911812458787591e-07,
"loss": 0.0001,
"reward": 0.7916666865348816,
"reward_std": 0.2110944464802742,
"rewards/accuracy_reward": 0.13541666697710752,
"rewards/format_reward": 0.6562500223517418,
"step": 98
},
{
"completion_length": 2524.8958740234375,
"entropy": 0.379150390625,
"epoch": 0.11314285714285714,
"grad_norm": 0.1743498593568802,
"kl": 0.003711700439453125,
"learning_rate": 3.8142703296283953e-07,
"loss": 0.0001,
"reward": 0.8125000204890966,
"reward_std": 0.4803639128804207,
"rewards/accuracy_reward": 0.2708333432674408,
"rewards/format_reward": 0.5416666697710752,
"step": 99
},
{
"completion_length": 2306.0521545410156,
"entropy": 0.35009765625,
"epoch": 0.11428571428571428,
"grad_norm": 0.12883873283863068,
"kl": 0.003726959228515625,
"learning_rate": 3.7176410528237945e-07,
"loss": 0.0001,
"reward": 1.0416666865348816,
"reward_std": 0.44550345838069916,
"rewards/accuracy_reward": 0.3437500074505806,
"rewards/format_reward": 0.6979166716337204,
"step": 100
},
{
"completion_length": 2115.5313110351562,
"entropy": 0.437255859375,
"epoch": 0.11542857142857142,
"grad_norm": 0.1613183170557022,
"kl": 0.00330352783203125,
"learning_rate": 3.62197695483182e-07,
"loss": 0.0001,
"reward": 0.833333358168602,
"reward_std": 0.2652370296418667,
"rewards/accuracy_reward": 0.15625000558793545,
"rewards/format_reward": 0.6770833358168602,
"step": 101
},
{
"completion_length": 1823.4167175292969,
"entropy": 0.369384765625,
"epoch": 0.11657142857142858,
"grad_norm": 0.11039572954177856,
"kl": 0.0045318603515625,
"learning_rate": 3.5273298394491515e-07,
"loss": 0.0002,
"reward": 0.9375000298023224,
"reward_std": 0.2463684342801571,
"rewards/accuracy_reward": 0.12500000279396772,
"rewards/format_reward": 0.8125000298023224,
"step": 102
},
{
"completion_length": 2301.593780517578,
"entropy": 0.376953125,
"epoch": 0.11771428571428572,
"grad_norm": 0.27249184250831604,
"kl": 0.00434112548828125,
"learning_rate": 3.433750959758446e-07,
"loss": 0.0002,
"reward": 0.895833358168602,
"reward_std": 0.5925451144576073,
"rewards/accuracy_reward": 0.19791667815297842,
"rewards/format_reward": 0.6979166865348816,
"step": 103
},
{
"completion_length": 2650.4063720703125,
"entropy": 0.45458984375,
"epoch": 0.11885714285714286,
"grad_norm": 0.14005857706069946,
"kl": 0.00519561767578125,
"learning_rate": 3.3412909903738936e-07,
"loss": 0.0002,
"reward": 0.572916679084301,
"reward_std": 0.42935075983405113,
"rewards/accuracy_reward": 0.09375000186264515,
"rewards/format_reward": 0.479166679084301,
"step": 104
},
{
"completion_length": 2248.6666870117188,
"entropy": 0.35205078125,
"epoch": 0.12,
"grad_norm": 0.18584585189819336,
"kl": 0.0032196044921875,
"learning_rate": 3.250000000000001e-07,
"loss": 0.0001,
"reward": 0.885416716337204,
"reward_std": 0.5438356846570969,
"rewards/accuracy_reward": 0.25000000838190317,
"rewards/format_reward": 0.6354166865348816,
"step": 105
},
{
"completion_length": 2213.1250915527344,
"entropy": 0.30859375,
"epoch": 0.12114285714285715,
"grad_norm": 0.13285432755947113,
"kl": 0.003154754638671875,
"learning_rate": 3.159927424318531e-07,
"loss": 0.0001,
"reward": 1.0625000204890966,
"reward_std": 0.4201487675309181,
"rewards/accuracy_reward": 0.40625,
"rewards/format_reward": 0.6562500055879354,
"step": 106
},
{
"completion_length": 2495.4166870117188,
"entropy": 0.54296875,
"epoch": 0.12228571428571429,
"grad_norm": 0.2279452681541443,
"kl": 0.005664825439453125,
"learning_rate": 3.0711220392181934e-07,
"loss": 0.0002,
"reward": 0.8437500298023224,
"reward_std": 0.39580530673265457,
"rewards/accuracy_reward": 0.19791666697710752,
"rewards/format_reward": 0.645833358168602,
"step": 107
},
{
"completion_length": 2470.9896545410156,
"entropy": 0.41259765625,
"epoch": 0.12342857142857143,
"grad_norm": 0.1852155178785324,
"kl": 0.005123138427734375,
"learning_rate": 2.9836319343816397e-07,
"loss": 0.0002,
"reward": 0.7604167014360428,
"reward_std": 0.44793232530355453,
"rewards/accuracy_reward": 0.1770833358168602,
"rewards/format_reward": 0.583333358168602,
"step": 108
},
{
"completion_length": 2746.9166870117188,
"entropy": 0.41796875,
"epoch": 0.12457142857142857,
"grad_norm": 0.15881556272506714,
"kl": 0.00360107421875,
"learning_rate": 2.897504487244061e-07,
"loss": 0.0001,
"reward": 0.6458333656191826,
"reward_std": 0.3334706202149391,
"rewards/accuracy_reward": 0.15625,
"rewards/format_reward": 0.4895833507180214,
"step": 109
},
{
"completion_length": 2482.5000610351562,
"entropy": 0.400390625,
"epoch": 0.12571428571428572,
"grad_norm": 0.1606108397245407,
"kl": 0.0030384063720703125,
"learning_rate": 2.812786337337463e-07,
"loss": 0.0001,
"reward": 0.8750000298023224,
"reward_std": 0.560508705675602,
"rewards/accuracy_reward": 0.22916667349636555,
"rewards/format_reward": 0.645833358168602,
"step": 110
},
{
"completion_length": 2674.791748046875,
"entropy": 0.486572265625,
"epoch": 0.12685714285714286,
"grad_norm": 0.14915555715560913,
"kl": 0.00421905517578125,
"learning_rate": 2.729523361034538e-07,
"loss": 0.0002,
"reward": 0.614583358168602,
"reward_std": 0.38519187271595,
"rewards/accuracy_reward": 0.12500000186264515,
"rewards/format_reward": 0.4895833507180214,
"step": 111
},
{
"completion_length": 2968.5834350585938,
"entropy": 0.470703125,
"epoch": 0.128,
"grad_norm": 0.2177121490240097,
"kl": 0.0030193328857421875,
"learning_rate": 2.6477606467058035e-07,
"loss": 0.0001,
"reward": 0.8229166865348816,
"reward_std": 0.5380749329924583,
"rewards/accuracy_reward": 0.2708333386108279,
"rewards/format_reward": 0.5520833507180214,
"step": 112
},
{
"completion_length": 1850.3125457763672,
"entropy": 0.37451171875,
"epoch": 0.12914285714285714,
"grad_norm": 0.18240460753440857,
"kl": 0.004482269287109375,
"learning_rate": 2.567542470303452e-07,
"loss": 0.0002,
"reward": 0.9375000149011612,
"reward_std": 0.39232632517814636,
"rewards/accuracy_reward": 0.1979166716337204,
"rewards/format_reward": 0.7395833432674408,
"step": 113
},
{
"completion_length": 1991.9479675292969,
"entropy": 0.34228515625,
"epoch": 0.13028571428571428,
"grad_norm": 0.12856441736221313,
"kl": 0.00384521484375,
"learning_rate": 2.488912271385139e-07,
"loss": 0.0002,
"reward": 0.9687500298023224,
"reward_std": 0.41063307225704193,
"rewards/accuracy_reward": 0.16666666977107525,
"rewards/format_reward": 0.802083358168602,
"step": 114
},
{
"completion_length": 2642.2813110351562,
"entropy": 0.470703125,
"epoch": 0.13142857142857142,
"grad_norm": 0.1557956039905548,
"kl": 0.005847930908203125,
"learning_rate": 2.411912629590699e-07,
"loss": 0.0002,
"reward": 0.7500000149011612,
"reward_std": 0.3538191542029381,
"rewards/accuracy_reward": 0.2500000074505806,
"rewards/format_reward": 0.5000000149011612,
"step": 115
},
{
"completion_length": 3452.9271240234375,
"entropy": 0.5498046875,
"epoch": 0.13257142857142856,
"grad_norm": 0.12163397669792175,
"kl": 0.00417327880859375,
"learning_rate": 2.336585241584522e-07,
"loss": 0.0002,
"reward": 0.3125000009313226,
"reward_std": 0.3494237996637821,
"rewards/accuracy_reward": 0.09375000279396772,
"rewards/format_reward": 0.21875000558793545,
"step": 116
},
{
"completion_length": 2859.000030517578,
"entropy": 0.5322265625,
"epoch": 0.1337142857142857,
"grad_norm": 0.23236258327960968,
"kl": 0.0061187744140625,
"learning_rate": 2.2629708984760706e-07,
"loss": 0.0002,
"reward": 0.48958336375653744,
"reward_std": 0.34661681205034256,
"rewards/accuracy_reward": 0.052083334885537624,
"rewards/format_reward": 0.4375000027939677,
"step": 117
},
{
"completion_length": 2824.354248046875,
"entropy": 0.3935546875,
"epoch": 0.13485714285714287,
"grad_norm": 0.11868440359830856,
"kl": 0.002780914306640625,
"learning_rate": 2.1911094637307714e-07,
"loss": 0.0001,
"reward": 1.0416666865348816,
"reward_std": 0.6117755249142647,
"rewards/accuracy_reward": 0.4270833358168602,
"rewards/format_reward": 0.6145833432674408,
"step": 118
},
{
"completion_length": 2026.5937805175781,
"entropy": 0.446533203125,
"epoch": 0.136,
"grad_norm": 0.1918383240699768,
"kl": 0.00518035888671875,
"learning_rate": 2.1210398515832536e-07,
"loss": 0.0002,
"reward": 0.989583358168602,
"reward_std": 0.3108450919389725,
"rewards/accuracy_reward": 0.2604166716337204,
"rewards/format_reward": 0.7291666865348816,
"step": 119
},
{
"completion_length": 1974.4270935058594,
"entropy": 0.43359375,
"epoch": 0.13714285714285715,
"grad_norm": 0.19848716259002686,
"kl": 0.006320953369140625,
"learning_rate": 2.0528000059645995e-07,
"loss": 0.0003,
"reward": 0.864583358168602,
"reward_std": 0.4301687255501747,
"rewards/accuracy_reward": 0.16666666697710752,
"rewards/format_reward": 0.6979166865348816,
"step": 120
},
{
"completion_length": 1066.2396240234375,
"entropy": 0.30810546875,
"epoch": 0.1382857142857143,
"grad_norm": 0.15486152470111847,
"kl": 0.00482940673828125,
"learning_rate": 1.986426879955034e-07,
"loss": 0.0002,
"reward": 1.2500000298023224,
"reward_std": 0.25760992616415024,
"rewards/accuracy_reward": 0.3020833386108279,
"rewards/format_reward": 0.9479166865348816,
"step": 121
},
{
"completion_length": 2534.8958740234375,
"entropy": 0.4521484375,
"epoch": 0.13942857142857143,
"grad_norm": 0.12797077000141144,
"kl": 0.00380706787109375,
"learning_rate": 1.9219564157731844e-07,
"loss": 0.0002,
"reward": 0.8229167014360428,
"reward_std": 0.3391122668981552,
"rewards/accuracy_reward": 0.20833333861082792,
"rewards/format_reward": 0.6145833432674408,
"step": 122
},
{
"completion_length": 2572.791748046875,
"entropy": 0.451416015625,
"epoch": 0.14057142857142857,
"grad_norm": 0.1227995902299881,
"kl": 0.0032405853271484375,
"learning_rate": 1.8594235253127372e-07,
"loss": 0.0001,
"reward": 0.7083333544433117,
"reward_std": 0.3739900141954422,
"rewards/accuracy_reward": 0.1354166679084301,
"rewards/format_reward": 0.5729166828095913,
"step": 123
},
{
"completion_length": 2198.6563110351562,
"entropy": 0.33984375,
"epoch": 0.1417142857142857,
"grad_norm": 0.2465568333864212,
"kl": 0.015537261962890625,
"learning_rate": 1.7988620712370195e-07,
"loss": 0.0006,
"reward": 0.9687500298023224,
"reward_std": 0.5658619552850723,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.7187500298023224,
"step": 124
},
{
"completion_length": 2865.250030517578,
"entropy": 0.431640625,
"epoch": 0.14285714285714285,
"grad_norm": 0.10473211109638214,
"kl": 0.00345611572265625,
"learning_rate": 1.7403048486417868e-07,
"loss": 0.0001,
"reward": 0.6979166697710752,
"reward_std": 0.3267679661512375,
"rewards/accuracy_reward": 0.30208333395421505,
"rewards/format_reward": 0.3958333386108279,
"step": 125
},
{
"completion_length": 2886.2188720703125,
"entropy": 0.4560546875,
"epoch": 0.144,
"grad_norm": 0.09086798876523972,
"kl": 0.0029506683349609375,
"learning_rate": 1.6837835672960831e-07,
"loss": 0.0001,
"reward": 0.7395833358168602,
"reward_std": 0.3677559196949005,
"rewards/accuracy_reward": 0.2083333432674408,
"rewards/format_reward": 0.5312500149011612,
"step": 126
},
{
"completion_length": 2771.3438415527344,
"entropy": 0.41796875,
"epoch": 0.14514285714285713,
"grad_norm": 0.15809084475040436,
"kl": 0.004436492919921875,
"learning_rate": 1.6293288344708566e-07,
"loss": 0.0002,
"reward": 0.635416679084301,
"reward_std": 0.49130160734057426,
"rewards/accuracy_reward": 0.0937500037252903,
"rewards/format_reward": 0.541666679084301,
"step": 127
},
{
"completion_length": 2852.291748046875,
"entropy": 0.5322265625,
"epoch": 0.1462857142857143,
"grad_norm": 0.17982318997383118,
"kl": 0.004756927490234375,
"learning_rate": 1.5769701383645698e-07,
"loss": 0.0002,
"reward": 0.9166666967794299,
"reward_std": 0.5036755502223969,
"rewards/accuracy_reward": 0.3645833358168602,
"rewards/format_reward": 0.5520833535119891,
"step": 128
},
{
"completion_length": 3367.9583740234375,
"entropy": 0.49658203125,
"epoch": 0.14742857142857144,
"grad_norm": 0.1593668907880783,
"kl": 0.00467681884765625,
"learning_rate": 1.5267358321348285e-07,
"loss": 0.0002,
"reward": 0.4583333507180214,
"reward_std": 0.4737073630094528,
"rewards/accuracy_reward": 0.1666666753590107,
"rewards/format_reward": 0.2916666679084301,
"step": 129
},
{
"completion_length": 2817.7709350585938,
"entropy": 0.5,
"epoch": 0.14857142857142858,
"grad_norm": 0.16384749114513397,
"kl": 0.00354766845703125,
"learning_rate": 1.4786531185446452e-07,
"loss": 0.0001,
"reward": 0.479166679084301,
"reward_std": 0.40873220562934875,
"rewards/accuracy_reward": 0.06250000186264515,
"rewards/format_reward": 0.4166666716337204,
"step": 130
},
{
"completion_length": 2720.8021850585938,
"entropy": 0.49658203125,
"epoch": 0.14971428571428572,
"grad_norm": 0.24187295138835907,
"kl": 0.004924774169921875,
"learning_rate": 1.432748035231658e-07,
"loss": 0.0002,
"reward": 0.9062500298023224,
"reward_std": 0.504379153251648,
"rewards/accuracy_reward": 0.375,
"rewards/format_reward": 0.5312500223517418,
"step": 131
},
{
"completion_length": 2590.0521545410156,
"entropy": 0.4248046875,
"epoch": 0.15085714285714286,
"grad_norm": 0.13521018624305725,
"kl": 0.003265380859375,
"learning_rate": 1.3890454406082956e-07,
"loss": 0.0001,
"reward": 0.833333358168602,
"reward_std": 0.5318443104624748,
"rewards/accuracy_reward": 0.2916666744276881,
"rewards/format_reward": 0.541666679084301,
"step": 132
},
{
"completion_length": 3037.8854370117188,
"entropy": 0.49072265625,
"epoch": 0.152,
"grad_norm": 0.17249611020088196,
"kl": 0.004932403564453125,
"learning_rate": 1.3475690004005097e-07,
"loss": 0.0002,
"reward": 0.5104166865348816,
"reward_std": 0.2723224312067032,
"rewards/accuracy_reward": 0.1041666716337204,
"rewards/format_reward": 0.4062500149011612,
"step": 133
},
{
"completion_length": 2485.322998046875,
"entropy": 0.5224609375,
"epoch": 0.15314285714285714,
"grad_norm": 0.16694706678390503,
"kl": 0.00672149658203125,
"learning_rate": 1.308341174832359e-07,
"loss": 0.0003,
"reward": 0.8750000149011612,
"reward_std": 0.49518734961748123,
"rewards/accuracy_reward": 0.2500000102445483,
"rewards/format_reward": 0.6250000149011612,
"step": 134
},
{
"completion_length": 1746.3958740234375,
"entropy": 0.373046875,
"epoch": 0.15428571428571428,
"grad_norm": 0.1858878880739212,
"kl": 0.007049560546875,
"learning_rate": 1.2713832064634125e-07,
"loss": 0.0003,
"reward": 1.1145833432674408,
"reward_std": 0.3352552205324173,
"rewards/accuracy_reward": 0.43750000558793545,
"rewards/format_reward": 0.6770833432674408,
"step": 135
},
{
"completion_length": 2145.4584045410156,
"entropy": 0.35986328125,
"epoch": 0.15542857142857142,
"grad_norm": 0.196150541305542,
"kl": 0.00507354736328125,
"learning_rate": 1.2367151086855187e-07,
"loss": 0.0002,
"reward": 0.9895833358168602,
"reward_std": 0.6333772391080856,
"rewards/accuracy_reward": 0.3125000074505806,
"rewards/format_reward": 0.6770833507180214,
"step": 136
},
{
"completion_length": 2813.510467529297,
"entropy": 0.387939453125,
"epoch": 0.15657142857142858,
"grad_norm": 0.13014180958271027,
"kl": 0.00402069091796875,
"learning_rate": 1.2043556548852063e-07,
"loss": 0.0002,
"reward": 0.677083358168602,
"reward_std": 0.5024930611252785,
"rewards/accuracy_reward": 0.1458333395421505,
"rewards/format_reward": 0.5312500260770321,
"step": 137
},
{
"completion_length": 2061.0000610351562,
"entropy": 0.35107421875,
"epoch": 0.15771428571428572,
"grad_norm": 0.10559725016355515,
"kl": 0.0037689208984375,
"learning_rate": 1.1743223682775649e-07,
"loss": 0.0002,
"reward": 0.9270833730697632,
"reward_std": 0.30403000861406326,
"rewards/accuracy_reward": 0.18750000558793545,
"rewards/format_reward": 0.739583358168602,
"step": 138
},
{
"completion_length": 3106.291748046875,
"entropy": 0.55615234375,
"epoch": 0.15885714285714286,
"grad_norm": 0.15882417559623718,
"kl": 0.00519561767578125,
"learning_rate": 1.1466315124171128e-07,
"loss": 0.0002,
"reward": 0.708333358168602,
"reward_std": 0.5456142984330654,
"rewards/accuracy_reward": 0.1666666753590107,
"rewards/format_reward": 0.541666679084301,
"step": 139
},
{
"completion_length": 2395.885498046875,
"entropy": 0.4853515625,
"epoch": 0.16,
"grad_norm": 0.2862064242362976,
"kl": 0.006809234619140625,
"learning_rate": 1.1212980823907929e-07,
"loss": 0.0003,
"reward": 0.7500000298023224,
"reward_std": 0.38956041634082794,
"rewards/accuracy_reward": 0.1666666679084301,
"rewards/format_reward": 0.5833333507180214,
"step": 140
},
{
"completion_length": 1969.2292175292969,
"entropy": 0.33935546875,
"epoch": 0.16114285714285714,
"grad_norm": 0.17285719513893127,
"kl": 0.0047760009765625,
"learning_rate": 1.0983357966978745e-07,
"loss": 0.0002,
"reward": 0.9895833730697632,
"reward_std": 0.520443569868803,
"rewards/accuracy_reward": 0.2187500074505806,
"rewards/format_reward": 0.770833358168602,
"step": 141
},
{
"completion_length": 2584.9688110351562,
"entropy": 0.447998046875,
"epoch": 0.16228571428571428,
"grad_norm": 0.13187262415885925,
"kl": 0.0047740936279296875,
"learning_rate": 1.0777570898211405e-07,
"loss": 0.0002,
"reward": 0.9166667014360428,
"reward_std": 0.4435262605547905,
"rewards/accuracy_reward": 0.2187500111758709,
"rewards/format_reward": 0.6979166865348816,
"step": 142
},
{
"completion_length": 2300.7500610351562,
"entropy": 0.4326171875,
"epoch": 0.16342857142857142,
"grad_norm": 0.25766721367836,
"kl": 0.00977325439453125,
"learning_rate": 1.0595731054933934e-07,
"loss": 0.0004,
"reward": 0.6875000074505806,
"reward_std": 0.3443669453263283,
"rewards/accuracy_reward": 0.0833333358168602,
"rewards/format_reward": 0.6041666716337204,
"step": 143
},
{
"completion_length": 2849.4688110351562,
"entropy": 0.45556640625,
"epoch": 0.16457142857142856,
"grad_norm": 0.14796483516693115,
"kl": 0.00493621826171875,
"learning_rate": 1.0437936906629334e-07,
"loss": 0.0002,
"reward": 0.677083358168602,
"reward_std": 0.4717573896050453,
"rewards/accuracy_reward": 0.2187500037252903,
"rewards/format_reward": 0.4583333432674408,
"step": 144
},
{
"completion_length": 1885.4479522705078,
"entropy": 0.353515625,
"epoch": 0.1657142857142857,
"grad_norm": 0.1617114096879959,
"kl": 0.005008697509765625,
"learning_rate": 1.0304273901612565e-07,
"loss": 0.0002,
"reward": 1.0208333730697632,
"reward_std": 0.3080247640609741,
"rewards/accuracy_reward": 0.3020833460614085,
"rewards/format_reward": 0.7187500149011612,
"step": 145
},
{
"completion_length": 1947.0312805175781,
"entropy": 0.376953125,
"epoch": 0.16685714285714287,
"grad_norm": 0.11680302768945694,
"kl": 0.0033721923828125,
"learning_rate": 1.0194814420758804e-07,
"loss": 0.0001,
"reward": 0.8750000149011612,
"reward_std": 0.22604453563690186,
"rewards/accuracy_reward": 0.07291666977107525,
"rewards/format_reward": 0.8020833432674408,
"step": 146
},
{
"completion_length": 2215.7084045410156,
"entropy": 0.392578125,
"epoch": 0.168,
"grad_norm": 0.21416479349136353,
"kl": 0.00583648681640625,
"learning_rate": 1.0109617738307911e-07,
"loss": 0.0002,
"reward": 0.8437500149011612,
"reward_std": 0.5214347615838051,
"rewards/accuracy_reward": 0.1770833395421505,
"rewards/format_reward": 0.6666666865348816,
"step": 147
},
{
"completion_length": 1631.229248046875,
"entropy": 0.302734375,
"epoch": 0.16914285714285715,
"grad_norm": 0.17106008529663086,
"kl": 0.00476837158203125,
"learning_rate": 1.0048729989766394e-07,
"loss": 0.0002,
"reward": 0.9895833730697632,
"reward_std": 0.29220427572727203,
"rewards/accuracy_reward": 0.13541666977107525,
"rewards/format_reward": 0.8541666865348816,
"step": 148
},
{
"completion_length": 2407.8229370117188,
"entropy": 0.3408203125,
"epoch": 0.1702857142857143,
"grad_norm": 0.15983973443508148,
"kl": 0.008087158203125,
"learning_rate": 1.0012184146924223e-07,
"loss": 0.0003,
"reward": 0.9270833432674408,
"reward_std": 0.48707588016986847,
"rewards/accuracy_reward": 0.2500000074505806,
"rewards/format_reward": 0.6770833432674408,
"step": 149
},
{
"completion_length": 2178.729217529297,
"entropy": 0.36962890625,
"epoch": 0.17142857142857143,
"grad_norm": 0.19081099331378937,
"kl": 0.004444122314453125,
"learning_rate": 1e-07,
"loss": 0.0002,
"reward": 1.0312500298023224,
"reward_std": 0.5531396120786667,
"rewards/accuracy_reward": 0.291666679084301,
"rewards/format_reward": 0.7395833432674408,
"step": 150
},
{
"epoch": 0.17142857142857143,
"step": 150,
"total_flos": 0.0,
"train_loss": 0.00011944215420650531,
"train_runtime": 12092.6435,
"train_samples_per_second": 1.191,
"train_steps_per_second": 0.012
}
],
"logging_steps": 1,
"max_steps": 150,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}