4DThinker-3B / 4drl /trainer_state.json
jankin123's picture
Upload 4DThinker-3B config and tokenizer files
3a7d274 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.043591979075850044,
"eval_steps": 500,
"global_step": 100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 6066.5546875,
"epoch": 0.00043591979075850045,
"grad_norm": 22.7120361328125,
"learning_rate": 9.995640802092413e-07,
"loss": -0.00010610813114908524,
"reward": 0.16562499850988388,
"reward_std": 0.2048395685851574,
"rewards/accuracy_reward": 0.1171875,
"rewards/format_reward": 0.2421875,
"step": 1
},
{
"completion_length": 5022.6015625,
"epoch": 0.0008718395815170009,
"grad_norm": 35.190757751464844,
"learning_rate": 9.99128160418483e-07,
"loss": -0.0003467285423539579,
"reward": 0.24531249701976776,
"reward_std": 0.13261918351054192,
"rewards/accuracy_reward": 0.171875,
"rewards/format_reward": 0.3671875,
"step": 2
},
{
"completion_length": 5074.4765625,
"epoch": 0.0013077593722755014,
"grad_norm": 36.27347946166992,
"learning_rate": 9.986922406277246e-07,
"loss": -0.0009319710079580545,
"reward": 0.31718750298023224,
"reward_std": 0.15016943216323853,
"rewards/accuracy_reward": 0.2578125,
"rewards/format_reward": 0.296875,
"step": 3
},
{
"completion_length": 4919.9921875,
"epoch": 0.0017436791630340018,
"grad_norm": 23.1031494140625,
"learning_rate": 9.98256320836966e-07,
"loss": -0.0011635422706604004,
"reward": 0.2515625134110451,
"reward_std": 0.22550153732299805,
"rewards/accuracy_reward": 0.1796875,
"rewards/format_reward": 0.359375,
"step": 4
},
{
"completion_length": 4392.3203125,
"epoch": 0.002179598953792502,
"grad_norm": 18.30356788635254,
"learning_rate": 9.978204010462075e-07,
"loss": -0.001725408248603344,
"reward": 0.3500000089406967,
"reward_std": 0.2601192742586136,
"rewards/accuracy_reward": 0.2734375,
"rewards/format_reward": 0.3828125,
"step": 5
},
{
"completion_length": 5603.28125,
"epoch": 0.0026155187445510027,
"grad_norm": 21.6483154296875,
"learning_rate": 9.97384481255449e-07,
"loss": -0.004177422029897571,
"reward": 0.07500000298023224,
"reward_std": 0.14329775422811508,
"rewards/accuracy_reward": 0.0390625,
"rewards/format_reward": 0.1796875,
"step": 6
},
{
"completion_length": 3968.328125,
"epoch": 0.003051438535309503,
"grad_norm": 21.388357162475586,
"learning_rate": 9.969485614646903e-07,
"loss": -0.002357690129429102,
"reward": 0.2109375149011612,
"reward_std": 0.19895199686288834,
"rewards/accuracy_reward": 0.1328125,
"rewards/format_reward": 0.390625,
"step": 7
},
{
"completion_length": 4283.15625,
"epoch": 0.0034873583260680036,
"grad_norm": 17.606998443603516,
"learning_rate": 9.96512641673932e-07,
"loss": -0.0032072272151708603,
"reward": 0.3046875149011612,
"reward_std": 0.3027474880218506,
"rewards/accuracy_reward": 0.234375,
"rewards/format_reward": 0.3515625,
"step": 8
},
{
"completion_length": 2647.453125,
"epoch": 0.003923278116826504,
"grad_norm": 7.005691051483154,
"learning_rate": 9.960767218831735e-07,
"loss": -0.0025840166490525007,
"reward": 0.33125001937150955,
"reward_std": 0.270910307765007,
"rewards/accuracy_reward": 0.234375,
"rewards/format_reward": 0.484375,
"step": 9
},
{
"completion_length": 2621.7734375,
"epoch": 0.004359197907585004,
"grad_norm": 32.332698822021484,
"learning_rate": 9.95640802092415e-07,
"loss": -0.0029894779436290264,
"reward": 0.29843752086162567,
"reward_std": 0.21532631665468216,
"rewards/accuracy_reward": 0.1640625,
"rewards/format_reward": 0.671875,
"step": 10
},
{
"completion_length": 2444.78125,
"epoch": 0.004795117698343505,
"grad_norm": 15.55601692199707,
"learning_rate": 9.952048823016565e-07,
"loss": -0.003852886729873717,
"reward": 0.24062500894069672,
"reward_std": 0.2612670660018921,
"rewards/accuracy_reward": 0.1484375,
"rewards/format_reward": 0.4609375,
"step": 11
},
{
"completion_length": 2593.9140625,
"epoch": 0.0052310374891020054,
"grad_norm": 29.252334594726562,
"learning_rate": 9.94768962510898e-07,
"loss": -0.0042398301884531975,
"reward": 0.48750001192092896,
"reward_std": 0.3399874120950699,
"rewards/accuracy_reward": 0.3515625,
"rewards/format_reward": 0.6796875,
"step": 12
},
{
"completion_length": 2220.7109375,
"epoch": 0.005666957279860506,
"grad_norm": 13.38364028930664,
"learning_rate": 9.943330427201393e-07,
"loss": -0.0033426693407818675,
"reward": 0.3125000149011612,
"reward_std": 0.19083451479673386,
"rewards/accuracy_reward": 0.171875,
"rewards/format_reward": 0.703125,
"step": 13
},
{
"completion_length": 2529.640625,
"epoch": 0.006102877070619006,
"grad_norm": 17.974361419677734,
"learning_rate": 9.93897122929381e-07,
"loss": -0.004656808450818062,
"reward": 0.3500000089406967,
"reward_std": 0.36670154333114624,
"rewards/accuracy_reward": 0.234375,
"rewards/format_reward": 0.578125,
"step": 14
},
{
"completion_length": 2840.5625,
"epoch": 0.006538796861377506,
"grad_norm": 11.839619636535645,
"learning_rate": 9.934612031386225e-07,
"loss": -0.005619838368147612,
"reward": 0.29218751937150955,
"reward_std": 0.11481105536222458,
"rewards/accuracy_reward": 0.1640625,
"rewards/format_reward": 0.640625,
"step": 15
},
{
"completion_length": 3081.2421875,
"epoch": 0.006974716652136007,
"grad_norm": 8.316078186035156,
"learning_rate": 9.93025283347864e-07,
"loss": -0.00666549289599061,
"reward": 0.42500003427267075,
"reward_std": 0.30427779257297516,
"rewards/accuracy_reward": 0.2890625,
"rewards/format_reward": 0.6796875,
"step": 16
},
{
"completion_length": 2150.2734375,
"epoch": 0.0074106364428945075,
"grad_norm": 15.924665451049805,
"learning_rate": 9.925893635571055e-07,
"loss": -0.006602097302675247,
"reward": 0.43906252086162567,
"reward_std": 0.35585278272628784,
"rewards/accuracy_reward": 0.2890625,
"rewards/format_reward": 0.75,
"step": 17
},
{
"completion_length": 1136.3359375,
"epoch": 0.007846556233653008,
"grad_norm": 3.312643527984619,
"learning_rate": 9.92153443766347e-07,
"loss": -0.00424616876989603,
"reward": 0.484375,
"reward_std": 0.2594892159104347,
"rewards/accuracy_reward": 0.3125,
"rewards/format_reward": 0.859375,
"step": 18
},
{
"completion_length": 1163.0703125,
"epoch": 0.008282476024411508,
"grad_norm": 4.4617600440979,
"learning_rate": 9.917175239755885e-07,
"loss": -0.0060931057669222355,
"reward": 0.6203125715255737,
"reward_std": 0.3268684893846512,
"rewards/accuracy_reward": 0.4375,
"rewards/format_reward": 0.9140625,
"step": 19
},
{
"completion_length": 1688.5078125,
"epoch": 0.008718395815170008,
"grad_norm": 5.8775506019592285,
"learning_rate": 9.9128160418483e-07,
"loss": -0.009722861228510737,
"reward": 0.38593751192092896,
"reward_std": 0.21004340052604675,
"rewards/accuracy_reward": 0.21875,
"rewards/format_reward": 0.8359375,
"step": 20
},
{
"completion_length": 1012.625,
"epoch": 0.009154315605928508,
"grad_norm": 4.331947326660156,
"learning_rate": 9.908456843940715e-07,
"loss": -0.005864025559276342,
"reward": 0.550000011920929,
"reward_std": 0.336714543402195,
"rewards/accuracy_reward": 0.359375,
"rewards/format_reward": 0.953125,
"step": 21
},
{
"completion_length": 1052.0234375,
"epoch": 0.00959023539668701,
"grad_norm": 7.702275276184082,
"learning_rate": 9.90409764603313e-07,
"loss": -0.0061883407179266214,
"reward": 0.5375000536441803,
"reward_std": 0.19406893104314804,
"rewards/accuracy_reward": 0.3515625,
"rewards/format_reward": 0.9296875,
"step": 22
},
{
"completion_length": 585.890625,
"epoch": 0.01002615518744551,
"grad_norm": 2.693312883377075,
"learning_rate": 9.899738448125545e-07,
"loss": -0.0050068587297573686,
"reward": 0.4500000476837158,
"reward_std": 0.22148218750953674,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 1.0,
"step": 23
},
{
"completion_length": 952.8984375,
"epoch": 0.010462074978204011,
"grad_norm": 2.040409564971924,
"learning_rate": 9.89537925021796e-07,
"loss": -0.008675348944962025,
"reward": 0.4531250298023224,
"reward_std": 0.2833295091986656,
"rewards/accuracy_reward": 0.2578125,
"rewards/format_reward": 0.9765625,
"step": 24
},
{
"completion_length": 855.203125,
"epoch": 0.010897994768962511,
"grad_norm": 1.750288486480713,
"learning_rate": 9.891020052310375e-07,
"loss": -0.006478779250755906,
"reward": 0.5281250476837158,
"reward_std": 0.2747085839509964,
"rewards/accuracy_reward": 0.3359375,
"rewards/format_reward": 0.9609375,
"step": 25
},
{
"completion_length": 622.625,
"epoch": 0.011333914559721011,
"grad_norm": 1.3210248947143555,
"learning_rate": 9.88666085440279e-07,
"loss": -0.006179739721119404,
"reward": 0.5562500357627869,
"reward_std": 0.2532925382256508,
"rewards/accuracy_reward": 0.359375,
"rewards/format_reward": 0.984375,
"step": 26
},
{
"completion_length": 609.3984375,
"epoch": 0.011769834350479512,
"grad_norm": 3.869396448135376,
"learning_rate": 9.882301656495205e-07,
"loss": -0.0062519978964701295,
"reward": 0.5000000298023224,
"reward_std": 0.14424315840005875,
"rewards/accuracy_reward": 0.3046875,
"rewards/format_reward": 0.9765625,
"step": 27
},
{
"completion_length": 547.375,
"epoch": 0.012205754141238012,
"grad_norm": 1.0941433906555176,
"learning_rate": 9.877942458587619e-07,
"loss": -0.0032227920601144433,
"reward": 0.5484375357627869,
"reward_std": 0.21108780801296234,
"rewards/accuracy_reward": 0.3515625,
"rewards/format_reward": 0.984375,
"step": 28
},
{
"completion_length": 566.8515625,
"epoch": 0.012641673931996512,
"grad_norm": 1.1244480609893799,
"learning_rate": 9.873583260680035e-07,
"loss": -0.005511581432074308,
"reward": 0.5296875238418579,
"reward_std": 0.25184717029333115,
"rewards/accuracy_reward": 0.3359375,
"rewards/format_reward": 0.96875,
"step": 29
},
{
"completion_length": 629.2109375,
"epoch": 0.013077593722755012,
"grad_norm": 1.0730060338974,
"learning_rate": 9.869224062772449e-07,
"loss": -0.00590163329616189,
"reward": 0.5437500476837158,
"reward_std": 0.30221718549728394,
"rewards/accuracy_reward": 0.34375,
"rewards/format_reward": 1.0,
"step": 30
},
{
"completion_length": 577.2890625,
"epoch": 0.013513513513513514,
"grad_norm": 1.0006390810012817,
"learning_rate": 9.864864864864865e-07,
"loss": -0.004602149594575167,
"reward": 0.48906251788139343,
"reward_std": 0.2546490430831909,
"rewards/accuracy_reward": 0.296875,
"rewards/format_reward": 0.9609375,
"step": 31
},
{
"completion_length": 652.453125,
"epoch": 0.013949433304272014,
"grad_norm": 2.0136349201202393,
"learning_rate": 9.860505666957279e-07,
"loss": -0.007033544359728694,
"reward": 0.6000000536441803,
"reward_std": 0.34614098072052,
"rewards/accuracy_reward": 0.40625,
"rewards/format_reward": 0.96875,
"step": 32
},
{
"completion_length": 830.296875,
"epoch": 0.014385353095030515,
"grad_norm": 0.6222465634346008,
"learning_rate": 9.856146469049695e-07,
"loss": -0.0058622711803764105,
"reward": 0.6406250596046448,
"reward_std": 0.26844407618045807,
"rewards/accuracy_reward": 0.4453125,
"rewards/format_reward": 0.9765625,
"step": 33
},
{
"completion_length": 924.8984375,
"epoch": 0.014821272885789015,
"grad_norm": 1.6224457025527954,
"learning_rate": 9.851787271142109e-07,
"loss": -0.006918259430676699,
"reward": 0.45781250298023224,
"reward_std": 0.1292574293911457,
"rewards/accuracy_reward": 0.265625,
"rewards/format_reward": 0.9609375,
"step": 34
},
{
"completion_length": 757.8828125,
"epoch": 0.015257192676547515,
"grad_norm": 0.8691195249557495,
"learning_rate": 9.847428073234525e-07,
"loss": -0.005784029606729746,
"reward": 0.45468753576278687,
"reward_std": 0.20147473365068436,
"rewards/accuracy_reward": 0.2578125,
"rewards/format_reward": 0.984375,
"step": 35
},
{
"completion_length": 1539.546875,
"epoch": 0.015693112467306015,
"grad_norm": 4.3893961906433105,
"learning_rate": 9.843068875326939e-07,
"loss": -0.010595182422548532,
"reward": 0.4765625298023224,
"reward_std": 0.2606821805238724,
"rewards/accuracy_reward": 0.296875,
"rewards/format_reward": 0.8984375,
"step": 36
},
{
"completion_length": 949.203125,
"epoch": 0.016129032258064516,
"grad_norm": 1.039124608039856,
"learning_rate": 9.838709677419355e-07,
"loss": -0.005853116046637297,
"reward": 0.6062500476837158,
"reward_std": 0.34973812103271484,
"rewards/accuracy_reward": 0.4140625,
"rewards/format_reward": 0.9609375,
"step": 37
},
{
"completion_length": 1128.515625,
"epoch": 0.016564952048823016,
"grad_norm": 2.9782750606536865,
"learning_rate": 9.834350479511769e-07,
"loss": -0.00639305729418993,
"reward": 0.3796875327825546,
"reward_std": 0.2201501727104187,
"rewards/accuracy_reward": 0.1953125,
"rewards/format_reward": 0.921875,
"step": 38
},
{
"completion_length": 803.09375,
"epoch": 0.017000871839581516,
"grad_norm": 0.9739387035369873,
"learning_rate": 9.829991281604185e-07,
"loss": -0.003955277847126126,
"reward": 0.5281250327825546,
"reward_std": 0.26489946991205215,
"rewards/accuracy_reward": 0.3359375,
"rewards/format_reward": 0.9609375,
"step": 39
},
{
"completion_length": 624.953125,
"epoch": 0.017436791630340016,
"grad_norm": 0.5311559438705444,
"learning_rate": 9.825632083696599e-07,
"loss": -0.004024791065603495,
"reward": 0.5468750298023224,
"reward_std": 0.2868617922067642,
"rewards/accuracy_reward": 0.3515625,
"rewards/format_reward": 0.9765625,
"step": 40
},
{
"completion_length": 1250.9765625,
"epoch": 0.017872711421098517,
"grad_norm": 0.7332771420478821,
"learning_rate": 9.821272885789015e-07,
"loss": -0.005356588866561651,
"reward": 0.44062504172325134,
"reward_std": 0.20438477396965027,
"rewards/accuracy_reward": 0.2578125,
"rewards/format_reward": 0.9140625,
"step": 41
},
{
"completion_length": 1476.9921875,
"epoch": 0.018308631211857017,
"grad_norm": 2.0835506916046143,
"learning_rate": 9.816913687881429e-07,
"loss": -0.006023196969181299,
"reward": 0.4125000238418579,
"reward_std": 0.23667097091674805,
"rewards/accuracy_reward": 0.234375,
"rewards/format_reward": 0.890625,
"step": 42
},
{
"completion_length": 1100.25,
"epoch": 0.018744551002615517,
"grad_norm": 0.7210353016853333,
"learning_rate": 9.812554489973845e-07,
"loss": -0.00412205932661891,
"reward": 0.5765625238418579,
"reward_std": 0.35319100320339203,
"rewards/accuracy_reward": 0.390625,
"rewards/format_reward": 0.9296875,
"step": 43
},
{
"completion_length": 1268.8828125,
"epoch": 0.01918047079337402,
"grad_norm": 1.018958330154419,
"learning_rate": 9.808195292066259e-07,
"loss": -0.005287598352879286,
"reward": 0.5796875357627869,
"reward_std": 0.33388449996709824,
"rewards/accuracy_reward": 0.3984375,
"rewards/format_reward": 0.90625,
"step": 44
},
{
"completion_length": 1529.1484375,
"epoch": 0.01961639058413252,
"grad_norm": 0.9156416058540344,
"learning_rate": 9.803836094158675e-07,
"loss": -0.006656843703240156,
"reward": 0.4531250298023224,
"reward_std": 0.3121063858270645,
"rewards/accuracy_reward": 0.2734375,
"rewards/format_reward": 0.8984375,
"step": 45
},
{
"completion_length": 771.7109375,
"epoch": 0.02005231037489102,
"grad_norm": 1.1348389387130737,
"learning_rate": 9.79947689625109e-07,
"loss": -0.004119608784094453,
"reward": 0.5406250357627869,
"reward_std": 0.16695528104901314,
"rewards/accuracy_reward": 0.34375,
"rewards/format_reward": 0.984375,
"step": 46
},
{
"completion_length": 1171.2578125,
"epoch": 0.02048823016564952,
"grad_norm": 0.7597943544387817,
"learning_rate": 9.795117698343505e-07,
"loss": -0.004714524140581489,
"reward": 0.6031250357627869,
"reward_std": 0.296435609459877,
"rewards/accuracy_reward": 0.4140625,
"rewards/format_reward": 0.9453125,
"step": 47
},
{
"completion_length": 1121.265625,
"epoch": 0.020924149956408022,
"grad_norm": 0.7531502842903137,
"learning_rate": 9.790758500435918e-07,
"loss": -0.004732346162199974,
"reward": 0.4359375089406967,
"reward_std": 0.3109729588031769,
"rewards/accuracy_reward": 0.25,
"rewards/format_reward": 0.9296875,
"step": 48
},
{
"completion_length": 1959.421875,
"epoch": 0.021360069747166522,
"grad_norm": 1.5503896474838257,
"learning_rate": 9.786399302528334e-07,
"loss": -0.0054204994812607765,
"reward": 0.4218750298023224,
"reward_std": 0.28712356090545654,
"rewards/accuracy_reward": 0.2578125,
"rewards/format_reward": 0.8203125,
"step": 49
},
{
"completion_length": 1155.4140625,
"epoch": 0.021795989537925022,
"grad_norm": 0.5289723873138428,
"learning_rate": 9.782040104620748e-07,
"loss": -0.00590874906629324,
"reward": 0.4140625298023224,
"reward_std": 0.2974793165922165,
"rewards/accuracy_reward": 0.2265625,
"rewards/format_reward": 0.9375,
"step": 50
},
{
"completion_length": 1361.078125,
"epoch": 0.022231909328683522,
"grad_norm": 0.7921638488769531,
"learning_rate": 9.777680906713164e-07,
"loss": -0.005040215328335762,
"reward": 0.3031249940395355,
"reward_std": 0.21944554150104523,
"rewards/accuracy_reward": 0.125,
"rewards/format_reward": 0.890625,
"step": 51
},
{
"completion_length": 1471.5859375,
"epoch": 0.022667829119442023,
"grad_norm": 0.6596994996070862,
"learning_rate": 9.77332170880558e-07,
"loss": -0.005814009346067905,
"reward": 0.4859375059604645,
"reward_std": 0.3088204860687256,
"rewards/accuracy_reward": 0.3046875,
"rewards/format_reward": 0.90625,
"step": 52
},
{
"completion_length": 1483.7265625,
"epoch": 0.023103748910200523,
"grad_norm": 0.9196128249168396,
"learning_rate": 9.768962510897994e-07,
"loss": -0.005532125011086464,
"reward": 0.6109375357627869,
"reward_std": 0.32757391035556793,
"rewards/accuracy_reward": 0.4296875,
"rewards/format_reward": 0.90625,
"step": 53
},
{
"completion_length": 1215.15625,
"epoch": 0.023539668700959023,
"grad_norm": 0.7604343891143799,
"learning_rate": 9.764603312990408e-07,
"loss": -0.0066660866141319275,
"reward": 0.6000000536441803,
"reward_std": 0.36297860741615295,
"rewards/accuracy_reward": 0.4140625,
"rewards/format_reward": 0.9296875,
"step": 54
},
{
"completion_length": 1351.3125,
"epoch": 0.023975588491717523,
"grad_norm": 0.7223543524742126,
"learning_rate": 9.760244115082824e-07,
"loss": -0.005946665536612272,
"reward": 0.43906253576278687,
"reward_std": 0.2528854086995125,
"rewards/accuracy_reward": 0.2578125,
"rewards/format_reward": 0.90625,
"step": 55
},
{
"completion_length": 1185.84375,
"epoch": 0.024411508282476024,
"grad_norm": 0.5095703601837158,
"learning_rate": 9.755884917175238e-07,
"loss": -0.0064112339168787,
"reward": 0.5093750208616257,
"reward_std": 0.21522878110408783,
"rewards/accuracy_reward": 0.3203125,
"rewards/format_reward": 0.9453125,
"step": 56
},
{
"completion_length": 1281.0546875,
"epoch": 0.024847428073234524,
"grad_norm": 1.2985528707504272,
"learning_rate": 9.751525719267654e-07,
"loss": -0.006073690485209227,
"reward": 0.515625,
"reward_std": 0.26809659600257874,
"rewards/accuracy_reward": 0.328125,
"rewards/format_reward": 0.9375,
"step": 57
},
{
"completion_length": 1124.7109375,
"epoch": 0.025283347863993024,
"grad_norm": 0.5998630523681641,
"learning_rate": 9.74716652136007e-07,
"loss": -0.006307224277406931,
"reward": 0.42500001192092896,
"reward_std": 0.15122529119253159,
"rewards/accuracy_reward": 0.234375,
"rewards/format_reward": 0.953125,
"step": 58
},
{
"completion_length": 1213.90625,
"epoch": 0.025719267654751524,
"grad_norm": 1.0118658542633057,
"learning_rate": 9.742807323452484e-07,
"loss": -0.007607629988342524,
"reward": 0.7015625238418579,
"reward_std": 0.26937858760356903,
"rewards/accuracy_reward": 0.515625,
"rewards/format_reward": 0.9296875,
"step": 59
},
{
"completion_length": 816.8046875,
"epoch": 0.026155187445510025,
"grad_norm": 0.5398353338241577,
"learning_rate": 9.738448125544898e-07,
"loss": -0.005773038603365421,
"reward": 0.8140625357627869,
"reward_std": 0.2714267522096634,
"rewards/accuracy_reward": 0.6171875,
"rewards/format_reward": 0.984375,
"step": 60
},
{
"completion_length": 802.9453125,
"epoch": 0.02659110723626853,
"grad_norm": 0.40469521284103394,
"learning_rate": 9.734088927637314e-07,
"loss": -0.005671899998560548,
"reward": 0.5234375298023224,
"reward_std": 0.19351572543382645,
"rewards/accuracy_reward": 0.328125,
"rewards/format_reward": 0.9765625,
"step": 61
},
{
"completion_length": 609.734375,
"epoch": 0.02702702702702703,
"grad_norm": 0.2660030424594879,
"learning_rate": 9.72972972972973e-07,
"loss": -0.003733730292879045,
"reward": 0.4968750327825546,
"reward_std": 0.11230766773223877,
"rewards/accuracy_reward": 0.296875,
"rewards/format_reward": 1.0,
"step": 62
},
{
"completion_length": 606.8125,
"epoch": 0.02746294681778553,
"grad_norm": 0.34162867069244385,
"learning_rate": 9.725370531822144e-07,
"loss": -0.004276728723198175,
"reward": 0.46406252682209015,
"reward_std": 0.17800088226795197,
"rewards/accuracy_reward": 0.265625,
"rewards/format_reward": 0.9921875,
"step": 63
},
{
"completion_length": 675.0234375,
"epoch": 0.02789886660854403,
"grad_norm": 0.3267340660095215,
"learning_rate": 9.72101133391456e-07,
"loss": -0.004803936462849379,
"reward": 0.47968754172325134,
"reward_std": 0.2698579430580139,
"rewards/accuracy_reward": 0.28125,
"rewards/format_reward": 0.9921875,
"step": 64
},
{
"completion_length": 689.875,
"epoch": 0.02833478639930253,
"grad_norm": 0.5070520639419556,
"learning_rate": 9.716652136006974e-07,
"loss": -0.004681795369833708,
"reward": 0.5890625417232513,
"reward_std": 0.28590644896030426,
"rewards/accuracy_reward": 0.390625,
"rewards/format_reward": 0.9921875,
"step": 65
},
{
"completion_length": 741.4296875,
"epoch": 0.02877070619006103,
"grad_norm": 0.539318859577179,
"learning_rate": 9.712292938099388e-07,
"loss": -0.0042268745601177216,
"reward": 0.7734375298023224,
"reward_std": 0.1940227895975113,
"rewards/accuracy_reward": 0.578125,
"rewards/format_reward": 0.9765625,
"step": 66
},
{
"completion_length": 494.328125,
"epoch": 0.02920662598081953,
"grad_norm": 0.23426468670368195,
"learning_rate": 9.707933740191804e-07,
"loss": -0.0033774186158552766,
"reward": 0.6296875476837158,
"reward_std": 0.12073517590761185,
"rewards/accuracy_reward": 0.4296875,
"rewards/format_reward": 1.0,
"step": 67
},
{
"completion_length": 663.703125,
"epoch": 0.02964254577157803,
"grad_norm": 0.5735094547271729,
"learning_rate": 9.70357454228422e-07,
"loss": -0.004496369976550341,
"reward": 0.47812503576278687,
"reward_std": 0.2521483972668648,
"rewards/accuracy_reward": 0.28125,
"rewards/format_reward": 0.984375,
"step": 68
},
{
"completion_length": 556.1875,
"epoch": 0.03007846556233653,
"grad_norm": 0.6028347611427307,
"learning_rate": 9.699215344376634e-07,
"loss": -0.0033272686414420605,
"reward": 0.5500000417232513,
"reward_std": 0.1772443801164627,
"rewards/accuracy_reward": 0.3515625,
"rewards/format_reward": 0.9921875,
"step": 69
},
{
"completion_length": 566.171875,
"epoch": 0.03051438535309503,
"grad_norm": 0.2644914984703064,
"learning_rate": 9.69485614646905e-07,
"loss": -0.0035728231305256486,
"reward": 0.44062504172325134,
"reward_std": 0.1354043260216713,
"rewards/accuracy_reward": 0.2421875,
"rewards/format_reward": 0.9921875,
"step": 70
},
{
"completion_length": 747.1171875,
"epoch": 0.03095030514385353,
"grad_norm": 0.45364266633987427,
"learning_rate": 9.690496948561464e-07,
"loss": -0.004377002594992518,
"reward": 0.5640625357627869,
"reward_std": 0.26171743869781494,
"rewards/accuracy_reward": 0.3671875,
"rewards/format_reward": 0.984375,
"step": 71
},
{
"completion_length": 736.7734375,
"epoch": 0.03138622493461203,
"grad_norm": 0.4566240608692169,
"learning_rate": 9.686137750653878e-07,
"loss": -0.0046136470045894384,
"reward": 0.5250000357627869,
"reward_std": 0.20069601386785507,
"rewards/accuracy_reward": 0.328125,
"rewards/format_reward": 0.984375,
"step": 72
},
{
"completion_length": 459.0234375,
"epoch": 0.03182214472537053,
"grad_norm": 0.4804815948009491,
"learning_rate": 9.681778552746294e-07,
"loss": -0.002390326582826674,
"reward": 0.5593750327825546,
"reward_std": 0.24959056824445724,
"rewards/accuracy_reward": 0.359375,
"rewards/format_reward": 1.0,
"step": 73
},
{
"completion_length": 611.203125,
"epoch": 0.03225806451612903,
"grad_norm": 0.32491230964660645,
"learning_rate": 9.67741935483871e-07,
"loss": -0.002548949094489217,
"reward": 0.6343750357627869,
"reward_std": 0.15103846788406372,
"rewards/accuracy_reward": 0.4375,
"rewards/format_reward": 0.984375,
"step": 74
},
{
"completion_length": 844.71875,
"epoch": 0.03269398430688753,
"grad_norm": 0.45734190940856934,
"learning_rate": 9.673060156931124e-07,
"loss": -0.0051384728867560625,
"reward": 0.47031253576278687,
"reward_std": 0.21036501228809357,
"rewards/accuracy_reward": 0.2734375,
"rewards/format_reward": 0.984375,
"step": 75
},
{
"completion_length": 507.1640625,
"epoch": 0.03312990409764603,
"grad_norm": 0.33810898661613464,
"learning_rate": 9.66870095902354e-07,
"loss": -0.0030672921566292644,
"reward": 0.7390625476837158,
"reward_std": 0.19674428552389145,
"rewards/accuracy_reward": 0.5390625,
"rewards/format_reward": 1.0,
"step": 76
},
{
"completion_length": 901.546875,
"epoch": 0.03356582388840453,
"grad_norm": 0.4877094626426697,
"learning_rate": 9.664341761115954e-07,
"loss": -0.004802107345312834,
"reward": 0.6296875476837158,
"reward_std": 0.24297793954610825,
"rewards/accuracy_reward": 0.4375,
"rewards/format_reward": 0.9609375,
"step": 77
},
{
"completion_length": 714.9296875,
"epoch": 0.03400174367916303,
"grad_norm": 0.6446647644042969,
"learning_rate": 9.659982563208368e-07,
"loss": -0.0041290284134447575,
"reward": 0.6953125298023224,
"reward_std": 0.21879743784666061,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.9765625,
"step": 78
},
{
"completion_length": 595.71875,
"epoch": 0.03443766346992153,
"grad_norm": 0.5552946329116821,
"learning_rate": 9.655623365300784e-07,
"loss": -0.00336921657435596,
"reward": 0.6906250417232513,
"reward_std": 0.2824498862028122,
"rewards/accuracy_reward": 0.4921875,
"rewards/format_reward": 0.9921875,
"step": 79
},
{
"completion_length": 559.0859375,
"epoch": 0.03487358326068003,
"grad_norm": 0.3945360481739044,
"learning_rate": 9.6512641673932e-07,
"loss": -0.0029993923380970955,
"reward": 0.5187500417232513,
"reward_std": 0.18508683145046234,
"rewards/accuracy_reward": 0.3203125,
"rewards/format_reward": 0.9921875,
"step": 80
},
{
"completion_length": 702.71875,
"epoch": 0.03530950305143853,
"grad_norm": 0.3644000291824341,
"learning_rate": 9.646904969485614e-07,
"loss": -0.004066583467647433,
"reward": 0.7515625357627869,
"reward_std": 0.17609478533267975,
"rewards/accuracy_reward": 0.5546875,
"rewards/format_reward": 0.984375,
"step": 81
},
{
"completion_length": 594.859375,
"epoch": 0.03574542284219703,
"grad_norm": 0.43995150923728943,
"learning_rate": 9.64254577157803e-07,
"loss": -0.0034514348953962326,
"reward": 0.49531254172325134,
"reward_std": 0.28716301918029785,
"rewards/accuracy_reward": 0.296875,
"rewards/format_reward": 0.9921875,
"step": 82
},
{
"completion_length": 468.734375,
"epoch": 0.036181342632955533,
"grad_norm": 0.49772775173187256,
"learning_rate": 9.638186573670444e-07,
"loss": -0.0027159389574080706,
"reward": 0.27031251788139343,
"reward_std": 0.15308690071105957,
"rewards/accuracy_reward": 0.0703125,
"rewards/format_reward": 1.0,
"step": 83
},
{
"completion_length": 626.71875,
"epoch": 0.036617262423714034,
"grad_norm": 0.27482131123542786,
"learning_rate": 9.63382737576286e-07,
"loss": -0.0032146567245945334,
"reward": 0.5421875417232513,
"reward_std": 0.14730052649974823,
"rewards/accuracy_reward": 0.34375,
"rewards/format_reward": 0.9921875,
"step": 84
},
{
"completion_length": 496.859375,
"epoch": 0.037053182214472534,
"grad_norm": 0.39663198590278625,
"learning_rate": 9.629468177855274e-07,
"loss": -0.0021742535172961652,
"reward": 0.6218750476837158,
"reward_std": 0.25354722142219543,
"rewards/accuracy_reward": 0.421875,
"rewards/format_reward": 1.0,
"step": 85
},
{
"completion_length": 507.53125,
"epoch": 0.037489102005231034,
"grad_norm": 0.38531285524368286,
"learning_rate": 9.62510897994769e-07,
"loss": -0.003144865622743964,
"reward": 0.6140625327825546,
"reward_std": 0.19332444667816162,
"rewards/accuracy_reward": 0.4140625,
"rewards/format_reward": 1.0,
"step": 86
},
{
"completion_length": 522.9609375,
"epoch": 0.03792502179598954,
"grad_norm": 0.4018486738204956,
"learning_rate": 9.620749782040104e-07,
"loss": -0.003351722378283739,
"reward": 0.5750000476837158,
"reward_std": 0.2790592461824417,
"rewards/accuracy_reward": 0.375,
"rewards/format_reward": 1.0,
"step": 87
},
{
"completion_length": 574.625,
"epoch": 0.03836094158674804,
"grad_norm": 0.29832443594932556,
"learning_rate": 9.61639058413252e-07,
"loss": -0.0031917719170451164,
"reward": 0.49531254172325134,
"reward_std": 0.19090906530618668,
"rewards/accuracy_reward": 0.296875,
"rewards/format_reward": 0.9921875,
"step": 88
},
{
"completion_length": 506.140625,
"epoch": 0.03879686137750654,
"grad_norm": 0.34219008684158325,
"learning_rate": 9.612031386224936e-07,
"loss": -0.002893596771173179,
"reward": 0.5359375476837158,
"reward_std": 0.21778053790330887,
"rewards/accuracy_reward": 0.3359375,
"rewards/format_reward": 1.0,
"step": 89
},
{
"completion_length": 520.8984375,
"epoch": 0.03923278116826504,
"grad_norm": 0.3178415298461914,
"learning_rate": 9.60767218831735e-07,
"loss": -0.0036478497786447406,
"reward": 0.5828125476837158,
"reward_std": 0.16781240701675415,
"rewards/accuracy_reward": 0.3828125,
"rewards/format_reward": 1.0,
"step": 90
},
{
"completion_length": 524.90625,
"epoch": 0.03966870095902354,
"grad_norm": 0.3558061122894287,
"learning_rate": 9.603312990409764e-07,
"loss": -0.003106694668531418,
"reward": 0.5437500476837158,
"reward_std": 0.268809512257576,
"rewards/accuracy_reward": 0.34375,
"rewards/format_reward": 1.0,
"step": 91
},
{
"completion_length": 486.4140625,
"epoch": 0.04010462074978204,
"grad_norm": 0.3201664388179779,
"learning_rate": 9.59895379250218e-07,
"loss": -0.00227005232591182,
"reward": 0.5750000476837158,
"reward_std": 0.17464719712734222,
"rewards/accuracy_reward": 0.375,
"rewards/format_reward": 1.0,
"step": 92
},
{
"completion_length": 605.6640625,
"epoch": 0.04054054054054054,
"grad_norm": 0.5713381767272949,
"learning_rate": 9.594594594594594e-07,
"loss": -0.003524004598148167,
"reward": 0.6125000417232513,
"reward_std": 0.26134093105793,
"rewards/accuracy_reward": 0.4140625,
"rewards/format_reward": 0.9921875,
"step": 93
},
{
"completion_length": 516.140625,
"epoch": 0.04097646033129904,
"grad_norm": 0.41625216603279114,
"learning_rate": 9.59023539668701e-07,
"loss": -0.003131876001134515,
"reward": 0.5906250476837158,
"reward_std": 0.20805486291646957,
"rewards/accuracy_reward": 0.390625,
"rewards/format_reward": 1.0,
"step": 94
},
{
"completion_length": 528.953125,
"epoch": 0.04141238012205754,
"grad_norm": 0.48059654235839844,
"learning_rate": 9.585876198779426e-07,
"loss": -0.003212686162441969,
"reward": 0.5984375476837158,
"reward_std": 0.24329258501529694,
"rewards/accuracy_reward": 0.3984375,
"rewards/format_reward": 1.0,
"step": 95
},
{
"completion_length": 500.1328125,
"epoch": 0.041848299912816043,
"grad_norm": 0.5891656875610352,
"learning_rate": 9.58151700087184e-07,
"loss": -0.0030139287700876594,
"reward": 0.5593750327825546,
"reward_std": 0.18648964911699295,
"rewards/accuracy_reward": 0.359375,
"rewards/format_reward": 1.0,
"step": 96
},
{
"completion_length": 474.71875,
"epoch": 0.042284219703574544,
"grad_norm": 0.8267337083816528,
"learning_rate": 9.577157802964253e-07,
"loss": -0.002822687732987106,
"reward": 0.5515625476837158,
"reward_std": 0.2012200579047203,
"rewards/accuracy_reward": 0.3515625,
"rewards/format_reward": 1.0,
"step": 97
},
{
"completion_length": 791.3203125,
"epoch": 0.042720139494333044,
"grad_norm": 0.38680315017700195,
"learning_rate": 9.57279860505667e-07,
"loss": -0.003116427455097437,
"reward": 0.6093750298023224,
"reward_std": 0.2934764325618744,
"rewards/accuracy_reward": 0.4140625,
"rewards/format_reward": 0.9765625,
"step": 98
},
{
"completion_length": 552.703125,
"epoch": 0.043156059285091544,
"grad_norm": 0.4463382959365845,
"learning_rate": 9.568439407149083e-07,
"loss": -0.0025965895038098097,
"reward": 0.5578125417232513,
"reward_std": 0.21536517888307571,
"rewards/accuracy_reward": 0.359375,
"rewards/format_reward": 0.9921875,
"step": 99
},
{
"completion_length": 462.828125,
"epoch": 0.043591979075850044,
"grad_norm": 0.3906485140323639,
"learning_rate": 9.5640802092415e-07,
"loss": -0.002442999859340489,
"reward": 0.4968750476837158,
"reward_std": 0.2109457552433014,
"rewards/accuracy_reward": 0.296875,
"rewards/format_reward": 1.0,
"step": 100
}
],
"logging_steps": 1.0,
"max_steps": 2294,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}