{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.043591979075850044, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 6066.5546875, "epoch": 0.00043591979075850045, "grad_norm": 22.7120361328125, "learning_rate": 9.995640802092413e-07, "loss": -0.00010610813114908524, "reward": 0.16562499850988388, "reward_std": 0.2048395685851574, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.2421875, "step": 1 }, { "completion_length": 5022.6015625, "epoch": 0.0008718395815170009, "grad_norm": 35.190757751464844, "learning_rate": 9.99128160418483e-07, "loss": -0.0003467285423539579, "reward": 0.24531249701976776, "reward_std": 0.13261918351054192, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.3671875, "step": 2 }, { "completion_length": 5074.4765625, "epoch": 0.0013077593722755014, "grad_norm": 36.27347946166992, "learning_rate": 9.986922406277246e-07, "loss": -0.0009319710079580545, "reward": 0.31718750298023224, "reward_std": 0.15016943216323853, "rewards/accuracy_reward": 0.2578125, "rewards/format_reward": 0.296875, "step": 3 }, { "completion_length": 4919.9921875, "epoch": 0.0017436791630340018, "grad_norm": 23.1031494140625, "learning_rate": 9.98256320836966e-07, "loss": -0.0011635422706604004, "reward": 0.2515625134110451, "reward_std": 0.22550153732299805, "rewards/accuracy_reward": 0.1796875, "rewards/format_reward": 0.359375, "step": 4 }, { "completion_length": 4392.3203125, "epoch": 0.002179598953792502, "grad_norm": 18.30356788635254, "learning_rate": 9.978204010462075e-07, "loss": -0.001725408248603344, "reward": 0.3500000089406967, "reward_std": 0.2601192742586136, "rewards/accuracy_reward": 0.2734375, "rewards/format_reward": 0.3828125, "step": 5 }, { "completion_length": 5603.28125, "epoch": 0.0026155187445510027, "grad_norm": 21.6483154296875, "learning_rate": 9.97384481255449e-07, "loss": -0.004177422029897571, "reward": 0.07500000298023224, "reward_std": 0.14329775422811508, "rewards/accuracy_reward": 0.0390625, "rewards/format_reward": 0.1796875, "step": 6 }, { "completion_length": 3968.328125, "epoch": 0.003051438535309503, "grad_norm": 21.388357162475586, "learning_rate": 9.969485614646903e-07, "loss": -0.002357690129429102, "reward": 0.2109375149011612, "reward_std": 0.19895199686288834, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.390625, "step": 7 }, { "completion_length": 4283.15625, "epoch": 0.0034873583260680036, "grad_norm": 17.606998443603516, "learning_rate": 9.96512641673932e-07, "loss": -0.0032072272151708603, "reward": 0.3046875149011612, "reward_std": 0.3027474880218506, "rewards/accuracy_reward": 0.234375, "rewards/format_reward": 0.3515625, "step": 8 }, { "completion_length": 2647.453125, "epoch": 0.003923278116826504, "grad_norm": 7.005691051483154, "learning_rate": 9.960767218831735e-07, "loss": -0.0025840166490525007, "reward": 0.33125001937150955, "reward_std": 0.270910307765007, "rewards/accuracy_reward": 0.234375, "rewards/format_reward": 0.484375, "step": 9 }, { "completion_length": 2621.7734375, "epoch": 0.004359197907585004, "grad_norm": 32.332698822021484, "learning_rate": 9.95640802092415e-07, "loss": -0.0029894779436290264, "reward": 0.29843752086162567, "reward_std": 0.21532631665468216, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.671875, "step": 10 }, { "completion_length": 2444.78125, "epoch": 0.004795117698343505, "grad_norm": 15.55601692199707, "learning_rate": 9.952048823016565e-07, "loss": -0.003852886729873717, "reward": 0.24062500894069672, "reward_std": 0.2612670660018921, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.4609375, "step": 11 }, { "completion_length": 2593.9140625, "epoch": 0.0052310374891020054, "grad_norm": 29.252334594726562, "learning_rate": 9.94768962510898e-07, "loss": -0.0042398301884531975, "reward": 0.48750001192092896, "reward_std": 0.3399874120950699, "rewards/accuracy_reward": 0.3515625, "rewards/format_reward": 0.6796875, "step": 12 }, { "completion_length": 2220.7109375, "epoch": 0.005666957279860506, "grad_norm": 13.38364028930664, "learning_rate": 9.943330427201393e-07, "loss": -0.0033426693407818675, "reward": 0.3125000149011612, "reward_std": 0.19083451479673386, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.703125, "step": 13 }, { "completion_length": 2529.640625, "epoch": 0.006102877070619006, "grad_norm": 17.974361419677734, "learning_rate": 9.93897122929381e-07, "loss": -0.004656808450818062, "reward": 0.3500000089406967, "reward_std": 0.36670154333114624, "rewards/accuracy_reward": 0.234375, "rewards/format_reward": 0.578125, "step": 14 }, { "completion_length": 2840.5625, "epoch": 0.006538796861377506, "grad_norm": 11.839619636535645, "learning_rate": 9.934612031386225e-07, "loss": -0.005619838368147612, "reward": 0.29218751937150955, "reward_std": 0.11481105536222458, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.640625, "step": 15 }, { "completion_length": 3081.2421875, "epoch": 0.006974716652136007, "grad_norm": 8.316078186035156, "learning_rate": 9.93025283347864e-07, "loss": -0.00666549289599061, "reward": 0.42500003427267075, "reward_std": 0.30427779257297516, "rewards/accuracy_reward": 0.2890625, "rewards/format_reward": 0.6796875, "step": 16 }, { "completion_length": 2150.2734375, "epoch": 0.0074106364428945075, "grad_norm": 15.924665451049805, "learning_rate": 9.925893635571055e-07, "loss": -0.006602097302675247, "reward": 0.43906252086162567, "reward_std": 0.35585278272628784, "rewards/accuracy_reward": 0.2890625, "rewards/format_reward": 0.75, "step": 17 }, { "completion_length": 1136.3359375, "epoch": 0.007846556233653008, "grad_norm": 3.312643527984619, "learning_rate": 9.92153443766347e-07, "loss": -0.00424616876989603, "reward": 0.484375, "reward_std": 0.2594892159104347, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.859375, "step": 18 }, { "completion_length": 1163.0703125, "epoch": 0.008282476024411508, "grad_norm": 4.4617600440979, "learning_rate": 9.917175239755885e-07, "loss": -0.0060931057669222355, "reward": 0.6203125715255737, "reward_std": 0.3268684893846512, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.9140625, "step": 19 }, { "completion_length": 1688.5078125, "epoch": 0.008718395815170008, "grad_norm": 5.8775506019592285, "learning_rate": 9.9128160418483e-07, "loss": -0.009722861228510737, "reward": 0.38593751192092896, "reward_std": 0.21004340052604675, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.8359375, "step": 20 }, { "completion_length": 1012.625, "epoch": 0.009154315605928508, "grad_norm": 4.331947326660156, "learning_rate": 9.908456843940715e-07, "loss": -0.005864025559276342, "reward": 0.550000011920929, "reward_std": 0.336714543402195, "rewards/accuracy_reward": 0.359375, "rewards/format_reward": 0.953125, "step": 21 }, { "completion_length": 1052.0234375, "epoch": 0.00959023539668701, "grad_norm": 7.702275276184082, "learning_rate": 9.90409764603313e-07, "loss": -0.0061883407179266214, "reward": 0.5375000536441803, "reward_std": 0.19406893104314804, "rewards/accuracy_reward": 0.3515625, "rewards/format_reward": 0.9296875, "step": 22 }, { "completion_length": 585.890625, "epoch": 0.01002615518744551, "grad_norm": 2.693312883377075, "learning_rate": 9.899738448125545e-07, "loss": -0.0050068587297573686, "reward": 0.4500000476837158, "reward_std": 0.22148218750953674, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 1.0, "step": 23 }, { "completion_length": 952.8984375, "epoch": 0.010462074978204011, "grad_norm": 2.040409564971924, "learning_rate": 9.89537925021796e-07, "loss": -0.008675348944962025, "reward": 0.4531250298023224, "reward_std": 0.2833295091986656, "rewards/accuracy_reward": 0.2578125, "rewards/format_reward": 0.9765625, "step": 24 }, { "completion_length": 855.203125, "epoch": 0.010897994768962511, "grad_norm": 1.750288486480713, "learning_rate": 9.891020052310375e-07, "loss": -0.006478779250755906, "reward": 0.5281250476837158, "reward_std": 0.2747085839509964, "rewards/accuracy_reward": 0.3359375, "rewards/format_reward": 0.9609375, "step": 25 }, { "completion_length": 622.625, "epoch": 0.011333914559721011, "grad_norm": 1.3210248947143555, "learning_rate": 9.88666085440279e-07, "loss": -0.006179739721119404, "reward": 0.5562500357627869, "reward_std": 0.2532925382256508, "rewards/accuracy_reward": 0.359375, "rewards/format_reward": 0.984375, "step": 26 }, { "completion_length": 609.3984375, "epoch": 0.011769834350479512, "grad_norm": 3.869396448135376, "learning_rate": 9.882301656495205e-07, "loss": -0.0062519978964701295, "reward": 0.5000000298023224, "reward_std": 0.14424315840005875, "rewards/accuracy_reward": 0.3046875, "rewards/format_reward": 0.9765625, "step": 27 }, { "completion_length": 547.375, "epoch": 0.012205754141238012, "grad_norm": 1.0941433906555176, "learning_rate": 9.877942458587619e-07, "loss": -0.0032227920601144433, "reward": 0.5484375357627869, "reward_std": 0.21108780801296234, "rewards/accuracy_reward": 0.3515625, "rewards/format_reward": 0.984375, "step": 28 }, { "completion_length": 566.8515625, "epoch": 0.012641673931996512, "grad_norm": 1.1244480609893799, "learning_rate": 9.873583260680035e-07, "loss": -0.005511581432074308, "reward": 0.5296875238418579, "reward_std": 0.25184717029333115, "rewards/accuracy_reward": 0.3359375, "rewards/format_reward": 0.96875, "step": 29 }, { "completion_length": 629.2109375, "epoch": 0.013077593722755012, "grad_norm": 1.0730060338974, "learning_rate": 9.869224062772449e-07, "loss": -0.00590163329616189, "reward": 0.5437500476837158, "reward_std": 0.30221718549728394, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 1.0, "step": 30 }, { "completion_length": 577.2890625, "epoch": 0.013513513513513514, "grad_norm": 1.0006390810012817, "learning_rate": 9.864864864864865e-07, "loss": -0.004602149594575167, "reward": 0.48906251788139343, "reward_std": 0.2546490430831909, "rewards/accuracy_reward": 0.296875, "rewards/format_reward": 0.9609375, "step": 31 }, { "completion_length": 652.453125, "epoch": 0.013949433304272014, "grad_norm": 2.0136349201202393, "learning_rate": 9.860505666957279e-07, "loss": -0.007033544359728694, "reward": 0.6000000536441803, "reward_std": 0.34614098072052, "rewards/accuracy_reward": 0.40625, "rewards/format_reward": 0.96875, "step": 32 }, { "completion_length": 830.296875, "epoch": 0.014385353095030515, "grad_norm": 0.6222465634346008, "learning_rate": 9.856146469049695e-07, "loss": -0.0058622711803764105, "reward": 0.6406250596046448, "reward_std": 0.26844407618045807, "rewards/accuracy_reward": 0.4453125, "rewards/format_reward": 0.9765625, "step": 33 }, { "completion_length": 924.8984375, "epoch": 0.014821272885789015, "grad_norm": 1.6224457025527954, "learning_rate": 9.851787271142109e-07, "loss": -0.006918259430676699, "reward": 0.45781250298023224, "reward_std": 0.1292574293911457, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 0.9609375, "step": 34 }, { "completion_length": 757.8828125, "epoch": 0.015257192676547515, "grad_norm": 0.8691195249557495, "learning_rate": 9.847428073234525e-07, "loss": -0.005784029606729746, "reward": 0.45468753576278687, "reward_std": 0.20147473365068436, "rewards/accuracy_reward": 0.2578125, "rewards/format_reward": 0.984375, "step": 35 }, { "completion_length": 1539.546875, "epoch": 0.015693112467306015, "grad_norm": 4.3893961906433105, "learning_rate": 9.843068875326939e-07, "loss": -0.010595182422548532, "reward": 0.4765625298023224, "reward_std": 0.2606821805238724, "rewards/accuracy_reward": 0.296875, "rewards/format_reward": 0.8984375, "step": 36 }, { "completion_length": 949.203125, "epoch": 0.016129032258064516, "grad_norm": 1.039124608039856, "learning_rate": 9.838709677419355e-07, "loss": -0.005853116046637297, "reward": 0.6062500476837158, "reward_std": 0.34973812103271484, "rewards/accuracy_reward": 0.4140625, "rewards/format_reward": 0.9609375, "step": 37 }, { "completion_length": 1128.515625, "epoch": 0.016564952048823016, "grad_norm": 2.9782750606536865, "learning_rate": 9.834350479511769e-07, "loss": -0.00639305729418993, "reward": 0.3796875327825546, "reward_std": 0.2201501727104187, "rewards/accuracy_reward": 0.1953125, "rewards/format_reward": 0.921875, "step": 38 }, { "completion_length": 803.09375, "epoch": 0.017000871839581516, "grad_norm": 0.9739387035369873, "learning_rate": 9.829991281604185e-07, "loss": -0.003955277847126126, "reward": 0.5281250327825546, "reward_std": 0.26489946991205215, "rewards/accuracy_reward": 0.3359375, "rewards/format_reward": 0.9609375, "step": 39 }, { "completion_length": 624.953125, "epoch": 0.017436791630340016, "grad_norm": 0.5311559438705444, "learning_rate": 9.825632083696599e-07, "loss": -0.004024791065603495, "reward": 0.5468750298023224, "reward_std": 0.2868617922067642, "rewards/accuracy_reward": 0.3515625, "rewards/format_reward": 0.9765625, "step": 40 }, { "completion_length": 1250.9765625, "epoch": 0.017872711421098517, "grad_norm": 0.7332771420478821, "learning_rate": 9.821272885789015e-07, "loss": -0.005356588866561651, "reward": 0.44062504172325134, "reward_std": 0.20438477396965027, "rewards/accuracy_reward": 0.2578125, "rewards/format_reward": 0.9140625, "step": 41 }, { "completion_length": 1476.9921875, "epoch": 0.018308631211857017, "grad_norm": 2.0835506916046143, "learning_rate": 9.816913687881429e-07, "loss": -0.006023196969181299, "reward": 0.4125000238418579, "reward_std": 0.23667097091674805, "rewards/accuracy_reward": 0.234375, "rewards/format_reward": 0.890625, "step": 42 }, { "completion_length": 1100.25, "epoch": 0.018744551002615517, "grad_norm": 0.7210353016853333, "learning_rate": 9.812554489973845e-07, "loss": -0.00412205932661891, "reward": 0.5765625238418579, "reward_std": 0.35319100320339203, "rewards/accuracy_reward": 0.390625, "rewards/format_reward": 0.9296875, "step": 43 }, { "completion_length": 1268.8828125, "epoch": 0.01918047079337402, "grad_norm": 1.018958330154419, "learning_rate": 9.808195292066259e-07, "loss": -0.005287598352879286, "reward": 0.5796875357627869, "reward_std": 0.33388449996709824, "rewards/accuracy_reward": 0.3984375, "rewards/format_reward": 0.90625, "step": 44 }, { "completion_length": 1529.1484375, "epoch": 0.01961639058413252, "grad_norm": 0.9156416058540344, "learning_rate": 9.803836094158675e-07, "loss": -0.006656843703240156, "reward": 0.4531250298023224, "reward_std": 0.3121063858270645, "rewards/accuracy_reward": 0.2734375, "rewards/format_reward": 0.8984375, "step": 45 }, { "completion_length": 771.7109375, "epoch": 0.02005231037489102, "grad_norm": 1.1348389387130737, "learning_rate": 9.79947689625109e-07, "loss": -0.004119608784094453, "reward": 0.5406250357627869, "reward_std": 0.16695528104901314, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.984375, "step": 46 }, { "completion_length": 1171.2578125, "epoch": 0.02048823016564952, "grad_norm": 0.7597943544387817, "learning_rate": 9.795117698343505e-07, "loss": -0.004714524140581489, "reward": 0.6031250357627869, "reward_std": 0.296435609459877, "rewards/accuracy_reward": 0.4140625, "rewards/format_reward": 0.9453125, "step": 47 }, { "completion_length": 1121.265625, "epoch": 0.020924149956408022, "grad_norm": 0.7531502842903137, "learning_rate": 9.790758500435918e-07, "loss": -0.004732346162199974, "reward": 0.4359375089406967, "reward_std": 0.3109729588031769, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.9296875, "step": 48 }, { "completion_length": 1959.421875, "epoch": 0.021360069747166522, "grad_norm": 1.5503896474838257, "learning_rate": 9.786399302528334e-07, "loss": -0.0054204994812607765, "reward": 0.4218750298023224, "reward_std": 0.28712356090545654, "rewards/accuracy_reward": 0.2578125, "rewards/format_reward": 0.8203125, "step": 49 }, { "completion_length": 1155.4140625, "epoch": 0.021795989537925022, "grad_norm": 0.5289723873138428, "learning_rate": 9.782040104620748e-07, "loss": -0.00590874906629324, "reward": 0.4140625298023224, "reward_std": 0.2974793165922165, "rewards/accuracy_reward": 0.2265625, "rewards/format_reward": 0.9375, "step": 50 }, { "completion_length": 1361.078125, "epoch": 0.022231909328683522, "grad_norm": 0.7921638488769531, "learning_rate": 9.777680906713164e-07, "loss": -0.005040215328335762, "reward": 0.3031249940395355, "reward_std": 0.21944554150104523, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.890625, "step": 51 }, { "completion_length": 1471.5859375, "epoch": 0.022667829119442023, "grad_norm": 0.6596994996070862, "learning_rate": 9.77332170880558e-07, "loss": -0.005814009346067905, "reward": 0.4859375059604645, "reward_std": 0.3088204860687256, "rewards/accuracy_reward": 0.3046875, "rewards/format_reward": 0.90625, "step": 52 }, { "completion_length": 1483.7265625, "epoch": 0.023103748910200523, "grad_norm": 0.9196128249168396, "learning_rate": 9.768962510897994e-07, "loss": -0.005532125011086464, "reward": 0.6109375357627869, "reward_std": 0.32757391035556793, "rewards/accuracy_reward": 0.4296875, "rewards/format_reward": 0.90625, "step": 53 }, { "completion_length": 1215.15625, "epoch": 0.023539668700959023, "grad_norm": 0.7604343891143799, "learning_rate": 9.764603312990408e-07, "loss": -0.0066660866141319275, "reward": 0.6000000536441803, "reward_std": 0.36297860741615295, "rewards/accuracy_reward": 0.4140625, "rewards/format_reward": 0.9296875, "step": 54 }, { "completion_length": 1351.3125, "epoch": 0.023975588491717523, "grad_norm": 0.7223543524742126, "learning_rate": 9.760244115082824e-07, "loss": -0.005946665536612272, "reward": 0.43906253576278687, "reward_std": 0.2528854086995125, "rewards/accuracy_reward": 0.2578125, "rewards/format_reward": 0.90625, "step": 55 }, { "completion_length": 1185.84375, "epoch": 0.024411508282476024, "grad_norm": 0.5095703601837158, "learning_rate": 9.755884917175238e-07, "loss": -0.0064112339168787, "reward": 0.5093750208616257, "reward_std": 0.21522878110408783, "rewards/accuracy_reward": 0.3203125, "rewards/format_reward": 0.9453125, "step": 56 }, { "completion_length": 1281.0546875, "epoch": 0.024847428073234524, "grad_norm": 1.2985528707504272, "learning_rate": 9.751525719267654e-07, "loss": -0.006073690485209227, "reward": 0.515625, "reward_std": 0.26809659600257874, "rewards/accuracy_reward": 0.328125, "rewards/format_reward": 0.9375, "step": 57 }, { "completion_length": 1124.7109375, "epoch": 0.025283347863993024, "grad_norm": 0.5998630523681641, "learning_rate": 9.74716652136007e-07, "loss": -0.006307224277406931, "reward": 0.42500001192092896, "reward_std": 0.15122529119253159, "rewards/accuracy_reward": 0.234375, "rewards/format_reward": 0.953125, "step": 58 }, { "completion_length": 1213.90625, "epoch": 0.025719267654751524, "grad_norm": 1.0118658542633057, "learning_rate": 9.742807323452484e-07, "loss": -0.007607629988342524, "reward": 0.7015625238418579, "reward_std": 0.26937858760356903, "rewards/accuracy_reward": 0.515625, "rewards/format_reward": 0.9296875, "step": 59 }, { "completion_length": 816.8046875, "epoch": 0.026155187445510025, "grad_norm": 0.5398353338241577, "learning_rate": 9.738448125544898e-07, "loss": -0.005773038603365421, "reward": 0.8140625357627869, "reward_std": 0.2714267522096634, "rewards/accuracy_reward": 0.6171875, "rewards/format_reward": 0.984375, "step": 60 }, { "completion_length": 802.9453125, "epoch": 0.02659110723626853, "grad_norm": 0.40469521284103394, "learning_rate": 9.734088927637314e-07, "loss": -0.005671899998560548, "reward": 0.5234375298023224, "reward_std": 0.19351572543382645, "rewards/accuracy_reward": 0.328125, "rewards/format_reward": 0.9765625, "step": 61 }, { "completion_length": 609.734375, "epoch": 0.02702702702702703, "grad_norm": 0.2660030424594879, "learning_rate": 9.72972972972973e-07, "loss": -0.003733730292879045, "reward": 0.4968750327825546, "reward_std": 0.11230766773223877, "rewards/accuracy_reward": 0.296875, "rewards/format_reward": 1.0, "step": 62 }, { "completion_length": 606.8125, "epoch": 0.02746294681778553, "grad_norm": 0.34162867069244385, "learning_rate": 9.725370531822144e-07, "loss": -0.004276728723198175, "reward": 0.46406252682209015, "reward_std": 0.17800088226795197, "rewards/accuracy_reward": 0.265625, "rewards/format_reward": 0.9921875, "step": 63 }, { "completion_length": 675.0234375, "epoch": 0.02789886660854403, "grad_norm": 0.3267340660095215, "learning_rate": 9.72101133391456e-07, "loss": -0.004803936462849379, "reward": 0.47968754172325134, "reward_std": 0.2698579430580139, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.9921875, "step": 64 }, { "completion_length": 689.875, "epoch": 0.02833478639930253, "grad_norm": 0.5070520639419556, "learning_rate": 9.716652136006974e-07, "loss": -0.004681795369833708, "reward": 0.5890625417232513, "reward_std": 0.28590644896030426, "rewards/accuracy_reward": 0.390625, "rewards/format_reward": 0.9921875, "step": 65 }, { "completion_length": 741.4296875, "epoch": 0.02877070619006103, "grad_norm": 0.539318859577179, "learning_rate": 9.712292938099388e-07, "loss": -0.0042268745601177216, "reward": 0.7734375298023224, "reward_std": 0.1940227895975113, "rewards/accuracy_reward": 0.578125, "rewards/format_reward": 0.9765625, "step": 66 }, { "completion_length": 494.328125, "epoch": 0.02920662598081953, "grad_norm": 0.23426468670368195, "learning_rate": 9.707933740191804e-07, "loss": -0.0033774186158552766, "reward": 0.6296875476837158, "reward_std": 0.12073517590761185, "rewards/accuracy_reward": 0.4296875, "rewards/format_reward": 1.0, "step": 67 }, { "completion_length": 663.703125, "epoch": 0.02964254577157803, "grad_norm": 0.5735094547271729, "learning_rate": 9.70357454228422e-07, "loss": -0.004496369976550341, "reward": 0.47812503576278687, "reward_std": 0.2521483972668648, "rewards/accuracy_reward": 0.28125, "rewards/format_reward": 0.984375, "step": 68 }, { "completion_length": 556.1875, "epoch": 0.03007846556233653, "grad_norm": 0.6028347611427307, "learning_rate": 9.699215344376634e-07, "loss": -0.0033272686414420605, "reward": 0.5500000417232513, "reward_std": 0.1772443801164627, "rewards/accuracy_reward": 0.3515625, "rewards/format_reward": 0.9921875, "step": 69 }, { "completion_length": 566.171875, "epoch": 0.03051438535309503, "grad_norm": 0.2644914984703064, "learning_rate": 9.69485614646905e-07, "loss": -0.0035728231305256486, "reward": 0.44062504172325134, "reward_std": 0.1354043260216713, "rewards/accuracy_reward": 0.2421875, "rewards/format_reward": 0.9921875, "step": 70 }, { "completion_length": 747.1171875, "epoch": 0.03095030514385353, "grad_norm": 0.45364266633987427, "learning_rate": 9.690496948561464e-07, "loss": -0.004377002594992518, "reward": 0.5640625357627869, "reward_std": 0.26171743869781494, "rewards/accuracy_reward": 0.3671875, "rewards/format_reward": 0.984375, "step": 71 }, { "completion_length": 736.7734375, "epoch": 0.03138622493461203, "grad_norm": 0.4566240608692169, "learning_rate": 9.686137750653878e-07, "loss": -0.0046136470045894384, "reward": 0.5250000357627869, "reward_std": 0.20069601386785507, "rewards/accuracy_reward": 0.328125, "rewards/format_reward": 0.984375, "step": 72 }, { "completion_length": 459.0234375, "epoch": 0.03182214472537053, "grad_norm": 0.4804815948009491, "learning_rate": 9.681778552746294e-07, "loss": -0.002390326582826674, "reward": 0.5593750327825546, "reward_std": 0.24959056824445724, "rewards/accuracy_reward": 0.359375, "rewards/format_reward": 1.0, "step": 73 }, { "completion_length": 611.203125, "epoch": 0.03225806451612903, "grad_norm": 0.32491230964660645, "learning_rate": 9.67741935483871e-07, "loss": -0.002548949094489217, "reward": 0.6343750357627869, "reward_std": 0.15103846788406372, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.984375, "step": 74 }, { "completion_length": 844.71875, "epoch": 0.03269398430688753, "grad_norm": 0.45734190940856934, "learning_rate": 9.673060156931124e-07, "loss": -0.0051384728867560625, "reward": 0.47031253576278687, "reward_std": 0.21036501228809357, "rewards/accuracy_reward": 0.2734375, "rewards/format_reward": 0.984375, "step": 75 }, { "completion_length": 507.1640625, "epoch": 0.03312990409764603, "grad_norm": 0.33810898661613464, "learning_rate": 9.66870095902354e-07, "loss": -0.0030672921566292644, "reward": 0.7390625476837158, "reward_std": 0.19674428552389145, "rewards/accuracy_reward": 0.5390625, "rewards/format_reward": 1.0, "step": 76 }, { "completion_length": 901.546875, "epoch": 0.03356582388840453, "grad_norm": 0.4877094626426697, "learning_rate": 9.664341761115954e-07, "loss": -0.004802107345312834, "reward": 0.6296875476837158, "reward_std": 0.24297793954610825, "rewards/accuracy_reward": 0.4375, "rewards/format_reward": 0.9609375, "step": 77 }, { "completion_length": 714.9296875, "epoch": 0.03400174367916303, "grad_norm": 0.6446647644042969, "learning_rate": 9.659982563208368e-07, "loss": -0.0041290284134447575, "reward": 0.6953125298023224, "reward_std": 0.21879743784666061, "rewards/accuracy_reward": 0.5, "rewards/format_reward": 0.9765625, "step": 78 }, { "completion_length": 595.71875, "epoch": 0.03443766346992153, "grad_norm": 0.5552946329116821, "learning_rate": 9.655623365300784e-07, "loss": -0.00336921657435596, "reward": 0.6906250417232513, "reward_std": 0.2824498862028122, "rewards/accuracy_reward": 0.4921875, "rewards/format_reward": 0.9921875, "step": 79 }, { "completion_length": 559.0859375, "epoch": 0.03487358326068003, "grad_norm": 0.3945360481739044, "learning_rate": 9.6512641673932e-07, "loss": -0.0029993923380970955, "reward": 0.5187500417232513, "reward_std": 0.18508683145046234, "rewards/accuracy_reward": 0.3203125, "rewards/format_reward": 0.9921875, "step": 80 }, { "completion_length": 702.71875, "epoch": 0.03530950305143853, "grad_norm": 0.3644000291824341, "learning_rate": 9.646904969485614e-07, "loss": -0.004066583467647433, "reward": 0.7515625357627869, "reward_std": 0.17609478533267975, "rewards/accuracy_reward": 0.5546875, "rewards/format_reward": 0.984375, "step": 81 }, { "completion_length": 594.859375, "epoch": 0.03574542284219703, "grad_norm": 0.43995150923728943, "learning_rate": 9.64254577157803e-07, "loss": -0.0034514348953962326, "reward": 0.49531254172325134, "reward_std": 0.28716301918029785, "rewards/accuracy_reward": 0.296875, "rewards/format_reward": 0.9921875, "step": 82 }, { "completion_length": 468.734375, "epoch": 0.036181342632955533, "grad_norm": 0.49772775173187256, "learning_rate": 9.638186573670444e-07, "loss": -0.0027159389574080706, "reward": 0.27031251788139343, "reward_std": 0.15308690071105957, "rewards/accuracy_reward": 0.0703125, "rewards/format_reward": 1.0, "step": 83 }, { "completion_length": 626.71875, "epoch": 0.036617262423714034, "grad_norm": 0.27482131123542786, "learning_rate": 9.63382737576286e-07, "loss": -0.0032146567245945334, "reward": 0.5421875417232513, "reward_std": 0.14730052649974823, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 0.9921875, "step": 84 }, { "completion_length": 496.859375, "epoch": 0.037053182214472534, "grad_norm": 0.39663198590278625, "learning_rate": 9.629468177855274e-07, "loss": -0.0021742535172961652, "reward": 0.6218750476837158, "reward_std": 0.25354722142219543, "rewards/accuracy_reward": 0.421875, "rewards/format_reward": 1.0, "step": 85 }, { "completion_length": 507.53125, "epoch": 0.037489102005231034, "grad_norm": 0.38531285524368286, "learning_rate": 9.62510897994769e-07, "loss": -0.003144865622743964, "reward": 0.6140625327825546, "reward_std": 0.19332444667816162, "rewards/accuracy_reward": 0.4140625, "rewards/format_reward": 1.0, "step": 86 }, { "completion_length": 522.9609375, "epoch": 0.03792502179598954, "grad_norm": 0.4018486738204956, "learning_rate": 9.620749782040104e-07, "loss": -0.003351722378283739, "reward": 0.5750000476837158, "reward_std": 0.2790592461824417, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 1.0, "step": 87 }, { "completion_length": 574.625, "epoch": 0.03836094158674804, "grad_norm": 0.29832443594932556, "learning_rate": 9.61639058413252e-07, "loss": -0.0031917719170451164, "reward": 0.49531254172325134, "reward_std": 0.19090906530618668, "rewards/accuracy_reward": 0.296875, "rewards/format_reward": 0.9921875, "step": 88 }, { "completion_length": 506.140625, "epoch": 0.03879686137750654, "grad_norm": 0.34219008684158325, "learning_rate": 9.612031386224936e-07, "loss": -0.002893596771173179, "reward": 0.5359375476837158, "reward_std": 0.21778053790330887, "rewards/accuracy_reward": 0.3359375, "rewards/format_reward": 1.0, "step": 89 }, { "completion_length": 520.8984375, "epoch": 0.03923278116826504, "grad_norm": 0.3178415298461914, "learning_rate": 9.60767218831735e-07, "loss": -0.0036478497786447406, "reward": 0.5828125476837158, "reward_std": 0.16781240701675415, "rewards/accuracy_reward": 0.3828125, "rewards/format_reward": 1.0, "step": 90 }, { "completion_length": 524.90625, "epoch": 0.03966870095902354, "grad_norm": 0.3558061122894287, "learning_rate": 9.603312990409764e-07, "loss": -0.003106694668531418, "reward": 0.5437500476837158, "reward_std": 0.268809512257576, "rewards/accuracy_reward": 0.34375, "rewards/format_reward": 1.0, "step": 91 }, { "completion_length": 486.4140625, "epoch": 0.04010462074978204, "grad_norm": 0.3201664388179779, "learning_rate": 9.59895379250218e-07, "loss": -0.00227005232591182, "reward": 0.5750000476837158, "reward_std": 0.17464719712734222, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 1.0, "step": 92 }, { "completion_length": 605.6640625, "epoch": 0.04054054054054054, "grad_norm": 0.5713381767272949, "learning_rate": 9.594594594594594e-07, "loss": -0.003524004598148167, "reward": 0.6125000417232513, "reward_std": 0.26134093105793, "rewards/accuracy_reward": 0.4140625, "rewards/format_reward": 0.9921875, "step": 93 }, { "completion_length": 516.140625, "epoch": 0.04097646033129904, "grad_norm": 0.41625216603279114, "learning_rate": 9.59023539668701e-07, "loss": -0.003131876001134515, "reward": 0.5906250476837158, "reward_std": 0.20805486291646957, "rewards/accuracy_reward": 0.390625, "rewards/format_reward": 1.0, "step": 94 }, { "completion_length": 528.953125, "epoch": 0.04141238012205754, "grad_norm": 0.48059654235839844, "learning_rate": 9.585876198779426e-07, "loss": -0.003212686162441969, "reward": 0.5984375476837158, "reward_std": 0.24329258501529694, "rewards/accuracy_reward": 0.3984375, "rewards/format_reward": 1.0, "step": 95 }, { "completion_length": 500.1328125, "epoch": 0.041848299912816043, "grad_norm": 0.5891656875610352, "learning_rate": 9.58151700087184e-07, "loss": -0.0030139287700876594, "reward": 0.5593750327825546, "reward_std": 0.18648964911699295, "rewards/accuracy_reward": 0.359375, "rewards/format_reward": 1.0, "step": 96 }, { "completion_length": 474.71875, "epoch": 0.042284219703574544, "grad_norm": 0.8267337083816528, "learning_rate": 9.577157802964253e-07, "loss": -0.002822687732987106, "reward": 0.5515625476837158, "reward_std": 0.2012200579047203, "rewards/accuracy_reward": 0.3515625, "rewards/format_reward": 1.0, "step": 97 }, { "completion_length": 791.3203125, "epoch": 0.042720139494333044, "grad_norm": 0.38680315017700195, "learning_rate": 9.57279860505667e-07, "loss": -0.003116427455097437, "reward": 0.6093750298023224, "reward_std": 0.2934764325618744, "rewards/accuracy_reward": 0.4140625, "rewards/format_reward": 0.9765625, "step": 98 }, { "completion_length": 552.703125, "epoch": 0.043156059285091544, "grad_norm": 0.4463382959365845, "learning_rate": 9.568439407149083e-07, "loss": -0.0025965895038098097, "reward": 0.5578125417232513, "reward_std": 0.21536517888307571, "rewards/accuracy_reward": 0.359375, "rewards/format_reward": 0.9921875, "step": 99 }, { "completion_length": 462.828125, "epoch": 0.043591979075850044, "grad_norm": 0.3906485140323639, "learning_rate": 9.5640802092415e-07, "loss": -0.002442999859340489, "reward": 0.4968750476837158, "reward_std": 0.2109457552433014, "rewards/accuracy_reward": 0.296875, "rewards/format_reward": 1.0, "step": 100 } ], "logging_steps": 1.0, "max_steps": 2294, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }