| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.043591979075850044, |
| "eval_steps": 500, |
| "global_step": 100, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "completion_length": 6066.5546875, |
| "epoch": 0.00043591979075850045, |
| "grad_norm": 22.7120361328125, |
| "learning_rate": 9.995640802092413e-07, |
| "loss": -0.00010610813114908524, |
| "reward": 0.16562499850988388, |
| "reward_std": 0.2048395685851574, |
| "rewards/accuracy_reward": 0.1171875, |
| "rewards/format_reward": 0.2421875, |
| "step": 1 |
| }, |
| { |
| "completion_length": 5022.6015625, |
| "epoch": 0.0008718395815170009, |
| "grad_norm": 35.190757751464844, |
| "learning_rate": 9.99128160418483e-07, |
| "loss": -0.0003467285423539579, |
| "reward": 0.24531249701976776, |
| "reward_std": 0.13261918351054192, |
| "rewards/accuracy_reward": 0.171875, |
| "rewards/format_reward": 0.3671875, |
| "step": 2 |
| }, |
| { |
| "completion_length": 5074.4765625, |
| "epoch": 0.0013077593722755014, |
| "grad_norm": 36.27347946166992, |
| "learning_rate": 9.986922406277246e-07, |
| "loss": -0.0009319710079580545, |
| "reward": 0.31718750298023224, |
| "reward_std": 0.15016943216323853, |
| "rewards/accuracy_reward": 0.2578125, |
| "rewards/format_reward": 0.296875, |
| "step": 3 |
| }, |
| { |
| "completion_length": 4919.9921875, |
| "epoch": 0.0017436791630340018, |
| "grad_norm": 23.1031494140625, |
| "learning_rate": 9.98256320836966e-07, |
| "loss": -0.0011635422706604004, |
| "reward": 0.2515625134110451, |
| "reward_std": 0.22550153732299805, |
| "rewards/accuracy_reward": 0.1796875, |
| "rewards/format_reward": 0.359375, |
| "step": 4 |
| }, |
| { |
| "completion_length": 4392.3203125, |
| "epoch": 0.002179598953792502, |
| "grad_norm": 18.30356788635254, |
| "learning_rate": 9.978204010462075e-07, |
| "loss": -0.001725408248603344, |
| "reward": 0.3500000089406967, |
| "reward_std": 0.2601192742586136, |
| "rewards/accuracy_reward": 0.2734375, |
| "rewards/format_reward": 0.3828125, |
| "step": 5 |
| }, |
| { |
| "completion_length": 5603.28125, |
| "epoch": 0.0026155187445510027, |
| "grad_norm": 21.6483154296875, |
| "learning_rate": 9.97384481255449e-07, |
| "loss": -0.004177422029897571, |
| "reward": 0.07500000298023224, |
| "reward_std": 0.14329775422811508, |
| "rewards/accuracy_reward": 0.0390625, |
| "rewards/format_reward": 0.1796875, |
| "step": 6 |
| }, |
| { |
| "completion_length": 3968.328125, |
| "epoch": 0.003051438535309503, |
| "grad_norm": 21.388357162475586, |
| "learning_rate": 9.969485614646903e-07, |
| "loss": -0.002357690129429102, |
| "reward": 0.2109375149011612, |
| "reward_std": 0.19895199686288834, |
| "rewards/accuracy_reward": 0.1328125, |
| "rewards/format_reward": 0.390625, |
| "step": 7 |
| }, |
| { |
| "completion_length": 4283.15625, |
| "epoch": 0.0034873583260680036, |
| "grad_norm": 17.606998443603516, |
| "learning_rate": 9.96512641673932e-07, |
| "loss": -0.0032072272151708603, |
| "reward": 0.3046875149011612, |
| "reward_std": 0.3027474880218506, |
| "rewards/accuracy_reward": 0.234375, |
| "rewards/format_reward": 0.3515625, |
| "step": 8 |
| }, |
| { |
| "completion_length": 2647.453125, |
| "epoch": 0.003923278116826504, |
| "grad_norm": 7.005691051483154, |
| "learning_rate": 9.960767218831735e-07, |
| "loss": -0.0025840166490525007, |
| "reward": 0.33125001937150955, |
| "reward_std": 0.270910307765007, |
| "rewards/accuracy_reward": 0.234375, |
| "rewards/format_reward": 0.484375, |
| "step": 9 |
| }, |
| { |
| "completion_length": 2621.7734375, |
| "epoch": 0.004359197907585004, |
| "grad_norm": 32.332698822021484, |
| "learning_rate": 9.95640802092415e-07, |
| "loss": -0.0029894779436290264, |
| "reward": 0.29843752086162567, |
| "reward_std": 0.21532631665468216, |
| "rewards/accuracy_reward": 0.1640625, |
| "rewards/format_reward": 0.671875, |
| "step": 10 |
| }, |
| { |
| "completion_length": 2444.78125, |
| "epoch": 0.004795117698343505, |
| "grad_norm": 15.55601692199707, |
| "learning_rate": 9.952048823016565e-07, |
| "loss": -0.003852886729873717, |
| "reward": 0.24062500894069672, |
| "reward_std": 0.2612670660018921, |
| "rewards/accuracy_reward": 0.1484375, |
| "rewards/format_reward": 0.4609375, |
| "step": 11 |
| }, |
| { |
| "completion_length": 2593.9140625, |
| "epoch": 0.0052310374891020054, |
| "grad_norm": 29.252334594726562, |
| "learning_rate": 9.94768962510898e-07, |
| "loss": -0.0042398301884531975, |
| "reward": 0.48750001192092896, |
| "reward_std": 0.3399874120950699, |
| "rewards/accuracy_reward": 0.3515625, |
| "rewards/format_reward": 0.6796875, |
| "step": 12 |
| }, |
| { |
| "completion_length": 2220.7109375, |
| "epoch": 0.005666957279860506, |
| "grad_norm": 13.38364028930664, |
| "learning_rate": 9.943330427201393e-07, |
| "loss": -0.0033426693407818675, |
| "reward": 0.3125000149011612, |
| "reward_std": 0.19083451479673386, |
| "rewards/accuracy_reward": 0.171875, |
| "rewards/format_reward": 0.703125, |
| "step": 13 |
| }, |
| { |
| "completion_length": 2529.640625, |
| "epoch": 0.006102877070619006, |
| "grad_norm": 17.974361419677734, |
| "learning_rate": 9.93897122929381e-07, |
| "loss": -0.004656808450818062, |
| "reward": 0.3500000089406967, |
| "reward_std": 0.36670154333114624, |
| "rewards/accuracy_reward": 0.234375, |
| "rewards/format_reward": 0.578125, |
| "step": 14 |
| }, |
| { |
| "completion_length": 2840.5625, |
| "epoch": 0.006538796861377506, |
| "grad_norm": 11.839619636535645, |
| "learning_rate": 9.934612031386225e-07, |
| "loss": -0.005619838368147612, |
| "reward": 0.29218751937150955, |
| "reward_std": 0.11481105536222458, |
| "rewards/accuracy_reward": 0.1640625, |
| "rewards/format_reward": 0.640625, |
| "step": 15 |
| }, |
| { |
| "completion_length": 3081.2421875, |
| "epoch": 0.006974716652136007, |
| "grad_norm": 8.316078186035156, |
| "learning_rate": 9.93025283347864e-07, |
| "loss": -0.00666549289599061, |
| "reward": 0.42500003427267075, |
| "reward_std": 0.30427779257297516, |
| "rewards/accuracy_reward": 0.2890625, |
| "rewards/format_reward": 0.6796875, |
| "step": 16 |
| }, |
| { |
| "completion_length": 2150.2734375, |
| "epoch": 0.0074106364428945075, |
| "grad_norm": 15.924665451049805, |
| "learning_rate": 9.925893635571055e-07, |
| "loss": -0.006602097302675247, |
| "reward": 0.43906252086162567, |
| "reward_std": 0.35585278272628784, |
| "rewards/accuracy_reward": 0.2890625, |
| "rewards/format_reward": 0.75, |
| "step": 17 |
| }, |
| { |
| "completion_length": 1136.3359375, |
| "epoch": 0.007846556233653008, |
| "grad_norm": 3.312643527984619, |
| "learning_rate": 9.92153443766347e-07, |
| "loss": -0.00424616876989603, |
| "reward": 0.484375, |
| "reward_std": 0.2594892159104347, |
| "rewards/accuracy_reward": 0.3125, |
| "rewards/format_reward": 0.859375, |
| "step": 18 |
| }, |
| { |
| "completion_length": 1163.0703125, |
| "epoch": 0.008282476024411508, |
| "grad_norm": 4.4617600440979, |
| "learning_rate": 9.917175239755885e-07, |
| "loss": -0.0060931057669222355, |
| "reward": 0.6203125715255737, |
| "reward_std": 0.3268684893846512, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/format_reward": 0.9140625, |
| "step": 19 |
| }, |
| { |
| "completion_length": 1688.5078125, |
| "epoch": 0.008718395815170008, |
| "grad_norm": 5.8775506019592285, |
| "learning_rate": 9.9128160418483e-07, |
| "loss": -0.009722861228510737, |
| "reward": 0.38593751192092896, |
| "reward_std": 0.21004340052604675, |
| "rewards/accuracy_reward": 0.21875, |
| "rewards/format_reward": 0.8359375, |
| "step": 20 |
| }, |
| { |
| "completion_length": 1012.625, |
| "epoch": 0.009154315605928508, |
| "grad_norm": 4.331947326660156, |
| "learning_rate": 9.908456843940715e-07, |
| "loss": -0.005864025559276342, |
| "reward": 0.550000011920929, |
| "reward_std": 0.336714543402195, |
| "rewards/accuracy_reward": 0.359375, |
| "rewards/format_reward": 0.953125, |
| "step": 21 |
| }, |
| { |
| "completion_length": 1052.0234375, |
| "epoch": 0.00959023539668701, |
| "grad_norm": 7.702275276184082, |
| "learning_rate": 9.90409764603313e-07, |
| "loss": -0.0061883407179266214, |
| "reward": 0.5375000536441803, |
| "reward_std": 0.19406893104314804, |
| "rewards/accuracy_reward": 0.3515625, |
| "rewards/format_reward": 0.9296875, |
| "step": 22 |
| }, |
| { |
| "completion_length": 585.890625, |
| "epoch": 0.01002615518744551, |
| "grad_norm": 2.693312883377075, |
| "learning_rate": 9.899738448125545e-07, |
| "loss": -0.0050068587297573686, |
| "reward": 0.4500000476837158, |
| "reward_std": 0.22148218750953674, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 1.0, |
| "step": 23 |
| }, |
| { |
| "completion_length": 952.8984375, |
| "epoch": 0.010462074978204011, |
| "grad_norm": 2.040409564971924, |
| "learning_rate": 9.89537925021796e-07, |
| "loss": -0.008675348944962025, |
| "reward": 0.4531250298023224, |
| "reward_std": 0.2833295091986656, |
| "rewards/accuracy_reward": 0.2578125, |
| "rewards/format_reward": 0.9765625, |
| "step": 24 |
| }, |
| { |
| "completion_length": 855.203125, |
| "epoch": 0.010897994768962511, |
| "grad_norm": 1.750288486480713, |
| "learning_rate": 9.891020052310375e-07, |
| "loss": -0.006478779250755906, |
| "reward": 0.5281250476837158, |
| "reward_std": 0.2747085839509964, |
| "rewards/accuracy_reward": 0.3359375, |
| "rewards/format_reward": 0.9609375, |
| "step": 25 |
| }, |
| { |
| "completion_length": 622.625, |
| "epoch": 0.011333914559721011, |
| "grad_norm": 1.3210248947143555, |
| "learning_rate": 9.88666085440279e-07, |
| "loss": -0.006179739721119404, |
| "reward": 0.5562500357627869, |
| "reward_std": 0.2532925382256508, |
| "rewards/accuracy_reward": 0.359375, |
| "rewards/format_reward": 0.984375, |
| "step": 26 |
| }, |
| { |
| "completion_length": 609.3984375, |
| "epoch": 0.011769834350479512, |
| "grad_norm": 3.869396448135376, |
| "learning_rate": 9.882301656495205e-07, |
| "loss": -0.0062519978964701295, |
| "reward": 0.5000000298023224, |
| "reward_std": 0.14424315840005875, |
| "rewards/accuracy_reward": 0.3046875, |
| "rewards/format_reward": 0.9765625, |
| "step": 27 |
| }, |
| { |
| "completion_length": 547.375, |
| "epoch": 0.012205754141238012, |
| "grad_norm": 1.0941433906555176, |
| "learning_rate": 9.877942458587619e-07, |
| "loss": -0.0032227920601144433, |
| "reward": 0.5484375357627869, |
| "reward_std": 0.21108780801296234, |
| "rewards/accuracy_reward": 0.3515625, |
| "rewards/format_reward": 0.984375, |
| "step": 28 |
| }, |
| { |
| "completion_length": 566.8515625, |
| "epoch": 0.012641673931996512, |
| "grad_norm": 1.1244480609893799, |
| "learning_rate": 9.873583260680035e-07, |
| "loss": -0.005511581432074308, |
| "reward": 0.5296875238418579, |
| "reward_std": 0.25184717029333115, |
| "rewards/accuracy_reward": 0.3359375, |
| "rewards/format_reward": 0.96875, |
| "step": 29 |
| }, |
| { |
| "completion_length": 629.2109375, |
| "epoch": 0.013077593722755012, |
| "grad_norm": 1.0730060338974, |
| "learning_rate": 9.869224062772449e-07, |
| "loss": -0.00590163329616189, |
| "reward": 0.5437500476837158, |
| "reward_std": 0.30221718549728394, |
| "rewards/accuracy_reward": 0.34375, |
| "rewards/format_reward": 1.0, |
| "step": 30 |
| }, |
| { |
| "completion_length": 577.2890625, |
| "epoch": 0.013513513513513514, |
| "grad_norm": 1.0006390810012817, |
| "learning_rate": 9.864864864864865e-07, |
| "loss": -0.004602149594575167, |
| "reward": 0.48906251788139343, |
| "reward_std": 0.2546490430831909, |
| "rewards/accuracy_reward": 0.296875, |
| "rewards/format_reward": 0.9609375, |
| "step": 31 |
| }, |
| { |
| "completion_length": 652.453125, |
| "epoch": 0.013949433304272014, |
| "grad_norm": 2.0136349201202393, |
| "learning_rate": 9.860505666957279e-07, |
| "loss": -0.007033544359728694, |
| "reward": 0.6000000536441803, |
| "reward_std": 0.34614098072052, |
| "rewards/accuracy_reward": 0.40625, |
| "rewards/format_reward": 0.96875, |
| "step": 32 |
| }, |
| { |
| "completion_length": 830.296875, |
| "epoch": 0.014385353095030515, |
| "grad_norm": 0.6222465634346008, |
| "learning_rate": 9.856146469049695e-07, |
| "loss": -0.0058622711803764105, |
| "reward": 0.6406250596046448, |
| "reward_std": 0.26844407618045807, |
| "rewards/accuracy_reward": 0.4453125, |
| "rewards/format_reward": 0.9765625, |
| "step": 33 |
| }, |
| { |
| "completion_length": 924.8984375, |
| "epoch": 0.014821272885789015, |
| "grad_norm": 1.6224457025527954, |
| "learning_rate": 9.851787271142109e-07, |
| "loss": -0.006918259430676699, |
| "reward": 0.45781250298023224, |
| "reward_std": 0.1292574293911457, |
| "rewards/accuracy_reward": 0.265625, |
| "rewards/format_reward": 0.9609375, |
| "step": 34 |
| }, |
| { |
| "completion_length": 757.8828125, |
| "epoch": 0.015257192676547515, |
| "grad_norm": 0.8691195249557495, |
| "learning_rate": 9.847428073234525e-07, |
| "loss": -0.005784029606729746, |
| "reward": 0.45468753576278687, |
| "reward_std": 0.20147473365068436, |
| "rewards/accuracy_reward": 0.2578125, |
| "rewards/format_reward": 0.984375, |
| "step": 35 |
| }, |
| { |
| "completion_length": 1539.546875, |
| "epoch": 0.015693112467306015, |
| "grad_norm": 4.3893961906433105, |
| "learning_rate": 9.843068875326939e-07, |
| "loss": -0.010595182422548532, |
| "reward": 0.4765625298023224, |
| "reward_std": 0.2606821805238724, |
| "rewards/accuracy_reward": 0.296875, |
| "rewards/format_reward": 0.8984375, |
| "step": 36 |
| }, |
| { |
| "completion_length": 949.203125, |
| "epoch": 0.016129032258064516, |
| "grad_norm": 1.039124608039856, |
| "learning_rate": 9.838709677419355e-07, |
| "loss": -0.005853116046637297, |
| "reward": 0.6062500476837158, |
| "reward_std": 0.34973812103271484, |
| "rewards/accuracy_reward": 0.4140625, |
| "rewards/format_reward": 0.9609375, |
| "step": 37 |
| }, |
| { |
| "completion_length": 1128.515625, |
| "epoch": 0.016564952048823016, |
| "grad_norm": 2.9782750606536865, |
| "learning_rate": 9.834350479511769e-07, |
| "loss": -0.00639305729418993, |
| "reward": 0.3796875327825546, |
| "reward_std": 0.2201501727104187, |
| "rewards/accuracy_reward": 0.1953125, |
| "rewards/format_reward": 0.921875, |
| "step": 38 |
| }, |
| { |
| "completion_length": 803.09375, |
| "epoch": 0.017000871839581516, |
| "grad_norm": 0.9739387035369873, |
| "learning_rate": 9.829991281604185e-07, |
| "loss": -0.003955277847126126, |
| "reward": 0.5281250327825546, |
| "reward_std": 0.26489946991205215, |
| "rewards/accuracy_reward": 0.3359375, |
| "rewards/format_reward": 0.9609375, |
| "step": 39 |
| }, |
| { |
| "completion_length": 624.953125, |
| "epoch": 0.017436791630340016, |
| "grad_norm": 0.5311559438705444, |
| "learning_rate": 9.825632083696599e-07, |
| "loss": -0.004024791065603495, |
| "reward": 0.5468750298023224, |
| "reward_std": 0.2868617922067642, |
| "rewards/accuracy_reward": 0.3515625, |
| "rewards/format_reward": 0.9765625, |
| "step": 40 |
| }, |
| { |
| "completion_length": 1250.9765625, |
| "epoch": 0.017872711421098517, |
| "grad_norm": 0.7332771420478821, |
| "learning_rate": 9.821272885789015e-07, |
| "loss": -0.005356588866561651, |
| "reward": 0.44062504172325134, |
| "reward_std": 0.20438477396965027, |
| "rewards/accuracy_reward": 0.2578125, |
| "rewards/format_reward": 0.9140625, |
| "step": 41 |
| }, |
| { |
| "completion_length": 1476.9921875, |
| "epoch": 0.018308631211857017, |
| "grad_norm": 2.0835506916046143, |
| "learning_rate": 9.816913687881429e-07, |
| "loss": -0.006023196969181299, |
| "reward": 0.4125000238418579, |
| "reward_std": 0.23667097091674805, |
| "rewards/accuracy_reward": 0.234375, |
| "rewards/format_reward": 0.890625, |
| "step": 42 |
| }, |
| { |
| "completion_length": 1100.25, |
| "epoch": 0.018744551002615517, |
| "grad_norm": 0.7210353016853333, |
| "learning_rate": 9.812554489973845e-07, |
| "loss": -0.00412205932661891, |
| "reward": 0.5765625238418579, |
| "reward_std": 0.35319100320339203, |
| "rewards/accuracy_reward": 0.390625, |
| "rewards/format_reward": 0.9296875, |
| "step": 43 |
| }, |
| { |
| "completion_length": 1268.8828125, |
| "epoch": 0.01918047079337402, |
| "grad_norm": 1.018958330154419, |
| "learning_rate": 9.808195292066259e-07, |
| "loss": -0.005287598352879286, |
| "reward": 0.5796875357627869, |
| "reward_std": 0.33388449996709824, |
| "rewards/accuracy_reward": 0.3984375, |
| "rewards/format_reward": 0.90625, |
| "step": 44 |
| }, |
| { |
| "completion_length": 1529.1484375, |
| "epoch": 0.01961639058413252, |
| "grad_norm": 0.9156416058540344, |
| "learning_rate": 9.803836094158675e-07, |
| "loss": -0.006656843703240156, |
| "reward": 0.4531250298023224, |
| "reward_std": 0.3121063858270645, |
| "rewards/accuracy_reward": 0.2734375, |
| "rewards/format_reward": 0.8984375, |
| "step": 45 |
| }, |
| { |
| "completion_length": 771.7109375, |
| "epoch": 0.02005231037489102, |
| "grad_norm": 1.1348389387130737, |
| "learning_rate": 9.79947689625109e-07, |
| "loss": -0.004119608784094453, |
| "reward": 0.5406250357627869, |
| "reward_std": 0.16695528104901314, |
| "rewards/accuracy_reward": 0.34375, |
| "rewards/format_reward": 0.984375, |
| "step": 46 |
| }, |
| { |
| "completion_length": 1171.2578125, |
| "epoch": 0.02048823016564952, |
| "grad_norm": 0.7597943544387817, |
| "learning_rate": 9.795117698343505e-07, |
| "loss": -0.004714524140581489, |
| "reward": 0.6031250357627869, |
| "reward_std": 0.296435609459877, |
| "rewards/accuracy_reward": 0.4140625, |
| "rewards/format_reward": 0.9453125, |
| "step": 47 |
| }, |
| { |
| "completion_length": 1121.265625, |
| "epoch": 0.020924149956408022, |
| "grad_norm": 0.7531502842903137, |
| "learning_rate": 9.790758500435918e-07, |
| "loss": -0.004732346162199974, |
| "reward": 0.4359375089406967, |
| "reward_std": 0.3109729588031769, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/format_reward": 0.9296875, |
| "step": 48 |
| }, |
| { |
| "completion_length": 1959.421875, |
| "epoch": 0.021360069747166522, |
| "grad_norm": 1.5503896474838257, |
| "learning_rate": 9.786399302528334e-07, |
| "loss": -0.0054204994812607765, |
| "reward": 0.4218750298023224, |
| "reward_std": 0.28712356090545654, |
| "rewards/accuracy_reward": 0.2578125, |
| "rewards/format_reward": 0.8203125, |
| "step": 49 |
| }, |
| { |
| "completion_length": 1155.4140625, |
| "epoch": 0.021795989537925022, |
| "grad_norm": 0.5289723873138428, |
| "learning_rate": 9.782040104620748e-07, |
| "loss": -0.00590874906629324, |
| "reward": 0.4140625298023224, |
| "reward_std": 0.2974793165922165, |
| "rewards/accuracy_reward": 0.2265625, |
| "rewards/format_reward": 0.9375, |
| "step": 50 |
| }, |
| { |
| "completion_length": 1361.078125, |
| "epoch": 0.022231909328683522, |
| "grad_norm": 0.7921638488769531, |
| "learning_rate": 9.777680906713164e-07, |
| "loss": -0.005040215328335762, |
| "reward": 0.3031249940395355, |
| "reward_std": 0.21944554150104523, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/format_reward": 0.890625, |
| "step": 51 |
| }, |
| { |
| "completion_length": 1471.5859375, |
| "epoch": 0.022667829119442023, |
| "grad_norm": 0.6596994996070862, |
| "learning_rate": 9.77332170880558e-07, |
| "loss": -0.005814009346067905, |
| "reward": 0.4859375059604645, |
| "reward_std": 0.3088204860687256, |
| "rewards/accuracy_reward": 0.3046875, |
| "rewards/format_reward": 0.90625, |
| "step": 52 |
| }, |
| { |
| "completion_length": 1483.7265625, |
| "epoch": 0.023103748910200523, |
| "grad_norm": 0.9196128249168396, |
| "learning_rate": 9.768962510897994e-07, |
| "loss": -0.005532125011086464, |
| "reward": 0.6109375357627869, |
| "reward_std": 0.32757391035556793, |
| "rewards/accuracy_reward": 0.4296875, |
| "rewards/format_reward": 0.90625, |
| "step": 53 |
| }, |
| { |
| "completion_length": 1215.15625, |
| "epoch": 0.023539668700959023, |
| "grad_norm": 0.7604343891143799, |
| "learning_rate": 9.764603312990408e-07, |
| "loss": -0.0066660866141319275, |
| "reward": 0.6000000536441803, |
| "reward_std": 0.36297860741615295, |
| "rewards/accuracy_reward": 0.4140625, |
| "rewards/format_reward": 0.9296875, |
| "step": 54 |
| }, |
| { |
| "completion_length": 1351.3125, |
| "epoch": 0.023975588491717523, |
| "grad_norm": 0.7223543524742126, |
| "learning_rate": 9.760244115082824e-07, |
| "loss": -0.005946665536612272, |
| "reward": 0.43906253576278687, |
| "reward_std": 0.2528854086995125, |
| "rewards/accuracy_reward": 0.2578125, |
| "rewards/format_reward": 0.90625, |
| "step": 55 |
| }, |
| { |
| "completion_length": 1185.84375, |
| "epoch": 0.024411508282476024, |
| "grad_norm": 0.5095703601837158, |
| "learning_rate": 9.755884917175238e-07, |
| "loss": -0.0064112339168787, |
| "reward": 0.5093750208616257, |
| "reward_std": 0.21522878110408783, |
| "rewards/accuracy_reward": 0.3203125, |
| "rewards/format_reward": 0.9453125, |
| "step": 56 |
| }, |
| { |
| "completion_length": 1281.0546875, |
| "epoch": 0.024847428073234524, |
| "grad_norm": 1.2985528707504272, |
| "learning_rate": 9.751525719267654e-07, |
| "loss": -0.006073690485209227, |
| "reward": 0.515625, |
| "reward_std": 0.26809659600257874, |
| "rewards/accuracy_reward": 0.328125, |
| "rewards/format_reward": 0.9375, |
| "step": 57 |
| }, |
| { |
| "completion_length": 1124.7109375, |
| "epoch": 0.025283347863993024, |
| "grad_norm": 0.5998630523681641, |
| "learning_rate": 9.74716652136007e-07, |
| "loss": -0.006307224277406931, |
| "reward": 0.42500001192092896, |
| "reward_std": 0.15122529119253159, |
| "rewards/accuracy_reward": 0.234375, |
| "rewards/format_reward": 0.953125, |
| "step": 58 |
| }, |
| { |
| "completion_length": 1213.90625, |
| "epoch": 0.025719267654751524, |
| "grad_norm": 1.0118658542633057, |
| "learning_rate": 9.742807323452484e-07, |
| "loss": -0.007607629988342524, |
| "reward": 0.7015625238418579, |
| "reward_std": 0.26937858760356903, |
| "rewards/accuracy_reward": 0.515625, |
| "rewards/format_reward": 0.9296875, |
| "step": 59 |
| }, |
| { |
| "completion_length": 816.8046875, |
| "epoch": 0.026155187445510025, |
| "grad_norm": 0.5398353338241577, |
| "learning_rate": 9.738448125544898e-07, |
| "loss": -0.005773038603365421, |
| "reward": 0.8140625357627869, |
| "reward_std": 0.2714267522096634, |
| "rewards/accuracy_reward": 0.6171875, |
| "rewards/format_reward": 0.984375, |
| "step": 60 |
| }, |
| { |
| "completion_length": 802.9453125, |
| "epoch": 0.02659110723626853, |
| "grad_norm": 0.40469521284103394, |
| "learning_rate": 9.734088927637314e-07, |
| "loss": -0.005671899998560548, |
| "reward": 0.5234375298023224, |
| "reward_std": 0.19351572543382645, |
| "rewards/accuracy_reward": 0.328125, |
| "rewards/format_reward": 0.9765625, |
| "step": 61 |
| }, |
| { |
| "completion_length": 609.734375, |
| "epoch": 0.02702702702702703, |
| "grad_norm": 0.2660030424594879, |
| "learning_rate": 9.72972972972973e-07, |
| "loss": -0.003733730292879045, |
| "reward": 0.4968750327825546, |
| "reward_std": 0.11230766773223877, |
| "rewards/accuracy_reward": 0.296875, |
| "rewards/format_reward": 1.0, |
| "step": 62 |
| }, |
| { |
| "completion_length": 606.8125, |
| "epoch": 0.02746294681778553, |
| "grad_norm": 0.34162867069244385, |
| "learning_rate": 9.725370531822144e-07, |
| "loss": -0.004276728723198175, |
| "reward": 0.46406252682209015, |
| "reward_std": 0.17800088226795197, |
| "rewards/accuracy_reward": 0.265625, |
| "rewards/format_reward": 0.9921875, |
| "step": 63 |
| }, |
| { |
| "completion_length": 675.0234375, |
| "epoch": 0.02789886660854403, |
| "grad_norm": 0.3267340660095215, |
| "learning_rate": 9.72101133391456e-07, |
| "loss": -0.004803936462849379, |
| "reward": 0.47968754172325134, |
| "reward_std": 0.2698579430580139, |
| "rewards/accuracy_reward": 0.28125, |
| "rewards/format_reward": 0.9921875, |
| "step": 64 |
| }, |
| { |
| "completion_length": 689.875, |
| "epoch": 0.02833478639930253, |
| "grad_norm": 0.5070520639419556, |
| "learning_rate": 9.716652136006974e-07, |
| "loss": -0.004681795369833708, |
| "reward": 0.5890625417232513, |
| "reward_std": 0.28590644896030426, |
| "rewards/accuracy_reward": 0.390625, |
| "rewards/format_reward": 0.9921875, |
| "step": 65 |
| }, |
| { |
| "completion_length": 741.4296875, |
| "epoch": 0.02877070619006103, |
| "grad_norm": 0.539318859577179, |
| "learning_rate": 9.712292938099388e-07, |
| "loss": -0.0042268745601177216, |
| "reward": 0.7734375298023224, |
| "reward_std": 0.1940227895975113, |
| "rewards/accuracy_reward": 0.578125, |
| "rewards/format_reward": 0.9765625, |
| "step": 66 |
| }, |
| { |
| "completion_length": 494.328125, |
| "epoch": 0.02920662598081953, |
| "grad_norm": 0.23426468670368195, |
| "learning_rate": 9.707933740191804e-07, |
| "loss": -0.0033774186158552766, |
| "reward": 0.6296875476837158, |
| "reward_std": 0.12073517590761185, |
| "rewards/accuracy_reward": 0.4296875, |
| "rewards/format_reward": 1.0, |
| "step": 67 |
| }, |
| { |
| "completion_length": 663.703125, |
| "epoch": 0.02964254577157803, |
| "grad_norm": 0.5735094547271729, |
| "learning_rate": 9.70357454228422e-07, |
| "loss": -0.004496369976550341, |
| "reward": 0.47812503576278687, |
| "reward_std": 0.2521483972668648, |
| "rewards/accuracy_reward": 0.28125, |
| "rewards/format_reward": 0.984375, |
| "step": 68 |
| }, |
| { |
| "completion_length": 556.1875, |
| "epoch": 0.03007846556233653, |
| "grad_norm": 0.6028347611427307, |
| "learning_rate": 9.699215344376634e-07, |
| "loss": -0.0033272686414420605, |
| "reward": 0.5500000417232513, |
| "reward_std": 0.1772443801164627, |
| "rewards/accuracy_reward": 0.3515625, |
| "rewards/format_reward": 0.9921875, |
| "step": 69 |
| }, |
| { |
| "completion_length": 566.171875, |
| "epoch": 0.03051438535309503, |
| "grad_norm": 0.2644914984703064, |
| "learning_rate": 9.69485614646905e-07, |
| "loss": -0.0035728231305256486, |
| "reward": 0.44062504172325134, |
| "reward_std": 0.1354043260216713, |
| "rewards/accuracy_reward": 0.2421875, |
| "rewards/format_reward": 0.9921875, |
| "step": 70 |
| }, |
| { |
| "completion_length": 747.1171875, |
| "epoch": 0.03095030514385353, |
| "grad_norm": 0.45364266633987427, |
| "learning_rate": 9.690496948561464e-07, |
| "loss": -0.004377002594992518, |
| "reward": 0.5640625357627869, |
| "reward_std": 0.26171743869781494, |
| "rewards/accuracy_reward": 0.3671875, |
| "rewards/format_reward": 0.984375, |
| "step": 71 |
| }, |
| { |
| "completion_length": 736.7734375, |
| "epoch": 0.03138622493461203, |
| "grad_norm": 0.4566240608692169, |
| "learning_rate": 9.686137750653878e-07, |
| "loss": -0.0046136470045894384, |
| "reward": 0.5250000357627869, |
| "reward_std": 0.20069601386785507, |
| "rewards/accuracy_reward": 0.328125, |
| "rewards/format_reward": 0.984375, |
| "step": 72 |
| }, |
| { |
| "completion_length": 459.0234375, |
| "epoch": 0.03182214472537053, |
| "grad_norm": 0.4804815948009491, |
| "learning_rate": 9.681778552746294e-07, |
| "loss": -0.002390326582826674, |
| "reward": 0.5593750327825546, |
| "reward_std": 0.24959056824445724, |
| "rewards/accuracy_reward": 0.359375, |
| "rewards/format_reward": 1.0, |
| "step": 73 |
| }, |
| { |
| "completion_length": 611.203125, |
| "epoch": 0.03225806451612903, |
| "grad_norm": 0.32491230964660645, |
| "learning_rate": 9.67741935483871e-07, |
| "loss": -0.002548949094489217, |
| "reward": 0.6343750357627869, |
| "reward_std": 0.15103846788406372, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/format_reward": 0.984375, |
| "step": 74 |
| }, |
| { |
| "completion_length": 844.71875, |
| "epoch": 0.03269398430688753, |
| "grad_norm": 0.45734190940856934, |
| "learning_rate": 9.673060156931124e-07, |
| "loss": -0.0051384728867560625, |
| "reward": 0.47031253576278687, |
| "reward_std": 0.21036501228809357, |
| "rewards/accuracy_reward": 0.2734375, |
| "rewards/format_reward": 0.984375, |
| "step": 75 |
| }, |
| { |
| "completion_length": 507.1640625, |
| "epoch": 0.03312990409764603, |
| "grad_norm": 0.33810898661613464, |
| "learning_rate": 9.66870095902354e-07, |
| "loss": -0.0030672921566292644, |
| "reward": 0.7390625476837158, |
| "reward_std": 0.19674428552389145, |
| "rewards/accuracy_reward": 0.5390625, |
| "rewards/format_reward": 1.0, |
| "step": 76 |
| }, |
| { |
| "completion_length": 901.546875, |
| "epoch": 0.03356582388840453, |
| "grad_norm": 0.4877094626426697, |
| "learning_rate": 9.664341761115954e-07, |
| "loss": -0.004802107345312834, |
| "reward": 0.6296875476837158, |
| "reward_std": 0.24297793954610825, |
| "rewards/accuracy_reward": 0.4375, |
| "rewards/format_reward": 0.9609375, |
| "step": 77 |
| }, |
| { |
| "completion_length": 714.9296875, |
| "epoch": 0.03400174367916303, |
| "grad_norm": 0.6446647644042969, |
| "learning_rate": 9.659982563208368e-07, |
| "loss": -0.0041290284134447575, |
| "reward": 0.6953125298023224, |
| "reward_std": 0.21879743784666061, |
| "rewards/accuracy_reward": 0.5, |
| "rewards/format_reward": 0.9765625, |
| "step": 78 |
| }, |
| { |
| "completion_length": 595.71875, |
| "epoch": 0.03443766346992153, |
| "grad_norm": 0.5552946329116821, |
| "learning_rate": 9.655623365300784e-07, |
| "loss": -0.00336921657435596, |
| "reward": 0.6906250417232513, |
| "reward_std": 0.2824498862028122, |
| "rewards/accuracy_reward": 0.4921875, |
| "rewards/format_reward": 0.9921875, |
| "step": 79 |
| }, |
| { |
| "completion_length": 559.0859375, |
| "epoch": 0.03487358326068003, |
| "grad_norm": 0.3945360481739044, |
| "learning_rate": 9.6512641673932e-07, |
| "loss": -0.0029993923380970955, |
| "reward": 0.5187500417232513, |
| "reward_std": 0.18508683145046234, |
| "rewards/accuracy_reward": 0.3203125, |
| "rewards/format_reward": 0.9921875, |
| "step": 80 |
| }, |
| { |
| "completion_length": 702.71875, |
| "epoch": 0.03530950305143853, |
| "grad_norm": 0.3644000291824341, |
| "learning_rate": 9.646904969485614e-07, |
| "loss": -0.004066583467647433, |
| "reward": 0.7515625357627869, |
| "reward_std": 0.17609478533267975, |
| "rewards/accuracy_reward": 0.5546875, |
| "rewards/format_reward": 0.984375, |
| "step": 81 |
| }, |
| { |
| "completion_length": 594.859375, |
| "epoch": 0.03574542284219703, |
| "grad_norm": 0.43995150923728943, |
| "learning_rate": 9.64254577157803e-07, |
| "loss": -0.0034514348953962326, |
| "reward": 0.49531254172325134, |
| "reward_std": 0.28716301918029785, |
| "rewards/accuracy_reward": 0.296875, |
| "rewards/format_reward": 0.9921875, |
| "step": 82 |
| }, |
| { |
| "completion_length": 468.734375, |
| "epoch": 0.036181342632955533, |
| "grad_norm": 0.49772775173187256, |
| "learning_rate": 9.638186573670444e-07, |
| "loss": -0.0027159389574080706, |
| "reward": 0.27031251788139343, |
| "reward_std": 0.15308690071105957, |
| "rewards/accuracy_reward": 0.0703125, |
| "rewards/format_reward": 1.0, |
| "step": 83 |
| }, |
| { |
| "completion_length": 626.71875, |
| "epoch": 0.036617262423714034, |
| "grad_norm": 0.27482131123542786, |
| "learning_rate": 9.63382737576286e-07, |
| "loss": -0.0032146567245945334, |
| "reward": 0.5421875417232513, |
| "reward_std": 0.14730052649974823, |
| "rewards/accuracy_reward": 0.34375, |
| "rewards/format_reward": 0.9921875, |
| "step": 84 |
| }, |
| { |
| "completion_length": 496.859375, |
| "epoch": 0.037053182214472534, |
| "grad_norm": 0.39663198590278625, |
| "learning_rate": 9.629468177855274e-07, |
| "loss": -0.0021742535172961652, |
| "reward": 0.6218750476837158, |
| "reward_std": 0.25354722142219543, |
| "rewards/accuracy_reward": 0.421875, |
| "rewards/format_reward": 1.0, |
| "step": 85 |
| }, |
| { |
| "completion_length": 507.53125, |
| "epoch": 0.037489102005231034, |
| "grad_norm": 0.38531285524368286, |
| "learning_rate": 9.62510897994769e-07, |
| "loss": -0.003144865622743964, |
| "reward": 0.6140625327825546, |
| "reward_std": 0.19332444667816162, |
| "rewards/accuracy_reward": 0.4140625, |
| "rewards/format_reward": 1.0, |
| "step": 86 |
| }, |
| { |
| "completion_length": 522.9609375, |
| "epoch": 0.03792502179598954, |
| "grad_norm": 0.4018486738204956, |
| "learning_rate": 9.620749782040104e-07, |
| "loss": -0.003351722378283739, |
| "reward": 0.5750000476837158, |
| "reward_std": 0.2790592461824417, |
| "rewards/accuracy_reward": 0.375, |
| "rewards/format_reward": 1.0, |
| "step": 87 |
| }, |
| { |
| "completion_length": 574.625, |
| "epoch": 0.03836094158674804, |
| "grad_norm": 0.29832443594932556, |
| "learning_rate": 9.61639058413252e-07, |
| "loss": -0.0031917719170451164, |
| "reward": 0.49531254172325134, |
| "reward_std": 0.19090906530618668, |
| "rewards/accuracy_reward": 0.296875, |
| "rewards/format_reward": 0.9921875, |
| "step": 88 |
| }, |
| { |
| "completion_length": 506.140625, |
| "epoch": 0.03879686137750654, |
| "grad_norm": 0.34219008684158325, |
| "learning_rate": 9.612031386224936e-07, |
| "loss": -0.002893596771173179, |
| "reward": 0.5359375476837158, |
| "reward_std": 0.21778053790330887, |
| "rewards/accuracy_reward": 0.3359375, |
| "rewards/format_reward": 1.0, |
| "step": 89 |
| }, |
| { |
| "completion_length": 520.8984375, |
| "epoch": 0.03923278116826504, |
| "grad_norm": 0.3178415298461914, |
| "learning_rate": 9.60767218831735e-07, |
| "loss": -0.0036478497786447406, |
| "reward": 0.5828125476837158, |
| "reward_std": 0.16781240701675415, |
| "rewards/accuracy_reward": 0.3828125, |
| "rewards/format_reward": 1.0, |
| "step": 90 |
| }, |
| { |
| "completion_length": 524.90625, |
| "epoch": 0.03966870095902354, |
| "grad_norm": 0.3558061122894287, |
| "learning_rate": 9.603312990409764e-07, |
| "loss": -0.003106694668531418, |
| "reward": 0.5437500476837158, |
| "reward_std": 0.268809512257576, |
| "rewards/accuracy_reward": 0.34375, |
| "rewards/format_reward": 1.0, |
| "step": 91 |
| }, |
| { |
| "completion_length": 486.4140625, |
| "epoch": 0.04010462074978204, |
| "grad_norm": 0.3201664388179779, |
| "learning_rate": 9.59895379250218e-07, |
| "loss": -0.00227005232591182, |
| "reward": 0.5750000476837158, |
| "reward_std": 0.17464719712734222, |
| "rewards/accuracy_reward": 0.375, |
| "rewards/format_reward": 1.0, |
| "step": 92 |
| }, |
| { |
| "completion_length": 605.6640625, |
| "epoch": 0.04054054054054054, |
| "grad_norm": 0.5713381767272949, |
| "learning_rate": 9.594594594594594e-07, |
| "loss": -0.003524004598148167, |
| "reward": 0.6125000417232513, |
| "reward_std": 0.26134093105793, |
| "rewards/accuracy_reward": 0.4140625, |
| "rewards/format_reward": 0.9921875, |
| "step": 93 |
| }, |
| { |
| "completion_length": 516.140625, |
| "epoch": 0.04097646033129904, |
| "grad_norm": 0.41625216603279114, |
| "learning_rate": 9.59023539668701e-07, |
| "loss": -0.003131876001134515, |
| "reward": 0.5906250476837158, |
| "reward_std": 0.20805486291646957, |
| "rewards/accuracy_reward": 0.390625, |
| "rewards/format_reward": 1.0, |
| "step": 94 |
| }, |
| { |
| "completion_length": 528.953125, |
| "epoch": 0.04141238012205754, |
| "grad_norm": 0.48059654235839844, |
| "learning_rate": 9.585876198779426e-07, |
| "loss": -0.003212686162441969, |
| "reward": 0.5984375476837158, |
| "reward_std": 0.24329258501529694, |
| "rewards/accuracy_reward": 0.3984375, |
| "rewards/format_reward": 1.0, |
| "step": 95 |
| }, |
| { |
| "completion_length": 500.1328125, |
| "epoch": 0.041848299912816043, |
| "grad_norm": 0.5891656875610352, |
| "learning_rate": 9.58151700087184e-07, |
| "loss": -0.0030139287700876594, |
| "reward": 0.5593750327825546, |
| "reward_std": 0.18648964911699295, |
| "rewards/accuracy_reward": 0.359375, |
| "rewards/format_reward": 1.0, |
| "step": 96 |
| }, |
| { |
| "completion_length": 474.71875, |
| "epoch": 0.042284219703574544, |
| "grad_norm": 0.8267337083816528, |
| "learning_rate": 9.577157802964253e-07, |
| "loss": -0.002822687732987106, |
| "reward": 0.5515625476837158, |
| "reward_std": 0.2012200579047203, |
| "rewards/accuracy_reward": 0.3515625, |
| "rewards/format_reward": 1.0, |
| "step": 97 |
| }, |
| { |
| "completion_length": 791.3203125, |
| "epoch": 0.042720139494333044, |
| "grad_norm": 0.38680315017700195, |
| "learning_rate": 9.57279860505667e-07, |
| "loss": -0.003116427455097437, |
| "reward": 0.6093750298023224, |
| "reward_std": 0.2934764325618744, |
| "rewards/accuracy_reward": 0.4140625, |
| "rewards/format_reward": 0.9765625, |
| "step": 98 |
| }, |
| { |
| "completion_length": 552.703125, |
| "epoch": 0.043156059285091544, |
| "grad_norm": 0.4463382959365845, |
| "learning_rate": 9.568439407149083e-07, |
| "loss": -0.0025965895038098097, |
| "reward": 0.5578125417232513, |
| "reward_std": 0.21536517888307571, |
| "rewards/accuracy_reward": 0.359375, |
| "rewards/format_reward": 0.9921875, |
| "step": 99 |
| }, |
| { |
| "completion_length": 462.828125, |
| "epoch": 0.043591979075850044, |
| "grad_norm": 0.3906485140323639, |
| "learning_rate": 9.5640802092415e-07, |
| "loss": -0.002442999859340489, |
| "reward": 0.4968750476837158, |
| "reward_std": 0.2109457552433014, |
| "rewards/accuracy_reward": 0.296875, |
| "rewards/format_reward": 1.0, |
| "step": 100 |
| } |
| ], |
| "logging_steps": 1.0, |
| "max_steps": 2294, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|