diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,35034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.4380585246188891, + "eval_steps": 500, + "global_step": 2500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "completion_length": 5.8125, + "epoch": 0.00017522340984755565, + "grad_norm": 21.7247019198033, + "kl": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "reward": 1.4406715631484985, + "reward_std": 0.22897478938102722, + "rewards/accuracy_reward_stage2": 0.4406715929508209, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1 + }, + { + "completion_length": 7.0, + "epoch": 0.0003504468196951113, + "grad_norm": 21.731991696214283, + "kl": 0.000118255615234375, + "learning_rate": 9.998247765901524e-07, + "loss": 0.0, + "reward": 1.4579381942749023, + "reward_std": 0.21444088220596313, + "rewards/accuracy_reward_stage2": 0.4579381048679352, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2 + }, + { + "completion_length": 10.65625, + "epoch": 0.0005256702295426669, + "grad_norm": 22.526133155999663, + "kl": 0.00025177001953125, + "learning_rate": 9.99649553180305e-07, + "loss": 0.0001, + "reward": 1.4064089059829712, + "reward_std": 0.30950504541397095, + "rewards/accuracy_reward_stage2": 0.4064089059829712, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 3 + }, + { + "completion_length": 12.078125, + "epoch": 0.0007008936393902226, + "grad_norm": 21.484916636473127, + "kl": 0.00188446044921875, + "learning_rate": 9.994743297704572e-07, + "loss": 0.0008, + "reward": 1.4104167222976685, + "reward_std": 0.17264413833618164, + "rewards/accuracy_reward_stage2": 0.5354166626930237, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 4 + }, + { + "completion_length": 15.34375, + "epoch": 0.0008761170492377782, + "grad_norm": 27.629987527288048, + "kl": -2.944469451904297e-05, + "learning_rate": 9.992991063606097e-07, + "loss": -0.0881, + "reward": 1.2981054782867432, + "reward_std": 0.22537116706371307, + "rewards/accuracy_reward_stage2": 0.32935553789138794, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 5 + }, + { + "completion_length": 10.59375, + "epoch": 0.0010513404590853338, + "grad_norm": 25.50093135228471, + "kl": 0.00013637542724609375, + "learning_rate": 9.991238829507622e-07, + "loss": 0.0001, + "reward": 1.583531379699707, + "reward_std": 0.3322303295135498, + "rewards/accuracy_reward_stage2": 0.5835314393043518, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 6 + }, + { + "completion_length": 9.5, + "epoch": 0.0012265638689328894, + "grad_norm": 23.99101673923454, + "kl": 0.0001049041748046875, + "learning_rate": 9.989486595409147e-07, + "loss": 0.0, + "reward": 1.3512048721313477, + "reward_std": 0.24498425424098969, + "rewards/accuracy_reward_stage2": 0.3512047529220581, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 7 + }, + { + "completion_length": 8.28125, + "epoch": 0.0014017872787804452, + "grad_norm": 50.2987810695245, + "kl": 0.140625, + "learning_rate": 9.98773436131067e-07, + "loss": 0.0412, + "reward": 1.3523304462432861, + "reward_std": 0.32968342304229736, + "rewards/accuracy_reward_stage2": 0.47733038663864136, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 8 + }, + { + "completion_length": 9.15625, + "epoch": 0.0015770106886280008, + "grad_norm": 41.286331589381646, + "kl": 0.07080078125, + "learning_rate": 9.985982127212195e-07, + "loss": 0.0218, + "reward": 1.327319622039795, + "reward_std": 0.24577535688877106, + "rewards/accuracy_reward_stage2": 0.4523196518421173, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 9 + }, + { + "completion_length": 13.734375, + "epoch": 0.0017522340984755564, + "grad_norm": 49.5957116925187, + "kl": 0.0341796875, + "learning_rate": 9.98422989311372e-07, + "loss": 0.0137, + "reward": 1.326066493988037, + "reward_std": 0.2599124312400818, + "rewards/accuracy_reward_stage2": 0.45106637477874756, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 10 + }, + { + "completion_length": 11.53125, + "epoch": 0.001927457508323112, + "grad_norm": 21.31206755321316, + "kl": 0.001068115234375, + "learning_rate": 9.982477659015245e-07, + "loss": 0.0004, + "reward": 1.48616623878479, + "reward_std": 0.2009294331073761, + "rewards/accuracy_reward_stage2": 0.4861662685871124, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 11 + }, + { + "completion_length": 11.609375, + "epoch": 0.0021026809181706675, + "grad_norm": 24.101789435137526, + "kl": 0.000728607177734375, + "learning_rate": 9.980725424916767e-07, + "loss": 0.0003, + "reward": 1.528315544128418, + "reward_std": 0.16574808955192566, + "rewards/accuracy_reward_stage2": 0.5283154845237732, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 12 + }, + { + "completion_length": 10.203125, + "epoch": 0.002277904328018223, + "grad_norm": 15598.029114802546, + "kl": 11.0, + "learning_rate": 9.978973190818292e-07, + "loss": 4.4264, + "reward": 1.3424155712127686, + "reward_std": 0.3205416202545166, + "rewards/accuracy_reward_stage2": 0.46741557121276855, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 13 + }, + { + "completion_length": 14.640625, + "epoch": 0.0024531277378657787, + "grad_norm": 22.566706425929812, + "kl": 0.002044677734375, + "learning_rate": 9.977220956719817e-07, + "loss": 0.0008, + "reward": 1.2618801593780518, + "reward_std": 0.19473139941692352, + "rewards/accuracy_reward_stage2": 0.2618802785873413, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 14 + }, + { + "completion_length": 16.75, + "epoch": 0.0026283511477133343, + "grad_norm": 19.548309921488077, + "kl": 0.0021514892578125, + "learning_rate": 9.975468722621342e-07, + "loss": 0.0009, + "reward": 1.6370456218719482, + "reward_std": 0.16209974884986877, + "rewards/accuracy_reward_stage2": 0.6370455622673035, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 15 + }, + { + "completion_length": 10.78125, + "epoch": 0.0028035745575608903, + "grad_norm": 25.48092585549323, + "kl": 0.0034027099609375, + "learning_rate": 9.973716488522867e-07, + "loss": 0.0014, + "reward": 1.4687082767486572, + "reward_std": 0.3171447813510895, + "rewards/accuracy_reward_stage2": 0.4687082767486572, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 16 + }, + { + "completion_length": 7.609375, + "epoch": 0.002978797967408446, + "grad_norm": 19.336096324064826, + "kl": 0.00045013427734375, + "learning_rate": 9.97196425442439e-07, + "loss": 0.0002, + "reward": 1.5834205150604248, + "reward_std": 0.09155820310115814, + "rewards/accuracy_reward_stage2": 0.5834205150604248, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 17 + }, + { + "completion_length": 7.328125, + "epoch": 0.0031540213772560015, + "grad_norm": 23.585621826315027, + "kl": 0.00238037109375, + "learning_rate": 9.970212020325915e-07, + "loss": 0.001, + "reward": 1.4562045335769653, + "reward_std": 0.27990102767944336, + "rewards/accuracy_reward_stage2": 0.45620453357696533, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 18 + }, + { + "completion_length": 14.421875, + "epoch": 0.003329244787103557, + "grad_norm": 20.168254519701865, + "kl": 0.004241943359375, + "learning_rate": 9.96845978622744e-07, + "loss": 0.0017, + "reward": 1.3004417419433594, + "reward_std": 0.14681744575500488, + "rewards/accuracy_reward_stage2": 0.4254416525363922, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 19 + }, + { + "completion_length": 10.015625, + "epoch": 0.0035044681969511127, + "grad_norm": 22.68796589254135, + "kl": 0.0027618408203125, + "learning_rate": 9.966707552128965e-07, + "loss": 0.0011, + "reward": 1.558898687362671, + "reward_std": 0.1517297327518463, + "rewards/accuracy_reward_stage2": 0.5588988065719604, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 20 + }, + { + "completion_length": 8.859375, + "epoch": 0.0036796916067986683, + "grad_norm": 14.419698875883565, + "kl": 0.00433349609375, + "learning_rate": 9.964955318030487e-07, + "loss": 0.0017, + "reward": 1.6647343635559082, + "reward_std": 0.10583889484405518, + "rewards/accuracy_reward_stage2": 0.664734423160553, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 21 + }, + { + "completion_length": 7.765625, + "epoch": 0.003854915016646224, + "grad_norm": 34.43768329839898, + "kl": 0.10400390625, + "learning_rate": 9.963203083932012e-07, + "loss": 0.0415, + "reward": 1.1504526138305664, + "reward_std": 0.24974983930587769, + "rewards/accuracy_reward_stage2": 0.2754526138305664, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 22 + }, + { + "completion_length": 9.546875, + "epoch": 0.0040301384264937795, + "grad_norm": 29.758862471062894, + "kl": 0.0086669921875, + "learning_rate": 9.961450849833537e-07, + "loss": 0.0035, + "reward": 1.3589386940002441, + "reward_std": 0.3868202865123749, + "rewards/accuracy_reward_stage2": 0.3589387536048889, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 23 + }, + { + "completion_length": 10.15625, + "epoch": 0.004205361836341335, + "grad_norm": 24.934275395499128, + "kl": 0.002685546875, + "learning_rate": 9.959698615735062e-07, + "loss": 0.0011, + "reward": 1.517673373222351, + "reward_std": 0.23951569199562073, + "rewards/accuracy_reward_stage2": 0.5176733732223511, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 24 + }, + { + "completion_length": 7.6875, + "epoch": 0.004380585246188891, + "grad_norm": 24.13104410678152, + "kl": 0.003143310546875, + "learning_rate": 9.957946381636585e-07, + "loss": 0.0013, + "reward": 1.4975221157073975, + "reward_std": 0.24336925148963928, + "rewards/accuracy_reward_stage2": 0.4975220859050751, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 25 + }, + { + "completion_length": 8.421875, + "epoch": 0.004555808656036446, + "grad_norm": 25.957497353677354, + "kl": 0.00439453125, + "learning_rate": 9.95619414753811e-07, + "loss": 0.0018, + "reward": 1.740341305732727, + "reward_std": 0.16394385695457458, + "rewards/accuracy_reward_stage2": 0.7403413653373718, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 26 + }, + { + "completion_length": 8.421875, + "epoch": 0.004731032065884002, + "grad_norm": 27210.57969180641, + "kl": 26.75, + "learning_rate": 9.954441913439635e-07, + "loss": 10.7099, + "reward": 1.423106074333191, + "reward_std": 0.28896230459213257, + "rewards/accuracy_reward_stage2": 0.5481060743331909, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 27 + }, + { + "completion_length": 10.609375, + "epoch": 0.0049062554757315574, + "grad_norm": 25.254963010996637, + "kl": 0.0011138916015625, + "learning_rate": 9.95268967934116e-07, + "loss": 0.0004, + "reward": 1.488661289215088, + "reward_std": 0.26590585708618164, + "rewards/accuracy_reward_stage2": 0.48866117000579834, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 28 + }, + { + "completion_length": 9.296875, + "epoch": 0.005081478885579113, + "grad_norm": 15.420666005058992, + "kl": 0.002838134765625, + "learning_rate": 9.950937445242685e-07, + "loss": 0.0011, + "reward": 1.4329993724822998, + "reward_std": 0.12809374928474426, + "rewards/accuracy_reward_stage2": 0.4329993724822998, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 29 + }, + { + "completion_length": 16.546875, + "epoch": 0.005256702295426669, + "grad_norm": 100.09886669879327, + "kl": 0.232421875, + "learning_rate": 9.94918521114421e-07, + "loss": 0.0926, + "reward": 1.0829801559448242, + "reward_std": 0.15251481533050537, + "rewards/accuracy_reward_stage2": 0.20798009634017944, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 30 + }, + { + "completion_length": 9.140625, + "epoch": 0.005431925705274224, + "grad_norm": 21.597575604241964, + "kl": 0.0042724609375, + "learning_rate": 9.947432977045732e-07, + "loss": 0.0017, + "reward": 1.712631344795227, + "reward_std": 0.22612644731998444, + "rewards/accuracy_reward_stage2": 0.7126312255859375, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 31 + }, + { + "completion_length": 11.84375, + "epoch": 0.005607149115121781, + "grad_norm": 30.567268636442027, + "kl": 0.0128173828125, + "learning_rate": 9.945680742947257e-07, + "loss": -0.0297, + "reward": 1.4765079021453857, + "reward_std": 0.23709246516227722, + "rewards/accuracy_reward_stage2": 0.49213287234306335, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 32 + }, + { + "completion_length": 10.578125, + "epoch": 0.005782372524969336, + "grad_norm": 19.899263114078344, + "kl": 0.0166015625, + "learning_rate": 9.94392850884878e-07, + "loss": 0.0066, + "reward": 1.3554813861846924, + "reward_std": 0.21323856711387634, + "rewards/accuracy_reward_stage2": 0.35548141598701477, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 33 + }, + { + "completion_length": 13.8125, + "epoch": 0.005957595934816892, + "grad_norm": 62.522704996084954, + "kl": 0.1806640625, + "learning_rate": 9.942176274750305e-07, + "loss": 0.0721, + "reward": 1.532733678817749, + "reward_std": 0.2642131447792053, + "rewards/accuracy_reward_stage2": 0.6577336192131042, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 34 + }, + { + "completion_length": 10.203125, + "epoch": 0.0061328193446644474, + "grad_norm": 28.84899146020609, + "kl": 0.0081787109375, + "learning_rate": 9.94042404065183e-07, + "loss": -0.0315, + "reward": 1.599797248840332, + "reward_std": 0.2751314043998718, + "rewards/accuracy_reward_stage2": 0.6154221296310425, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 35 + }, + { + "completion_length": 15.390625, + "epoch": 0.006308042754512003, + "grad_norm": 869.9467292590601, + "kl": 0.96875, + "learning_rate": 9.938671806553355e-07, + "loss": 0.387, + "reward": 1.1933510303497314, + "reward_std": 0.19862015545368195, + "rewards/accuracy_reward_stage2": 0.31835103034973145, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 36 + }, + { + "completion_length": 6.15625, + "epoch": 0.006483266164359559, + "grad_norm": 21.28271109324732, + "kl": 0.016845703125, + "learning_rate": 9.93691957245488e-07, + "loss": -0.0307, + "reward": 1.716348648071289, + "reward_std": 0.13987597823143005, + "rewards/accuracy_reward_stage2": 0.7475985884666443, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 37 + }, + { + "completion_length": 7.859375, + "epoch": 0.006658489574207114, + "grad_norm": 18.801528535727066, + "kl": 0.01611328125, + "learning_rate": 9.935167338356405e-07, + "loss": 0.0064, + "reward": 1.427717924118042, + "reward_std": 0.21595898270606995, + "rewards/accuracy_reward_stage2": 0.5527178049087524, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 38 + }, + { + "completion_length": 13.234375, + "epoch": 0.00683371298405467, + "grad_norm": 29.72698332741626, + "kl": 0.010498046875, + "learning_rate": 9.933415104257928e-07, + "loss": 0.0042, + "reward": 1.4073545932769775, + "reward_std": 0.4115482568740845, + "rewards/accuracy_reward_stage2": 0.5323545336723328, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 39 + }, + { + "completion_length": 18.5625, + "epoch": 0.007008936393902225, + "grad_norm": 190.47366474232058, + "kl": 0.3203125, + "learning_rate": 9.931662870159453e-07, + "loss": 0.1277, + "reward": 1.3717286586761475, + "reward_std": 0.20918840169906616, + "rewards/accuracy_reward_stage2": 0.4967285692691803, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 40 + }, + { + "completion_length": 10.796875, + "epoch": 0.007184159803749781, + "grad_norm": 30.35224097860874, + "kl": 0.0201416015625, + "learning_rate": 9.929910636060978e-07, + "loss": 0.0081, + "reward": 1.571852445602417, + "reward_std": 0.2101229727268219, + "rewards/accuracy_reward_stage2": 0.5718523859977722, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 41 + }, + { + "completion_length": 12.609375, + "epoch": 0.007359383213597337, + "grad_norm": 20.959404085955548, + "kl": 0.00677490234375, + "learning_rate": 9.928158401962502e-07, + "loss": 0.0027, + "reward": 1.6106300354003906, + "reward_std": 0.15248414874076843, + "rewards/accuracy_reward_stage2": 0.6106299757957458, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 42 + }, + { + "completion_length": 12.125, + "epoch": 0.007534606623444892, + "grad_norm": 40.0503689273909, + "kl": 0.376953125, + "learning_rate": 9.926406167864027e-07, + "loss": 0.1509, + "reward": 1.3072917461395264, + "reward_std": 0.17887625098228455, + "rewards/accuracy_reward_stage2": 0.4322916567325592, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 43 + }, + { + "completion_length": 8.09375, + "epoch": 0.007709830033292448, + "grad_norm": 25.498158819945512, + "kl": 0.043701171875, + "learning_rate": 9.92465393376555e-07, + "loss": -0.0115, + "reward": 1.400193452835083, + "reward_std": 0.17675068974494934, + "rewards/accuracy_reward_stage2": 0.540818452835083, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 44 + }, + { + "completion_length": 6.765625, + "epoch": 0.007885053443140003, + "grad_norm": 26.292798175814653, + "kl": 0.0184326171875, + "learning_rate": 9.922901699667075e-07, + "loss": 0.0074, + "reward": 1.4580981731414795, + "reward_std": 0.3192833662033081, + "rewards/accuracy_reward_stage2": 0.5830981731414795, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 45 + }, + { + "completion_length": 6.3125, + "epoch": 0.008060276852987559, + "grad_norm": 29.8457885783044, + "kl": 0.01373291015625, + "learning_rate": 9.9211494655686e-07, + "loss": 0.0055, + "reward": 1.3718205690383911, + "reward_std": 0.4017482399940491, + "rewards/accuracy_reward_stage2": 0.4968205690383911, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 46 + }, + { + "completion_length": 6.96875, + "epoch": 0.008235500262835115, + "grad_norm": 23.87042828545471, + "kl": 0.0194091796875, + "learning_rate": 9.919397231470123e-07, + "loss": 0.0078, + "reward": 1.5246155261993408, + "reward_std": 0.2239200323820114, + "rewards/accuracy_reward_stage2": 0.5246155858039856, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 47 + }, + { + "completion_length": 8.515625, + "epoch": 0.00841072367268267, + "grad_norm": 23.48554567269952, + "kl": 0.00982666015625, + "learning_rate": 9.917644997371648e-07, + "loss": 0.0039, + "reward": 1.3785715103149414, + "reward_std": 0.2797955572605133, + "rewards/accuracy_reward_stage2": 0.5035715699195862, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 48 + }, + { + "completion_length": 8.03125, + "epoch": 0.008585947082530226, + "grad_norm": 17.54951377855767, + "kl": 0.01318359375, + "learning_rate": 9.915892763273173e-07, + "loss": 0.0053, + "reward": 1.6302083730697632, + "reward_std": 0.16204530000686646, + "rewards/accuracy_reward_stage2": 0.6302083730697632, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 49 + }, + { + "completion_length": 6.765625, + "epoch": 0.008761170492377781, + "grad_norm": 23.217378435807788, + "kl": 0.007720947265625, + "learning_rate": 9.914140529174698e-07, + "loss": 0.0031, + "reward": 1.4840940237045288, + "reward_std": 0.25985440611839294, + "rewards/accuracy_reward_stage2": 0.4840940237045288, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 50 + }, + { + "completion_length": 13.5, + "epoch": 0.008936393902225337, + "grad_norm": 24.87705970165469, + "kl": 0.046875, + "learning_rate": 9.912388295076223e-07, + "loss": 0.0188, + "reward": 1.477467656135559, + "reward_std": 0.2629283666610718, + "rewards/accuracy_reward_stage2": 0.4774676561355591, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 51 + }, + { + "completion_length": 25.15625, + "epoch": 0.009111617312072893, + "grad_norm": 23.93595159530324, + "kl": 0.51171875, + "learning_rate": 9.910636060977745e-07, + "loss": 0.2041, + "reward": 1.425516963005066, + "reward_std": 0.12461623549461365, + "rewards/accuracy_reward_stage2": 0.5505169630050659, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 52 + }, + { + "completion_length": 18.171875, + "epoch": 0.009286840721920448, + "grad_norm": 22.11918874616949, + "kl": 0.0225830078125, + "learning_rate": 9.90888382687927e-07, + "loss": 0.009, + "reward": 1.2883203029632568, + "reward_std": 0.19033360481262207, + "rewards/accuracy_reward_stage2": 0.4133202135562897, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 53 + }, + { + "completion_length": 17.046875, + "epoch": 0.009462064131768004, + "grad_norm": 22.519091215947434, + "kl": 0.041259765625, + "learning_rate": 9.907131592780795e-07, + "loss": 0.0165, + "reward": 1.5168977975845337, + "reward_std": 0.20637044310569763, + "rewards/accuracy_reward_stage2": 0.5168977975845337, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 54 + }, + { + "completion_length": 11.1875, + "epoch": 0.00963728754161556, + "grad_norm": 19.29424591739688, + "kl": 0.01031494140625, + "learning_rate": 9.90537935868232e-07, + "loss": 0.0041, + "reward": 1.6363108158111572, + "reward_std": 0.20468105375766754, + "rewards/accuracy_reward_stage2": 0.6363107562065125, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 55 + }, + { + "completion_length": 13.265625, + "epoch": 0.009812510951463115, + "grad_norm": 23.610180528134936, + "kl": 0.0341796875, + "learning_rate": 9.903627124583845e-07, + "loss": -0.0103, + "reward": 1.6153689622879028, + "reward_std": 0.20442932844161987, + "rewards/accuracy_reward_stage2": 0.6309939622879028, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 56 + }, + { + "completion_length": 11.53125, + "epoch": 0.00998773436131067, + "grad_norm": 25.012855879784862, + "kl": 0.0439453125, + "learning_rate": 9.901874890485368e-07, + "loss": 0.0176, + "reward": 1.570845603942871, + "reward_std": 0.19931158423423767, + "rewards/accuracy_reward_stage2": 0.5708456039428711, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 57 + }, + { + "completion_length": 12.484375, + "epoch": 0.010162957771158226, + "grad_norm": 19.3673100075188, + "kl": 0.046142578125, + "learning_rate": 9.900122656386893e-07, + "loss": 0.0185, + "reward": 1.5261080265045166, + "reward_std": 0.1789964884519577, + "rewards/accuracy_reward_stage2": 0.5261080265045166, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 58 + }, + { + "completion_length": 9.53125, + "epoch": 0.010338181181005782, + "grad_norm": 22.076786740581635, + "kl": 0.041015625, + "learning_rate": 9.898370422288418e-07, + "loss": 0.0164, + "reward": 1.528951644897461, + "reward_std": 0.17195484042167664, + "rewards/accuracy_reward_stage2": 0.5289516448974609, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 59 + }, + { + "completion_length": 9.65625, + "epoch": 0.010513404590853337, + "grad_norm": 21.532828207340156, + "kl": 0.06494140625, + "learning_rate": 9.89661818818994e-07, + "loss": 0.0259, + "reward": 1.2520328760147095, + "reward_std": 0.11828687787055969, + "rewards/accuracy_reward_stage2": 0.2520328760147095, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 60 + }, + { + "completion_length": 7.109375, + "epoch": 0.010688628000700893, + "grad_norm": 18.27714256983686, + "kl": 0.016357421875, + "learning_rate": 9.894865954091465e-07, + "loss": 0.0066, + "reward": 1.7229888439178467, + "reward_std": 0.22104474902153015, + "rewards/accuracy_reward_stage2": 0.7229888439178467, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 61 + }, + { + "completion_length": 21.140625, + "epoch": 0.010863851410548448, + "grad_norm": 26.309629270176462, + "kl": 0.0927734375, + "learning_rate": 9.89311371999299e-07, + "loss": -0.0065, + "reward": 1.72660493850708, + "reward_std": 0.32230430841445923, + "rewards/accuracy_reward_stage2": 0.7422299981117249, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 62 + }, + { + "completion_length": 18.921875, + "epoch": 0.011039074820396006, + "grad_norm": 23.324464380030182, + "kl": 0.042724609375, + "learning_rate": 9.891361485894515e-07, + "loss": 0.0171, + "reward": 1.2932167053222656, + "reward_std": 0.20824560523033142, + "rewards/accuracy_reward_stage2": 0.2932167053222656, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 63 + }, + { + "completion_length": 22.0625, + "epoch": 0.011214298230243561, + "grad_norm": 510.2753190723213, + "kl": 0.94921875, + "learning_rate": 9.88960925179604e-07, + "loss": 0.3816, + "reward": 1.3846971988677979, + "reward_std": 0.15436303615570068, + "rewards/accuracy_reward_stage2": 0.5096973180770874, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 64 + }, + { + "completion_length": 9.78125, + "epoch": 0.011389521640091117, + "grad_norm": 20.624173694033985, + "kl": 0.06640625, + "learning_rate": 9.887857017697563e-07, + "loss": 0.0265, + "reward": 1.6437493562698364, + "reward_std": 0.13226813077926636, + "rewards/accuracy_reward_stage2": 0.6437492966651917, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 65 + }, + { + "completion_length": 23.34375, + "epoch": 0.011564745049938673, + "grad_norm": 7774.154134478514, + "kl": 7.4375, + "learning_rate": 9.886104783599088e-07, + "loss": 2.9733, + "reward": 1.7207577228546143, + "reward_std": 0.13807401061058044, + "rewards/accuracy_reward_stage2": 0.8457577228546143, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 66 + }, + { + "completion_length": 9.78125, + "epoch": 0.011739968459786228, + "grad_norm": 17.504390886758873, + "kl": 0.07275390625, + "learning_rate": 9.884352549500613e-07, + "loss": 0.0291, + "reward": 1.1457009315490723, + "reward_std": 0.1326243132352829, + "rewards/accuracy_reward_stage2": 0.14570099115371704, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 67 + }, + { + "completion_length": 15.34375, + "epoch": 0.011915191869633784, + "grad_norm": 23.114615139412418, + "kl": 0.017822265625, + "learning_rate": 9.882600315402138e-07, + "loss": 0.0071, + "reward": 1.6107048988342285, + "reward_std": 0.16821706295013428, + "rewards/accuracy_reward_stage2": 0.610704779624939, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 68 + }, + { + "completion_length": 11.65625, + "epoch": 0.01209041527948134, + "grad_norm": 25.7631490100802, + "kl": 0.04833984375, + "learning_rate": 9.880848081303663e-07, + "loss": 0.0193, + "reward": 1.669920802116394, + "reward_std": 0.2990760803222656, + "rewards/accuracy_reward_stage2": 0.6699207425117493, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 69 + }, + { + "completion_length": 7.0625, + "epoch": 0.012265638689328895, + "grad_norm": 28.192508330216686, + "kl": 0.2373046875, + "learning_rate": 9.879095847205188e-07, + "loss": 0.0948, + "reward": 1.5916601419448853, + "reward_std": 0.1377098262310028, + "rewards/accuracy_reward_stage2": 0.7166601419448853, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 70 + }, + { + "completion_length": 10.515625, + "epoch": 0.01244086209917645, + "grad_norm": 29.211176981993752, + "kl": 0.022705078125, + "learning_rate": 9.87734361310671e-07, + "loss": 0.0091, + "reward": 1.3620529174804688, + "reward_std": 0.28657591342926025, + "rewards/accuracy_reward_stage2": 0.362052857875824, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 71 + }, + { + "completion_length": 7.03125, + "epoch": 0.012616085509024006, + "grad_norm": 62.034643138243744, + "kl": 0.232421875, + "learning_rate": 9.875591379008235e-07, + "loss": 0.0926, + "reward": 1.5957125425338745, + "reward_std": 0.1589427888393402, + "rewards/accuracy_reward_stage2": 0.7207125425338745, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 72 + }, + { + "completion_length": 15.046875, + "epoch": 0.012791308918871562, + "grad_norm": 49.291115562011576, + "kl": 0.072265625, + "learning_rate": 9.873839144909758e-07, + "loss": 0.0289, + "reward": 1.7115159034729004, + "reward_std": 0.20133166015148163, + "rewards/accuracy_reward_stage2": 0.7115159034729004, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 73 + }, + { + "completion_length": 11.203125, + "epoch": 0.012966532328719117, + "grad_norm": 21.002854737310127, + "kl": 0.107421875, + "learning_rate": 9.872086910811283e-07, + "loss": 0.043, + "reward": 1.5917012691497803, + "reward_std": 0.16557535529136658, + "rewards/accuracy_reward_stage2": 0.5917012691497803, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 74 + }, + { + "completion_length": 9.46875, + "epoch": 0.013141755738566673, + "grad_norm": 20.822893777959763, + "kl": 0.06591796875, + "learning_rate": 9.870334676712808e-07, + "loss": 0.0264, + "reward": 1.4202609062194824, + "reward_std": 0.19799599051475525, + "rewards/accuracy_reward_stage2": 0.42026087641716003, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 75 + }, + { + "completion_length": 11.796875, + "epoch": 0.013316979148414228, + "grad_norm": 24.661401462976446, + "kl": 0.047119140625, + "learning_rate": 9.868582442614333e-07, + "loss": 0.0188, + "reward": 1.6601495742797852, + "reward_std": 0.16851621866226196, + "rewards/accuracy_reward_stage2": 0.6601495146751404, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 76 + }, + { + "completion_length": 8.359375, + "epoch": 0.013492202558261784, + "grad_norm": 17.354177040158966, + "kl": 0.0242919921875, + "learning_rate": 9.866830208515858e-07, + "loss": 0.0097, + "reward": 1.1423872709274292, + "reward_std": 0.08182623237371445, + "rewards/accuracy_reward_stage2": 0.2673872113227844, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 77 + }, + { + "completion_length": 16.0, + "epoch": 0.01366742596810934, + "grad_norm": 19.041542561969962, + "kl": 0.0255126953125, + "learning_rate": 9.86507797441738e-07, + "loss": 0.0102, + "reward": 1.4512853622436523, + "reward_std": 0.13536491990089417, + "rewards/accuracy_reward_stage2": 0.4512854814529419, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 78 + }, + { + "completion_length": 12.9375, + "epoch": 0.013842649377956895, + "grad_norm": 16.90288627870279, + "kl": 0.0242919921875, + "learning_rate": 9.863325740318906e-07, + "loss": 0.0097, + "reward": 1.3414337635040283, + "reward_std": 0.1197996586561203, + "rewards/accuracy_reward_stage2": 0.3414338231086731, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 79 + }, + { + "completion_length": 6.78125, + "epoch": 0.01401787278780445, + "grad_norm": 25.60916886197121, + "kl": 0.24609375, + "learning_rate": 9.86157350622043e-07, + "loss": 0.0984, + "reward": 1.5731947422027588, + "reward_std": 0.2476077377796173, + "rewards/accuracy_reward_stage2": 0.698194682598114, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 80 + }, + { + "completion_length": 9.734375, + "epoch": 0.014193096197652006, + "grad_norm": 21.284203853284396, + "kl": 0.023681640625, + "learning_rate": 9.859821272121955e-07, + "loss": 0.0094, + "reward": 1.851102352142334, + "reward_std": 0.10228273272514343, + "rewards/accuracy_reward_stage2": 0.8511022329330444, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 81 + }, + { + "completion_length": 17.515625, + "epoch": 0.014368319607499562, + "grad_norm": 22.317783264001836, + "kl": 0.04931640625, + "learning_rate": 9.85806903802348e-07, + "loss": 0.0197, + "reward": 1.4761021137237549, + "reward_std": 0.19037950038909912, + "rewards/accuracy_reward_stage2": 0.4761021137237549, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 82 + }, + { + "completion_length": 7.703125, + "epoch": 0.014543543017347118, + "grad_norm": 17.43143667281669, + "kl": 0.0133056640625, + "learning_rate": 9.856316803925005e-07, + "loss": 0.0053, + "reward": 1.3431739807128906, + "reward_std": 0.19102345407009125, + "rewards/accuracy_reward_stage2": 0.3431740403175354, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 83 + }, + { + "completion_length": 15.0, + "epoch": 0.014718766427194673, + "grad_norm": 29.21779230277411, + "kl": 0.373046875, + "learning_rate": 9.854564569826528e-07, + "loss": 0.1492, + "reward": 1.3184058666229248, + "reward_std": 0.2299778163433075, + "rewards/accuracy_reward_stage2": 0.4434059262275696, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 84 + }, + { + "completion_length": 8.328125, + "epoch": 0.014893989837042229, + "grad_norm": 21.284746149411724, + "kl": 0.046142578125, + "learning_rate": 9.852812335728053e-07, + "loss": 0.0184, + "reward": 1.5364583730697632, + "reward_std": 0.18556493520736694, + "rewards/accuracy_reward_stage2": 0.5364583730697632, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 85 + }, + { + "completion_length": 18.109375, + "epoch": 0.015069213246889784, + "grad_norm": 21.554042596269834, + "kl": 0.58203125, + "learning_rate": 9.851060101629576e-07, + "loss": 0.2327, + "reward": 1.343637228012085, + "reward_std": 0.22345028817653656, + "rewards/accuracy_reward_stage2": 0.46863725781440735, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 86 + }, + { + "completion_length": 13.40625, + "epoch": 0.01524443665673734, + "grad_norm": 22.937433205252535, + "kl": 0.044189453125, + "learning_rate": 9.8493078675311e-07, + "loss": 0.0177, + "reward": 1.577603816986084, + "reward_std": 0.11516597867012024, + "rewards/accuracy_reward_stage2": 0.5776037573814392, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 87 + }, + { + "completion_length": 16.046875, + "epoch": 0.015419660066584896, + "grad_norm": 20.89853454660946, + "kl": 0.04248046875, + "learning_rate": 9.847555633432626e-07, + "loss": 0.017, + "reward": 1.4262290000915527, + "reward_std": 0.2058263123035431, + "rewards/accuracy_reward_stage2": 0.4262291193008423, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 88 + }, + { + "completion_length": 9.234375, + "epoch": 0.015594883476432451, + "grad_norm": 13.755470671094079, + "kl": 0.01080322265625, + "learning_rate": 9.84580339933415e-07, + "loss": 0.0043, + "reward": 1.6458332538604736, + "reward_std": 0.0883883461356163, + "rewards/accuracy_reward_stage2": 0.6458333134651184, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 89 + }, + { + "completion_length": 6.765625, + "epoch": 0.015770106886280007, + "grad_norm": 11.623954354278077, + "kl": 0.0299072265625, + "learning_rate": 9.844051165235676e-07, + "loss": 0.012, + "reward": 1.609375, + "reward_std": 0.10205793380737305, + "rewards/accuracy_reward_stage2": 0.609375, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 90 + }, + { + "completion_length": 10.78125, + "epoch": 0.015945330296127564, + "grad_norm": 18.861997300165676, + "kl": 0.0341796875, + "learning_rate": 9.8422989311372e-07, + "loss": 0.0137, + "reward": 1.506962537765503, + "reward_std": 0.18392382562160492, + "rewards/accuracy_reward_stage2": 0.5069626569747925, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 91 + }, + { + "completion_length": 9.25, + "epoch": 0.016120553705975118, + "grad_norm": 18104.437031192745, + "kl": 31.0, + "learning_rate": 9.840546697038723e-07, + "loss": 12.3281, + "reward": 1.3662315607070923, + "reward_std": 0.11293835937976837, + "rewards/accuracy_reward_stage2": 0.5068565607070923, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 92 + }, + { + "completion_length": 24.9375, + "epoch": 0.016295777115822675, + "grad_norm": 4068.679729400268, + "kl": 81.5, + "learning_rate": 9.838794462940248e-07, + "loss": 32.587, + "reward": 1.2212448120117188, + "reward_std": 0.21262651681900024, + "rewards/accuracy_reward_stage2": 0.4868698716163635, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 93 + }, + { + "completion_length": 10.703125, + "epoch": 0.01647100052567023, + "grad_norm": 21.628345918894205, + "kl": 0.072265625, + "learning_rate": 9.837042228841773e-07, + "loss": 0.0289, + "reward": 1.5775998830795288, + "reward_std": 0.231684148311615, + "rewards/accuracy_reward_stage2": 0.7025998830795288, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 94 + }, + { + "completion_length": 11.5, + "epoch": 0.016646223935517786, + "grad_norm": 26.44437366688261, + "kl": 0.0125732421875, + "learning_rate": 9.835289994743298e-07, + "loss": 0.005, + "reward": 1.5052083730697632, + "reward_std": 0.2688094973564148, + "rewards/accuracy_reward_stage2": 0.5052083134651184, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 95 + }, + { + "completion_length": 13.40625, + "epoch": 0.01682144734536534, + "grad_norm": 27.942135859948205, + "kl": 0.05615234375, + "learning_rate": 9.833537760644823e-07, + "loss": -0.0218, + "reward": 1.30165433883667, + "reward_std": 0.2656284272670746, + "rewards/accuracy_reward_stage2": 0.31727930903434753, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 96 + }, + { + "completion_length": 16.5625, + "epoch": 0.016996670755212898, + "grad_norm": 23.460188028730357, + "kl": 0.07177734375, + "learning_rate": 9.831785526546346e-07, + "loss": 0.0288, + "reward": 1.3574440479278564, + "reward_std": 0.17292845249176025, + "rewards/accuracy_reward_stage2": 0.35744398832321167, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 97 + }, + { + "completion_length": 8.453125, + "epoch": 0.01717189416506045, + "grad_norm": 18.965752150922896, + "kl": 0.053466796875, + "learning_rate": 9.83003329244787e-07, + "loss": -0.0116, + "reward": 1.6651774644851685, + "reward_std": 0.19083932042121887, + "rewards/accuracy_reward_stage2": 0.8058024644851685, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 98 + }, + { + "completion_length": 13.875, + "epoch": 0.01734711757490801, + "grad_norm": 18.059570150986527, + "kl": 0.0235595703125, + "learning_rate": 9.828281058349396e-07, + "loss": 0.0094, + "reward": 1.640625, + "reward_std": 0.2472364604473114, + "rewards/accuracy_reward_stage2": 0.640625, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 99 + }, + { + "completion_length": 7.65625, + "epoch": 0.017522340984755563, + "grad_norm": 18.09970933493801, + "kl": 0.0213623046875, + "learning_rate": 9.826528824250918e-07, + "loss": 0.0085, + "reward": 1.4744908809661865, + "reward_std": 0.2173069417476654, + "rewards/accuracy_reward_stage2": 0.5994908809661865, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 100 + }, + { + "completion_length": 7.5625, + "epoch": 0.01769756439460312, + "grad_norm": 27.227153684766957, + "kl": 0.03369140625, + "learning_rate": 9.824776590152443e-07, + "loss": -0.0158, + "reward": 1.4680397510528564, + "reward_std": 0.26545101404190063, + "rewards/accuracy_reward_stage2": 0.48366478085517883, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 101 + }, + { + "completion_length": 12.3125, + "epoch": 0.017872787804450674, + "grad_norm": 21.805155630136777, + "kl": 0.0098876953125, + "learning_rate": 9.823024356053968e-07, + "loss": 0.0039, + "reward": 1.4559073448181152, + "reward_std": 0.13833250105381012, + "rewards/accuracy_reward_stage2": 0.45590728521347046, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 102 + }, + { + "completion_length": 9.703125, + "epoch": 0.01804801121429823, + "grad_norm": 23.892836207416806, + "kl": 0.020263671875, + "learning_rate": 9.821272121955493e-07, + "loss": 0.0081, + "reward": 1.4594056606292725, + "reward_std": 0.19585129618644714, + "rewards/accuracy_reward_stage2": 0.7094056606292725, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 103 + }, + { + "completion_length": 11.40625, + "epoch": 0.018223234624145785, + "grad_norm": 16.16795985343101, + "kl": 0.024169921875, + "learning_rate": 9.819519887857018e-07, + "loss": 0.0097, + "reward": 1.7032923698425293, + "reward_std": 0.10313989222049713, + "rewards/accuracy_reward_stage2": 0.7032923102378845, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 104 + }, + { + "completion_length": 15.15625, + "epoch": 0.018398458033993342, + "grad_norm": 58.162353972663645, + "kl": 0.486328125, + "learning_rate": 9.81776765375854e-07, + "loss": 0.1505, + "reward": 1.5770833492279053, + "reward_std": 0.23222008347511292, + "rewards/accuracy_reward_stage2": 0.7177083492279053, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 105 + }, + { + "completion_length": 10.890625, + "epoch": 0.018573681443840896, + "grad_norm": 20.3955912999849, + "kl": 0.041259765625, + "learning_rate": 9.816015419660066e-07, + "loss": -0.0051, + "reward": 1.7585219144821167, + "reward_std": 0.18110564351081848, + "rewards/accuracy_reward_stage2": 0.7741469144821167, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 106 + }, + { + "completion_length": 7.9375, + "epoch": 0.018748904853688454, + "grad_norm": 33.10455387599469, + "kl": 0.21484375, + "learning_rate": 9.81426318556159e-07, + "loss": 0.0861, + "reward": 1.446201205253601, + "reward_std": 0.26556122303009033, + "rewards/accuracy_reward_stage2": 0.5712012052536011, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 107 + }, + { + "completion_length": 9.453125, + "epoch": 0.018924128263536007, + "grad_norm": 28.010482936540864, + "kl": 0.051513671875, + "learning_rate": 9.812510951463116e-07, + "loss": 0.0206, + "reward": 1.5572054386138916, + "reward_std": 0.18969415128231049, + "rewards/accuracy_reward_stage2": 0.5572054386138916, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 108 + }, + { + "completion_length": 13.609375, + "epoch": 0.019099351673383565, + "grad_norm": 22.80737044173408, + "kl": 0.0888671875, + "learning_rate": 9.81075871736464e-07, + "loss": 0.0356, + "reward": 1.2530102729797363, + "reward_std": 0.27315306663513184, + "rewards/accuracy_reward_stage2": 0.5030102729797363, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 109 + }, + { + "completion_length": 7.265625, + "epoch": 0.01927457508323112, + "grad_norm": 19.569412790924364, + "kl": 0.047119140625, + "learning_rate": 9.809006483266164e-07, + "loss": 0.0188, + "reward": 1.7912015914916992, + "reward_std": 0.22704292833805084, + "rewards/accuracy_reward_stage2": 0.7912015318870544, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 110 + }, + { + "completion_length": 7.5625, + "epoch": 0.019449798493078676, + "grad_norm": 25.806046811486276, + "kl": 0.033935546875, + "learning_rate": 9.807254249167688e-07, + "loss": 0.0136, + "reward": 1.692245364189148, + "reward_std": 0.31582602858543396, + "rewards/accuracy_reward_stage2": 0.6922453045845032, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 111 + }, + { + "completion_length": 13.234375, + "epoch": 0.01962502190292623, + "grad_norm": 15.656314149690356, + "kl": 0.05810546875, + "learning_rate": 9.805502015069213e-07, + "loss": 0.0232, + "reward": 1.7992335557937622, + "reward_std": 0.11169708520174026, + "rewards/accuracy_reward_stage2": 0.7992335557937622, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 112 + }, + { + "completion_length": 7.859375, + "epoch": 0.019800245312773787, + "grad_norm": 20.00385022632407, + "kl": 0.0228271484375, + "learning_rate": 9.803749780970736e-07, + "loss": 0.0091, + "reward": 1.5724248886108398, + "reward_std": 0.23601973056793213, + "rewards/accuracy_reward_stage2": 0.6974248886108398, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 113 + }, + { + "completion_length": 10.046875, + "epoch": 0.01997546872262134, + "grad_norm": 19.010960047122616, + "kl": 0.034423828125, + "learning_rate": 9.801997546872261e-07, + "loss": 0.0137, + "reward": 1.6812996864318848, + "reward_std": 0.17957565188407898, + "rewards/accuracy_reward_stage2": 0.6812995672225952, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 114 + }, + { + "completion_length": 14.515625, + "epoch": 0.020150692132468898, + "grad_norm": 19.989242334690157, + "kl": 0.044677734375, + "learning_rate": 9.800245312773786e-07, + "loss": 0.0179, + "reward": 1.6909170150756836, + "reward_std": 0.1381261646747589, + "rewards/accuracy_reward_stage2": 0.6909170150756836, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 115 + }, + { + "completion_length": 8.96875, + "epoch": 0.020325915542316452, + "grad_norm": 21.574664105458634, + "kl": 0.07275390625, + "learning_rate": 9.79849307867531e-07, + "loss": 0.0291, + "reward": 1.5188727378845215, + "reward_std": 0.21951830387115479, + "rewards/accuracy_reward_stage2": 0.5188726186752319, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 116 + }, + { + "completion_length": 9.0625, + "epoch": 0.02050113895216401, + "grad_norm": 21.074363399702218, + "kl": 0.0296630859375, + "learning_rate": 9.796740844576836e-07, + "loss": -0.0757, + "reward": 1.3273824453353882, + "reward_std": 0.2471882849931717, + "rewards/accuracy_reward_stage2": 0.3586324453353882, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 117 + }, + { + "completion_length": 14.859375, + "epoch": 0.020676362362011563, + "grad_norm": 24.287310771519245, + "kl": 0.0303955078125, + "learning_rate": 9.794988610478359e-07, + "loss": 0.0122, + "reward": 1.3982466459274292, + "reward_std": 0.24839340150356293, + "rewards/accuracy_reward_stage2": 0.3982466161251068, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 118 + }, + { + "completion_length": 12.390625, + "epoch": 0.02085158577185912, + "grad_norm": 23.822191359887427, + "kl": 0.0380859375, + "learning_rate": 9.793236376379884e-07, + "loss": 0.0152, + "reward": 1.2920645475387573, + "reward_std": 0.15751829743385315, + "rewards/accuracy_reward_stage2": 0.41706451773643494, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 119 + }, + { + "completion_length": 9.8125, + "epoch": 0.021026809181706674, + "grad_norm": 20.83707735551321, + "kl": 0.043212890625, + "learning_rate": 9.791484142281409e-07, + "loss": -0.0259, + "reward": 1.4067494869232178, + "reward_std": 0.21363842487335205, + "rewards/accuracy_reward_stage2": 0.42237451672554016, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 120 + }, + { + "completion_length": 11.75, + "epoch": 0.021202032591554232, + "grad_norm": 23.337085102664894, + "kl": 0.0201416015625, + "learning_rate": 9.789731908182933e-07, + "loss": 0.0081, + "reward": 1.697341799736023, + "reward_std": 0.2553454339504242, + "rewards/accuracy_reward_stage2": 0.697341799736023, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 121 + }, + { + "completion_length": 10.0625, + "epoch": 0.021377256001401786, + "grad_norm": 23.09151113763775, + "kl": 0.049072265625, + "learning_rate": 9.787979674084458e-07, + "loss": 0.0196, + "reward": 1.4421296119689941, + "reward_std": 0.34498897194862366, + "rewards/accuracy_reward_stage2": 0.5671296119689941, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 122 + }, + { + "completion_length": 9.21875, + "epoch": 0.021552479411249343, + "grad_norm": 18.3096250105465, + "kl": 0.038818359375, + "learning_rate": 9.786227439985981e-07, + "loss": 0.0156, + "reward": 1.2355936765670776, + "reward_std": 0.211252823472023, + "rewards/accuracy_reward_stage2": 0.36059367656707764, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 123 + }, + { + "completion_length": 13.375, + "epoch": 0.021727702821096897, + "grad_norm": 17.483371856266107, + "kl": 0.035888671875, + "learning_rate": 9.784475205887506e-07, + "loss": 0.0144, + "reward": 1.6861112117767334, + "reward_std": 0.12975779175758362, + "rewards/accuracy_reward_stage2": 0.6861111521720886, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 124 + }, + { + "completion_length": 8.53125, + "epoch": 0.021902926230944454, + "grad_norm": 14.352172913149598, + "kl": 0.11572265625, + "learning_rate": 9.78272297178903e-07, + "loss": 0.0463, + "reward": 1.4479167461395264, + "reward_std": 0.06200198456645012, + "rewards/accuracy_reward_stage2": 0.5729166865348816, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 125 + }, + { + "completion_length": 11.484375, + "epoch": 0.02207814964079201, + "grad_norm": 18.73850177803472, + "kl": 0.049560546875, + "learning_rate": 9.780970737690554e-07, + "loss": -0.0115, + "reward": 1.3510587215423584, + "reward_std": 0.2711937725543976, + "rewards/accuracy_reward_stage2": 0.3666836619377136, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 126 + }, + { + "completion_length": 6.65625, + "epoch": 0.022253373050639565, + "grad_norm": 19.53612239258542, + "kl": 0.03173828125, + "learning_rate": 9.779218503592079e-07, + "loss": 0.0127, + "reward": 1.7808170318603516, + "reward_std": 0.14916422963142395, + "rewards/accuracy_reward_stage2": 0.7808170318603516, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 127 + }, + { + "completion_length": 12.125, + "epoch": 0.022428596460487123, + "grad_norm": 24.190435873177584, + "kl": 0.0128173828125, + "learning_rate": 9.777466269493604e-07, + "loss": 0.0051, + "reward": 1.7245370149612427, + "reward_std": 0.22146297991275787, + "rewards/accuracy_reward_stage2": 0.7245370149612427, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 128 + }, + { + "completion_length": 12.3125, + "epoch": 0.022603819870334677, + "grad_norm": 24.571036917758526, + "kl": 0.09912109375, + "learning_rate": 9.775714035395129e-07, + "loss": -0.0045, + "reward": 1.5884959697723389, + "reward_std": 0.20376023650169373, + "rewards/accuracy_reward_stage2": 0.6041209697723389, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 129 + }, + { + "completion_length": 8.75, + "epoch": 0.022779043280182234, + "grad_norm": 17.644400774669826, + "kl": 0.04638671875, + "learning_rate": 9.773961801296654e-07, + "loss": 0.0186, + "reward": 1.5527987480163574, + "reward_std": 0.11296023428440094, + "rewards/accuracy_reward_stage2": 0.6777988076210022, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 130 + }, + { + "completion_length": 8.546875, + "epoch": 0.022954266690029788, + "grad_norm": 970.1329773138383, + "kl": 1.8203125, + "learning_rate": 9.772209567198178e-07, + "loss": 0.728, + "reward": 1.5097854137420654, + "reward_std": 0.06071118637919426, + "rewards/accuracy_reward_stage2": 0.6347853541374207, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 131 + }, + { + "completion_length": 8.140625, + "epoch": 0.023129490099877345, + "grad_norm": 13.697885280020044, + "kl": 0.043212890625, + "learning_rate": 9.770457333099701e-07, + "loss": 0.0172, + "reward": 1.610494613647461, + "reward_std": 0.12853994965553284, + "rewards/accuracy_reward_stage2": 0.6104945540428162, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 132 + }, + { + "completion_length": 9.265625, + "epoch": 0.0233047135097249, + "grad_norm": 14.959578456824348, + "kl": 0.040771484375, + "learning_rate": 9.768705099001226e-07, + "loss": 0.0163, + "reward": 1.4808006286621094, + "reward_std": 0.2097875326871872, + "rewards/accuracy_reward_stage2": 0.48080065846443176, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 133 + }, + { + "completion_length": 7.921875, + "epoch": 0.023479936919572456, + "grad_norm": 17.78254745647574, + "kl": 0.0240478515625, + "learning_rate": 9.766952864902751e-07, + "loss": 0.0096, + "reward": 1.9173030853271484, + "reward_std": 0.1365506649017334, + "rewards/accuracy_reward_stage2": 0.9173030853271484, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 134 + }, + { + "completion_length": 11.65625, + "epoch": 0.02365516032942001, + "grad_norm": 33.701675996088376, + "kl": 0.0198974609375, + "learning_rate": 9.765200630804274e-07, + "loss": 0.008, + "reward": 1.2959372997283936, + "reward_std": 0.22484754025936127, + "rewards/accuracy_reward_stage2": 0.42093732953071594, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 135 + }, + { + "completion_length": 10.296875, + "epoch": 0.023830383739267567, + "grad_norm": 26.21797926317752, + "kl": 0.027099609375, + "learning_rate": 9.763448396705799e-07, + "loss": 0.0109, + "reward": 1.248408555984497, + "reward_std": 0.2923775911331177, + "rewards/accuracy_reward_stage2": 0.24840857088565826, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 136 + }, + { + "completion_length": 11.21875, + "epoch": 0.02400560714911512, + "grad_norm": 18.800406173940722, + "kl": 0.051025390625, + "learning_rate": 9.761696162607324e-07, + "loss": 0.0205, + "reward": 1.3072917461395264, + "reward_std": 0.19727420806884766, + "rewards/accuracy_reward_stage2": 0.4322916567325592, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 137 + }, + { + "completion_length": 10.359375, + "epoch": 0.02418083055896268, + "grad_norm": 25.398987769188867, + "kl": 0.052978515625, + "learning_rate": 9.759943928508849e-07, + "loss": 0.0212, + "reward": 1.514993667602539, + "reward_std": 0.3276137113571167, + "rewards/accuracy_reward_stage2": 0.5149936676025391, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 138 + }, + { + "completion_length": 9.8125, + "epoch": 0.024356053968810232, + "grad_norm": 24.531852080551108, + "kl": 0.053466796875, + "learning_rate": 9.758191694410374e-07, + "loss": 0.0214, + "reward": 1.5364623069763184, + "reward_std": 0.19039994478225708, + "rewards/accuracy_reward_stage2": 0.5364623665809631, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 139 + }, + { + "completion_length": 9.125, + "epoch": 0.02453127737865779, + "grad_norm": 40.319697076275226, + "kl": 0.2138671875, + "learning_rate": 9.756439460311896e-07, + "loss": 0.0853, + "reward": 1.2618070840835571, + "reward_std": 0.18684542179107666, + "rewards/accuracy_reward_stage2": 0.38680708408355713, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 140 + }, + { + "completion_length": 11.734375, + "epoch": 0.024706500788505344, + "grad_norm": 39.53407951213018, + "kl": 0.080078125, + "learning_rate": 9.754687226213421e-07, + "loss": 0.0321, + "reward": 1.3220620155334473, + "reward_std": 0.20267724990844727, + "rewards/accuracy_reward_stage2": 0.3220618963241577, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 141 + }, + { + "completion_length": 14.375, + "epoch": 0.0248817241983529, + "grad_norm": 16.356615395919924, + "kl": 0.031005859375, + "learning_rate": 9.752934992114946e-07, + "loss": 0.0124, + "reward": 1.3732510805130005, + "reward_std": 0.09298402070999146, + "rewards/accuracy_reward_stage2": 0.4982510209083557, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 142 + }, + { + "completion_length": 14.5, + "epoch": 0.025056947608200455, + "grad_norm": 20.485881481845645, + "kl": 0.06591796875, + "learning_rate": 9.751182758016471e-07, + "loss": 0.0264, + "reward": 1.42328941822052, + "reward_std": 0.1043790802359581, + "rewards/accuracy_reward_stage2": 0.42328938841819763, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 143 + }, + { + "completion_length": 13.03125, + "epoch": 0.025232171018048012, + "grad_norm": 692.1862884429493, + "kl": 1.8828125, + "learning_rate": 9.749430523917996e-07, + "loss": 0.7519, + "reward": 1.450892686843872, + "reward_std": 0.22691306471824646, + "rewards/accuracy_reward_stage2": 0.5758926868438721, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 144 + }, + { + "completion_length": 6.90625, + "epoch": 0.025407394427895566, + "grad_norm": 18.93751606206503, + "kl": 0.01904296875, + "learning_rate": 9.74767828981952e-07, + "loss": 0.0076, + "reward": 1.556060552597046, + "reward_std": 0.1706404983997345, + "rewards/accuracy_reward_stage2": 0.5560606122016907, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 145 + }, + { + "completion_length": 9.671875, + "epoch": 0.025582617837743123, + "grad_norm": 17.819420207093465, + "kl": 0.0478515625, + "learning_rate": 9.745926055721044e-07, + "loss": 0.0191, + "reward": 1.4579105377197266, + "reward_std": 0.08065672963857651, + "rewards/accuracy_reward_stage2": 0.4579104781150818, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 146 + }, + { + "completion_length": 8.5, + "epoch": 0.025757841247590677, + "grad_norm": 22.182762747653655, + "kl": 0.041259765625, + "learning_rate": 9.744173821622569e-07, + "loss": 0.0165, + "reward": 1.6844103336334229, + "reward_std": 0.13641130924224854, + "rewards/accuracy_reward_stage2": 0.6844102144241333, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 147 + }, + { + "completion_length": 13.921875, + "epoch": 0.025933064657438235, + "grad_norm": 16.81373447634611, + "kl": 0.1357421875, + "learning_rate": 9.742421587524092e-07, + "loss": 0.0544, + "reward": 1.344390869140625, + "reward_std": 0.13242456316947937, + "rewards/accuracy_reward_stage2": 0.46939074993133545, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 148 + }, + { + "completion_length": 15.359375, + "epoch": 0.02610828806728579, + "grad_norm": 60.27386553230432, + "kl": 0.6953125, + "learning_rate": 9.740669353425617e-07, + "loss": 0.2776, + "reward": 1.1980289220809937, + "reward_std": 0.04343012720346451, + "rewards/accuracy_reward_stage2": 0.4480289816856384, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 149 + }, + { + "completion_length": 18.3125, + "epoch": 0.026283511477133346, + "grad_norm": 24.141913410064443, + "kl": 0.06884765625, + "learning_rate": 9.738917119327141e-07, + "loss": 0.0275, + "reward": 1.2728391885757446, + "reward_std": 0.19992250204086304, + "rewards/accuracy_reward_stage2": 0.39783918857574463, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 150 + }, + { + "completion_length": 11.3125, + "epoch": 0.0264587348869809, + "grad_norm": 24.36720305750165, + "kl": 0.10791015625, + "learning_rate": 9.737164885228666e-07, + "loss": 0.0433, + "reward": 1.547379970550537, + "reward_std": 0.1736781895160675, + "rewards/accuracy_reward_stage2": 0.6723799109458923, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 151 + }, + { + "completion_length": 14.453125, + "epoch": 0.026633958296828457, + "grad_norm": 22.95786679226241, + "kl": 0.06494140625, + "learning_rate": 9.735412651130191e-07, + "loss": 0.0259, + "reward": 1.5407392978668213, + "reward_std": 0.15379472076892853, + "rewards/accuracy_reward_stage2": 0.5407392978668213, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 152 + }, + { + "completion_length": 8.1875, + "epoch": 0.02680918170667601, + "grad_norm": 21.91910468296093, + "kl": 0.287109375, + "learning_rate": 9.733660417031714e-07, + "loss": 0.1147, + "reward": 1.4202229976654053, + "reward_std": 0.07398553192615509, + "rewards/accuracy_reward_stage2": 0.5452229976654053, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 153 + }, + { + "completion_length": 11.21875, + "epoch": 0.026984405116523568, + "grad_norm": 22.202942153600965, + "kl": 0.03173828125, + "learning_rate": 9.73190818293324e-07, + "loss": 0.0127, + "reward": 1.4828336238861084, + "reward_std": 0.1846798062324524, + "rewards/accuracy_reward_stage2": 0.4828336834907532, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 154 + }, + { + "completion_length": 14.796875, + "epoch": 0.027159628526371122, + "grad_norm": 16.491804505949233, + "kl": 0.031005859375, + "learning_rate": 9.730155948834764e-07, + "loss": -0.0318, + "reward": 1.6083829402923584, + "reward_std": 0.1673525720834732, + "rewards/accuracy_reward_stage2": 0.6240079402923584, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 155 + }, + { + "completion_length": 7.953125, + "epoch": 0.02733485193621868, + "grad_norm": 19.67446432653083, + "kl": 0.01806640625, + "learning_rate": 9.728403714736289e-07, + "loss": -0.0369, + "reward": 1.6960554122924805, + "reward_std": 0.167361319065094, + "rewards/accuracy_reward_stage2": 0.7116804718971252, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 156 + }, + { + "completion_length": 8.796875, + "epoch": 0.027510075346066233, + "grad_norm": 17.68099975023594, + "kl": 0.1435546875, + "learning_rate": 9.726651480637814e-07, + "loss": 0.0572, + "reward": 1.2521253824234009, + "reward_std": 0.09048113971948624, + "rewards/accuracy_reward_stage2": 0.5021253824234009, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 157 + }, + { + "completion_length": 9.84375, + "epoch": 0.02768529875591379, + "grad_norm": 19.589334117115808, + "kl": 0.02685546875, + "learning_rate": 9.724899246539337e-07, + "loss": 0.0107, + "reward": 1.4374645948410034, + "reward_std": 0.22100204229354858, + "rewards/accuracy_reward_stage2": 0.5624645352363586, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 158 + }, + { + "completion_length": 7.90625, + "epoch": 0.027860522165761344, + "grad_norm": 16.88466818293425, + "kl": 0.1435546875, + "learning_rate": 9.723147012440862e-07, + "loss": 0.0575, + "reward": 1.286747694015503, + "reward_std": 0.07868210971355438, + "rewards/accuracy_reward_stage2": 0.41174769401550293, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 159 + }, + { + "completion_length": 9.859375, + "epoch": 0.0280357455756089, + "grad_norm": 24.05845918121416, + "kl": 0.287109375, + "learning_rate": 9.721394778342387e-07, + "loss": 0.0704, + "reward": 1.385817289352417, + "reward_std": 0.24782794713974, + "rewards/accuracy_reward_stage2": 0.5420673489570618, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 160 + }, + { + "completion_length": 11.796875, + "epoch": 0.028210968985456455, + "grad_norm": 24.903654926382757, + "kl": 0.2236328125, + "learning_rate": 9.71964254424391e-07, + "loss": 0.0894, + "reward": 1.41621732711792, + "reward_std": 0.21123412251472473, + "rewards/accuracy_reward_stage2": 0.5412173271179199, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 161 + }, + { + "completion_length": 5.984375, + "epoch": 0.028386192395304013, + "grad_norm": 18.93121586861874, + "kl": 0.08984375, + "learning_rate": 9.717890310145434e-07, + "loss": 0.0143, + "reward": 1.6027777194976807, + "reward_std": 0.15436765551567078, + "rewards/accuracy_reward_stage2": 0.7434027791023254, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 162 + }, + { + "completion_length": 12.09375, + "epoch": 0.028561415805151567, + "grad_norm": 21.863912604715342, + "kl": 0.035888671875, + "learning_rate": 9.71613807604696e-07, + "loss": 0.0143, + "reward": 1.4885514974594116, + "reward_std": 0.197072833776474, + "rewards/accuracy_reward_stage2": 0.4885514974594116, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 163 + }, + { + "completion_length": 9.609375, + "epoch": 0.028736639214999124, + "grad_norm": 12.925116224769734, + "kl": 0.0289306640625, + "learning_rate": 9.714385841948484e-07, + "loss": -0.0258, + "reward": 1.3930555582046509, + "reward_std": 0.1789308786392212, + "rewards/accuracy_reward_stage2": 0.5336805582046509, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 164 + }, + { + "completion_length": 10.921875, + "epoch": 0.028911862624846678, + "grad_norm": 23.33889593451729, + "kl": 0.12255859375, + "learning_rate": 9.71263360785001e-07, + "loss": 0.049, + "reward": 1.521234154701233, + "reward_std": 0.2500312328338623, + "rewards/accuracy_reward_stage2": 0.5212341547012329, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 165 + }, + { + "completion_length": 17.75, + "epoch": 0.029087086034694235, + "grad_norm": 27.358894272466987, + "kl": 0.038818359375, + "learning_rate": 9.710881373751532e-07, + "loss": 0.0156, + "reward": 1.204958200454712, + "reward_std": 0.1550707370042801, + "rewards/accuracy_reward_stage2": 0.7049582004547119, + "rewards/format_reward_stage1_pointerpad": 0.5, + "scores/accuracy_reward_stage2": 0.5, + "step": 166 + }, + { + "completion_length": 11.21875, + "epoch": 0.029262309444541793, + "grad_norm": 469.49636178742867, + "kl": 1.5078125, + "learning_rate": 9.709129139653057e-07, + "loss": 0.569, + "reward": 1.3361520767211914, + "reward_std": 0.29897385835647583, + "rewards/accuracy_reward_stage2": 0.6017770767211914, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 167 + }, + { + "completion_length": 12.328125, + "epoch": 0.029437532854389346, + "grad_norm": 239.23585360135849, + "kl": 0.828125, + "learning_rate": 9.707376905554582e-07, + "loss": 0.2873, + "reward": 1.2365100383758545, + "reward_std": 0.15128737688064575, + "rewards/accuracy_reward_stage2": 0.5021350383758545, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 168 + }, + { + "completion_length": 8.453125, + "epoch": 0.029612756264236904, + "grad_norm": 24.00686702492152, + "kl": 0.07861328125, + "learning_rate": 9.705624671456107e-07, + "loss": 0.0315, + "reward": 1.5308879613876343, + "reward_std": 0.12414561212062836, + "rewards/accuracy_reward_stage2": 0.6558879613876343, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 169 + }, + { + "completion_length": 10.9375, + "epoch": 0.029787979674084458, + "grad_norm": 16.30119126944678, + "kl": 0.1953125, + "learning_rate": 9.703872437357632e-07, + "loss": 0.0782, + "reward": 1.1700856685638428, + "reward_std": 0.08398524671792984, + "rewards/accuracy_reward_stage2": 0.42008569836616516, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 170 + }, + { + "completion_length": 11.578125, + "epoch": 0.029963203083932015, + "grad_norm": 19.33276262182998, + "kl": 0.0791015625, + "learning_rate": 9.702120203259154e-07, + "loss": 0.0316, + "reward": 1.5743356943130493, + "reward_std": 0.08988235145807266, + "rewards/accuracy_reward_stage2": 0.6993356943130493, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 171 + }, + { + "completion_length": 7.609375, + "epoch": 0.03013842649377957, + "grad_norm": 17.819788413421964, + "kl": 0.024169921875, + "learning_rate": 9.70036796916068e-07, + "loss": 0.0097, + "reward": 1.546875, + "reward_std": 0.19044628739356995, + "rewards/accuracy_reward_stage2": 0.546875, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 172 + }, + { + "completion_length": 8.59375, + "epoch": 0.030313649903627126, + "grad_norm": 17.311509596973274, + "kl": 0.017578125, + "learning_rate": 9.698615735062204e-07, + "loss": 0.007, + "reward": 1.609375, + "reward_std": 0.23144522309303284, + "rewards/accuracy_reward_stage2": 0.609375, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 173 + }, + { + "completion_length": 15.359375, + "epoch": 0.03048887331347468, + "grad_norm": 18.7946667032825, + "kl": 0.2890625, + "learning_rate": 9.696863500963727e-07, + "loss": 0.1151, + "reward": 1.5292927026748657, + "reward_std": 0.15632925927639008, + "rewards/accuracy_reward_stage2": 0.6542927026748657, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 174 + }, + { + "completion_length": 9.875, + "epoch": 0.030664096723322237, + "grad_norm": 23.606318369291483, + "kl": 0.0322265625, + "learning_rate": 9.695111266865252e-07, + "loss": 0.0129, + "reward": 1.5562366247177124, + "reward_std": 0.22413820028305054, + "rewards/accuracy_reward_stage2": 0.5562366247177124, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 175 + }, + { + "completion_length": 6.15625, + "epoch": 0.03083932013316979, + "grad_norm": 20.00259671512067, + "kl": 0.051025390625, + "learning_rate": 9.693359032766777e-07, + "loss": 0.0204, + "reward": 1.7697513103485107, + "reward_std": 0.18249884247779846, + "rewards/accuracy_reward_stage2": 0.7697513103485107, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 176 + }, + { + "completion_length": 13.484375, + "epoch": 0.03101454354301735, + "grad_norm": 20.382631126127297, + "kl": 0.08544921875, + "learning_rate": 9.691606798668302e-07, + "loss": 0.0341, + "reward": 1.5370434522628784, + "reward_std": 0.2536548376083374, + "rewards/accuracy_reward_stage2": 0.5370435118675232, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 177 + }, + { + "completion_length": 7.84375, + "epoch": 0.031189766952864902, + "grad_norm": 15.427236203193937, + "kl": 0.03759765625, + "learning_rate": 9.689854564569827e-07, + "loss": 0.0151, + "reward": 1.4042786359786987, + "reward_std": 0.19416913390159607, + "rewards/accuracy_reward_stage2": 0.5292786359786987, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 178 + }, + { + "completion_length": 5.234375, + "epoch": 0.03136499036271246, + "grad_norm": 16.107007209106428, + "kl": 0.0294189453125, + "learning_rate": 9.68810233047135e-07, + "loss": 0.0118, + "reward": 1.453125, + "reward_std": 0.12255740165710449, + "rewards/accuracy_reward_stage2": 0.453125, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 179 + }, + { + "completion_length": 8.0, + "epoch": 0.03154021377256001, + "grad_norm": 29.971484972401523, + "kl": 0.146484375, + "learning_rate": 9.686350096372874e-07, + "loss": 0.0584, + "reward": 1.516639232635498, + "reward_std": 0.3135683536529541, + "rewards/accuracy_reward_stage2": 0.5166392922401428, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 180 + }, + { + "completion_length": 18.203125, + "epoch": 0.03171543718240757, + "grad_norm": 20.36688864200073, + "kl": 0.56640625, + "learning_rate": 9.6845978622744e-07, + "loss": 0.227, + "reward": 1.384101390838623, + "reward_std": 0.19463613629341125, + "rewards/accuracy_reward_stage2": 0.5091014504432678, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 181 + }, + { + "completion_length": 9.65625, + "epoch": 0.03189066059225513, + "grad_norm": 24.92217917167458, + "kl": 0.314453125, + "learning_rate": 9.682845628175924e-07, + "loss": 0.0966, + "reward": 1.561603307723999, + "reward_std": 0.21581391990184784, + "rewards/accuracy_reward_stage2": 0.702228307723999, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 182 + }, + { + "completion_length": 7.84375, + "epoch": 0.03206588400210268, + "grad_norm": 17.739746197602084, + "kl": 0.2294921875, + "learning_rate": 9.68109339407745e-07, + "loss": 0.0914, + "reward": 1.410539150238037, + "reward_std": 0.17888331413269043, + "rewards/accuracy_reward_stage2": 0.5355392098426819, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 183 + }, + { + "completion_length": 6.40625, + "epoch": 0.032241107411950236, + "grad_norm": 21.32780493688504, + "kl": 0.017333984375, + "learning_rate": 9.679341159978974e-07, + "loss": 0.0069, + "reward": 1.5488324165344238, + "reward_std": 0.17159403860569, + "rewards/accuracy_reward_stage2": 0.5488324165344238, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 184 + }, + { + "completion_length": 8.65625, + "epoch": 0.03241633082179779, + "grad_norm": 10.965259085064249, + "kl": 0.0186767578125, + "learning_rate": 9.677588925880497e-07, + "loss": 0.0075, + "reward": 1.53125, + "reward_std": 0.10888782143592834, + "rewards/accuracy_reward_stage2": 0.53125, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 185 + }, + { + "completion_length": 6.46875, + "epoch": 0.03259155423164535, + "grad_norm": 31.56035135634499, + "kl": 0.025146484375, + "learning_rate": 9.675836691782022e-07, + "loss": 0.01, + "reward": 1.390625, + "reward_std": 0.2688094973564148, + "rewards/accuracy_reward_stage2": 0.390625, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 186 + }, + { + "completion_length": 9.453125, + "epoch": 0.032766777641492904, + "grad_norm": 21.307798882002583, + "kl": 0.0859375, + "learning_rate": 9.674084457683545e-07, + "loss": 0.0344, + "reward": 1.7048611640930176, + "reward_std": 0.1740472912788391, + "rewards/accuracy_reward_stage2": 0.7048612236976624, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 187 + }, + { + "completion_length": 12.828125, + "epoch": 0.03294200105134046, + "grad_norm": 27.47034209459053, + "kl": 0.5703125, + "learning_rate": 9.67233222358507e-07, + "loss": 0.2284, + "reward": 1.5460162162780762, + "reward_std": 0.13096138834953308, + "rewards/accuracy_reward_stage2": 0.671016275882721, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 188 + }, + { + "completion_length": 11.078125, + "epoch": 0.03311722446118801, + "grad_norm": 613.5396132456912, + "kl": 0.87109375, + "learning_rate": 9.670579989486595e-07, + "loss": 0.3496, + "reward": 1.3907642364501953, + "reward_std": 0.11250603199005127, + "rewards/accuracy_reward_stage2": 0.5157641768455505, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 189 + }, + { + "completion_length": 11.125, + "epoch": 0.03329244787103557, + "grad_norm": 22.822592319874765, + "kl": 0.06884765625, + "learning_rate": 9.66882775538812e-07, + "loss": 0.0274, + "reward": 1.5781188011169434, + "reward_std": 0.17464013397693634, + "rewards/accuracy_reward_stage2": 0.5781188011169434, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 190 + }, + { + "completion_length": 12.0, + "epoch": 0.03346767128088313, + "grad_norm": 23.259903802071136, + "kl": 0.07763671875, + "learning_rate": 9.667075521289644e-07, + "loss": 0.031, + "reward": 1.4189984798431396, + "reward_std": 0.2558039128780365, + "rewards/accuracy_reward_stage2": 0.4189985394477844, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 191 + }, + { + "completion_length": 9.234375, + "epoch": 0.03364289469073068, + "grad_norm": 28.16087420682577, + "kl": 0.04296875, + "learning_rate": 9.66532328719117e-07, + "loss": 0.0172, + "reward": 1.5628974437713623, + "reward_std": 0.2809803783893585, + "rewards/accuracy_reward_stage2": 0.5628974437713623, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 192 + }, + { + "completion_length": 8.625, + "epoch": 0.033818118100578234, + "grad_norm": 18.956653677151603, + "kl": 0.099609375, + "learning_rate": 9.663571053092692e-07, + "loss": 0.0399, + "reward": 1.7109836339950562, + "reward_std": 0.09243927150964737, + "rewards/accuracy_reward_stage2": 0.7109836339950562, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 193 + }, + { + "completion_length": 7.90625, + "epoch": 0.033993341510425795, + "grad_norm": 20.13826896551377, + "kl": 0.04833984375, + "learning_rate": 9.661818818994217e-07, + "loss": 0.0194, + "reward": 1.592308759689331, + "reward_std": 0.09325343370437622, + "rewards/accuracy_reward_stage2": 0.592308759689331, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 194 + }, + { + "completion_length": 13.1875, + "epoch": 0.03416856492027335, + "grad_norm": 29.30599734850387, + "kl": 0.03173828125, + "learning_rate": 9.660066584895742e-07, + "loss": 0.0127, + "reward": 1.53125, + "reward_std": 0.28566449880599976, + "rewards/accuracy_reward_stage2": 0.65625, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 195 + }, + { + "completion_length": 14.375, + "epoch": 0.0343437883301209, + "grad_norm": 47.13706959580855, + "kl": 0.2236328125, + "learning_rate": 9.658314350797267e-07, + "loss": 0.0496, + "reward": 1.4006702899932861, + "reward_std": 0.13923153281211853, + "rewards/accuracy_reward_stage2": 0.5412952303886414, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 196 + }, + { + "completion_length": 8.3125, + "epoch": 0.03451901173996846, + "grad_norm": 22.833902373519475, + "kl": 0.1318359375, + "learning_rate": 9.656562116698792e-07, + "loss": 0.0524, + "reward": 1.2781250476837158, + "reward_std": 0.12562815845012665, + "rewards/accuracy_reward_stage2": 0.528124988079071, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 197 + }, + { + "completion_length": 16.59375, + "epoch": 0.03469423514981602, + "grad_norm": 665.8480304252839, + "kl": 3.71875, + "learning_rate": 9.654809882600315e-07, + "loss": 1.4827, + "reward": 1.6354167461395264, + "reward_std": 0.1997472047805786, + "rewards/accuracy_reward_stage2": 0.8854166865348816, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 198 + }, + { + "completion_length": 10.0, + "epoch": 0.03486945855966357, + "grad_norm": 24.530226882177114, + "kl": 0.115234375, + "learning_rate": 9.65305764850184e-07, + "loss": 0.0461, + "reward": 1.4895833730697632, + "reward_std": 0.1462521106004715, + "rewards/accuracy_reward_stage2": 0.6145833730697632, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 199 + }, + { + "completion_length": 7.5, + "epoch": 0.035044681969511125, + "grad_norm": 16.836862354365994, + "kl": 0.0072021484375, + "learning_rate": 9.651305414403364e-07, + "loss": 0.0029, + "reward": 1.490378499031067, + "reward_std": 0.11636392772197723, + "rewards/accuracy_reward_stage2": 0.6153784990310669, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 200 + }, + { + "completion_length": 11.125, + "epoch": 0.03521990537935868, + "grad_norm": 22.060189107320923, + "kl": 0.054931640625, + "learning_rate": 9.649553180304887e-07, + "loss": 0.022, + "reward": 1.3880213499069214, + "reward_std": 0.16327911615371704, + "rewards/accuracy_reward_stage2": 0.3880213499069214, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 201 + }, + { + "completion_length": 8.25, + "epoch": 0.03539512878920624, + "grad_norm": 19.505067645334403, + "kl": 0.039306640625, + "learning_rate": 9.647800946206412e-07, + "loss": -0.0284, + "reward": 1.773182988166809, + "reward_std": 0.1636386662721634, + "rewards/accuracy_reward_stage2": 0.7888079881668091, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 202 + }, + { + "completion_length": 11.125, + "epoch": 0.035570352199053794, + "grad_norm": 26.28569376820353, + "kl": 0.0673828125, + "learning_rate": 9.646048712107937e-07, + "loss": 0.0269, + "reward": 1.1831471920013428, + "reward_std": 0.20850315690040588, + "rewards/accuracy_reward_stage2": 0.43314728140830994, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 203 + }, + { + "completion_length": 10.1875, + "epoch": 0.03574557560890135, + "grad_norm": 22.462333146273004, + "kl": 0.06640625, + "learning_rate": 9.644296478009462e-07, + "loss": 0.0265, + "reward": 1.3284682035446167, + "reward_std": 0.23116865754127502, + "rewards/accuracy_reward_stage2": 0.3284682035446167, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 204 + }, + { + "completion_length": 23.359375, + "epoch": 0.0359207990187489, + "grad_norm": 43.353709695881726, + "kl": 0.040283203125, + "learning_rate": 9.642544243910987e-07, + "loss": 0.0162, + "reward": 1.2655521631240845, + "reward_std": 0.29400354623794556, + "rewards/accuracy_reward_stage2": 0.26555219292640686, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 205 + }, + { + "completion_length": 8.25, + "epoch": 0.03609602242859646, + "grad_norm": 17.987108141910905, + "kl": 0.04248046875, + "learning_rate": 9.64079200981251e-07, + "loss": 0.0171, + "reward": 1.8145318031311035, + "reward_std": 0.21234536170959473, + "rewards/accuracy_reward_stage2": 0.8145317435264587, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 206 + }, + { + "completion_length": 10.703125, + "epoch": 0.036271245838444016, + "grad_norm": 27.1027412842586, + "kl": 0.412109375, + "learning_rate": 9.639039775714035e-07, + "loss": 0.1648, + "reward": 1.2671058177947998, + "reward_std": 0.24343198537826538, + "rewards/accuracy_reward_stage2": 0.5171056985855103, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 207 + }, + { + "completion_length": 11.21875, + "epoch": 0.03644646924829157, + "grad_norm": 11.33940008261332, + "kl": 0.02978515625, + "learning_rate": 9.63728754161556e-07, + "loss": 0.0119, + "reward": 1.3489539623260498, + "reward_std": 0.0795942097902298, + "rewards/accuracy_reward_stage2": 0.3489539623260498, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 208 + }, + { + "completion_length": 7.265625, + "epoch": 0.03662169265813913, + "grad_norm": 18.64581413706451, + "kl": 0.00799560546875, + "learning_rate": 9.635535307517085e-07, + "loss": -0.041, + "reward": 1.5224037170410156, + "reward_std": 0.18349644541740417, + "rewards/accuracy_reward_stage2": 0.5380287170410156, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 209 + }, + { + "completion_length": 10.203125, + "epoch": 0.036796916067986685, + "grad_norm": 26.531838880659922, + "kl": 0.06494140625, + "learning_rate": 9.63378307341861e-07, + "loss": 0.026, + "reward": 1.6403162479400635, + "reward_std": 0.36213648319244385, + "rewards/accuracy_reward_stage2": 0.640316367149353, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 210 + }, + { + "completion_length": 11.421875, + "epoch": 0.03697213947783424, + "grad_norm": 19.67305372025666, + "kl": 0.142578125, + "learning_rate": 9.632030839320132e-07, + "loss": 0.0571, + "reward": 1.446283221244812, + "reward_std": 0.11086312681436539, + "rewards/accuracy_reward_stage2": 0.5712832808494568, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 211 + }, + { + "completion_length": 7.921875, + "epoch": 0.03714736288768179, + "grad_norm": 18.73395276268249, + "kl": 0.0216064453125, + "learning_rate": 9.630278605221657e-07, + "loss": 0.0087, + "reward": 1.742701530456543, + "reward_std": 0.164242684841156, + "rewards/accuracy_reward_stage2": 0.7427014708518982, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 212 + }, + { + "completion_length": 15.359375, + "epoch": 0.03732258629752935, + "grad_norm": 18.465353708149525, + "kl": 0.045654296875, + "learning_rate": 9.628526371123182e-07, + "loss": 0.0183, + "reward": 1.1487271785736084, + "reward_std": 0.11971971392631531, + "rewards/accuracy_reward_stage2": 0.2737271785736084, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 213 + }, + { + "completion_length": 10.8125, + "epoch": 0.03749780970737691, + "grad_norm": 17.321449052890124, + "kl": 0.02392578125, + "learning_rate": 9.626774137024705e-07, + "loss": 0.0096, + "reward": 1.5291376113891602, + "reward_std": 0.15100376307964325, + "rewards/accuracy_reward_stage2": 0.5291374921798706, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 214 + }, + { + "completion_length": 7.75, + "epoch": 0.03767303311722446, + "grad_norm": 21.689215854097338, + "kl": 0.12109375, + "learning_rate": 9.62502190292623e-07, + "loss": 0.0485, + "reward": 1.4410247802734375, + "reward_std": 0.3011726140975952, + "rewards/accuracy_reward_stage2": 0.5660248398780823, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 215 + }, + { + "completion_length": 8.359375, + "epoch": 0.037848256527072015, + "grad_norm": 23.09518268911407, + "kl": 0.0869140625, + "learning_rate": 9.623269668827755e-07, + "loss": -0.0094, + "reward": 1.4675501585006714, + "reward_std": 0.1814957708120346, + "rewards/accuracy_reward_stage2": 0.483175128698349, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 216 + }, + { + "completion_length": 12.734375, + "epoch": 0.038023479936919576, + "grad_norm": 19.483904450481027, + "kl": 0.11962890625, + "learning_rate": 9.62151743472928e-07, + "loss": 0.0477, + "reward": 1.4069256782531738, + "reward_std": 0.17002731561660767, + "rewards/accuracy_reward_stage2": 0.531925618648529, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 217 + }, + { + "completion_length": 6.53125, + "epoch": 0.03819870334676713, + "grad_norm": 38.87952022905316, + "kl": 0.09521484375, + "learning_rate": 9.619765200630805e-07, + "loss": -0.006, + "reward": 1.8126009702682495, + "reward_std": 0.19280743598937988, + "rewards/accuracy_reward_stage2": 0.8282259702682495, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 218 + }, + { + "completion_length": 9.546875, + "epoch": 0.03837392675661468, + "grad_norm": 18.565108749119865, + "kl": 0.1416015625, + "learning_rate": 9.618012966532327e-07, + "loss": 0.0565, + "reward": 1.5609869956970215, + "reward_std": 0.1407136768102646, + "rewards/accuracy_reward_stage2": 0.8109869956970215, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 219 + }, + { + "completion_length": 8.65625, + "epoch": 0.03854915016646224, + "grad_norm": 15.210793388574377, + "kl": 0.01385498046875, + "learning_rate": 9.616260732433852e-07, + "loss": 0.0055, + "reward": 1.5824074745178223, + "reward_std": 0.16188913583755493, + "rewards/accuracy_reward_stage2": 0.5824074149131775, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 220 + }, + { + "completion_length": 8.984375, + "epoch": 0.0387243735763098, + "grad_norm": 21.186984746548617, + "kl": 0.056396484375, + "learning_rate": 9.614508498335377e-07, + "loss": 0.0225, + "reward": 1.4985287189483643, + "reward_std": 0.12607437372207642, + "rewards/accuracy_reward_stage2": 0.49852871894836426, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 221 + }, + { + "completion_length": 11.15625, + "epoch": 0.03889959698615735, + "grad_norm": 16.93524040273681, + "kl": 0.060546875, + "learning_rate": 9.612756264236902e-07, + "loss": 0.0243, + "reward": 1.6816024780273438, + "reward_std": 0.13517498970031738, + "rewards/accuracy_reward_stage2": 0.6816024780273438, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 222 + }, + { + "completion_length": 10.109375, + "epoch": 0.039074820396004906, + "grad_norm": 23.577441458972622, + "kl": 0.07275390625, + "learning_rate": 9.611004030138427e-07, + "loss": 0.0292, + "reward": 1.5530338287353516, + "reward_std": 0.18569764494895935, + "rewards/accuracy_reward_stage2": 0.5530339479446411, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 223 + }, + { + "completion_length": 15.015625, + "epoch": 0.03925004380585246, + "grad_norm": 6141.6201343970815, + "kl": 15.6875, + "learning_rate": 9.60925179603995e-07, + "loss": 6.2679, + "reward": 1.1519737243652344, + "reward_std": 0.23728567361831665, + "rewards/accuracy_reward_stage2": 0.417598694562912, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 224 + }, + { + "completion_length": 10.21875, + "epoch": 0.03942526721570002, + "grad_norm": 21.489482009145725, + "kl": 0.0240478515625, + "learning_rate": 9.607499561941475e-07, + "loss": 0.0096, + "reward": 1.7004191875457764, + "reward_std": 0.15592418611049652, + "rewards/accuracy_reward_stage2": 0.7004191875457764, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 225 + }, + { + "completion_length": 9.953125, + "epoch": 0.039600490625547574, + "grad_norm": 16.51799687820737, + "kl": 0.0250244140625, + "learning_rate": 9.605747327843e-07, + "loss": 0.01, + "reward": 1.6579914093017578, + "reward_std": 0.16353479027748108, + "rewards/accuracy_reward_stage2": 0.657991349697113, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 226 + }, + { + "completion_length": 36.53125, + "epoch": 0.03977571403539513, + "grad_norm": 20.3875835203367, + "kl": 0.018310546875, + "learning_rate": 9.603995093744523e-07, + "loss": 0.0073, + "reward": 1.589333176612854, + "reward_std": 0.1167701929807663, + "rewards/accuracy_reward_stage2": 0.589333176612854, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 227 + }, + { + "completion_length": 9.78125, + "epoch": 0.03995093744524268, + "grad_norm": 20.26695770586334, + "kl": 0.061279296875, + "learning_rate": 9.602242859646048e-07, + "loss": -0.0197, + "reward": 1.712099552154541, + "reward_std": 0.15879106521606445, + "rewards/accuracy_reward_stage2": 0.727724552154541, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 228 + }, + { + "completion_length": 10.984375, + "epoch": 0.04012616085509024, + "grad_norm": 21.453556659540432, + "kl": 0.0291748046875, + "learning_rate": 9.600490625547573e-07, + "loss": -0.0278, + "reward": 1.6276353597640991, + "reward_std": 0.25666865706443787, + "rewards/accuracy_reward_stage2": 0.6432603001594543, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 229 + }, + { + "completion_length": 8.453125, + "epoch": 0.040301384264937797, + "grad_norm": 24.431506061785814, + "kl": 0.193359375, + "learning_rate": 9.598738391449097e-07, + "loss": 0.0777, + "reward": 1.531674861907959, + "reward_std": 0.14275917410850525, + "rewards/accuracy_reward_stage2": 0.6566749811172485, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 230 + }, + { + "completion_length": 10.921875, + "epoch": 0.04047660767478535, + "grad_norm": 17.59455777114902, + "kl": 0.12353515625, + "learning_rate": 9.596986157350622e-07, + "loss": 0.0052, + "reward": 1.4566197395324707, + "reward_std": 0.21315625309944153, + "rewards/accuracy_reward_stage2": 0.47224482893943787, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 231 + }, + { + "completion_length": 11.5625, + "epoch": 0.040651831084632904, + "grad_norm": 16.538614976828548, + "kl": 0.0478515625, + "learning_rate": 9.595233923252145e-07, + "loss": 0.0192, + "reward": 1.4099462032318115, + "reward_std": 0.165542870759964, + "rewards/accuracy_reward_stage2": 0.4099462628364563, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 232 + }, + { + "completion_length": 19.921875, + "epoch": 0.040827054494480465, + "grad_norm": 15.825866121650826, + "kl": 0.035888671875, + "learning_rate": 9.59348168915367e-07, + "loss": 0.0144, + "reward": 1.3888517618179321, + "reward_std": 0.10451022535562515, + "rewards/accuracy_reward_stage2": 0.38885173201560974, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 233 + }, + { + "completion_length": 8.59375, + "epoch": 0.04100227790432802, + "grad_norm": 29.579691766926054, + "kl": 0.0294189453125, + "learning_rate": 9.591729455055195e-07, + "loss": 0.0118, + "reward": 1.328125, + "reward_std": 0.22673699259757996, + "rewards/accuracy_reward_stage2": 0.453125, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 234 + }, + { + "completion_length": 10.96875, + "epoch": 0.04117750131417557, + "grad_norm": 20.924805494763422, + "kl": 0.0791015625, + "learning_rate": 9.58997722095672e-07, + "loss": 0.0316, + "reward": 1.5717337131500244, + "reward_std": 0.08275075256824493, + "rewards/accuracy_reward_stage2": 0.5717335939407349, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 235 + }, + { + "completion_length": 9.875, + "epoch": 0.04135272472402313, + "grad_norm": 23.14742741754885, + "kl": 0.03076171875, + "learning_rate": 9.588224986858245e-07, + "loss": 0.0123, + "reward": 1.7143429517745972, + "reward_std": 0.18178583681583405, + "rewards/accuracy_reward_stage2": 0.7143428921699524, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 236 + }, + { + "completion_length": 10.59375, + "epoch": 0.04152794813387069, + "grad_norm": 16.969631438431694, + "kl": 0.07666015625, + "learning_rate": 9.586472752759768e-07, + "loss": 0.0306, + "reward": 1.559175729751587, + "reward_std": 0.13105592131614685, + "rewards/accuracy_reward_stage2": 0.5591757297515869, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 237 + }, + { + "completion_length": 11.328125, + "epoch": 0.04170317154371824, + "grad_norm": 21.50047187718676, + "kl": 0.0458984375, + "learning_rate": 9.584720518661293e-07, + "loss": 0.0183, + "reward": 1.4090485572814941, + "reward_std": 0.19534313678741455, + "rewards/accuracy_reward_stage2": 0.4090486168861389, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 238 + }, + { + "completion_length": 12.5625, + "epoch": 0.041878394953565795, + "grad_norm": 13.322354332909555, + "kl": 0.023681640625, + "learning_rate": 9.582968284562818e-07, + "loss": 0.0095, + "reward": 1.4014296531677246, + "reward_std": 0.14201299846172333, + "rewards/accuracy_reward_stage2": 0.4014296531677246, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 239 + }, + { + "completion_length": 8.84375, + "epoch": 0.04205361836341335, + "grad_norm": 105.06945037302584, + "kl": 0.14453125, + "learning_rate": 9.58121605046434e-07, + "loss": 0.0242, + "reward": 1.6751351356506348, + "reward_std": 0.24619035422801971, + "rewards/accuracy_reward_stage2": 0.6907602548599243, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 240 + }, + { + "completion_length": 8.578125, + "epoch": 0.04222884177326091, + "grad_norm": 25.160030176481186, + "kl": 0.035888671875, + "learning_rate": 9.579463816365865e-07, + "loss": 0.0144, + "reward": 1.5575356483459473, + "reward_std": 0.26918134093284607, + "rewards/accuracy_reward_stage2": 0.6825356483459473, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 241 + }, + { + "completion_length": 10.609375, + "epoch": 0.042404065183108464, + "grad_norm": 20.556650961134515, + "kl": 0.099609375, + "learning_rate": 9.57771158226739e-07, + "loss": 0.0398, + "reward": 1.4830882549285889, + "reward_std": 0.19809089601039886, + "rewards/accuracy_reward_stage2": 0.48308834433555603, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 242 + }, + { + "completion_length": 8.453125, + "epoch": 0.04257928859295602, + "grad_norm": 30.14382376308136, + "kl": 0.201171875, + "learning_rate": 9.575959348168915e-07, + "loss": 0.0806, + "reward": 1.6129651069641113, + "reward_std": 0.27128833532333374, + "rewards/accuracy_reward_stage2": 0.6129651665687561, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 243 + }, + { + "completion_length": 11.328125, + "epoch": 0.04275451200280357, + "grad_norm": 324.7190701022341, + "kl": 1.8359375, + "learning_rate": 9.57420711407044e-07, + "loss": 0.6912, + "reward": 1.7415674924850464, + "reward_std": 0.18711799383163452, + "rewards/accuracy_reward_stage2": 0.8821924328804016, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 244 + }, + { + "completion_length": 9.265625, + "epoch": 0.04292973541265113, + "grad_norm": 24.608966123142295, + "kl": 0.2734375, + "learning_rate": 9.572454879971965e-07, + "loss": 0.1092, + "reward": 1.4464468955993652, + "reward_std": 0.3318367302417755, + "rewards/accuracy_reward_stage2": 0.5714468955993652, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 245 + }, + { + "completion_length": 8.546875, + "epoch": 0.043104958822498686, + "grad_norm": 19.36548616027605, + "kl": 0.0284423828125, + "learning_rate": 9.570702645873488e-07, + "loss": 0.0114, + "reward": 1.5153286457061768, + "reward_std": 0.23770207166671753, + "rewards/accuracy_reward_stage2": 0.515328586101532, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 246 + }, + { + "completion_length": 14.15625, + "epoch": 0.04328018223234624, + "grad_norm": 22.541003255182787, + "kl": 0.0223388671875, + "learning_rate": 9.568950411775013e-07, + "loss": 0.009, + "reward": 1.5925946235656738, + "reward_std": 0.2566668391227722, + "rewards/accuracy_reward_stage2": 0.5925946235656738, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 247 + }, + { + "completion_length": 10.71875, + "epoch": 0.043455405642193794, + "grad_norm": 22.651546161743862, + "kl": 0.052734375, + "learning_rate": 9.567198177676538e-07, + "loss": 0.021, + "reward": 1.6145833730697632, + "reward_std": 0.21749193966388702, + "rewards/accuracy_reward_stage2": 0.6145833730697632, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 248 + }, + { + "completion_length": 5.984375, + "epoch": 0.043630629052041354, + "grad_norm": 25.60105051795061, + "kl": 0.042724609375, + "learning_rate": 9.565445943578063e-07, + "loss": 0.0171, + "reward": 1.5998629331588745, + "reward_std": 0.181601881980896, + "rewards/accuracy_reward_stage2": 0.5998629331588745, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 249 + }, + { + "completion_length": 12.25, + "epoch": 0.04380585246188891, + "grad_norm": 26.37616634616468, + "kl": 0.23046875, + "learning_rate": 9.563693709479585e-07, + "loss": 0.0926, + "reward": 1.4369782209396362, + "reward_std": 0.2544781565666199, + "rewards/accuracy_reward_stage2": 0.5619782209396362, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 250 + }, + { + "completion_length": 11.46875, + "epoch": 0.04398107587173646, + "grad_norm": 19.591330163004343, + "kl": 0.05517578125, + "learning_rate": 9.56194147538111e-07, + "loss": 0.0221, + "reward": 1.517921805381775, + "reward_std": 0.14252689480781555, + "rewards/accuracy_reward_stage2": 0.5179218053817749, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 251 + }, + { + "completion_length": 13.09375, + "epoch": 0.04415629928158402, + "grad_norm": 15.21166450844923, + "kl": 0.059814453125, + "learning_rate": 9.560189241282635e-07, + "loss": 0.0239, + "reward": 1.4374809265136719, + "reward_std": 0.12924231588840485, + "rewards/accuracy_reward_stage2": 0.4374809265136719, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 252 + }, + { + "completion_length": 9.578125, + "epoch": 0.04433152269143158, + "grad_norm": 15.65910478320339, + "kl": 0.037841796875, + "learning_rate": 9.55843700718416e-07, + "loss": -0.0266, + "reward": 1.6806310415267944, + "reward_std": 0.13534963130950928, + "rewards/accuracy_reward_stage2": 0.6962560415267944, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 253 + }, + { + "completion_length": 12.265625, + "epoch": 0.04450674610127913, + "grad_norm": 13.699123733968174, + "kl": 0.060791015625, + "learning_rate": 9.556684773085683e-07, + "loss": 0.0243, + "reward": 1.563589096069336, + "reward_std": 0.07810796797275543, + "rewards/accuracy_reward_stage2": 0.5635892152786255, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 254 + }, + { + "completion_length": 8.0625, + "epoch": 0.044681969511126685, + "grad_norm": 18.550385686776043, + "kl": 0.0732421875, + "learning_rate": 9.554932538987208e-07, + "loss": 0.0293, + "reward": 1.4762643575668335, + "reward_std": 0.1577366590499878, + "rewards/accuracy_reward_stage2": 0.4762643277645111, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 255 + }, + { + "completion_length": 8.3125, + "epoch": 0.044857192920974245, + "grad_norm": 26.99970659587625, + "kl": 0.0869140625, + "learning_rate": 9.553180304888733e-07, + "loss": -0.1223, + "reward": 1.3463577032089233, + "reward_std": 0.3971595764160156, + "rewards/accuracy_reward_stage2": 0.40885767340660095, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 256 + }, + { + "completion_length": 11.203125, + "epoch": 0.0450324163308218, + "grad_norm": 22.516253290469027, + "kl": 0.039794921875, + "learning_rate": 9.551428070790258e-07, + "loss": 0.0159, + "reward": 1.410148024559021, + "reward_std": 0.1681969165802002, + "rewards/accuracy_reward_stage2": 0.4101479947566986, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 257 + }, + { + "completion_length": 7.203125, + "epoch": 0.04520763974066935, + "grad_norm": 19.288619552831214, + "kl": 0.1025390625, + "learning_rate": 9.549675836691783e-07, + "loss": -0.0266, + "reward": 1.5533853769302368, + "reward_std": 0.14150744676589966, + "rewards/accuracy_reward_stage2": 0.7096354365348816, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 258 + }, + { + "completion_length": 6.078125, + "epoch": 0.04538286315051691, + "grad_norm": 15.499767988058947, + "kl": 0.09619140625, + "learning_rate": 9.547923602593305e-07, + "loss": 0.002, + "reward": 1.5709052085876465, + "reward_std": 0.1841139942407608, + "rewards/accuracy_reward_stage2": 0.5865301489830017, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 259 + }, + { + "completion_length": 11.859375, + "epoch": 0.04555808656036447, + "grad_norm": 341.9481715113229, + "kl": 0.58984375, + "learning_rate": 9.54617136849483e-07, + "loss": 0.1918, + "reward": 1.475005865097046, + "reward_std": 0.13841910660266876, + "rewards/accuracy_reward_stage2": 0.4906309247016907, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 260 + }, + { + "completion_length": 15.5, + "epoch": 0.04573330997021202, + "grad_norm": 18.643239096535567, + "kl": 0.1357421875, + "learning_rate": 9.544419134396355e-07, + "loss": 0.0539, + "reward": 1.3988699913024902, + "reward_std": 0.15246807038784027, + "rewards/accuracy_reward_stage2": 0.523870050907135, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 261 + }, + { + "completion_length": 11.671875, + "epoch": 0.045908533380059575, + "grad_norm": 23.82966694525005, + "kl": 0.0654296875, + "learning_rate": 9.54266690029788e-07, + "loss": 0.0262, + "reward": 1.5144448280334473, + "reward_std": 0.19817392528057098, + "rewards/accuracy_reward_stage2": 0.5144447684288025, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 262 + }, + { + "completion_length": 8.046875, + "epoch": 0.04608375678990713, + "grad_norm": 13.796449286878177, + "kl": 0.08203125, + "learning_rate": 9.540914666199403e-07, + "loss": 0.0329, + "reward": 1.663055419921875, + "reward_std": 0.05650103837251663, + "rewards/accuracy_reward_stage2": 0.663055419921875, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 263 + }, + { + "completion_length": 10.6875, + "epoch": 0.04625898019975469, + "grad_norm": 21.257988314253687, + "kl": 0.0673828125, + "learning_rate": 9.539162432100928e-07, + "loss": 0.0269, + "reward": 1.5275135040283203, + "reward_std": 0.2669033110141754, + "rewards/accuracy_reward_stage2": 0.6525135040283203, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 264 + }, + { + "completion_length": 8.796875, + "epoch": 0.046434203609602244, + "grad_norm": 26.189523445618033, + "kl": 0.0301513671875, + "learning_rate": 9.537410198002453e-07, + "loss": -0.0308, + "reward": 1.8123893737792969, + "reward_std": 0.16830343008041382, + "rewards/accuracy_reward_stage2": 0.8280143737792969, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 265 + }, + { + "completion_length": 6.578125, + "epoch": 0.0466094270194498, + "grad_norm": 17.083602252194122, + "kl": 0.0172119140625, + "learning_rate": 9.535657963903977e-07, + "loss": 0.0069, + "reward": 1.4092938899993896, + "reward_std": 0.16730165481567383, + "rewards/accuracy_reward_stage2": 0.4092938303947449, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 266 + }, + { + "completion_length": 10.3125, + "epoch": 0.04678465042929735, + "grad_norm": 37.184142614310694, + "kl": 0.028564453125, + "learning_rate": 9.533905729805502e-07, + "loss": -0.0328, + "reward": 1.5248304605484009, + "reward_std": 0.3038603663444519, + "rewards/accuracy_reward_stage2": 0.5404554605484009, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 267 + }, + { + "completion_length": 10.609375, + "epoch": 0.04695987383914491, + "grad_norm": 28.95806658900415, + "kl": 0.25390625, + "learning_rate": 9.532153495707026e-07, + "loss": 0.0159, + "reward": 1.2882633209228516, + "reward_std": 0.23559540510177612, + "rewards/accuracy_reward_stage2": 0.44451335072517395, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 268 + }, + { + "completion_length": 11.484375, + "epoch": 0.047135097248992466, + "grad_norm": 26.488973390928386, + "kl": 0.10888671875, + "learning_rate": 9.53040126160855e-07, + "loss": -0.0298, + "reward": 1.598239541053772, + "reward_std": 0.2586630582809448, + "rewards/accuracy_reward_stage2": 0.6294894814491272, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 269 + }, + { + "completion_length": 8.484375, + "epoch": 0.04731032065884002, + "grad_norm": 20.04576924598462, + "kl": 0.072265625, + "learning_rate": 9.528649027510075e-07, + "loss": 0.029, + "reward": 1.3794233798980713, + "reward_std": 0.20582807064056396, + "rewards/accuracy_reward_stage2": 0.5044234395027161, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 270 + }, + { + "completion_length": 11.609375, + "epoch": 0.047485544068687574, + "grad_norm": 20.371874049564635, + "kl": 0.08740234375, + "learning_rate": 9.526896793411599e-07, + "loss": 0.035, + "reward": 1.446256399154663, + "reward_std": 0.20240430533885956, + "rewards/accuracy_reward_stage2": 0.5712563395500183, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 271 + }, + { + "completion_length": 5.703125, + "epoch": 0.047660767478535135, + "grad_norm": 21.311882099524407, + "kl": 0.0888671875, + "learning_rate": 9.525144559313124e-07, + "loss": 0.0356, + "reward": 1.736009120941162, + "reward_std": 0.17594566941261292, + "rewards/accuracy_reward_stage2": 0.7360091209411621, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 272 + }, + { + "completion_length": 9.15625, + "epoch": 0.04783599088838269, + "grad_norm": 29.67754488993763, + "kl": 0.0179443359375, + "learning_rate": 9.523392325214649e-07, + "loss": 0.0072, + "reward": 1.641929268836975, + "reward_std": 0.17333316802978516, + "rewards/accuracy_reward_stage2": 0.6419292688369751, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 273 + }, + { + "completion_length": 9.21875, + "epoch": 0.04801121429823024, + "grad_norm": 16.305746612398746, + "kl": 0.09521484375, + "learning_rate": 9.521640091116173e-07, + "loss": 0.0382, + "reward": 1.406163215637207, + "reward_std": 0.15640440583229065, + "rewards/accuracy_reward_stage2": 0.531163215637207, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 274 + }, + { + "completion_length": 18.671875, + "epoch": 0.048186437708077796, + "grad_norm": 19.104494178824076, + "kl": 0.361328125, + "learning_rate": 9.519887857017697e-07, + "loss": 0.1008, + "reward": 1.6140856742858887, + "reward_std": 0.1861516237258911, + "rewards/accuracy_reward_stage2": 0.7547106742858887, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 275 + }, + { + "completion_length": 6.765625, + "epoch": 0.04836166111792536, + "grad_norm": 19.587912810004013, + "kl": 0.061767578125, + "learning_rate": 9.518135622919221e-07, + "loss": 0.0247, + "reward": 1.8026357889175415, + "reward_std": 0.24802812933921814, + "rewards/accuracy_reward_stage2": 0.8026357293128967, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 276 + }, + { + "completion_length": 6.953125, + "epoch": 0.04853688452777291, + "grad_norm": 16.27617765583656, + "kl": 0.055908203125, + "learning_rate": 9.516383388820746e-07, + "loss": -0.01, + "reward": 1.5396757125854492, + "reward_std": 0.21801412105560303, + "rewards/accuracy_reward_stage2": 0.5553005933761597, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 277 + }, + { + "completion_length": 12.9375, + "epoch": 0.048712107937620465, + "grad_norm": 15.60792995659862, + "kl": 0.11328125, + "learning_rate": 9.514631154722271e-07, + "loss": 0.0011, + "reward": 1.3102458715438843, + "reward_std": 0.157129168510437, + "rewards/accuracy_reward_stage2": 0.5758708715438843, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 278 + }, + { + "completion_length": 5.65625, + "epoch": 0.04888733134746802, + "grad_norm": 21.31301576597148, + "kl": 0.04931640625, + "learning_rate": 9.512878920623794e-07, + "loss": 0.0197, + "reward": 1.7604167461395264, + "reward_std": 0.257610946893692, + "rewards/accuracy_reward_stage2": 0.7604166865348816, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 279 + }, + { + "completion_length": 8.125, + "epoch": 0.04906255475731558, + "grad_norm": 17.03041073737609, + "kl": 0.048095703125, + "learning_rate": 9.511126686525319e-07, + "loss": 0.0192, + "reward": 1.6696319580078125, + "reward_std": 0.06764136254787445, + "rewards/accuracy_reward_stage2": 0.669631838798523, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 280 + }, + { + "completion_length": 9.75, + "epoch": 0.04923777816716313, + "grad_norm": 22.55785616992177, + "kl": 0.076171875, + "learning_rate": 9.509374452426844e-07, + "loss": 0.0304, + "reward": 1.5457661151885986, + "reward_std": 0.2752734422683716, + "rewards/accuracy_reward_stage2": 0.6707661747932434, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 281 + }, + { + "completion_length": 9.234375, + "epoch": 0.04941300157701069, + "grad_norm": 21.99612719765442, + "kl": 0.07421875, + "learning_rate": 9.507622218328368e-07, + "loss": 0.0297, + "reward": 1.654970407485962, + "reward_std": 0.24402377009391785, + "rewards/accuracy_reward_stage2": 0.6549703478813171, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 282 + }, + { + "completion_length": 11.203125, + "epoch": 0.04958822498685824, + "grad_norm": 20.447129133637134, + "kl": 0.0947265625, + "learning_rate": 9.505869984229893e-07, + "loss": 0.0379, + "reward": 1.3550448417663574, + "reward_std": 0.2543000876903534, + "rewards/accuracy_reward_stage2": 0.4800449013710022, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 283 + }, + { + "completion_length": 7.03125, + "epoch": 0.0497634483967058, + "grad_norm": 15.994507580460478, + "kl": 0.0341796875, + "learning_rate": 9.504117750131417e-07, + "loss": 0.0137, + "reward": 1.5851523876190186, + "reward_std": 0.13426676392555237, + "rewards/accuracy_reward_stage2": 0.5851523876190186, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 284 + }, + { + "completion_length": 10.75, + "epoch": 0.049938671806553356, + "grad_norm": 16.657668769693082, + "kl": 0.0703125, + "learning_rate": 9.502365516032942e-07, + "loss": 0.0281, + "reward": 1.5128419399261475, + "reward_std": 0.18941733241081238, + "rewards/accuracy_reward_stage2": 0.5128419399261475, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 285 + }, + { + "completion_length": 7.71875, + "epoch": 0.05011389521640091, + "grad_norm": 11.758117565473336, + "kl": 0.044189453125, + "learning_rate": 9.500613281934467e-07, + "loss": 0.0176, + "reward": 1.5194578170776367, + "reward_std": 0.08947868645191193, + "rewards/accuracy_reward_stage2": 0.5194578170776367, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 286 + }, + { + "completion_length": 9.109375, + "epoch": 0.050289118626248464, + "grad_norm": 20.8562775866761, + "kl": 0.0498046875, + "learning_rate": 9.498861047835991e-07, + "loss": 0.0199, + "reward": 1.7933104038238525, + "reward_std": 0.12623247504234314, + "rewards/accuracy_reward_stage2": 0.793310284614563, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 287 + }, + { + "completion_length": 6.546875, + "epoch": 0.050464342036096024, + "grad_norm": 18.39024977557091, + "kl": 0.051025390625, + "learning_rate": 9.497108813737515e-07, + "loss": 0.0204, + "reward": 1.5014456510543823, + "reward_std": 0.20475083589553833, + "rewards/accuracy_reward_stage2": 0.5014456510543823, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 288 + }, + { + "completion_length": 15.171875, + "epoch": 0.05063956544594358, + "grad_norm": 18.14671972942763, + "kl": 0.080078125, + "learning_rate": 9.495356579639038e-07, + "loss": 0.032, + "reward": 1.4318616390228271, + "reward_std": 0.1176854595541954, + "rewards/accuracy_reward_stage2": 0.5568615794181824, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 289 + }, + { + "completion_length": 13.765625, + "epoch": 0.05081478885579113, + "grad_norm": 20.710006975987632, + "kl": 0.0537109375, + "learning_rate": 9.493604345540563e-07, + "loss": 0.0215, + "reward": 1.6023633480072021, + "reward_std": 0.2683194875717163, + "rewards/accuracy_reward_stage2": 0.6023632884025574, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 290 + }, + { + "completion_length": 8.984375, + "epoch": 0.050990012265638686, + "grad_norm": 49.22708752391607, + "kl": 0.2353515625, + "learning_rate": 9.491852111442088e-07, + "loss": 0.0759, + "reward": 1.523097038269043, + "reward_std": 0.23898158967494965, + "rewards/accuracy_reward_stage2": 0.5387219190597534, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 291 + }, + { + "completion_length": 8.109375, + "epoch": 0.05116523567548625, + "grad_norm": 20.121543362765728, + "kl": 0.057861328125, + "learning_rate": 9.490099877343612e-07, + "loss": 0.0232, + "reward": 1.5405619144439697, + "reward_std": 0.1676030457019806, + "rewards/accuracy_reward_stage2": 0.6655619144439697, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 292 + }, + { + "completion_length": 13.5625, + "epoch": 0.0513404590853338, + "grad_norm": 24.313452223867653, + "kl": 0.054443359375, + "learning_rate": 9.488347643245137e-07, + "loss": -0.0331, + "reward": 1.5306828022003174, + "reward_std": 0.23595470190048218, + "rewards/accuracy_reward_stage2": 0.6869328022003174, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 293 + }, + { + "completion_length": 26.109375, + "epoch": 0.051515682495181354, + "grad_norm": 23.43936822314985, + "kl": 0.0164794921875, + "learning_rate": 9.486595409146662e-07, + "loss": 0.0066, + "reward": 1.4214849472045898, + "reward_std": 0.15756914019584656, + "rewards/accuracy_reward_stage2": 0.42148491740226746, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 294 + }, + { + "completion_length": 12.265625, + "epoch": 0.051690905905028915, + "grad_norm": 22.691844775763478, + "kl": 0.146484375, + "learning_rate": 9.484843175048186e-07, + "loss": 0.0378, + "reward": 1.3974359035491943, + "reward_std": 0.29218339920043945, + "rewards/accuracy_reward_stage2": 0.5380609035491943, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 295 + }, + { + "completion_length": 8.65625, + "epoch": 0.05186612931487647, + "grad_norm": 20.479831261545776, + "kl": 0.031982421875, + "learning_rate": 9.483090940949711e-07, + "loss": -0.0613, + "reward": 1.5576322078704834, + "reward_std": 0.31971651315689087, + "rewards/accuracy_reward_stage2": 0.5888821482658386, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 296 + }, + { + "completion_length": 23.4375, + "epoch": 0.05204135272472402, + "grad_norm": 22.74094957435536, + "kl": 0.0230712890625, + "learning_rate": 9.481338706851235e-07, + "loss": -0.035, + "reward": 1.1994527578353882, + "reward_std": 0.17752505838871002, + "rewards/accuracy_reward_stage2": 0.4650777578353882, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 297 + }, + { + "completion_length": 12.359375, + "epoch": 0.05221657613457158, + "grad_norm": 21.441989920997887, + "kl": 0.04248046875, + "learning_rate": 9.47958647275276e-07, + "loss": 0.017, + "reward": 1.5468885898590088, + "reward_std": 0.1514553427696228, + "rewards/accuracy_reward_stage2": 0.6718886494636536, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 298 + }, + { + "completion_length": 12.609375, + "epoch": 0.05239179954441914, + "grad_norm": 4830.82411347626, + "kl": 24.5, + "learning_rate": 9.477834238654284e-07, + "loss": 9.7416, + "reward": 1.2803688049316406, + "reward_std": 0.22106263041496277, + "rewards/accuracy_reward_stage2": 0.5303688049316406, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 299 + }, + { + "completion_length": 15.171875, + "epoch": 0.05256702295426669, + "grad_norm": 66.7580675171776, + "kl": 0.609375, + "learning_rate": 9.476082004555808e-07, + "loss": 0.1995, + "reward": 1.222217321395874, + "reward_std": 0.054408349096775055, + "rewards/accuracy_reward_stage2": 0.487842321395874, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 300 + }, + { + "completion_length": 14.90625, + "epoch": 0.052742246364114245, + "grad_norm": 13.78642449839868, + "kl": 0.0250244140625, + "learning_rate": 9.474329770457332e-07, + "loss": -0.0342, + "reward": 1.2080720663070679, + "reward_std": 0.16576477885246277, + "rewards/accuracy_reward_stage2": 0.34869706630706787, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 301 + }, + { + "completion_length": 15.796875, + "epoch": 0.0529174697739618, + "grad_norm": 17.396925885377456, + "kl": 0.049072265625, + "learning_rate": 9.472577536358857e-07, + "loss": 0.0196, + "reward": 1.5201051235198975, + "reward_std": 0.056752100586891174, + "rewards/accuracy_reward_stage2": 0.5201051235198975, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 302 + }, + { + "completion_length": 12.078125, + "epoch": 0.05309269318380936, + "grad_norm": 24.39287992746118, + "kl": 0.1259765625, + "learning_rate": 9.470825302260381e-07, + "loss": 0.0503, + "reward": 1.6268336772918701, + "reward_std": 0.25979822874069214, + "rewards/accuracy_reward_stage2": 0.6268336772918701, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 303 + }, + { + "completion_length": 8.984375, + "epoch": 0.053267916593656914, + "grad_norm": 26.97048476925272, + "kl": 0.1552734375, + "learning_rate": 9.469073068161906e-07, + "loss": 0.062, + "reward": 1.4804831743240356, + "reward_std": 0.30746644735336304, + "rewards/accuracy_reward_stage2": 0.6054832339286804, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 304 + }, + { + "completion_length": 8.828125, + "epoch": 0.05344314000350447, + "grad_norm": 30.65849433100143, + "kl": 0.1064453125, + "learning_rate": 9.46732083406343e-07, + "loss": 0.0426, + "reward": 1.2948065996170044, + "reward_std": 0.18823125958442688, + "rewards/accuracy_reward_stage2": 0.5448065996170044, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 305 + }, + { + "completion_length": 10.09375, + "epoch": 0.05361836341335202, + "grad_norm": 17.884322716284345, + "kl": 0.026123046875, + "learning_rate": 9.465568599964955e-07, + "loss": -0.0088, + "reward": 1.3229167461395264, + "reward_std": 0.16480545699596405, + "rewards/accuracy_reward_stage2": 0.3385416865348816, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 306 + }, + { + "completion_length": 10.921875, + "epoch": 0.05379358682319958, + "grad_norm": 17.82060901728658, + "kl": 0.3359375, + "learning_rate": 9.46381636586648e-07, + "loss": 0.1344, + "reward": 1.2772254943847656, + "reward_std": 0.1089072972536087, + "rewards/accuracy_reward_stage2": 0.4022255539894104, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 307 + }, + { + "completion_length": 8.46875, + "epoch": 0.053968810233047136, + "grad_norm": 16.952050026411197, + "kl": 0.05029296875, + "learning_rate": 9.462064131768004e-07, + "loss": 0.02, + "reward": 1.5841929912567139, + "reward_std": 0.1349220871925354, + "rewards/accuracy_reward_stage2": 0.5841929316520691, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 308 + }, + { + "completion_length": 14.390625, + "epoch": 0.05414403364289469, + "grad_norm": 19.823545204054554, + "kl": 0.67578125, + "learning_rate": 9.460311897669528e-07, + "loss": 0.2257, + "reward": 1.2973132133483887, + "reward_std": 0.160364031791687, + "rewards/accuracy_reward_stage2": 0.5629382729530334, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 309 + }, + { + "completion_length": 12.109375, + "epoch": 0.054319257052742244, + "grad_norm": 19.287387696238504, + "kl": 0.0361328125, + "learning_rate": 9.458559663571053e-07, + "loss": -0.0218, + "reward": 1.4892075061798096, + "reward_std": 0.202066108584404, + "rewards/accuracy_reward_stage2": 0.5048325061798096, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 310 + }, + { + "completion_length": 19.59375, + "epoch": 0.054494480462589805, + "grad_norm": 23.88832240645103, + "kl": 0.0458984375, + "learning_rate": 9.456807429472577e-07, + "loss": 0.0183, + "reward": 1.44734525680542, + "reward_std": 0.27426010370254517, + "rewards/accuracy_reward_stage2": 0.4473453164100647, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 311 + }, + { + "completion_length": 16.0625, + "epoch": 0.05466970387243736, + "grad_norm": 12.613548315395109, + "kl": 0.039794921875, + "learning_rate": 9.455055195374102e-07, + "loss": 0.0159, + "reward": 1.5471065044403076, + "reward_std": 0.14003218710422516, + "rewards/accuracy_reward_stage2": 0.5471064448356628, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 312 + }, + { + "completion_length": 8.640625, + "epoch": 0.05484492728228491, + "grad_norm": 30.149557832680912, + "kl": 0.07470703125, + "learning_rate": 9.453302961275626e-07, + "loss": 0.0297, + "reward": 1.7912230491638184, + "reward_std": 0.20346251130104065, + "rewards/accuracy_reward_stage2": 0.7912230491638184, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 313 + }, + { + "completion_length": 13.71875, + "epoch": 0.055020150692132466, + "grad_norm": 22.805893325483133, + "kl": 0.72265625, + "learning_rate": 9.45155072717715e-07, + "loss": 0.2892, + "reward": 1.4171767234802246, + "reward_std": 0.16180284321308136, + "rewards/accuracy_reward_stage2": 0.5421766638755798, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 314 + }, + { + "completion_length": 11.3125, + "epoch": 0.05519537410198003, + "grad_norm": 31.939022622697205, + "kl": 0.059814453125, + "learning_rate": 9.449798493078675e-07, + "loss": 0.024, + "reward": 1.6428941488265991, + "reward_std": 0.279352605342865, + "rewards/accuracy_reward_stage2": 0.6428941488265991, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 315 + }, + { + "completion_length": 15.921875, + "epoch": 0.05537059751182758, + "grad_norm": 31.633316769847468, + "kl": 0.1171875, + "learning_rate": 9.448046258980199e-07, + "loss": 0.0469, + "reward": 1.643958568572998, + "reward_std": 0.3066912591457367, + "rewards/accuracy_reward_stage2": 0.643958568572998, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 316 + }, + { + "completion_length": 10.5625, + "epoch": 0.055545820921675135, + "grad_norm": 22.638846768381686, + "kl": 0.030517578125, + "learning_rate": 9.446294024881724e-07, + "loss": 0.0122, + "reward": 1.6205029487609863, + "reward_std": 0.18559257686138153, + "rewards/accuracy_reward_stage2": 0.6205028295516968, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 317 + }, + { + "completion_length": 10.1875, + "epoch": 0.05572104433152269, + "grad_norm": 16.301947212185045, + "kl": 0.0123291015625, + "learning_rate": 9.444541790783249e-07, + "loss": 0.0049, + "reward": 1.2974097728729248, + "reward_std": 0.1253841519355774, + "rewards/accuracy_reward_stage2": 0.29740971326828003, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 318 + }, + { + "completion_length": 8.4375, + "epoch": 0.05589626774137025, + "grad_norm": 16.27137921980481, + "kl": 0.0771484375, + "learning_rate": 9.442789556684772e-07, + "loss": 0.0308, + "reward": 1.131199836730957, + "reward_std": 0.13367994129657745, + "rewards/accuracy_reward_stage2": 0.13119982182979584, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 319 + }, + { + "completion_length": 17.5625, + "epoch": 0.0560714911512178, + "grad_norm": 23.532132630398923, + "kl": 0.064453125, + "learning_rate": 9.441037322586297e-07, + "loss": 0.0259, + "reward": 1.3425979614257812, + "reward_std": 0.18130367994308472, + "rewards/accuracy_reward_stage2": 0.342598021030426, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 320 + }, + { + "completion_length": 7.59375, + "epoch": 0.05624671456106536, + "grad_norm": 19.109109165832518, + "kl": 0.0546875, + "learning_rate": 9.439285088487821e-07, + "loss": -0.0223, + "reward": 1.7482370138168335, + "reward_std": 0.240419402718544, + "rewards/accuracy_reward_stage2": 0.7638620138168335, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 321 + }, + { + "completion_length": 13.734375, + "epoch": 0.05642193797091291, + "grad_norm": 14.806289223451508, + "kl": 0.04541015625, + "learning_rate": 9.437532854389346e-07, + "loss": 0.0182, + "reward": 1.4349133968353271, + "reward_std": 0.15795834362506866, + "rewards/accuracy_reward_stage2": 0.5599132776260376, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 322 + }, + { + "completion_length": 9.90625, + "epoch": 0.05659716138076047, + "grad_norm": 17.926016568620387, + "kl": 0.059326171875, + "learning_rate": 9.435780620290871e-07, + "loss": 0.0238, + "reward": 1.3682993650436401, + "reward_std": 0.14938199520111084, + "rewards/accuracy_reward_stage2": 0.36829936504364014, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 323 + }, + { + "completion_length": 6.109375, + "epoch": 0.056772384790608026, + "grad_norm": 15.622725365840726, + "kl": 0.0301513671875, + "learning_rate": 9.434028386192395e-07, + "loss": 0.0121, + "reward": 1.6096426248550415, + "reward_std": 0.09859603643417358, + "rewards/accuracy_reward_stage2": 0.6096425652503967, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 324 + }, + { + "completion_length": 22.296875, + "epoch": 0.05694760820045558, + "grad_norm": 19.94432851352937, + "kl": 0.515625, + "learning_rate": 9.43227615209392e-07, + "loss": 0.207, + "reward": 1.4566096067428589, + "reward_std": 0.2075975239276886, + "rewards/accuracy_reward_stage2": 0.5816096067428589, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 325 + }, + { + "completion_length": 10.359375, + "epoch": 0.05712283161030313, + "grad_norm": 17.786476616869685, + "kl": 0.0927734375, + "learning_rate": 9.430523917995444e-07, + "loss": -0.0072, + "reward": 1.559728741645813, + "reward_std": 0.19865679740905762, + "rewards/accuracy_reward_stage2": 0.575353741645813, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 326 + }, + { + "completion_length": 13.203125, + "epoch": 0.057298055020150694, + "grad_norm": 22.973658958195003, + "kl": 0.06298828125, + "learning_rate": 9.428771683896968e-07, + "loss": -0.0036, + "reward": 1.4841365814208984, + "reward_std": 0.25597521662712097, + "rewards/accuracy_reward_stage2": 0.4997614920139313, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 327 + }, + { + "completion_length": 26.09375, + "epoch": 0.05747327842999825, + "grad_norm": 17.487533600335915, + "kl": 0.61328125, + "learning_rate": 9.427019449798493e-07, + "loss": 0.2008, + "reward": 1.189457654953003, + "reward_std": 0.17859560251235962, + "rewards/accuracy_reward_stage2": 0.3300827145576477, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 328 + }, + { + "completion_length": 14.28125, + "epoch": 0.0576485018398458, + "grad_norm": 21.218654010279625, + "kl": 0.609375, + "learning_rate": 9.425267215700016e-07, + "loss": 0.1987, + "reward": 1.3703992366790771, + "reward_std": 0.24765318632125854, + "rewards/accuracy_reward_stage2": 0.6360243558883667, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 329 + }, + { + "completion_length": 11.234375, + "epoch": 0.057823725249693356, + "grad_norm": 21.651742707988532, + "kl": 0.031005859375, + "learning_rate": 9.423514981601541e-07, + "loss": 0.0124, + "reward": 1.5708177089691162, + "reward_std": 0.18964162468910217, + "rewards/accuracy_reward_stage2": 0.5708176493644714, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 330 + }, + { + "completion_length": 9.140625, + "epoch": 0.057998948659540916, + "grad_norm": 35.78706914854555, + "kl": 0.050048828125, + "learning_rate": 9.421762747503066e-07, + "loss": 0.02, + "reward": 1.3698612451553345, + "reward_std": 0.20854762196540833, + "rewards/accuracy_reward_stage2": 0.49486127495765686, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 331 + }, + { + "completion_length": 10.171875, + "epoch": 0.05817417206938847, + "grad_norm": 17.065846478050958, + "kl": 0.0625, + "learning_rate": 9.42001051340459e-07, + "loss": 0.025, + "reward": 1.7728002071380615, + "reward_std": 0.1718043088912964, + "rewards/accuracy_reward_stage2": 0.772800087928772, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 332 + }, + { + "completion_length": 7.828125, + "epoch": 0.058349395479236024, + "grad_norm": 21.44285617956672, + "kl": 0.055908203125, + "learning_rate": 9.418258279306115e-07, + "loss": 0.0007, + "reward": 1.5830440521240234, + "reward_std": 0.10806939750909805, + "rewards/accuracy_reward_stage2": 0.5986689925193787, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 333 + }, + { + "completion_length": 9.5, + "epoch": 0.058524618889083585, + "grad_norm": 19.21932855206352, + "kl": 0.09326171875, + "learning_rate": 9.41650604520764e-07, + "loss": -0.0019, + "reward": 1.4759433269500732, + "reward_std": 0.20894023776054382, + "rewards/accuracy_reward_stage2": 0.49156832695007324, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 334 + }, + { + "completion_length": 6.234375, + "epoch": 0.05869984229893114, + "grad_norm": 17.213037529404584, + "kl": 0.0179443359375, + "learning_rate": 9.414753811109164e-07, + "loss": 0.0072, + "reward": 1.8309895992279053, + "reward_std": 0.12448026239871979, + "rewards/accuracy_reward_stage2": 0.8309895992279053, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 335 + }, + { + "completion_length": 10.0, + "epoch": 0.05887506570877869, + "grad_norm": 27.770533441967974, + "kl": 0.072265625, + "learning_rate": 9.413001577010689e-07, + "loss": 0.0289, + "reward": 1.7088299989700317, + "reward_std": 0.23482096195220947, + "rewards/accuracy_reward_stage2": 0.7088299989700317, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 336 + }, + { + "completion_length": 12.90625, + "epoch": 0.05905028911862625, + "grad_norm": 18.69296153324102, + "kl": 0.00616455078125, + "learning_rate": 9.411249342912213e-07, + "loss": 0.0025, + "reward": 1.7130486965179443, + "reward_std": 0.1294686496257782, + "rewards/accuracy_reward_stage2": 0.7130487561225891, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 337 + }, + { + "completion_length": 11.625, + "epoch": 0.05922551252847381, + "grad_norm": 21.091997815143415, + "kl": 0.08642578125, + "learning_rate": 9.409497108813738e-07, + "loss": 0.0346, + "reward": 1.5273933410644531, + "reward_std": 0.21970880031585693, + "rewards/accuracy_reward_stage2": 0.5273933410644531, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 338 + }, + { + "completion_length": 13.03125, + "epoch": 0.05940073593832136, + "grad_norm": 22.056625658238307, + "kl": 0.083984375, + "learning_rate": 9.407744874715261e-07, + "loss": 0.0335, + "reward": 1.563867449760437, + "reward_std": 0.1931018829345703, + "rewards/accuracy_reward_stage2": 0.563867449760437, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 339 + }, + { + "completion_length": 19.734375, + "epoch": 0.059575959348168915, + "grad_norm": 25.210516531222016, + "kl": 0.283203125, + "learning_rate": 9.405992640616785e-07, + "loss": 0.1131, + "reward": 1.47365403175354, + "reward_std": 0.2418113648891449, + "rewards/accuracy_reward_stage2": 0.59865403175354, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 340 + }, + { + "completion_length": 7.671875, + "epoch": 0.05975118275801647, + "grad_norm": 17.408056064153456, + "kl": 0.205078125, + "learning_rate": 9.40424040651831e-07, + "loss": 0.0379, + "reward": 1.2369791269302368, + "reward_std": 0.19781196117401123, + "rewards/accuracy_reward_stage2": 0.3776041567325592, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 341 + }, + { + "completion_length": 11.375, + "epoch": 0.05992640616786403, + "grad_norm": 26.883149428221756, + "kl": 0.04345703125, + "learning_rate": 9.402488172419835e-07, + "loss": -0.0268, + "reward": 1.6457767486572266, + "reward_std": 0.25722378492355347, + "rewards/accuracy_reward_stage2": 0.6614017486572266, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 342 + }, + { + "completion_length": 7.5625, + "epoch": 0.060101629577711584, + "grad_norm": 11.1916374569426, + "kl": 0.00958251953125, + "learning_rate": 9.400735938321359e-07, + "loss": 0.0038, + "reward": 1.6783901453018188, + "reward_std": 0.01817590743303299, + "rewards/accuracy_reward_stage2": 0.6783901453018188, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 343 + }, + { + "completion_length": 9.625, + "epoch": 0.06027685298755914, + "grad_norm": 26.840387014062035, + "kl": 0.028076171875, + "learning_rate": 9.398983704222884e-07, + "loss": 0.0112, + "reward": 1.472599983215332, + "reward_std": 0.14326484501361847, + "rewards/accuracy_reward_stage2": 0.47259995341300964, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 344 + }, + { + "completion_length": 8.78125, + "epoch": 0.06045207639740669, + "grad_norm": 30.13359702730389, + "kl": 0.0167236328125, + "learning_rate": 9.397231470124408e-07, + "loss": -0.0375, + "reward": 1.7736797332763672, + "reward_std": 0.2013120949268341, + "rewards/accuracy_reward_stage2": 0.7893046736717224, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 345 + }, + { + "completion_length": 11.734375, + "epoch": 0.06062729980725425, + "grad_norm": 30.760314151166217, + "kl": 0.126953125, + "learning_rate": 9.395479236025933e-07, + "loss": 0.0507, + "reward": 1.2022664546966553, + "reward_std": 0.29217326641082764, + "rewards/accuracy_reward_stage2": 0.4522664546966553, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 346 + }, + { + "completion_length": 23.3125, + "epoch": 0.060802523217101806, + "grad_norm": 17.509360599599663, + "kl": 0.04345703125, + "learning_rate": 9.393727001927458e-07, + "loss": 0.0174, + "reward": 1.3837076425552368, + "reward_std": 0.17910131812095642, + "rewards/accuracy_reward_stage2": 0.3837076425552368, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 347 + }, + { + "completion_length": 12.0, + "epoch": 0.06097774662694936, + "grad_norm": 24.40584268231279, + "kl": 0.06298828125, + "learning_rate": 9.391974767828981e-07, + "loss": 0.0252, + "reward": 1.7270774841308594, + "reward_std": 0.21028944849967957, + "rewards/accuracy_reward_stage2": 0.7270774841308594, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 348 + }, + { + "completion_length": 14.609375, + "epoch": 0.061152970036796914, + "grad_norm": 22.108695434224188, + "kl": 0.06103515625, + "learning_rate": 9.390222533730506e-07, + "loss": 0.0245, + "reward": 1.4529306888580322, + "reward_std": 0.16741645336151123, + "rewards/accuracy_reward_stage2": 0.4529306888580322, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 349 + }, + { + "completion_length": 9.171875, + "epoch": 0.061328193446644474, + "grad_norm": 17.749474904467053, + "kl": 0.0181884765625, + "learning_rate": 9.388470299632031e-07, + "loss": -0.0369, + "reward": 1.59375, + "reward_std": 0.19727617502212524, + "rewards/accuracy_reward_stage2": 0.609375, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 350 + }, + { + "completion_length": 17.65625, + "epoch": 0.06150341685649203, + "grad_norm": 23.483625219058727, + "kl": 0.04443359375, + "learning_rate": 9.386718065533555e-07, + "loss": -0.0655, + "reward": 1.537459373474121, + "reward_std": 0.2557547390460968, + "rewards/accuracy_reward_stage2": 0.5843343734741211, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 351 + }, + { + "completion_length": 15.296875, + "epoch": 0.06167864026633958, + "grad_norm": 159.74195424359493, + "kl": 0.2001953125, + "learning_rate": 9.384965831435079e-07, + "loss": 0.0358, + "reward": 1.5681722164154053, + "reward_std": 0.21072784066200256, + "rewards/accuracy_reward_stage2": 0.58379727602005, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 352 + }, + { + "completion_length": 10.3125, + "epoch": 0.061853863676187136, + "grad_norm": 18.886814832728522, + "kl": 0.07958984375, + "learning_rate": 9.383213597336603e-07, + "loss": 0.0103, + "reward": 1.270545482635498, + "reward_std": 0.19417250156402588, + "rewards/accuracy_reward_stage2": 0.41117042303085327, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 353 + }, + { + "completion_length": 9.359375, + "epoch": 0.0620290870860347, + "grad_norm": 18.37524567034427, + "kl": 0.0299072265625, + "learning_rate": 9.381461363238128e-07, + "loss": 0.0119, + "reward": 1.5955801010131836, + "reward_std": 0.19248944520950317, + "rewards/accuracy_reward_stage2": 0.7205801010131836, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 354 + }, + { + "completion_length": 14.125, + "epoch": 0.06220431049588225, + "grad_norm": 20.849395826639736, + "kl": 0.046630859375, + "learning_rate": 9.379709129139653e-07, + "loss": 0.0187, + "reward": 1.4106394052505493, + "reward_std": 0.1208919808268547, + "rewards/accuracy_reward_stage2": 0.5356393456459045, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 355 + }, + { + "completion_length": 10.203125, + "epoch": 0.062379533905729805, + "grad_norm": 19.576848945135378, + "kl": 0.0257568359375, + "learning_rate": 9.377956895041177e-07, + "loss": 0.0103, + "reward": 1.6396290063858032, + "reward_std": 0.12200456112623215, + "rewards/accuracy_reward_stage2": 0.6396290063858032, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 356 + }, + { + "completion_length": 10.21875, + "epoch": 0.06255475731557736, + "grad_norm": 25.56816645699675, + "kl": 0.041748046875, + "learning_rate": 9.376204660942702e-07, + "loss": 0.0167, + "reward": 1.7575688362121582, + "reward_std": 0.1697702407836914, + "rewards/accuracy_reward_stage2": 0.757568895816803, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 357 + }, + { + "completion_length": 9.375, + "epoch": 0.06272998072542492, + "grad_norm": 15.342675032171575, + "kl": 0.0167236328125, + "learning_rate": 9.374452426844227e-07, + "loss": 0.0067, + "reward": 1.46875, + "reward_std": 0.24511480331420898, + "rewards/accuracy_reward_stage2": 0.46875, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 358 + }, + { + "completion_length": 9.046875, + "epoch": 0.06290520413527247, + "grad_norm": 17.824968671605863, + "kl": 0.0791015625, + "learning_rate": 9.37270019274575e-07, + "loss": 0.006, + "reward": 1.485837697982788, + "reward_std": 0.20334991812705994, + "rewards/accuracy_reward_stage2": 0.5014628171920776, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 359 + }, + { + "completion_length": 17.453125, + "epoch": 0.06308042754512003, + "grad_norm": 21233.154628289303, + "kl": 744.0, + "learning_rate": 9.370947958647275e-07, + "loss": 298.3744, + "reward": 1.3596529960632324, + "reward_std": 0.16530917584896088, + "rewards/accuracy_reward_stage2": 0.6252779364585876, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 360 + }, + { + "completion_length": 16.34375, + "epoch": 0.06325565095496759, + "grad_norm": 23.684309225602988, + "kl": 0.09716796875, + "learning_rate": 9.369195724548799e-07, + "loss": 0.0389, + "reward": 1.4965417385101318, + "reward_std": 0.23690475523471832, + "rewards/accuracy_reward_stage2": 0.6215417981147766, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 361 + }, + { + "completion_length": 8.265625, + "epoch": 0.06343087436481513, + "grad_norm": 19.810862008446907, + "kl": 0.0830078125, + "learning_rate": 9.367443490450324e-07, + "loss": -0.0109, + "reward": 1.4895833730697632, + "reward_std": 0.3171301484107971, + "rewards/accuracy_reward_stage2": 0.6302083134651184, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 362 + }, + { + "completion_length": 10.140625, + "epoch": 0.0636060977746627, + "grad_norm": 18.18713950785703, + "kl": 0.07275390625, + "learning_rate": 9.365691256351849e-07, + "loss": 0.029, + "reward": 1.6478146314620972, + "reward_std": 0.14585444331169128, + "rewards/accuracy_reward_stage2": 0.6478146910667419, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 363 + }, + { + "completion_length": 9.140625, + "epoch": 0.06378132118451026, + "grad_norm": 20.067637774027975, + "kl": 0.049072265625, + "learning_rate": 9.363939022253373e-07, + "loss": -0.0214, + "reward": 1.633749008178711, + "reward_std": 0.17472119629383087, + "rewards/accuracy_reward_stage2": 0.6493740081787109, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 364 + }, + { + "completion_length": 5.875, + "epoch": 0.0639565445943578, + "grad_norm": 19.42945311409606, + "kl": 0.052001953125, + "learning_rate": 9.362186788154897e-07, + "loss": 0.0209, + "reward": 1.6205922365188599, + "reward_std": 0.23425078392028809, + "rewards/accuracy_reward_stage2": 0.6205922365188599, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 365 + }, + { + "completion_length": 8.4375, + "epoch": 0.06413176800420536, + "grad_norm": 19.86294029189364, + "kl": 0.0830078125, + "learning_rate": 9.360434554056421e-07, + "loss": -0.0057, + "reward": 1.3854167461395264, + "reward_std": 0.22538167238235474, + "rewards/accuracy_reward_stage2": 0.4166666567325592, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 366 + }, + { + "completion_length": 10.796875, + "epoch": 0.06430699141405291, + "grad_norm": 17.291194693608265, + "kl": 0.06591796875, + "learning_rate": 9.358682319957946e-07, + "loss": -0.0179, + "reward": 1.5046108961105347, + "reward_std": 0.16259142756462097, + "rewards/accuracy_reward_stage2": 0.5202358961105347, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 367 + }, + { + "completion_length": 11.625, + "epoch": 0.06448221482390047, + "grad_norm": 19.514220261583535, + "kl": 0.0230712890625, + "learning_rate": 9.35693008585947e-07, + "loss": 0.0092, + "reward": 1.6788980960845947, + "reward_std": 0.18751531839370728, + "rewards/accuracy_reward_stage2": 0.67889803647995, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 368 + }, + { + "completion_length": 10.71875, + "epoch": 0.06465743823374803, + "grad_norm": 18.037938987127585, + "kl": 0.10791015625, + "learning_rate": 9.355177851760994e-07, + "loss": -0.0012, + "reward": 1.3620235919952393, + "reward_std": 0.16237865388393402, + "rewards/accuracy_reward_stage2": 0.37764859199523926, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 369 + }, + { + "completion_length": 9.265625, + "epoch": 0.06483266164359558, + "grad_norm": 19.647145326309186, + "kl": 0.046875, + "learning_rate": 9.353425617662519e-07, + "loss": -0.0101, + "reward": 1.6776671409606934, + "reward_std": 0.31430885195732117, + "rewards/accuracy_reward_stage2": 0.6932921409606934, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 370 + }, + { + "completion_length": 7.234375, + "epoch": 0.06500788505344314, + "grad_norm": 19.609023289214672, + "kl": 0.0194091796875, + "learning_rate": 9.351673383564044e-07, + "loss": 0.0078, + "reward": 1.6241884231567383, + "reward_std": 0.24334561824798584, + "rewards/accuracy_reward_stage2": 0.6241884827613831, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 371 + }, + { + "completion_length": 9.203125, + "epoch": 0.0651831084632907, + "grad_norm": 21.693746313134152, + "kl": 0.0830078125, + "learning_rate": 9.349921149465568e-07, + "loss": 0.0332, + "reward": 1.557944893836975, + "reward_std": 0.14944539964199066, + "rewards/accuracy_reward_stage2": 0.6829449534416199, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 372 + }, + { + "completion_length": 7.5, + "epoch": 0.06535833187313825, + "grad_norm": 20.281466370040626, + "kl": 0.08984375, + "learning_rate": 9.348168915367093e-07, + "loss": 0.036, + "reward": 1.3900837898254395, + "reward_std": 0.13699333369731903, + "rewards/accuracy_reward_stage2": 0.39008378982543945, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 373 + }, + { + "completion_length": 12.078125, + "epoch": 0.06553355528298581, + "grad_norm": 21.899711129212353, + "kl": 0.326171875, + "learning_rate": 9.346416681268617e-07, + "loss": 0.1302, + "reward": 1.506643533706665, + "reward_std": 0.21940842270851135, + "rewards/accuracy_reward_stage2": 0.6316434741020203, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 374 + }, + { + "completion_length": 10.171875, + "epoch": 0.06570877869283337, + "grad_norm": 18.654545463446304, + "kl": 0.078125, + "learning_rate": 9.344664447170142e-07, + "loss": 0.0312, + "reward": 1.4036774635314941, + "reward_std": 0.1636413335800171, + "rewards/accuracy_reward_stage2": 0.40367743372917175, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 375 + }, + { + "completion_length": 8.109375, + "epoch": 0.06588400210268092, + "grad_norm": 17.667479564188806, + "kl": 0.03662109375, + "learning_rate": 9.342912213071667e-07, + "loss": 0.0146, + "reward": 1.6772925853729248, + "reward_std": 0.11799340695142746, + "rewards/accuracy_reward_stage2": 0.6772925853729248, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 376 + }, + { + "completion_length": 7.0, + "epoch": 0.06605922551252848, + "grad_norm": 22.152832316564922, + "kl": 0.01324462890625, + "learning_rate": 9.34115997897319e-07, + "loss": 0.0053, + "reward": 1.5811469554901123, + "reward_std": 0.16719821095466614, + "rewards/accuracy_reward_stage2": 0.5811468958854675, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 377 + }, + { + "completion_length": 12.15625, + "epoch": 0.06623444892237602, + "grad_norm": 19.79514474324505, + "kl": 0.05126953125, + "learning_rate": 9.339407744874714e-07, + "loss": -0.0115, + "reward": 1.5288242101669312, + "reward_std": 0.2464839220046997, + "rewards/accuracy_reward_stage2": 0.5444492101669312, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 378 + }, + { + "completion_length": 10.625, + "epoch": 0.06640967233222358, + "grad_norm": 15.80397044871378, + "kl": 0.03076171875, + "learning_rate": 9.337655510776239e-07, + "loss": 0.0123, + "reward": 1.7461693286895752, + "reward_std": 0.1648191660642624, + "rewards/accuracy_reward_stage2": 0.7461693286895752, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 379 + }, + { + "completion_length": 15.578125, + "epoch": 0.06658489574207115, + "grad_norm": 39.80712653417323, + "kl": 0.50390625, + "learning_rate": 9.335903276677763e-07, + "loss": 0.1679, + "reward": 1.3081011772155762, + "reward_std": 0.25190237164497375, + "rewards/accuracy_reward_stage2": 0.46435117721557617, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 380 + }, + { + "completion_length": 11.78125, + "epoch": 0.06676011915191869, + "grad_norm": 16.030393941102684, + "kl": 0.5703125, + "learning_rate": 9.334151042579288e-07, + "loss": 0.227, + "reward": 1.43631911277771, + "reward_std": 0.10598289966583252, + "rewards/accuracy_reward_stage2": 0.6863189935684204, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 381 + }, + { + "completion_length": 13.875, + "epoch": 0.06693534256176625, + "grad_norm": 21.087588784066092, + "kl": 0.099609375, + "learning_rate": 9.332398808480812e-07, + "loss": 0.04, + "reward": 1.2481482028961182, + "reward_std": 0.18241316080093384, + "rewards/accuracy_reward_stage2": 0.24814806878566742, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 382 + }, + { + "completion_length": 10.734375, + "epoch": 0.06711056597161381, + "grad_norm": 20.186397153368166, + "kl": 0.037353515625, + "learning_rate": 9.330646574382337e-07, + "loss": 0.015, + "reward": 1.5044660568237305, + "reward_std": 0.1688692569732666, + "rewards/accuracy_reward_stage2": 0.6294660568237305, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 383 + }, + { + "completion_length": 9.90625, + "epoch": 0.06728578938146136, + "grad_norm": 25.32946244457715, + "kl": 0.0341796875, + "learning_rate": 9.328894340283862e-07, + "loss": 0.0137, + "reward": 1.4899933338165283, + "reward_std": 0.23859579861164093, + "rewards/accuracy_reward_stage2": 0.4899933338165283, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 384 + }, + { + "completion_length": 13.328125, + "epoch": 0.06746101279130892, + "grad_norm": 26.81316686056031, + "kl": 0.57421875, + "learning_rate": 9.327142106185386e-07, + "loss": 0.2292, + "reward": 1.4508566856384277, + "reward_std": 0.2757464051246643, + "rewards/accuracy_reward_stage2": 0.5758566856384277, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 385 + }, + { + "completion_length": 9.28125, + "epoch": 0.06763623620115647, + "grad_norm": 19.521819857840825, + "kl": 0.048583984375, + "learning_rate": 9.325389872086911e-07, + "loss": -0.0711, + "reward": 1.7960493564605713, + "reward_std": 0.20955920219421387, + "rewards/accuracy_reward_stage2": 0.8429244160652161, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 386 + }, + { + "completion_length": 10.046875, + "epoch": 0.06781145961100403, + "grad_norm": 19.68601688056184, + "kl": 0.03759765625, + "learning_rate": 9.323637637988436e-07, + "loss": 0.0151, + "reward": 1.511404037475586, + "reward_std": 0.18920361995697021, + "rewards/accuracy_reward_stage2": 0.5114039778709412, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 387 + }, + { + "completion_length": 10.078125, + "epoch": 0.06798668302085159, + "grad_norm": 27.240554115165228, + "kl": 0.031982421875, + "learning_rate": 9.321885403889959e-07, + "loss": 0.0127, + "reward": 1.5362706184387207, + "reward_std": 0.20674368739128113, + "rewards/accuracy_reward_stage2": 0.6612705588340759, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 388 + }, + { + "completion_length": 15.53125, + "epoch": 0.06816190643069914, + "grad_norm": 31.545316926959348, + "kl": 0.2109375, + "learning_rate": 9.320133169791484e-07, + "loss": 0.0843, + "reward": 1.3991228342056274, + "reward_std": 0.323274701833725, + "rewards/accuracy_reward_stage2": 0.5241228342056274, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 389 + }, + { + "completion_length": 8.34375, + "epoch": 0.0683371298405467, + "grad_norm": 16.876944193659174, + "kl": 0.08642578125, + "learning_rate": 9.318380935693007e-07, + "loss": -0.0036, + "reward": 1.538655400276184, + "reward_std": 0.16559931635856628, + "rewards/accuracy_reward_stage2": 0.5542804598808289, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 390 + }, + { + "completion_length": 9.5625, + "epoch": 0.06851235325039426, + "grad_norm": 19.364008138324078, + "kl": 0.052734375, + "learning_rate": 9.316628701594532e-07, + "loss": 0.0211, + "reward": 1.417523741722107, + "reward_std": 0.19266514480113983, + "rewards/accuracy_reward_stage2": 0.41752374172210693, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 391 + }, + { + "completion_length": 11.015625, + "epoch": 0.0686875766602418, + "grad_norm": 20.188722390380764, + "kl": 0.0198974609375, + "learning_rate": 9.314876467496057e-07, + "loss": 0.008, + "reward": 1.3770326375961304, + "reward_std": 0.15552134811878204, + "rewards/accuracy_reward_stage2": 0.37703263759613037, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 392 + }, + { + "completion_length": 9.34375, + "epoch": 0.06886280007008937, + "grad_norm": 20.005146906885646, + "kl": 0.044921875, + "learning_rate": 9.313124233397581e-07, + "loss": 0.018, + "reward": 1.3810763359069824, + "reward_std": 0.21949338912963867, + "rewards/accuracy_reward_stage2": 0.3810763955116272, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 393 + }, + { + "completion_length": 10.21875, + "epoch": 0.06903802347993691, + "grad_norm": 15.891615345327356, + "kl": 0.051025390625, + "learning_rate": 9.311371999299106e-07, + "loss": 0.0204, + "reward": 1.5088541507720947, + "reward_std": 0.17497307062149048, + "rewards/accuracy_reward_stage2": 0.5088541507720947, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 394 + }, + { + "completion_length": 11.125, + "epoch": 0.06921324688978447, + "grad_norm": 23.512648748907708, + "kl": 0.06640625, + "learning_rate": 9.309619765200631e-07, + "loss": 0.0266, + "reward": 1.5210347175598145, + "reward_std": 0.2370174527168274, + "rewards/accuracy_reward_stage2": 0.6460347175598145, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 395 + }, + { + "completion_length": 10.46875, + "epoch": 0.06938847029963204, + "grad_norm": 21.42132475023691, + "kl": 0.04052734375, + "learning_rate": 9.307867531102155e-07, + "loss": -0.052, + "reward": 1.5896108150482178, + "reward_std": 0.26329123973846436, + "rewards/accuracy_reward_stage2": 0.6208608150482178, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 396 + }, + { + "completion_length": 17.03125, + "epoch": 0.06956369370947958, + "grad_norm": 14.95198588883508, + "kl": 0.031494140625, + "learning_rate": 9.30611529700368e-07, + "loss": -0.0316, + "reward": 1.5074900388717651, + "reward_std": 0.06422868371009827, + "rewards/accuracy_reward_stage2": 0.6481150984764099, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 397 + }, + { + "completion_length": 9.703125, + "epoch": 0.06973891711932714, + "grad_norm": 20.44367679889482, + "kl": 0.07958984375, + "learning_rate": 9.304363062905203e-07, + "loss": 0.0318, + "reward": 1.5729174613952637, + "reward_std": 0.09998870640993118, + "rewards/accuracy_reward_stage2": 0.5729174613952637, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 398 + }, + { + "completion_length": 24.828125, + "epoch": 0.0699141405291747, + "grad_norm": 19.893244845989614, + "kl": 0.10009765625, + "learning_rate": 9.302610828806728e-07, + "loss": 0.0401, + "reward": 1.135704517364502, + "reward_std": 0.16893689334392548, + "rewards/accuracy_reward_stage2": 0.26070448756217957, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 399 + }, + { + "completion_length": 13.765625, + "epoch": 0.07008936393902225, + "grad_norm": 4169.748724851853, + "kl": 21.125, + "learning_rate": 9.300858594708253e-07, + "loss": 8.4703, + "reward": 1.3594677448272705, + "reward_std": 0.1398237645626068, + "rewards/accuracy_reward_stage2": 0.4844678044319153, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 400 + }, + { + "completion_length": 9.5, + "epoch": 0.07026458734886981, + "grad_norm": 21.00576080891472, + "kl": 0.08984375, + "learning_rate": 9.299106360609777e-07, + "loss": 0.0069, + "reward": 1.8014570474624634, + "reward_std": 0.22386255860328674, + "rewards/accuracy_reward_stage2": 0.8170820474624634, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 401 + }, + { + "completion_length": 10.234375, + "epoch": 0.07043981075871736, + "grad_norm": 14.49412707044264, + "kl": 0.0576171875, + "learning_rate": 9.297354126511302e-07, + "loss": 0.0231, + "reward": 1.4158527851104736, + "reward_std": 0.09953659772872925, + "rewards/accuracy_reward_stage2": 0.4158529043197632, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 402 + }, + { + "completion_length": 10.25, + "epoch": 0.07061503416856492, + "grad_norm": 32.36561156113094, + "kl": 0.10693359375, + "learning_rate": 9.295601892412826e-07, + "loss": 0.0139, + "reward": 1.72810697555542, + "reward_std": 0.2724772095680237, + "rewards/accuracy_reward_stage2": 0.7437319159507751, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 403 + }, + { + "completion_length": 18.609375, + "epoch": 0.07079025757841248, + "grad_norm": 24.080748226483227, + "kl": 0.345703125, + "learning_rate": 9.29384965831435e-07, + "loss": 0.1378, + "reward": 1.2732133865356445, + "reward_std": 0.15048734843730927, + "rewards/accuracy_reward_stage2": 0.39821332693099976, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 404 + }, + { + "completion_length": 20.515625, + "epoch": 0.07096548098826003, + "grad_norm": 264.3265949278815, + "kl": 1.984375, + "learning_rate": 9.292097424215875e-07, + "loss": 0.7961, + "reward": 1.4931068420410156, + "reward_std": 0.14453980326652527, + "rewards/accuracy_reward_stage2": 0.6181067824363708, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 405 + }, + { + "completion_length": 10.375, + "epoch": 0.07114070439810759, + "grad_norm": 16.95554663885095, + "kl": 0.0277099609375, + "learning_rate": 9.290345190117399e-07, + "loss": 0.0111, + "reward": 1.7291667461395264, + "reward_std": 0.1907956451177597, + "rewards/accuracy_reward_stage2": 0.7291666269302368, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 406 + }, + { + "completion_length": 10.703125, + "epoch": 0.07131592780795515, + "grad_norm": 20.95046609185987, + "kl": 0.0272216796875, + "learning_rate": 9.288592956018924e-07, + "loss": 0.0109, + "reward": 1.4488990306854248, + "reward_std": 0.25809937715530396, + "rewards/accuracy_reward_stage2": 0.4488990306854248, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 407 + }, + { + "completion_length": 10.15625, + "epoch": 0.0714911512178027, + "grad_norm": 18.718637839739095, + "kl": 0.036376953125, + "learning_rate": 9.286840721920448e-07, + "loss": 0.0146, + "reward": 1.5787631273269653, + "reward_std": 0.16580891609191895, + "rewards/accuracy_reward_stage2": 0.5787630677223206, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 408 + }, + { + "completion_length": 7.21875, + "epoch": 0.07166637462765026, + "grad_norm": 21.22667824518068, + "kl": 0.1162109375, + "learning_rate": 9.285088487821972e-07, + "loss": 0.0464, + "reward": 1.5071234703063965, + "reward_std": 0.18095630407333374, + "rewards/accuracy_reward_stage2": 0.5071234703063965, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 409 + }, + { + "completion_length": 10.390625, + "epoch": 0.0718415980374978, + "grad_norm": 23.411944773725754, + "kl": 0.06787109375, + "learning_rate": 9.283336253723497e-07, + "loss": -0.0387, + "reward": 1.7464570999145508, + "reward_std": 0.20300233364105225, + "rewards/accuracy_reward_stage2": 0.7777070999145508, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 410 + }, + { + "completion_length": 12.8125, + "epoch": 0.07201682144734536, + "grad_norm": 26.486173925924497, + "kl": 0.06103515625, + "learning_rate": 9.281584019625022e-07, + "loss": 0.0244, + "reward": 1.3693530559539795, + "reward_std": 0.24731206893920898, + "rewards/accuracy_reward_stage2": 0.49435311555862427, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 411 + }, + { + "completion_length": 14.84375, + "epoch": 0.07219204485719292, + "grad_norm": 25.114732676456597, + "kl": 0.1298828125, + "learning_rate": 9.279831785526546e-07, + "loss": 0.0518, + "reward": 1.5222240686416626, + "reward_std": 0.22694925963878632, + "rewards/accuracy_reward_stage2": 0.5222241282463074, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 412 + }, + { + "completion_length": 13.5, + "epoch": 0.07236726826704047, + "grad_norm": 20.31226399381922, + "kl": 0.01409912109375, + "learning_rate": 9.278079551428071e-07, + "loss": 0.0056, + "reward": 1.2919033765792847, + "reward_std": 0.1674969494342804, + "rewards/accuracy_reward_stage2": 0.29190340638160706, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 413 + }, + { + "completion_length": 8.125, + "epoch": 0.07254249167688803, + "grad_norm": 11.704296994412068, + "kl": 0.04150390625, + "learning_rate": 9.276327317329595e-07, + "loss": 0.0166, + "reward": 1.6002380847930908, + "reward_std": 0.10007701814174652, + "rewards/accuracy_reward_stage2": 0.6002380847930908, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 414 + }, + { + "completion_length": 8.859375, + "epoch": 0.0727177150867356, + "grad_norm": 17.94825639927456, + "kl": 0.10205078125, + "learning_rate": 9.27457508323112e-07, + "loss": 0.0408, + "reward": 1.4635004997253418, + "reward_std": 0.22981159389019012, + "rewards/accuracy_reward_stage2": 0.4635005295276642, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 415 + }, + { + "completion_length": 8.40625, + "epoch": 0.07289293849658314, + "grad_norm": 22.852074729973108, + "kl": 0.03271484375, + "learning_rate": 9.272822849132644e-07, + "loss": -0.0311, + "reward": 1.9208898544311523, + "reward_std": 0.15941961109638214, + "rewards/accuracy_reward_stage2": 0.9365148544311523, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 416 + }, + { + "completion_length": 20.765625, + "epoch": 0.0730681619064307, + "grad_norm": 15.219734361543404, + "kl": 0.0341796875, + "learning_rate": 9.271070615034167e-07, + "loss": 0.0137, + "reward": 1.288794994354248, + "reward_std": 0.07320894300937653, + "rewards/accuracy_reward_stage2": 0.41379502415657043, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 417 + }, + { + "completion_length": 9.953125, + "epoch": 0.07324338531627826, + "grad_norm": 18.93618282416882, + "kl": 0.10009765625, + "learning_rate": 9.269318380935692e-07, + "loss": -0.0041, + "reward": 1.4674339294433594, + "reward_std": 0.21226957440376282, + "rewards/accuracy_reward_stage2": 0.483058899641037, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 418 + }, + { + "completion_length": 12.03125, + "epoch": 0.07341860872612581, + "grad_norm": 16.897489413117984, + "kl": 0.0244140625, + "learning_rate": 9.267566146837217e-07, + "loss": 0.0098, + "reward": 1.4678539037704468, + "reward_std": 0.16594135761260986, + "rewards/accuracy_reward_stage2": 0.4678539037704468, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 419 + }, + { + "completion_length": 15.53125, + "epoch": 0.07359383213597337, + "grad_norm": 24.996218069156303, + "kl": 0.37890625, + "learning_rate": 9.265813912738741e-07, + "loss": 0.1515, + "reward": 1.382279634475708, + "reward_std": 0.15624842047691345, + "rewards/accuracy_reward_stage2": 0.5072795748710632, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 420 + }, + { + "completion_length": 9.75, + "epoch": 0.07376905554582092, + "grad_norm": 20.6302936890133, + "kl": 0.057373046875, + "learning_rate": 9.264061678640266e-07, + "loss": 0.023, + "reward": 1.6946684122085571, + "reward_std": 0.241998553276062, + "rewards/accuracy_reward_stage2": 0.6946684718132019, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 421 + }, + { + "completion_length": 13.046875, + "epoch": 0.07394427895566848, + "grad_norm": 18.246674428454657, + "kl": 0.09765625, + "learning_rate": 9.26230944454179e-07, + "loss": 0.01, + "reward": 1.5222277641296387, + "reward_std": 0.2169165462255478, + "rewards/accuracy_reward_stage2": 0.5378527641296387, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 422 + }, + { + "completion_length": 10.734375, + "epoch": 0.07411950236551604, + "grad_norm": 19.198742373526642, + "kl": 0.65625, + "learning_rate": 9.260557210443315e-07, + "loss": 0.2619, + "reward": 1.5240931510925293, + "reward_std": 0.22133302688598633, + "rewards/accuracy_reward_stage2": 0.7740931510925293, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 423 + }, + { + "completion_length": 12.75, + "epoch": 0.07429472577536358, + "grad_norm": 29.79110571390762, + "kl": 0.10302734375, + "learning_rate": 9.25880497634484e-07, + "loss": -0.003, + "reward": 1.3967804908752441, + "reward_std": 0.30737680196762085, + "rewards/accuracy_reward_stage2": 0.4124056100845337, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 424 + }, + { + "completion_length": 14.046875, + "epoch": 0.07446994918521115, + "grad_norm": 18.602900239899586, + "kl": 0.59765625, + "learning_rate": 9.257052742246364e-07, + "loss": 0.1944, + "reward": 1.226088285446167, + "reward_std": 0.21388447284698486, + "rewards/accuracy_reward_stage2": 0.36671334505081177, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 425 + }, + { + "completion_length": 5.5625, + "epoch": 0.0746451725950587, + "grad_norm": 23.651939037577094, + "kl": 0.03759765625, + "learning_rate": 9.255300508147889e-07, + "loss": 0.015, + "reward": 1.5, + "reward_std": 0.1293872892856598, + "rewards/accuracy_reward_stage2": 0.5, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 426 + }, + { + "completion_length": 11.671875, + "epoch": 0.07482039600490625, + "grad_norm": 24.861135751373336, + "kl": 0.087890625, + "learning_rate": 9.253548274049414e-07, + "loss": 0.0036, + "reward": 1.570847511291504, + "reward_std": 0.3195498585700989, + "rewards/accuracy_reward_stage2": 0.5864725112915039, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 427 + }, + { + "completion_length": 9.78125, + "epoch": 0.07499561941475381, + "grad_norm": 20.03380077267981, + "kl": 0.115234375, + "learning_rate": 9.251796039950936e-07, + "loss": 0.0461, + "reward": 1.5266039371490479, + "reward_std": 0.1536373347043991, + "rewards/accuracy_reward_stage2": 0.6516038179397583, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 428 + }, + { + "completion_length": 12.78125, + "epoch": 0.07517084282460136, + "grad_norm": 65.42975776154991, + "kl": 0.6875, + "learning_rate": 9.250043805852461e-07, + "loss": 0.2751, + "reward": 1.4417392015457153, + "reward_std": 0.29339122772216797, + "rewards/accuracy_reward_stage2": 0.6917392015457153, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 429 + }, + { + "completion_length": 9.421875, + "epoch": 0.07534606623444892, + "grad_norm": 20.85029400969406, + "kl": 0.0250244140625, + "learning_rate": 9.248291571753985e-07, + "loss": 0.01, + "reward": 1.3675525188446045, + "reward_std": 0.1472133994102478, + "rewards/accuracy_reward_stage2": 0.3675525486469269, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 430 + }, + { + "completion_length": 9.3125, + "epoch": 0.07552128964429648, + "grad_norm": 17.12020230180985, + "kl": 0.0150146484375, + "learning_rate": 9.24653933765551e-07, + "loss": 0.006, + "reward": 1.7181713581085205, + "reward_std": 0.13219161331653595, + "rewards/accuracy_reward_stage2": 0.7181712985038757, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 431 + }, + { + "completion_length": 9.453125, + "epoch": 0.07569651305414403, + "grad_norm": 19.27576364331227, + "kl": 0.0634765625, + "learning_rate": 9.244787103557035e-07, + "loss": 0.0254, + "reward": 1.5303231477737427, + "reward_std": 0.11499994993209839, + "rewards/accuracy_reward_stage2": 0.5303231477737427, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 432 + }, + { + "completion_length": 12.71875, + "epoch": 0.07587173646399159, + "grad_norm": 12.586077511720019, + "kl": 0.0157470703125, + "learning_rate": 9.243034869458559e-07, + "loss": 0.0063, + "reward": 1.4600911140441895, + "reward_std": 0.0769738256931305, + "rewards/accuracy_reward_stage2": 0.4600910544395447, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 433 + }, + { + "completion_length": 14.078125, + "epoch": 0.07604695987383915, + "grad_norm": 18.27881518277824, + "kl": 0.126953125, + "learning_rate": 9.241282635360084e-07, + "loss": 0.0507, + "reward": 1.414088487625122, + "reward_std": 0.14201867580413818, + "rewards/accuracy_reward_stage2": 0.5390884280204773, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 434 + }, + { + "completion_length": 14.734375, + "epoch": 0.0762221832836867, + "grad_norm": 22.462968714576906, + "kl": 0.1640625, + "learning_rate": 9.239530401261609e-07, + "loss": 0.0218, + "reward": 1.7298152446746826, + "reward_std": 0.18984611332416534, + "rewards/accuracy_reward_stage2": 0.7454402446746826, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 435 + }, + { + "completion_length": 26.984375, + "epoch": 0.07639740669353426, + "grad_norm": 37.120742747022334, + "kl": 0.5625, + "learning_rate": 9.237778167163133e-07, + "loss": 0.2252, + "reward": 1.5777366161346436, + "reward_std": 0.23983044922351837, + "rewards/accuracy_reward_stage2": 0.7027365565299988, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 436 + }, + { + "completion_length": 13.609375, + "epoch": 0.0765726301033818, + "grad_norm": 21.32126027829951, + "kl": 0.291015625, + "learning_rate": 9.236025933064658e-07, + "loss": 0.0866, + "reward": 1.5508217811584473, + "reward_std": 0.11196690797805786, + "rewards/accuracy_reward_stage2": 0.691446840763092, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 437 + }, + { + "completion_length": 13.125, + "epoch": 0.07674785351322937, + "grad_norm": 21.206277419580225, + "kl": 0.0830078125, + "learning_rate": 9.234273698966181e-07, + "loss": -0.0022, + "reward": 1.3115644454956055, + "reward_std": 0.2376328855752945, + "rewards/accuracy_reward_stage2": 0.45218944549560547, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 438 + }, + { + "completion_length": 15.359375, + "epoch": 0.07692307692307693, + "grad_norm": 26.441305499162887, + "kl": 0.69921875, + "learning_rate": 9.232521464867706e-07, + "loss": 0.2789, + "reward": 1.5329861640930176, + "reward_std": 0.2549724876880646, + "rewards/accuracy_reward_stage2": 0.6579861044883728, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 439 + }, + { + "completion_length": 7.375, + "epoch": 0.07709830033292447, + "grad_norm": 14.831592080269788, + "kl": 0.0184326171875, + "learning_rate": 9.230769230769231e-07, + "loss": 0.0074, + "reward": 1.5251660346984863, + "reward_std": 0.14086659252643585, + "rewards/accuracy_reward_stage2": 0.5251659154891968, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 440 + }, + { + "completion_length": 10.125, + "epoch": 0.07727352374277204, + "grad_norm": 18.685402304111808, + "kl": 0.078125, + "learning_rate": 9.229016996670754e-07, + "loss": 0.0314, + "reward": 1.652919054031372, + "reward_std": 0.17189809679985046, + "rewards/accuracy_reward_stage2": 0.6529191136360168, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 441 + }, + { + "completion_length": 9.203125, + "epoch": 0.0774487471526196, + "grad_norm": 26.496965759362393, + "kl": 0.01141357421875, + "learning_rate": 9.227264762572279e-07, + "loss": 0.0046, + "reward": 1.6230113506317139, + "reward_std": 0.20974057912826538, + "rewards/accuracy_reward_stage2": 0.6230113506317139, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 442 + }, + { + "completion_length": 7.078125, + "epoch": 0.07762397056246714, + "grad_norm": 17.947336239229962, + "kl": 0.01226806640625, + "learning_rate": 9.225512528473803e-07, + "loss": 0.0049, + "reward": 1.6121759414672852, + "reward_std": 0.21027937531471252, + "rewards/accuracy_reward_stage2": 0.6121759414672852, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 443 + }, + { + "completion_length": 10.046875, + "epoch": 0.0777991939723147, + "grad_norm": 20.823151026736113, + "kl": 0.0206298828125, + "learning_rate": 9.223760294375328e-07, + "loss": -0.0359, + "reward": 1.5672154426574707, + "reward_std": 0.18561364710330963, + "rewards/accuracy_reward_stage2": 0.5828403234481812, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 444 + }, + { + "completion_length": 18.609375, + "epoch": 0.07797441738216225, + "grad_norm": 20.451095462177015, + "kl": 0.08203125, + "learning_rate": 9.222008060276853e-07, + "loss": 0.0329, + "reward": 1.4407696723937988, + "reward_std": 0.23721659183502197, + "rewards/accuracy_reward_stage2": 0.44076964259147644, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 445 + }, + { + "completion_length": 9.75, + "epoch": 0.07814964079200981, + "grad_norm": 16.20511513569212, + "kl": 0.0439453125, + "learning_rate": 9.220255826178377e-07, + "loss": 0.0176, + "reward": 1.6019539833068848, + "reward_std": 0.21476775407791138, + "rewards/accuracy_reward_stage2": 0.6019538640975952, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 446 + }, + { + "completion_length": 8.578125, + "epoch": 0.07832486420185737, + "grad_norm": 21.53926182512, + "kl": 0.061279296875, + "learning_rate": 9.218503592079901e-07, + "loss": -0.0197, + "reward": 1.5372408628463745, + "reward_std": 0.29406264424324036, + "rewards/accuracy_reward_stage2": 0.5528658628463745, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 447 + }, + { + "completion_length": 8.3125, + "epoch": 0.07850008761170492, + "grad_norm": 20.87221627271646, + "kl": 0.041259765625, + "learning_rate": 9.216751357981426e-07, + "loss": -0.0151, + "reward": 1.622206449508667, + "reward_std": 0.32090505957603455, + "rewards/accuracy_reward_stage2": 0.637831449508667, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 448 + }, + { + "completion_length": 9.15625, + "epoch": 0.07867531102155248, + "grad_norm": 16.58816774301801, + "kl": 0.041748046875, + "learning_rate": 9.21499912388295e-07, + "loss": 0.0166, + "reward": 1.6170015335083008, + "reward_std": 0.19486872851848602, + "rewards/accuracy_reward_stage2": 0.617001473903656, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 449 + }, + { + "completion_length": 12.109375, + "epoch": 0.07885053443140004, + "grad_norm": 17.47196711067821, + "kl": 0.5703125, + "learning_rate": 9.213246889784475e-07, + "loss": 0.2271, + "reward": 1.5230088233947754, + "reward_std": 0.13385896384716034, + "rewards/accuracy_reward_stage2": 0.6480089426040649, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 450 + }, + { + "completion_length": 8.046875, + "epoch": 0.07902575784124759, + "grad_norm": 14.006565039229681, + "kl": 0.041748046875, + "learning_rate": 9.211494655685999e-07, + "loss": -0.0167, + "reward": 1.8111279010772705, + "reward_std": 0.2181154489517212, + "rewards/accuracy_reward_stage2": 0.8267529010772705, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 451 + }, + { + "completion_length": 10.4375, + "epoch": 0.07920098125109515, + "grad_norm": 13.338040288753435, + "kl": 0.04833984375, + "learning_rate": 9.209742421587524e-07, + "loss": 0.0194, + "reward": 1.522031307220459, + "reward_std": 0.11794281750917435, + "rewards/accuracy_reward_stage2": 0.5220313668251038, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 452 + }, + { + "completion_length": 14.875, + "epoch": 0.0793762046609427, + "grad_norm": 18.92124589768796, + "kl": 0.4453125, + "learning_rate": 9.207990187489049e-07, + "loss": 0.1449, + "reward": 1.508528232574463, + "reward_std": 0.17721973359584808, + "rewards/accuracy_reward_stage2": 0.6491532325744629, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 453 + }, + { + "completion_length": 7.0625, + "epoch": 0.07955142807079026, + "grad_norm": 11.358183463925144, + "kl": 0.028564453125, + "learning_rate": 9.206237953390572e-07, + "loss": 0.0114, + "reward": 1.5925219058990479, + "reward_std": 0.06521537899971008, + "rewards/accuracy_reward_stage2": 0.5925219058990479, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 454 + }, + { + "completion_length": 13.53125, + "epoch": 0.07972665148063782, + "grad_norm": 19.019904562195457, + "kl": 0.08203125, + "learning_rate": 9.204485719292097e-07, + "loss": -0.011, + "reward": 1.2978602647781372, + "reward_std": 0.13462838530540466, + "rewards/accuracy_reward_stage2": 0.3134852647781372, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 455 + }, + { + "completion_length": 11.421875, + "epoch": 0.07990187489048536, + "grad_norm": 23.626297967066854, + "kl": 0.1171875, + "learning_rate": 9.202733485193622e-07, + "loss": -0.0206, + "reward": 1.5690983533859253, + "reward_std": 0.20688967406749725, + "rewards/accuracy_reward_stage2": 0.6003483533859253, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 456 + }, + { + "completion_length": 12.046875, + "epoch": 0.08007709830033292, + "grad_norm": 22.91529648896093, + "kl": 0.47265625, + "learning_rate": 9.200981251095145e-07, + "loss": 0.1892, + "reward": 1.5967607498168945, + "reward_std": 0.2147434800863266, + "rewards/accuracy_reward_stage2": 0.7217606902122498, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 457 + }, + { + "completion_length": 13.40625, + "epoch": 0.08025232171018049, + "grad_norm": 23.42855017991445, + "kl": 0.09228515625, + "learning_rate": 9.19922901699667e-07, + "loss": 0.0368, + "reward": 1.515872836112976, + "reward_std": 0.1746286153793335, + "rewards/accuracy_reward_stage2": 0.5158728361129761, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 458 + }, + { + "completion_length": 8.53125, + "epoch": 0.08042754512002803, + "grad_norm": 19.36224437018435, + "kl": 0.07177734375, + "learning_rate": 9.197476782898194e-07, + "loss": 0.0287, + "reward": 1.6671662330627441, + "reward_std": 0.20749810338020325, + "rewards/accuracy_reward_stage2": 0.6671661734580994, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 459 + }, + { + "completion_length": 9.484375, + "epoch": 0.08060276852987559, + "grad_norm": 18.931505006258217, + "kl": 0.037109375, + "learning_rate": 9.195724548799719e-07, + "loss": 0.0148, + "reward": 1.524126648902893, + "reward_std": 0.16327914595603943, + "rewards/accuracy_reward_stage2": 0.5241267085075378, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 460 + }, + { + "completion_length": 8.859375, + "epoch": 0.08077799193972315, + "grad_norm": 21.76679733035785, + "kl": 0.072265625, + "learning_rate": 9.193972314701244e-07, + "loss": 0.0289, + "reward": 1.5988078117370605, + "reward_std": 0.2698323130607605, + "rewards/accuracy_reward_stage2": 0.5988078713417053, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 461 + }, + { + "completion_length": 12.75, + "epoch": 0.0809532153495707, + "grad_norm": 259.33214865484473, + "kl": 0.5390625, + "learning_rate": 9.192220080602768e-07, + "loss": 0.1409, + "reward": 1.3012222051620483, + "reward_std": 0.2742873728275299, + "rewards/accuracy_reward_stage2": 0.4574722647666931, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 462 + }, + { + "completion_length": 10.4375, + "epoch": 0.08112843875941826, + "grad_norm": 15.7184925250878, + "kl": 0.052978515625, + "learning_rate": 9.190467846504293e-07, + "loss": 0.0212, + "reward": 1.3793818950653076, + "reward_std": 0.14053119719028473, + "rewards/accuracy_reward_stage2": 0.5043818950653076, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 463 + }, + { + "completion_length": 8.359375, + "epoch": 0.08130366216926581, + "grad_norm": 18.581197943613866, + "kl": 0.032470703125, + "learning_rate": 9.188715612405818e-07, + "loss": 0.013, + "reward": 1.5649161338806152, + "reward_std": 0.12751111388206482, + "rewards/accuracy_reward_stage2": 0.5649161338806152, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 464 + }, + { + "completion_length": 9.296875, + "epoch": 0.08147888557911337, + "grad_norm": 13.875201754235798, + "kl": 0.09765625, + "learning_rate": 9.186963378307342e-07, + "loss": 0.039, + "reward": 1.5525639057159424, + "reward_std": 0.12691347301006317, + "rewards/accuracy_reward_stage2": 0.5525639653205872, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 465 + }, + { + "completion_length": 14.25, + "epoch": 0.08165410898896093, + "grad_norm": 18.189817374897615, + "kl": 0.0244140625, + "learning_rate": 9.185211144208866e-07, + "loss": 0.0098, + "reward": 1.468153476715088, + "reward_std": 0.0900755375623703, + "rewards/accuracy_reward_stage2": 0.5931534767150879, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 466 + }, + { + "completion_length": 23.375, + "epoch": 0.08182933239880848, + "grad_norm": 26.444868554171173, + "kl": 0.05419921875, + "learning_rate": 9.183458910110389e-07, + "loss": 0.0217, + "reward": 1.593287467956543, + "reward_std": 0.23012332618236542, + "rewards/accuracy_reward_stage2": 0.5932875871658325, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 467 + }, + { + "completion_length": 27.75, + "epoch": 0.08200455580865604, + "grad_norm": 8966.745368887247, + "kl": 55.75, + "learning_rate": 9.181706676011914e-07, + "loss": 22.3474, + "reward": 1.203883171081543, + "reward_std": 0.20250242948532104, + "rewards/accuracy_reward_stage2": 0.32888320088386536, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 468 + }, + { + "completion_length": 9.78125, + "epoch": 0.0821797792185036, + "grad_norm": 11.436562594290804, + "kl": 0.0133056640625, + "learning_rate": 9.179954441913439e-07, + "loss": 0.0053, + "reward": 1.7812447547912598, + "reward_std": 0.04915858805179596, + "rewards/accuracy_reward_stage2": 0.9062446355819702, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 469 + }, + { + "completion_length": 10.078125, + "epoch": 0.08235500262835115, + "grad_norm": 21.126468460943666, + "kl": 0.06103515625, + "learning_rate": 9.178202207814963e-07, + "loss": -0.0045, + "reward": 1.70155930519104, + "reward_std": 0.2857385575771332, + "rewards/accuracy_reward_stage2": 0.7171843647956848, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 470 + }, + { + "completion_length": 14.890625, + "epoch": 0.0825302260381987, + "grad_norm": 19.202675304680323, + "kl": 0.07958984375, + "learning_rate": 9.176449973716488e-07, + "loss": 0.0027, + "reward": 1.693793773651123, + "reward_std": 0.1584872305393219, + "rewards/accuracy_reward_stage2": 0.709418773651123, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 471 + }, + { + "completion_length": 9.6875, + "epoch": 0.08270544944804625, + "grad_norm": 16.471679971821743, + "kl": 0.057861328125, + "learning_rate": 9.174697739618013e-07, + "loss": 0.0231, + "reward": 1.8260822296142578, + "reward_std": 0.0860002413392067, + "rewards/accuracy_reward_stage2": 0.8260822892189026, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 472 + }, + { + "completion_length": 11.15625, + "epoch": 0.08288067285789381, + "grad_norm": 23.791810653654068, + "kl": 0.287109375, + "learning_rate": 9.172945505519537e-07, + "loss": 0.1152, + "reward": 1.402888536453247, + "reward_std": 0.2622658908367157, + "rewards/accuracy_reward_stage2": 0.5278885364532471, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 473 + }, + { + "completion_length": 17.8125, + "epoch": 0.08305589626774137, + "grad_norm": 25.791584456588865, + "kl": 0.115234375, + "learning_rate": 9.171193271421062e-07, + "loss": 0.046, + "reward": 1.5935009717941284, + "reward_std": 0.1915404051542282, + "rewards/accuracy_reward_stage2": 0.5935010313987732, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 474 + }, + { + "completion_length": 10.90625, + "epoch": 0.08323111967758892, + "grad_norm": 24.400165974590752, + "kl": 0.09716796875, + "learning_rate": 9.169441037322586e-07, + "loss": 0.0389, + "reward": 1.6456031799316406, + "reward_std": 0.29340386390686035, + "rewards/accuracy_reward_stage2": 0.6456031799316406, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 475 + }, + { + "completion_length": 11.03125, + "epoch": 0.08340634308743648, + "grad_norm": 22.10942774313132, + "kl": 0.08544921875, + "learning_rate": 9.167688803224111e-07, + "loss": 0.0342, + "reward": 1.499727487564087, + "reward_std": 0.16826727986335754, + "rewards/accuracy_reward_stage2": 0.49972739815711975, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 476 + }, + { + "completion_length": 16.078125, + "epoch": 0.08358156649728404, + "grad_norm": 25.856498433525953, + "kl": 0.322265625, + "learning_rate": 9.165936569125636e-07, + "loss": 0.129, + "reward": 1.40625, + "reward_std": 0.1293872892856598, + "rewards/accuracy_reward_stage2": 0.53125, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 477 + }, + { + "completion_length": 11.671875, + "epoch": 0.08375678990713159, + "grad_norm": 22.729798625639337, + "kl": 0.07177734375, + "learning_rate": 9.164184335027159e-07, + "loss": -0.0026, + "reward": 1.5009512901306152, + "reward_std": 0.3437076210975647, + "rewards/accuracy_reward_stage2": 0.51657634973526, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 478 + }, + { + "completion_length": 21.171875, + "epoch": 0.08393201331697915, + "grad_norm": 118.09404479510592, + "kl": 1.09375, + "learning_rate": 9.162432100928683e-07, + "loss": 0.4047, + "reward": 1.2600040435791016, + "reward_std": 0.1994476616382599, + "rewards/accuracy_reward_stage2": 0.525628924369812, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 479 + }, + { + "completion_length": 13.625, + "epoch": 0.0841072367268267, + "grad_norm": 22.19379717741815, + "kl": 0.06103515625, + "learning_rate": 9.160679866830208e-07, + "loss": -0.0089, + "reward": 1.350834846496582, + "reward_std": 0.29887160658836365, + "rewards/accuracy_reward_stage2": 0.36645978689193726, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 480 + }, + { + "completion_length": 12.46875, + "epoch": 0.08428246013667426, + "grad_norm": 17.494509829942675, + "kl": 0.0361328125, + "learning_rate": 9.158927632731732e-07, + "loss": -0.0732, + "reward": 1.7071616649627686, + "reward_std": 0.26007020473480225, + "rewards/accuracy_reward_stage2": 0.7384116053581238, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 481 + }, + { + "completion_length": 9.5625, + "epoch": 0.08445768354652182, + "grad_norm": 26.620563297769827, + "kl": 0.1201171875, + "learning_rate": 9.157175398633257e-07, + "loss": -0.0193, + "reward": 1.4905469417572021, + "reward_std": 0.2239820957183838, + "rewards/accuracy_reward_stage2": 0.5217969417572021, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 482 + }, + { + "completion_length": 8.921875, + "epoch": 0.08463290695636937, + "grad_norm": 20.51588760061051, + "kl": 0.05322265625, + "learning_rate": 9.155423164534781e-07, + "loss": 0.0213, + "reward": 1.6132653951644897, + "reward_std": 0.21259036660194397, + "rewards/accuracy_reward_stage2": 0.6132654547691345, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 483 + }, + { + "completion_length": 15.375, + "epoch": 0.08480813036621693, + "grad_norm": 20.733659007741288, + "kl": 0.57421875, + "learning_rate": 9.153670930436306e-07, + "loss": 0.2298, + "reward": 1.191416621208191, + "reward_std": 0.15625491738319397, + "rewards/accuracy_reward_stage2": 0.44141659140586853, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 484 + }, + { + "completion_length": 9.234375, + "epoch": 0.08498335377606449, + "grad_norm": 17.187443879963023, + "kl": 0.06884765625, + "learning_rate": 9.151918696337831e-07, + "loss": 0.0276, + "reward": 1.4177746772766113, + "reward_std": 0.16705992817878723, + "rewards/accuracy_reward_stage2": 0.5427746772766113, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 485 + }, + { + "completion_length": 12.234375, + "epoch": 0.08515857718591203, + "grad_norm": 25.971877155932173, + "kl": 0.11669921875, + "learning_rate": 9.150166462239355e-07, + "loss": 0.0468, + "reward": 1.5548030138015747, + "reward_std": 0.32056811451911926, + "rewards/accuracy_reward_stage2": 0.6798031330108643, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 486 + }, + { + "completion_length": 10.265625, + "epoch": 0.0853338005957596, + "grad_norm": 21.44425590514259, + "kl": 0.023681640625, + "learning_rate": 9.148414228140879e-07, + "loss": 0.0095, + "reward": 1.6888264417648315, + "reward_std": 0.18528994917869568, + "rewards/accuracy_reward_stage2": 0.6888264417648315, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 487 + }, + { + "completion_length": 11.0625, + "epoch": 0.08550902400560714, + "grad_norm": 15.23099896578219, + "kl": 0.6015625, + "learning_rate": 9.146661994042404e-07, + "loss": 0.2397, + "reward": 1.6460518836975098, + "reward_std": 0.07517996430397034, + "rewards/accuracy_reward_stage2": 0.7710518836975098, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 488 + }, + { + "completion_length": 12.421875, + "epoch": 0.0856842474154547, + "grad_norm": 18.419604037862406, + "kl": 0.06494140625, + "learning_rate": 9.144909759943928e-07, + "loss": 0.026, + "reward": 1.7371560335159302, + "reward_std": 0.2151593118906021, + "rewards/accuracy_reward_stage2": 0.7371560335159302, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 489 + }, + { + "completion_length": 12.578125, + "epoch": 0.08585947082530226, + "grad_norm": 18.257266241360327, + "kl": 0.0213623046875, + "learning_rate": 9.143157525845453e-07, + "loss": 0.0085, + "reward": 1.5157694816589355, + "reward_std": 0.23306876420974731, + "rewards/accuracy_reward_stage2": 0.5157694220542908, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 490 + }, + { + "completion_length": 15.25, + "epoch": 0.08603469423514981, + "grad_norm": 13.805727437190503, + "kl": 0.01123046875, + "learning_rate": 9.141405291746977e-07, + "loss": 0.0045, + "reward": 1.2439332008361816, + "reward_std": 0.136602520942688, + "rewards/accuracy_reward_stage2": 0.36893314123153687, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 491 + }, + { + "completion_length": 11.859375, + "epoch": 0.08620991764499737, + "grad_norm": 17.94928563968683, + "kl": 0.032958984375, + "learning_rate": 9.139653057648501e-07, + "loss": 0.0131, + "reward": 1.6556003093719482, + "reward_std": 0.11049705743789673, + "rewards/accuracy_reward_stage2": 0.655600368976593, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 492 + }, + { + "completion_length": 11.578125, + "epoch": 0.08638514105484493, + "grad_norm": 16.562608575277626, + "kl": 0.09228515625, + "learning_rate": 9.137900823550026e-07, + "loss": 0.037, + "reward": 1.4523134231567383, + "reward_std": 0.13723313808441162, + "rewards/accuracy_reward_stage2": 0.5773133635520935, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 493 + }, + { + "completion_length": 7.03125, + "epoch": 0.08656036446469248, + "grad_norm": 16.621714925514897, + "kl": 0.044189453125, + "learning_rate": 9.13614858945155e-07, + "loss": -0.0917, + "reward": 1.780239224433899, + "reward_std": 0.18271209299564362, + "rewards/accuracy_reward_stage2": 0.8271142244338989, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 494 + }, + { + "completion_length": 11.453125, + "epoch": 0.08673558787454004, + "grad_norm": 26.540387912232912, + "kl": 0.05908203125, + "learning_rate": 9.134396355353075e-07, + "loss": 0.0236, + "reward": 1.4195480346679688, + "reward_std": 0.12336726486682892, + "rewards/accuracy_reward_stage2": 0.544547975063324, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 495 + }, + { + "completion_length": 9.90625, + "epoch": 0.08691081128438759, + "grad_norm": 21.874119279086194, + "kl": 0.0732421875, + "learning_rate": 9.1326441212546e-07, + "loss": 0.0293, + "reward": 1.704089641571045, + "reward_std": 0.1853123903274536, + "rewards/accuracy_reward_stage2": 0.7040896415710449, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 496 + }, + { + "completion_length": 17.984375, + "epoch": 0.08708603469423515, + "grad_norm": 16.270959865875156, + "kl": 0.0712890625, + "learning_rate": 9.130891887156123e-07, + "loss": -0.0157, + "reward": 1.3348397016525269, + "reward_std": 0.14666268229484558, + "rewards/accuracy_reward_stage2": 0.47546470165252686, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 497 + }, + { + "completion_length": 8.953125, + "epoch": 0.08726125810408271, + "grad_norm": 20.903866923336594, + "kl": 0.09326171875, + "learning_rate": 9.129139653057648e-07, + "loss": -0.0069, + "reward": 1.4990664720535278, + "reward_std": 0.2881520092487335, + "rewards/accuracy_reward_stage2": 0.6396914720535278, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 498 + }, + { + "completion_length": 14.921875, + "epoch": 0.08743648151393026, + "grad_norm": 17.22315948392657, + "kl": 0.1201171875, + "learning_rate": 9.127387418959172e-07, + "loss": 0.0478, + "reward": 1.448919653892517, + "reward_std": 0.1353299617767334, + "rewards/accuracy_reward_stage2": 0.5739197134971619, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 499 + }, + { + "completion_length": 7.46875, + "epoch": 0.08761170492377782, + "grad_norm": 16.0429762085806, + "kl": 0.02734375, + "learning_rate": 9.125635184860697e-07, + "loss": -0.0225, + "reward": 1.3645137548446655, + "reward_std": 0.20214247703552246, + "rewards/accuracy_reward_stage2": 0.38013872504234314, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 500 + }, + { + "completion_length": 7.109375, + "epoch": 0.08778692833362538, + "grad_norm": 13.494480051498577, + "kl": 0.01904296875, + "learning_rate": 9.123882950762222e-07, + "loss": 0.0076, + "reward": 1.6657228469848633, + "reward_std": 0.10837189853191376, + "rewards/accuracy_reward_stage2": 0.6657228469848633, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 501 + }, + { + "completion_length": 11.078125, + "epoch": 0.08796215174347292, + "grad_norm": 20.820818788522654, + "kl": 0.041748046875, + "learning_rate": 9.122130716663746e-07, + "loss": -0.0113, + "reward": 1.6971039772033691, + "reward_std": 0.17723074555397034, + "rewards/accuracy_reward_stage2": 0.7127288579940796, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 502 + }, + { + "completion_length": 8.9375, + "epoch": 0.08813737515332049, + "grad_norm": 20.591576189544814, + "kl": 0.0859375, + "learning_rate": 9.120378482565271e-07, + "loss": 0.0345, + "reward": 1.5953072309494019, + "reward_std": 0.14238935708999634, + "rewards/accuracy_reward_stage2": 0.5953072309494019, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 503 + }, + { + "completion_length": 10.0, + "epoch": 0.08831259856316805, + "grad_norm": 21.333154774702535, + "kl": 0.057861328125, + "learning_rate": 9.118626248466796e-07, + "loss": 0.0232, + "reward": 1.5268161296844482, + "reward_std": 0.17292073369026184, + "rewards/accuracy_reward_stage2": 0.6518160700798035, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 504 + }, + { + "completion_length": 6.71875, + "epoch": 0.08848782197301559, + "grad_norm": 15.041117262517627, + "kl": 0.041259765625, + "learning_rate": 9.116874014368319e-07, + "loss": 0.0166, + "reward": 1.7544504404067993, + "reward_std": 0.07580053806304932, + "rewards/accuracy_reward_stage2": 0.7544504404067993, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 505 + }, + { + "completion_length": 7.59375, + "epoch": 0.08866304538286315, + "grad_norm": 26.12056744776605, + "kl": 0.150390625, + "learning_rate": 9.115121780269844e-07, + "loss": 0.0602, + "reward": 1.3547911643981934, + "reward_std": 0.2755330204963684, + "rewards/accuracy_reward_stage2": 0.47979116439819336, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 506 + }, + { + "completion_length": 9.953125, + "epoch": 0.0888382687927107, + "grad_norm": 29.23205546489329, + "kl": 0.042724609375, + "learning_rate": 9.113369546171367e-07, + "loss": 0.0171, + "reward": 1.3176989555358887, + "reward_std": 0.16319842636585236, + "rewards/accuracy_reward_stage2": 0.44269901514053345, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 507 + }, + { + "completion_length": 9.015625, + "epoch": 0.08901349220255826, + "grad_norm": 20.57669358054299, + "kl": 0.0478515625, + "learning_rate": 9.111617312072892e-07, + "loss": 0.0191, + "reward": 1.525465965270996, + "reward_std": 0.12524645030498505, + "rewards/accuracy_reward_stage2": 0.5254659652709961, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 508 + }, + { + "completion_length": 12.09375, + "epoch": 0.08918871561240582, + "grad_norm": 1464.9969971244047, + "kl": 2.078125, + "learning_rate": 9.109865077974417e-07, + "loss": 0.8816, + "reward": 1.3966069221496582, + "reward_std": 0.1975216120481491, + "rewards/accuracy_reward_stage2": 0.5216069221496582, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 509 + }, + { + "completion_length": 15.703125, + "epoch": 0.08936393902225337, + "grad_norm": 36.949477081771, + "kl": 0.47265625, + "learning_rate": 9.108112843875941e-07, + "loss": 0.1891, + "reward": 1.4456298351287842, + "reward_std": 0.17112982273101807, + "rewards/accuracy_reward_stage2": 0.570629894733429, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 510 + }, + { + "completion_length": 11.75, + "epoch": 0.08953916243210093, + "grad_norm": 33.871343877158345, + "kl": 0.21484375, + "learning_rate": 9.106360609777466e-07, + "loss": 0.0186, + "reward": 1.18915593624115, + "reward_std": 0.30198466777801514, + "rewards/accuracy_reward_stage2": 0.4704058766365051, + "rewards/format_reward_stage1_pointerpad": 0.71875, + "scores/accuracy_reward_stage2": 0.71875, + "step": 511 + }, + { + "completion_length": 15.96875, + "epoch": 0.08971438584194849, + "grad_norm": 20.09570532536062, + "kl": 0.07470703125, + "learning_rate": 9.10460837567899e-07, + "loss": 0.0298, + "reward": 1.4191932678222656, + "reward_std": 0.1313377022743225, + "rewards/accuracy_reward_stage2": 0.41919323801994324, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 512 + }, + { + "completion_length": 8.046875, + "epoch": 0.08988960925179604, + "grad_norm": 22.475214131805384, + "kl": 0.09375, + "learning_rate": 9.102856141580515e-07, + "loss": 0.0108, + "reward": 1.6266400814056396, + "reward_std": 0.2724280059337616, + "rewards/accuracy_reward_stage2": 0.6422651410102844, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 513 + }, + { + "completion_length": 15.125, + "epoch": 0.0900648326616436, + "grad_norm": 19.065789675442435, + "kl": 0.5703125, + "learning_rate": 9.10110390748204e-07, + "loss": 0.2284, + "reward": 1.4352792501449585, + "reward_std": 0.20429880917072296, + "rewards/accuracy_reward_stage2": 0.5602791905403137, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 514 + }, + { + "completion_length": 7.03125, + "epoch": 0.09024005607149115, + "grad_norm": 41.06172188165604, + "kl": 0.298828125, + "learning_rate": 9.099351673383564e-07, + "loss": 0.0861, + "reward": 1.4946836233139038, + "reward_std": 0.3347545266151428, + "rewards/accuracy_reward_stage2": 0.5103086233139038, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 515 + }, + { + "completion_length": 16.71875, + "epoch": 0.0904152794813387, + "grad_norm": 21.74849083261524, + "kl": 0.021728515625, + "learning_rate": 9.097599439285089e-07, + "loss": -0.0355, + "reward": 1.4019594192504883, + "reward_std": 0.2302931249141693, + "rewards/accuracy_reward_stage2": 0.5425843000411987, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 516 + }, + { + "completion_length": 7.921875, + "epoch": 0.09059050289118627, + "grad_norm": 20.46413969828866, + "kl": 0.072265625, + "learning_rate": 9.095847205186612e-07, + "loss": 0.0288, + "reward": 1.7358605861663818, + "reward_std": 0.22160489857196808, + "rewards/accuracy_reward_stage2": 0.7358605265617371, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 517 + }, + { + "completion_length": 8.359375, + "epoch": 0.09076572630103381, + "grad_norm": 19.461831091625143, + "kl": 0.0400390625, + "learning_rate": 9.094094971088136e-07, + "loss": 0.0159, + "reward": 1.5917770862579346, + "reward_std": 0.20575006306171417, + "rewards/accuracy_reward_stage2": 0.5917772054672241, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 518 + }, + { + "completion_length": 23.453125, + "epoch": 0.09094094971088137, + "grad_norm": 28.252730679984083, + "kl": 0.486328125, + "learning_rate": 9.092342736989661e-07, + "loss": 0.1949, + "reward": 1.470628261566162, + "reward_std": 0.19762389361858368, + "rewards/accuracy_reward_stage2": 0.5956283211708069, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 519 + }, + { + "completion_length": 7.578125, + "epoch": 0.09111617312072894, + "grad_norm": 20.000340904813825, + "kl": 0.04638671875, + "learning_rate": 9.090590502891185e-07, + "loss": 0.0185, + "reward": 1.4284979104995728, + "reward_std": 0.12627477943897247, + "rewards/accuracy_reward_stage2": 0.428497850894928, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 520 + }, + { + "completion_length": 10.96875, + "epoch": 0.09129139653057648, + "grad_norm": 19.089316113977233, + "kl": 0.109375, + "learning_rate": 9.08883826879271e-07, + "loss": 0.0438, + "reward": 1.519882321357727, + "reward_std": 0.12701007723808289, + "rewards/accuracy_reward_stage2": 0.644882321357727, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 521 + }, + { + "completion_length": 10.65625, + "epoch": 0.09146661994042404, + "grad_norm": 17.661386673905188, + "kl": 0.051025390625, + "learning_rate": 9.087086034694235e-07, + "loss": 0.0204, + "reward": 1.499420404434204, + "reward_std": 0.16511984169483185, + "rewards/accuracy_reward_stage2": 0.4994203746318817, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 522 + }, + { + "completion_length": 21.40625, + "epoch": 0.09164184335027159, + "grad_norm": 16.986774971855642, + "kl": 0.0859375, + "learning_rate": 9.085333800595759e-07, + "loss": 0.0344, + "reward": 1.2771297693252563, + "reward_std": 0.15670299530029297, + "rewards/accuracy_reward_stage2": 0.40212973952293396, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 523 + }, + { + "completion_length": 6.5625, + "epoch": 0.09181706676011915, + "grad_norm": 13.33918575428173, + "kl": 0.06689453125, + "learning_rate": 9.083581566497284e-07, + "loss": 0.0268, + "reward": 1.2362689971923828, + "reward_std": 0.01895066723227501, + "rewards/accuracy_reward_stage2": 0.36126893758773804, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 524 + }, + { + "completion_length": 8.15625, + "epoch": 0.09199229016996671, + "grad_norm": 18.44676368745436, + "kl": 0.0888671875, + "learning_rate": 9.081829332398809e-07, + "loss": 0.0357, + "reward": 1.4270833730697632, + "reward_std": 0.2298484742641449, + "rewards/accuracy_reward_stage2": 0.4270833730697632, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 525 + }, + { + "completion_length": 11.53125, + "epoch": 0.09216751357981426, + "grad_norm": 24.122259978679597, + "kl": 0.0791015625, + "learning_rate": 9.080077098300333e-07, + "loss": 0.0026, + "reward": 1.4625226259231567, + "reward_std": 0.24698016047477722, + "rewards/accuracy_reward_stage2": 0.603147566318512, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 526 + }, + { + "completion_length": 7.9375, + "epoch": 0.09234273698966182, + "grad_norm": 24.15433980688675, + "kl": 0.03173828125, + "learning_rate": 9.078324864201857e-07, + "loss": 0.0127, + "reward": 1.3791133165359497, + "reward_std": 0.21480947732925415, + "rewards/accuracy_reward_stage2": 0.37911322712898254, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 527 + }, + { + "completion_length": 11.390625, + "epoch": 0.09251796039950938, + "grad_norm": 24.953994379295434, + "kl": 0.1103515625, + "learning_rate": 9.076572630103381e-07, + "loss": 0.0344, + "reward": 1.471587896347046, + "reward_std": 0.2744523882865906, + "rewards/accuracy_reward_stage2": 0.5965878963470459, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 528 + }, + { + "completion_length": 10.671875, + "epoch": 0.09269318380935693, + "grad_norm": 27.436250870990296, + "kl": 0.09521484375, + "learning_rate": 9.074820396004906e-07, + "loss": 0.038, + "reward": 1.618800401687622, + "reward_std": 0.2890097498893738, + "rewards/accuracy_reward_stage2": 0.6188005208969116, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 529 + }, + { + "completion_length": 8.40625, + "epoch": 0.09286840721920449, + "grad_norm": 23.698633010265233, + "kl": 0.09130859375, + "learning_rate": 9.07306816190643e-07, + "loss": -0.0077, + "reward": 1.6275365352630615, + "reward_std": 0.30690711736679077, + "rewards/accuracy_reward_stage2": 0.7681615352630615, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 530 + }, + { + "completion_length": 13.046875, + "epoch": 0.09304363062905203, + "grad_norm": 21.264761614556896, + "kl": 0.06787109375, + "learning_rate": 9.071315927807954e-07, + "loss": 0.0272, + "reward": 1.5593863725662231, + "reward_std": 0.16855500638484955, + "rewards/accuracy_reward_stage2": 0.5593863725662231, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 531 + }, + { + "completion_length": 8.796875, + "epoch": 0.0932188540388996, + "grad_norm": 19.428518945294773, + "kl": 0.0859375, + "learning_rate": 9.069563693709479e-07, + "loss": 0.0344, + "reward": 1.565098762512207, + "reward_std": 0.3714370131492615, + "rewards/accuracy_reward_stage2": 0.5650988221168518, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 532 + }, + { + "completion_length": 10.15625, + "epoch": 0.09339407744874716, + "grad_norm": 20.04251433090847, + "kl": 0.019287109375, + "learning_rate": 9.067811459611004e-07, + "loss": -0.0364, + "reward": 1.355189323425293, + "reward_std": 0.1609458327293396, + "rewards/accuracy_reward_stage2": 0.37081438302993774, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 533 + }, + { + "completion_length": 6.796875, + "epoch": 0.0935693008585947, + "grad_norm": 20.797059105787657, + "kl": 0.039306640625, + "learning_rate": 9.066059225512528e-07, + "loss": -0.0054, + "reward": 1.5364583730697632, + "reward_std": 0.2956216037273407, + "rewards/accuracy_reward_stage2": 0.5520833730697632, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 534 + }, + { + "completion_length": 14.875, + "epoch": 0.09374452426844226, + "grad_norm": 24.920972156704856, + "kl": 0.69140625, + "learning_rate": 9.064306991414053e-07, + "loss": 0.2752, + "reward": 1.4657204151153564, + "reward_std": 0.2909356355667114, + "rewards/accuracy_reward_stage2": 0.5907202959060669, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 535 + }, + { + "completion_length": 5.765625, + "epoch": 0.09391974767828982, + "grad_norm": 11.554153440169287, + "kl": 0.0172119140625, + "learning_rate": 9.062554757315576e-07, + "loss": 0.0069, + "reward": 1.587594747543335, + "reward_std": 0.11471574753522873, + "rewards/accuracy_reward_stage2": 0.7125946879386902, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 536 + }, + { + "completion_length": 14.5, + "epoch": 0.09409497108813737, + "grad_norm": 18.408333407487323, + "kl": 0.06005859375, + "learning_rate": 9.060802523217101e-07, + "loss": 0.0241, + "reward": 1.280574083328247, + "reward_std": 0.1755804419517517, + "rewards/accuracy_reward_stage2": 0.40557414293289185, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 537 + }, + { + "completion_length": 8.171875, + "epoch": 0.09427019449798493, + "grad_norm": 24.09543191878335, + "kl": 0.2158203125, + "learning_rate": 9.059050289118626e-07, + "loss": 0.042, + "reward": 1.2124801874160767, + "reward_std": 0.25265681743621826, + "rewards/accuracy_reward_stage2": 0.47810518741607666, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 538 + }, + { + "completion_length": 29.546875, + "epoch": 0.09444541790783248, + "grad_norm": 21.332602397727992, + "kl": 0.0361328125, + "learning_rate": 9.05729805502015e-07, + "loss": 0.0144, + "reward": 1.2393457889556885, + "reward_std": 0.20028795301914215, + "rewards/accuracy_reward_stage2": 0.36434581875801086, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 539 + }, + { + "completion_length": 19.875, + "epoch": 0.09462064131768004, + "grad_norm": 23.21808555459532, + "kl": 0.7265625, + "learning_rate": 9.055545820921675e-07, + "loss": 0.292, + "reward": 1.217187523841858, + "reward_std": 0.20525680482387543, + "rewards/accuracy_reward_stage2": 0.3421875238418579, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 540 + }, + { + "completion_length": 18.359375, + "epoch": 0.0947958647275276, + "grad_norm": 25.148256796049036, + "kl": 0.1875, + "learning_rate": 9.0537935868232e-07, + "loss": 0.0747, + "reward": 1.2867825031280518, + "reward_std": 0.20961745083332062, + "rewards/accuracy_reward_stage2": 0.411782443523407, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 541 + }, + { + "completion_length": 11.375, + "epoch": 0.09497108813737515, + "grad_norm": 23.908511785902895, + "kl": 0.062255859375, + "learning_rate": 9.052041352724724e-07, + "loss": 0.0249, + "reward": 1.2482308149337769, + "reward_std": 0.21626482903957367, + "rewards/accuracy_reward_stage2": 0.37323081493377686, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 542 + }, + { + "completion_length": 11.96875, + "epoch": 0.09514631154722271, + "grad_norm": 17.570510934597706, + "kl": 0.1220703125, + "learning_rate": 9.050289118626248e-07, + "loss": 0.0136, + "reward": 1.5263545513153076, + "reward_std": 0.18224114179611206, + "rewards/accuracy_reward_stage2": 0.5419795513153076, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 543 + }, + { + "completion_length": 13.421875, + "epoch": 0.09532153495707027, + "grad_norm": 22.0167136435298, + "kl": 0.1083984375, + "learning_rate": 9.048536884527772e-07, + "loss": 0.0434, + "reward": 1.5312836170196533, + "reward_std": 0.2787465751171112, + "rewards/accuracy_reward_stage2": 0.5312834978103638, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 544 + }, + { + "completion_length": 7.484375, + "epoch": 0.09549675836691782, + "grad_norm": 20.119504546456255, + "kl": 0.1279296875, + "learning_rate": 9.046784650429297e-07, + "loss": 0.0513, + "reward": 1.6141095161437988, + "reward_std": 0.19938796758651733, + "rewards/accuracy_reward_stage2": 0.6141095161437988, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 545 + }, + { + "completion_length": 9.328125, + "epoch": 0.09567198177676538, + "grad_norm": 21.585749591558706, + "kl": 0.06982421875, + "learning_rate": 9.045032416330821e-07, + "loss": 0.028, + "reward": 1.459768295288086, + "reward_std": 0.1722957044839859, + "rewards/accuracy_reward_stage2": 0.5847682356834412, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 546 + }, + { + "completion_length": 20.6875, + "epoch": 0.09584720518661294, + "grad_norm": 20.96518673634591, + "kl": 0.025634765625, + "learning_rate": 9.043280182232345e-07, + "loss": 0.0103, + "reward": 1.3154137134552002, + "reward_std": 0.2370438277721405, + "rewards/accuracy_reward_stage2": 0.4404137134552002, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 547 + }, + { + "completion_length": 11.328125, + "epoch": 0.09602242859646049, + "grad_norm": 19.927064137421482, + "kl": 0.04931640625, + "learning_rate": 9.04152794813387e-07, + "loss": 0.0198, + "reward": 1.6055917739868164, + "reward_std": 0.11641772091388702, + "rewards/accuracy_reward_stage2": 0.6055917739868164, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 548 + }, + { + "completion_length": 8.296875, + "epoch": 0.09619765200630805, + "grad_norm": 21.137165253719424, + "kl": 0.05712890625, + "learning_rate": 9.039775714035395e-07, + "loss": 0.0228, + "reward": 1.746246576309204, + "reward_std": 0.27798545360565186, + "rewards/accuracy_reward_stage2": 0.7462465763092041, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 549 + }, + { + "completion_length": 7.015625, + "epoch": 0.09637287541615559, + "grad_norm": 17.906357465189267, + "kl": 0.10693359375, + "learning_rate": 9.038023479936919e-07, + "loss": 0.0429, + "reward": 1.5435776710510254, + "reward_std": 0.12182464450597763, + "rewards/accuracy_reward_stage2": 0.5435777902603149, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 550 + }, + { + "completion_length": 10.578125, + "epoch": 0.09654809882600315, + "grad_norm": 52.716369091651, + "kl": 0.138671875, + "learning_rate": 9.036271245838444e-07, + "loss": 0.0555, + "reward": 1.751212239265442, + "reward_std": 0.3372414708137512, + "rewards/accuracy_reward_stage2": 0.7512121796607971, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 551 + }, + { + "completion_length": 9.796875, + "epoch": 0.09672332223585071, + "grad_norm": 19.819458175934273, + "kl": 0.11279296875, + "learning_rate": 9.034519011739968e-07, + "loss": 0.0452, + "reward": 1.5237207412719727, + "reward_std": 0.17480897903442383, + "rewards/accuracy_reward_stage2": 0.6487207412719727, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 552 + }, + { + "completion_length": 11.109375, + "epoch": 0.09689854564569826, + "grad_norm": 16.12663207048521, + "kl": 0.0517578125, + "learning_rate": 9.032766777641493e-07, + "loss": 0.0207, + "reward": 1.7208008766174316, + "reward_std": 0.1695939302444458, + "rewards/accuracy_reward_stage2": 0.7208009362220764, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 553 + }, + { + "completion_length": 14.015625, + "epoch": 0.09707376905554582, + "grad_norm": 23.486242967320905, + "kl": 0.0634765625, + "learning_rate": 9.031014543543018e-07, + "loss": 0.0253, + "reward": 1.3676196336746216, + "reward_std": 0.23155313730239868, + "rewards/accuracy_reward_stage2": 0.36761969327926636, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 554 + }, + { + "completion_length": 17.125, + "epoch": 0.09724899246539338, + "grad_norm": 19.557327683653888, + "kl": 0.0546875, + "learning_rate": 9.029262309444542e-07, + "loss": 0.0218, + "reward": 1.3779256343841553, + "reward_std": 0.14323818683624268, + "rewards/accuracy_reward_stage2": 0.5029256343841553, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 555 + }, + { + "completion_length": 17.546875, + "epoch": 0.09742421587524093, + "grad_norm": 17.80871699497735, + "kl": 0.7734375, + "learning_rate": 9.027510075346065e-07, + "loss": 0.3101, + "reward": 1.411677598953247, + "reward_std": 0.13221172988414764, + "rewards/accuracy_reward_stage2": 0.5366775989532471, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 556 + }, + { + "completion_length": 11.46875, + "epoch": 0.09759943928508849, + "grad_norm": 15.659891701415704, + "kl": 0.08642578125, + "learning_rate": 9.02575784124759e-07, + "loss": 0.0056, + "reward": 1.859658122062683, + "reward_std": 0.14201951026916504, + "rewards/accuracy_reward_stage2": 0.8752831220626831, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 557 + }, + { + "completion_length": 14.34375, + "epoch": 0.09777466269493604, + "grad_norm": 24.3519660210772, + "kl": 0.0181884765625, + "learning_rate": 9.024005607149114e-07, + "loss": 0.0073, + "reward": 1.4546735286712646, + "reward_std": 0.31135839223861694, + "rewards/accuracy_reward_stage2": 0.4546734690666199, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 558 + }, + { + "completion_length": 11.953125, + "epoch": 0.0979498861047836, + "grad_norm": 20.241237658071395, + "kl": 0.107421875, + "learning_rate": 9.022253373050639e-07, + "loss": 0.0429, + "reward": 1.2426481246948242, + "reward_std": 0.18194395303726196, + "rewards/accuracy_reward_stage2": 0.3676481544971466, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 559 + }, + { + "completion_length": 6.640625, + "epoch": 0.09812510951463116, + "grad_norm": 24.783421445628193, + "kl": 0.10546875, + "learning_rate": 9.020501138952163e-07, + "loss": 0.0424, + "reward": 1.6440285444259644, + "reward_std": 0.3332129716873169, + "rewards/accuracy_reward_stage2": 0.6440285444259644, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 560 + }, + { + "completion_length": 18.84375, + "epoch": 0.0983003329244787, + "grad_norm": 18.366188266106377, + "kl": 0.34765625, + "learning_rate": 9.018748904853688e-07, + "loss": 0.1393, + "reward": 1.3278311491012573, + "reward_std": 0.17044323682785034, + "rewards/accuracy_reward_stage2": 0.4528311789035797, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 561 + }, + { + "completion_length": 9.625, + "epoch": 0.09847555633432627, + "grad_norm": 23.72519010814911, + "kl": 0.053466796875, + "learning_rate": 9.016996670755213e-07, + "loss": 0.0214, + "reward": 1.7041335105895996, + "reward_std": 0.27296823263168335, + "rewards/accuracy_reward_stage2": 0.7041334509849548, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 562 + }, + { + "completion_length": 14.109375, + "epoch": 0.09865077974417383, + "grad_norm": 15.513103779989715, + "kl": 0.05224609375, + "learning_rate": 9.015244436656737e-07, + "loss": 0.0209, + "reward": 1.4729877710342407, + "reward_std": 0.1910639852285385, + "rewards/accuracy_reward_stage2": 0.4729878604412079, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 563 + }, + { + "completion_length": 10.921875, + "epoch": 0.09882600315402137, + "grad_norm": 24.6052578581256, + "kl": 0.07470703125, + "learning_rate": 9.013492202558262e-07, + "loss": 0.0084, + "reward": 1.7897675037384033, + "reward_std": 0.2912678122520447, + "rewards/accuracy_reward_stage2": 0.8053925037384033, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 564 + }, + { + "completion_length": 12.8125, + "epoch": 0.09900122656386894, + "grad_norm": 23.816204925967753, + "kl": 0.1689453125, + "learning_rate": 9.011739968459787e-07, + "loss": 0.0676, + "reward": 1.3777607679367065, + "reward_std": 0.19424016773700714, + "rewards/accuracy_reward_stage2": 0.7527608275413513, + "rewards/format_reward_stage1_pointerpad": 0.625, + "scores/accuracy_reward_stage2": 0.625, + "step": 565 + }, + { + "completion_length": 11.1875, + "epoch": 0.09917644997371648, + "grad_norm": 23.472488849189013, + "kl": 0.06591796875, + "learning_rate": 9.00998773436131e-07, + "loss": 0.0265, + "reward": 1.4185214042663574, + "reward_std": 0.1836758255958557, + "rewards/accuracy_reward_stage2": 0.41852134466171265, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 566 + }, + { + "completion_length": 10.03125, + "epoch": 0.09935167338356404, + "grad_norm": 21.483289325649206, + "kl": 0.099609375, + "learning_rate": 9.008235500262835e-07, + "loss": 0.0398, + "reward": 1.4853670597076416, + "reward_std": 0.18764030933380127, + "rewards/accuracy_reward_stage2": 0.7353670597076416, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 567 + }, + { + "completion_length": 8.3125, + "epoch": 0.0995268967934116, + "grad_norm": 22.82355246060793, + "kl": 0.08984375, + "learning_rate": 9.006483266164358e-07, + "loss": 0.0359, + "reward": 1.4687992334365845, + "reward_std": 0.3864898681640625, + "rewards/accuracy_reward_stage2": 0.5937991738319397, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 568 + }, + { + "completion_length": 9.296875, + "epoch": 0.09970212020325915, + "grad_norm": 16.1402694836104, + "kl": 0.033935546875, + "learning_rate": 9.004731032065883e-07, + "loss": 0.0136, + "reward": 1.5385760068893433, + "reward_std": 0.17539136111736298, + "rewards/accuracy_reward_stage2": 0.6635760068893433, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 569 + }, + { + "completion_length": 8.828125, + "epoch": 0.09987734361310671, + "grad_norm": 24.375755946771584, + "kl": 0.126953125, + "learning_rate": 9.002978797967408e-07, + "loss": 0.0509, + "reward": 1.4903528690338135, + "reward_std": 0.23065432906150818, + "rewards/accuracy_reward_stage2": 0.6153527498245239, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 570 + }, + { + "completion_length": 13.546875, + "epoch": 0.10005256702295427, + "grad_norm": 14.409670458769568, + "kl": 0.026123046875, + "learning_rate": 9.001226563868932e-07, + "loss": 0.0105, + "reward": 1.639979362487793, + "reward_std": 0.05574566125869751, + "rewards/accuracy_reward_stage2": 0.6399792432785034, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 571 + }, + { + "completion_length": 8.1875, + "epoch": 0.10022779043280182, + "grad_norm": 21.203228039697116, + "kl": 0.033447265625, + "learning_rate": 8.999474329770457e-07, + "loss": 0.0133, + "reward": 1.7701388597488403, + "reward_std": 0.15610602498054504, + "rewards/accuracy_reward_stage2": 0.7701388597488403, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 572 + }, + { + "completion_length": 13.515625, + "epoch": 0.10040301384264938, + "grad_norm": 248.60038049667256, + "kl": 1.140625, + "learning_rate": 8.997722095671982e-07, + "loss": 0.4545, + "reward": 1.1850402355194092, + "reward_std": 0.3066478967666626, + "rewards/accuracy_reward_stage2": 0.560040295124054, + "rewards/format_reward_stage1_pointerpad": 0.625, + "scores/accuracy_reward_stage2": 0.625, + "step": 573 + }, + { + "completion_length": 8.09375, + "epoch": 0.10057823725249693, + "grad_norm": 17.849366329021635, + "kl": 0.05224609375, + "learning_rate": 8.995969861573506e-07, + "loss": 0.0209, + "reward": 1.8501524925231934, + "reward_std": 0.1443457454442978, + "rewards/accuracy_reward_stage2": 0.8501523733139038, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 574 + }, + { + "completion_length": 5.625, + "epoch": 0.10075346066234449, + "grad_norm": 15.276267483118692, + "kl": 0.0654296875, + "learning_rate": 8.994217627475031e-07, + "loss": 0.0261, + "reward": 1.4483295679092407, + "reward_std": 0.11541729420423508, + "rewards/accuracy_reward_stage2": 0.5733295679092407, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 575 + }, + { + "completion_length": 37.859375, + "epoch": 0.10092868407219205, + "grad_norm": 3981.8869707992453, + "kl": 23.375, + "learning_rate": 8.992465393376554e-07, + "loss": 9.357, + "reward": 1.3813152313232422, + "reward_std": 0.07771497964859009, + "rewards/accuracy_reward_stage2": 0.7563152313232422, + "rewards/format_reward_stage1_pointerpad": 0.625, + "scores/accuracy_reward_stage2": 0.625, + "step": 576 + }, + { + "completion_length": 13.828125, + "epoch": 0.1011039074820396, + "grad_norm": 22.44950284758865, + "kl": 0.06640625, + "learning_rate": 8.990713159278079e-07, + "loss": 0.0267, + "reward": 1.568946123123169, + "reward_std": 0.2095562368631363, + "rewards/accuracy_reward_stage2": 0.568946123123169, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 577 + }, + { + "completion_length": 7.9375, + "epoch": 0.10127913089188716, + "grad_norm": 26.53547380462697, + "kl": 0.12353515625, + "learning_rate": 8.988960925179604e-07, + "loss": 0.0495, + "reward": 1.6583962440490723, + "reward_std": 0.29453662037849426, + "rewards/accuracy_reward_stage2": 0.658396303653717, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 578 + }, + { + "completion_length": 9.3125, + "epoch": 0.10145435430173472, + "grad_norm": 16.845420567269674, + "kl": 0.030029296875, + "learning_rate": 8.987208691081128e-07, + "loss": 0.012, + "reward": 1.513580560684204, + "reward_std": 0.22537294030189514, + "rewards/accuracy_reward_stage2": 0.5135806202888489, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 579 + }, + { + "completion_length": 9.28125, + "epoch": 0.10162957771158226, + "grad_norm": 23.688189724072313, + "kl": 0.08984375, + "learning_rate": 8.985456456982653e-07, + "loss": 0.0359, + "reward": 1.4752018451690674, + "reward_std": 0.3478262424468994, + "rewards/accuracy_reward_stage2": 0.6002017855644226, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 580 + }, + { + "completion_length": 10.578125, + "epoch": 0.10180480112142982, + "grad_norm": 21.007284172674584, + "kl": 0.099609375, + "learning_rate": 8.983704222884176e-07, + "loss": 0.0397, + "reward": 1.5977373123168945, + "reward_std": 0.2676808834075928, + "rewards/accuracy_reward_stage2": 0.5977373719215393, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 581 + }, + { + "completion_length": 28.8125, + "epoch": 0.10198002453127737, + "grad_norm": 22.332576010592028, + "kl": 0.322265625, + "learning_rate": 8.981951988785701e-07, + "loss": 0.1292, + "reward": 1.1091969013214111, + "reward_std": 0.1680152416229248, + "rewards/accuracy_reward_stage2": 0.3591969609260559, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 582 + }, + { + "completion_length": 9.671875, + "epoch": 0.10215524794112493, + "grad_norm": 18.801648588335883, + "kl": 0.048828125, + "learning_rate": 8.980199754687226e-07, + "loss": 0.0196, + "reward": 1.2164499759674072, + "reward_std": 0.17073309421539307, + "rewards/accuracy_reward_stage2": 0.21645000576972961, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 583 + }, + { + "completion_length": 9.234375, + "epoch": 0.1023304713509725, + "grad_norm": 15.080559029811127, + "kl": 0.0216064453125, + "learning_rate": 8.97844752058875e-07, + "loss": 0.0087, + "reward": 1.6748721599578857, + "reward_std": 0.123601995408535, + "rewards/accuracy_reward_stage2": 0.6748720407485962, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 584 + }, + { + "completion_length": 20.1875, + "epoch": 0.10250569476082004, + "grad_norm": 30.400426260278596, + "kl": 0.53125, + "learning_rate": 8.976695286490275e-07, + "loss": 0.1686, + "reward": 1.4335464239120483, + "reward_std": 0.15213021636009216, + "rewards/accuracy_reward_stage2": 0.5741714239120483, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 585 + }, + { + "completion_length": 10.390625, + "epoch": 0.1026809181706676, + "grad_norm": 22.515022914890093, + "kl": 0.1064453125, + "learning_rate": 8.974943052391799e-07, + "loss": -0.0016, + "reward": 1.328930139541626, + "reward_std": 0.270600289106369, + "rewards/accuracy_reward_stage2": 0.469555139541626, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 586 + }, + { + "completion_length": 9.484375, + "epoch": 0.10285614158051516, + "grad_norm": 22.364167132308026, + "kl": 0.10986328125, + "learning_rate": 8.973190818293323e-07, + "loss": 0.0439, + "reward": 1.5296134948730469, + "reward_std": 0.2580759525299072, + "rewards/accuracy_reward_stage2": 0.5296134948730469, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 587 + }, + { + "completion_length": 18.15625, + "epoch": 0.10303136499036271, + "grad_norm": 21.189378057515125, + "kl": 0.0927734375, + "learning_rate": 8.971438584194848e-07, + "loss": 0.0371, + "reward": 1.5236037969589233, + "reward_std": 0.26827800273895264, + "rewards/accuracy_reward_stage2": 0.5236037969589233, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 588 + }, + { + "completion_length": 10.703125, + "epoch": 0.10320658840021027, + "grad_norm": 17.39955052196623, + "kl": 0.062255859375, + "learning_rate": 8.969686350096372e-07, + "loss": 0.0249, + "reward": 1.7869575023651123, + "reward_std": 0.19973281025886536, + "rewards/accuracy_reward_stage2": 0.7869575619697571, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 589 + }, + { + "completion_length": 14.03125, + "epoch": 0.10338181181005783, + "grad_norm": 25.503001638471407, + "kl": 0.37109375, + "learning_rate": 8.967934115997897e-07, + "loss": 0.1483, + "reward": 1.2879681587219238, + "reward_std": 0.29135996103286743, + "rewards/accuracy_reward_stage2": 0.5379682183265686, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 590 + }, + { + "completion_length": 19.984375, + "epoch": 0.10355703521990538, + "grad_norm": 19.80642056865787, + "kl": 0.3828125, + "learning_rate": 8.966181881899422e-07, + "loss": 0.1527, + "reward": 1.4519280195236206, + "reward_std": 0.10544341057538986, + "rewards/accuracy_reward_stage2": 0.7019280195236206, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 591 + }, + { + "completion_length": 9.0625, + "epoch": 0.10373225862975294, + "grad_norm": 17.164179326779152, + "kl": 0.0208740234375, + "learning_rate": 8.964429647800946e-07, + "loss": -0.0669, + "reward": 1.6845653057098389, + "reward_std": 0.2696949243545532, + "rewards/accuracy_reward_stage2": 0.8408153057098389, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 592 + }, + { + "completion_length": 9.84375, + "epoch": 0.10390748203960049, + "grad_norm": 20.757377644711703, + "kl": 0.03857421875, + "learning_rate": 8.962677413702471e-07, + "loss": 0.0154, + "reward": 1.6968727111816406, + "reward_std": 0.17491915822029114, + "rewards/accuracy_reward_stage2": 0.6968726515769958, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 593 + }, + { + "completion_length": 12.59375, + "epoch": 0.10408270544944805, + "grad_norm": 29.563787899550757, + "kl": 0.044677734375, + "learning_rate": 8.960925179603995e-07, + "loss": 0.0403, + "reward": 1.6349983215332031, + "reward_std": 0.1502247452735901, + "rewards/accuracy_reward_stage2": 0.7599983215332031, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 594 + }, + { + "completion_length": 12.609375, + "epoch": 0.1042579288592956, + "grad_norm": 14.903961141826915, + "kl": 0.0439453125, + "learning_rate": 8.959172945505519e-07, + "loss": -0.0343, + "reward": 1.5937268733978271, + "reward_std": 0.19302524626255035, + "rewards/accuracy_reward_stage2": 0.6249768733978271, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 595 + }, + { + "completion_length": 21.4375, + "epoch": 0.10443315226914315, + "grad_norm": 26.291996159246033, + "kl": 0.038330078125, + "learning_rate": 8.957420711407043e-07, + "loss": -0.0162, + "reward": 1.3569380044937134, + "reward_std": 0.20827914774417877, + "rewards/accuracy_reward_stage2": 0.3725629448890686, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 596 + }, + { + "completion_length": 7.390625, + "epoch": 0.10460837567899071, + "grad_norm": 16.93122883453144, + "kl": 0.052978515625, + "learning_rate": 8.955668477308567e-07, + "loss": 0.0212, + "reward": 1.5769851207733154, + "reward_std": 0.25848501920700073, + "rewards/accuracy_reward_stage2": 0.7019850015640259, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 597 + }, + { + "completion_length": 23.1875, + "epoch": 0.10478359908883828, + "grad_norm": 60.151052040310844, + "kl": 0.81640625, + "learning_rate": 8.953916243210092e-07, + "loss": 0.2832, + "reward": 1.166857123374939, + "reward_std": 0.17063213884830475, + "rewards/accuracy_reward_stage2": 0.30748212337493896, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 598 + }, + { + "completion_length": 9.84375, + "epoch": 0.10495882249868582, + "grad_norm": 29.438033012180327, + "kl": 0.1962890625, + "learning_rate": 8.952164009111617e-07, + "loss": 0.0351, + "reward": 1.6112334728240967, + "reward_std": 0.2669917345046997, + "rewards/accuracy_reward_stage2": 0.6268585324287415, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 599 + }, + { + "completion_length": 15.953125, + "epoch": 0.10513404590853338, + "grad_norm": 19.207758702593235, + "kl": 0.0771484375, + "learning_rate": 8.950411775013141e-07, + "loss": -0.0133, + "reward": 1.3625128269195557, + "reward_std": 0.12542307376861572, + "rewards/accuracy_reward_stage2": 0.37813782691955566, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 600 + }, + { + "completion_length": 13.1875, + "epoch": 0.10530926931838093, + "grad_norm": 22.80070618758305, + "kl": 0.0947265625, + "learning_rate": 8.948659540914666e-07, + "loss": 0.0107, + "reward": 1.4847618341445923, + "reward_std": 0.20820224285125732, + "rewards/accuracy_reward_stage2": 0.5003868341445923, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 601 + }, + { + "completion_length": 13.59375, + "epoch": 0.10548449272822849, + "grad_norm": 16.16392296114711, + "kl": 0.08251953125, + "learning_rate": 8.946907306816191e-07, + "loss": -0.0004, + "reward": 1.6386563777923584, + "reward_std": 0.14577843248844147, + "rewards/accuracy_reward_stage2": 0.6542813181877136, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 602 + }, + { + "completion_length": 9.375, + "epoch": 0.10565971613807605, + "grad_norm": 22.11154070768946, + "kl": 0.04150390625, + "learning_rate": 8.945155072717715e-07, + "loss": 0.0166, + "reward": 1.7849338054656982, + "reward_std": 0.2635495364665985, + "rewards/accuracy_reward_stage2": 0.7849337458610535, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 603 + }, + { + "completion_length": 21.484375, + "epoch": 0.1058349395479236, + "grad_norm": 25.437037649937153, + "kl": 0.33203125, + "learning_rate": 8.94340283861924e-07, + "loss": 0.1333, + "reward": 1.4886643886566162, + "reward_std": 0.19219179451465607, + "rewards/accuracy_reward_stage2": 0.6136643886566162, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 604 + }, + { + "completion_length": 14.390625, + "epoch": 0.10601016295777116, + "grad_norm": 12.178068721041292, + "kl": 0.00909423828125, + "learning_rate": 8.941650604520764e-07, + "loss": 0.0036, + "reward": 1.5776515007019043, + "reward_std": 0.1157275140285492, + "rewards/accuracy_reward_stage2": 0.5776515007019043, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 605 + }, + { + "completion_length": 13.109375, + "epoch": 0.10618538636761872, + "grad_norm": 20.590768959219737, + "kl": 0.0311279296875, + "learning_rate": 8.939898370422288e-07, + "loss": -0.0317, + "reward": 1.5676214694976807, + "reward_std": 0.10006687045097351, + "rewards/accuracy_reward_stage2": 0.5832464694976807, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 606 + }, + { + "completion_length": 10.03125, + "epoch": 0.10636060977746627, + "grad_norm": 28.16540860038017, + "kl": 0.224609375, + "learning_rate": 8.938146136323812e-07, + "loss": 0.0899, + "reward": 1.3759479522705078, + "reward_std": 0.07002376019954681, + "rewards/accuracy_reward_stage2": 0.5009479522705078, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 607 + }, + { + "completion_length": 9.515625, + "epoch": 0.10653583318731383, + "grad_norm": 15.000548669218713, + "kl": 0.05859375, + "learning_rate": 8.936393902225336e-07, + "loss": 0.0234, + "reward": 1.5940972566604614, + "reward_std": 0.1635403335094452, + "rewards/accuracy_reward_stage2": 0.7190971970558167, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 608 + }, + { + "completion_length": 7.4375, + "epoch": 0.10671105659716137, + "grad_norm": 19.213288800692773, + "kl": 0.043701171875, + "learning_rate": 8.934641668126861e-07, + "loss": 0.0175, + "reward": 1.2412978410720825, + "reward_std": 0.2668173313140869, + "rewards/accuracy_reward_stage2": 0.3662978708744049, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 609 + }, + { + "completion_length": 16.1875, + "epoch": 0.10688628000700894, + "grad_norm": 22.02974792763698, + "kl": 0.55078125, + "learning_rate": 8.932889434028386e-07, + "loss": 0.2196, + "reward": 1.4358563423156738, + "reward_std": 0.2029583603143692, + "rewards/accuracy_reward_stage2": 0.6858564615249634, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 610 + }, + { + "completion_length": 8.765625, + "epoch": 0.1070615034168565, + "grad_norm": 27.041480284219045, + "kl": 0.017578125, + "learning_rate": 8.93113719992991e-07, + "loss": 0.007, + "reward": 1.6799907684326172, + "reward_std": 0.11624743044376373, + "rewards/accuracy_reward_stage2": 0.6799907088279724, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 611 + }, + { + "completion_length": 9.875, + "epoch": 0.10723672682670404, + "grad_norm": 124.73618694081247, + "kl": 0.037109375, + "learning_rate": 8.929384965831435e-07, + "loss": 0.0149, + "reward": 1.5989768505096436, + "reward_std": 0.1074318140745163, + "rewards/accuracy_reward_stage2": 0.5989767909049988, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 612 + }, + { + "completion_length": 8.09375, + "epoch": 0.1074119502365516, + "grad_norm": 21.868434969689023, + "kl": 0.041259765625, + "learning_rate": 8.927632731732959e-07, + "loss": -0.0277, + "reward": 1.1822917461395264, + "reward_std": 0.16993504762649536, + "rewards/accuracy_reward_stage2": 0.1979166716337204, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 613 + }, + { + "completion_length": 10.25, + "epoch": 0.10758717364639916, + "grad_norm": 20.31274312952241, + "kl": 0.061279296875, + "learning_rate": 8.925880497634484e-07, + "loss": -0.0486, + "reward": 1.7639718055725098, + "reward_std": 0.2529860734939575, + "rewards/accuracy_reward_stage2": 0.9202218055725098, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 614 + }, + { + "completion_length": 10.625, + "epoch": 0.10776239705624671, + "grad_norm": 22.26824541507161, + "kl": 0.06103515625, + "learning_rate": 8.924128263536009e-07, + "loss": 0.0245, + "reward": 1.5011794567108154, + "reward_std": 0.23802436888217926, + "rewards/accuracy_reward_stage2": 0.5011795163154602, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 615 + }, + { + "completion_length": 12.109375, + "epoch": 0.10793762046609427, + "grad_norm": 17.995465269921365, + "kl": 0.06689453125, + "learning_rate": 8.922376029437532e-07, + "loss": 0.0268, + "reward": 1.575548529624939, + "reward_std": 0.20315426588058472, + "rewards/accuracy_reward_stage2": 0.700548529624939, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 616 + }, + { + "completion_length": 13.984375, + "epoch": 0.10811284387594182, + "grad_norm": 19.57701236061785, + "kl": 0.36328125, + "learning_rate": 8.920623795339057e-07, + "loss": 0.1457, + "reward": 1.6030621528625488, + "reward_std": 0.13488642871379852, + "rewards/accuracy_reward_stage2": 0.7280622124671936, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 617 + }, + { + "completion_length": 17.46875, + "epoch": 0.10828806728578938, + "grad_norm": 20.320395279142957, + "kl": 0.046142578125, + "learning_rate": 8.918871561240582e-07, + "loss": 0.0184, + "reward": 1.4128468036651611, + "reward_std": 0.19677528738975525, + "rewards/accuracy_reward_stage2": 0.41284680366516113, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 618 + }, + { + "completion_length": 6.234375, + "epoch": 0.10846329069563694, + "grad_norm": 21.100346568277157, + "kl": 0.03173828125, + "learning_rate": 8.917119327142105e-07, + "loss": 0.0126, + "reward": 1.506620168685913, + "reward_std": 0.19347314536571503, + "rewards/accuracy_reward_stage2": 0.5066201686859131, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 619 + }, + { + "completion_length": 10.03125, + "epoch": 0.10863851410548449, + "grad_norm": 20.38714158047014, + "kl": 0.0306396484375, + "learning_rate": 8.91536709304363e-07, + "loss": 0.0122, + "reward": 1.515625, + "reward_std": 0.2109457552433014, + "rewards/accuracy_reward_stage2": 0.515625, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 620 + }, + { + "completion_length": 6.15625, + "epoch": 0.10881373751533205, + "grad_norm": 13.837433374871935, + "kl": 0.08251953125, + "learning_rate": 8.913614858945154e-07, + "loss": 0.033, + "reward": 1.5983612537384033, + "reward_std": 0.13748112320899963, + "rewards/accuracy_reward_stage2": 0.5983611941337585, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 621 + }, + { + "completion_length": 9.46875, + "epoch": 0.10898896092517961, + "grad_norm": 22.91529731345705, + "kl": 0.1708984375, + "learning_rate": 8.911862624846679e-07, + "loss": 0.0683, + "reward": 1.6282224655151367, + "reward_std": 0.23113414645195007, + "rewards/accuracy_reward_stage2": 0.6282224655151367, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 622 + }, + { + "completion_length": 14.25, + "epoch": 0.10916418433502716, + "grad_norm": 15.083020792961545, + "kl": 0.0264892578125, + "learning_rate": 8.910110390748204e-07, + "loss": 0.0106, + "reward": 1.7609566450119019, + "reward_std": 0.18378415703773499, + "rewards/accuracy_reward_stage2": 0.7609566450119019, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 623 + }, + { + "completion_length": 16.765625, + "epoch": 0.10933940774487472, + "grad_norm": 21.66425736767919, + "kl": 0.306640625, + "learning_rate": 8.908358156649728e-07, + "loss": 0.1057, + "reward": 1.4428138732910156, + "reward_std": 0.12810033559799194, + "rewards/accuracy_reward_stage2": 0.5678137540817261, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 624 + }, + { + "completion_length": 11.375, + "epoch": 0.10951463115472228, + "grad_norm": 22.060448935490477, + "kl": 0.0830078125, + "learning_rate": 8.906605922551253e-07, + "loss": 0.0331, + "reward": 1.6386213302612305, + "reward_std": 0.10365209728479385, + "rewards/accuracy_reward_stage2": 0.6386213302612305, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 625 + }, + { + "completion_length": 9.5625, + "epoch": 0.10968985456456982, + "grad_norm": 15.622154479911362, + "kl": 0.020751953125, + "learning_rate": 8.904853688452777e-07, + "loss": -0.0648, + "reward": 1.841752290725708, + "reward_std": 0.1866808980703354, + "rewards/accuracy_reward_stage2": 0.8730022311210632, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 626 + }, + { + "completion_length": 11.953125, + "epoch": 0.10986507797441739, + "grad_norm": 12.336957618950738, + "kl": 0.07470703125, + "learning_rate": 8.903101454354301e-07, + "loss": 0.0299, + "reward": 1.5518933534622192, + "reward_std": 0.07083739340305328, + "rewards/accuracy_reward_stage2": 0.6768933534622192, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 627 + }, + { + "completion_length": 11.875, + "epoch": 0.11004030138426493, + "grad_norm": 19.012998512084977, + "kl": 0.0830078125, + "learning_rate": 8.901349220255826e-07, + "loss": -0.0109, + "reward": 1.2286701202392578, + "reward_std": 0.21301977336406708, + "rewards/accuracy_reward_stage2": 0.24429510533809662, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 628 + }, + { + "completion_length": 9.28125, + "epoch": 0.1102155247941125, + "grad_norm": 19.74217319899889, + "kl": 0.287109375, + "learning_rate": 8.89959698615735e-07, + "loss": 0.1149, + "reward": 1.5378367900848389, + "reward_std": 0.13060753047466278, + "rewards/accuracy_reward_stage2": 0.6628367900848389, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 629 + }, + { + "completion_length": 9.296875, + "epoch": 0.11039074820396005, + "grad_norm": 19.776418120024495, + "kl": 0.040283203125, + "learning_rate": 8.897844752058875e-07, + "loss": -0.057, + "reward": 1.629166603088379, + "reward_std": 0.21690016984939575, + "rewards/accuracy_reward_stage2": 0.6604166626930237, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 630 + }, + { + "completion_length": 7.625, + "epoch": 0.1105659716138076, + "grad_norm": 17.89716035722575, + "kl": 0.12109375, + "learning_rate": 8.8960925179604e-07, + "loss": 0.009, + "reward": 1.456545114517212, + "reward_std": 0.1892307996749878, + "rewards/accuracy_reward_stage2": 0.47217005491256714, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 631 + }, + { + "completion_length": 7.671875, + "epoch": 0.11074119502365516, + "grad_norm": 19.94611007960208, + "kl": 0.0595703125, + "learning_rate": 8.894340283861923e-07, + "loss": 0.0238, + "reward": 1.5591226816177368, + "reward_std": 0.14498020708560944, + "rewards/accuracy_reward_stage2": 0.5591225624084473, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 632 + }, + { + "completion_length": 15.0, + "epoch": 0.11091641843350272, + "grad_norm": 19.2567062060884, + "kl": 0.062255859375, + "learning_rate": 8.892588049763448e-07, + "loss": -0.0193, + "reward": 1.8932830095291138, + "reward_std": 0.18728239834308624, + "rewards/accuracy_reward_stage2": 0.9089080095291138, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 633 + }, + { + "completion_length": 16.828125, + "epoch": 0.11109164184335027, + "grad_norm": 96.32237761630043, + "kl": 0.2412109375, + "learning_rate": 8.890835815664973e-07, + "loss": 0.0084, + "reward": 1.396424651145935, + "reward_std": 0.2588643431663513, + "rewards/accuracy_reward_stage2": 0.42767465114593506, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 634 + }, + { + "completion_length": 8.90625, + "epoch": 0.11126686525319783, + "grad_norm": 33.13647172827168, + "kl": 0.1201171875, + "learning_rate": 8.889083581566496e-07, + "loss": 0.0482, + "reward": 1.541839361190796, + "reward_std": 0.2884628474712372, + "rewards/accuracy_reward_stage2": 0.6668393611907959, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 635 + }, + { + "completion_length": 10.921875, + "epoch": 0.11144208866304538, + "grad_norm": 22.79993073340755, + "kl": 0.056640625, + "learning_rate": 8.887331347468021e-07, + "loss": 0.0227, + "reward": 1.5613070726394653, + "reward_std": 0.24244731664657593, + "rewards/accuracy_reward_stage2": 0.7019320130348206, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 636 + }, + { + "completion_length": 10.40625, + "epoch": 0.11161731207289294, + "grad_norm": 25.892737746278428, + "kl": 0.040771484375, + "learning_rate": 8.885579113369545e-07, + "loss": 0.0163, + "reward": 1.7931983470916748, + "reward_std": 0.22556662559509277, + "rewards/accuracy_reward_stage2": 0.7931983470916748, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 637 + }, + { + "completion_length": 19.671875, + "epoch": 0.1117925354827405, + "grad_norm": 3265.4880349670693, + "kl": 11.375, + "learning_rate": 8.88382687927107e-07, + "loss": 4.5539, + "reward": 1.350884199142456, + "reward_std": 0.21642833948135376, + "rewards/accuracy_reward_stage2": 0.47588419914245605, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 638 + }, + { + "completion_length": 9.40625, + "epoch": 0.11196775889258805, + "grad_norm": 25.598904130512008, + "kl": 0.07763671875, + "learning_rate": 8.882074645172595e-07, + "loss": -0.0043, + "reward": 1.550414800643921, + "reward_std": 0.26173245906829834, + "rewards/accuracy_reward_stage2": 0.5660399198532104, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 639 + }, + { + "completion_length": 13.75, + "epoch": 0.1121429823024356, + "grad_norm": 435.07023916376096, + "kl": 2.25, + "learning_rate": 8.880322411074119e-07, + "loss": 0.9058, + "reward": 1.3459508419036865, + "reward_std": 0.1702684909105301, + "rewards/accuracy_reward_stage2": 0.4709508419036865, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 640 + }, + { + "completion_length": 9.6875, + "epoch": 0.11231820571228317, + "grad_norm": 19.688791320693465, + "kl": 0.099609375, + "learning_rate": 8.878570176975644e-07, + "loss": 0.0398, + "reward": 1.2360773086547852, + "reward_std": 0.13248330354690552, + "rewards/accuracy_reward_stage2": 0.36107730865478516, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 641 + }, + { + "completion_length": 7.828125, + "epoch": 0.11249342912213071, + "grad_norm": 36.62343718508925, + "kl": 0.06494140625, + "learning_rate": 8.876817942877169e-07, + "loss": 0.026, + "reward": 1.5003522634506226, + "reward_std": 0.1630491018295288, + "rewards/accuracy_reward_stage2": 0.5003523230552673, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 642 + }, + { + "completion_length": 11.84375, + "epoch": 0.11266865253197828, + "grad_norm": 24.784720605465544, + "kl": 0.14453125, + "learning_rate": 8.875065708778693e-07, + "loss": 0.0134, + "reward": 1.6077549457550049, + "reward_std": 0.23488447070121765, + "rewards/accuracy_reward_stage2": 0.7483799457550049, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 643 + }, + { + "completion_length": 14.46875, + "epoch": 0.11284387594182582, + "grad_norm": 25.261374722671505, + "kl": 0.09619140625, + "learning_rate": 8.873313474680218e-07, + "loss": 0.0138, + "reward": 1.5423240661621094, + "reward_std": 0.23243725299835205, + "rewards/accuracy_reward_stage2": 0.5579490661621094, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 644 + }, + { + "completion_length": 12.09375, + "epoch": 0.11301909935167338, + "grad_norm": 24.468911275019025, + "kl": 0.310546875, + "learning_rate": 8.87156124058174e-07, + "loss": 0.1233, + "reward": 1.5601372718811035, + "reward_std": 0.12885455787181854, + "rewards/accuracy_reward_stage2": 0.6851372122764587, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 645 + }, + { + "completion_length": 30.671875, + "epoch": 0.11319432276152094, + "grad_norm": 19.03420864463341, + "kl": 0.0155029296875, + "learning_rate": 8.869809006483265e-07, + "loss": 0.0062, + "reward": 1.8722697496414185, + "reward_std": 0.08883378654718399, + "rewards/accuracy_reward_stage2": 0.8722698092460632, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 646 + }, + { + "completion_length": 10.03125, + "epoch": 0.11336954617136849, + "grad_norm": 26.39720044309794, + "kl": 0.1533203125, + "learning_rate": 8.86805677238479e-07, + "loss": -0.0185, + "reward": 1.4335455894470215, + "reward_std": 0.3102685213088989, + "rewards/accuracy_reward_stage2": 0.4647955894470215, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 647 + }, + { + "completion_length": 12.25, + "epoch": 0.11354476958121605, + "grad_norm": 20.080247397857068, + "kl": 0.08203125, + "learning_rate": 8.866304538286314e-07, + "loss": 0.0329, + "reward": 1.620429515838623, + "reward_std": 0.22463209927082062, + "rewards/accuracy_reward_stage2": 0.6204294562339783, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 648 + }, + { + "completion_length": 10.703125, + "epoch": 0.11371999299106361, + "grad_norm": 25.501686410903066, + "kl": 0.1396484375, + "learning_rate": 8.864552304187839e-07, + "loss": 0.0559, + "reward": 1.4538609981536865, + "reward_std": 0.2756735682487488, + "rewards/accuracy_reward_stage2": 0.5788609385490417, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 649 + }, + { + "completion_length": 7.484375, + "epoch": 0.11389521640091116, + "grad_norm": 20.12953702589919, + "kl": 0.039794921875, + "learning_rate": 8.862800070089363e-07, + "loss": -0.0234, + "reward": 1.4669541120529175, + "reward_std": 0.1871233880519867, + "rewards/accuracy_reward_stage2": 0.4825791120529175, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 650 + }, + { + "completion_length": 8.859375, + "epoch": 0.11407043981075872, + "grad_norm": 21.64836519336152, + "kl": 0.07666015625, + "learning_rate": 8.861047835990888e-07, + "loss": 0.0307, + "reward": 1.3550217151641846, + "reward_std": 0.2267133742570877, + "rewards/accuracy_reward_stage2": 0.35502177476882935, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 651 + }, + { + "completion_length": 10.359375, + "epoch": 0.11424566322060627, + "grad_norm": 18.437674084025137, + "kl": 0.03271484375, + "learning_rate": 8.859295601892413e-07, + "loss": 0.0346, + "reward": 1.4443836212158203, + "reward_std": 0.22392143309116364, + "rewards/accuracy_reward_stage2": 0.5693836808204651, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 652 + }, + { + "completion_length": 8.609375, + "epoch": 0.11442088663045383, + "grad_norm": 19.893784433796597, + "kl": 0.06982421875, + "learning_rate": 8.857543367793937e-07, + "loss": 0.0004, + "reward": 1.4368394613265991, + "reward_std": 0.24404609203338623, + "rewards/accuracy_reward_stage2": 0.5774644613265991, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 653 + }, + { + "completion_length": 11.859375, + "epoch": 0.11459611004030139, + "grad_norm": 14.241918651909575, + "kl": 0.119140625, + "learning_rate": 8.855791133695462e-07, + "loss": 0.0477, + "reward": 1.4363772869110107, + "reward_std": 0.09710687398910522, + "rewards/accuracy_reward_stage2": 0.6863773465156555, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 654 + }, + { + "completion_length": 9.78125, + "epoch": 0.11477133345014894, + "grad_norm": 19.270036996457037, + "kl": 0.08251953125, + "learning_rate": 8.854038899596987e-07, + "loss": 0.0331, + "reward": 1.6022714376449585, + "reward_std": 0.29964011907577515, + "rewards/accuracy_reward_stage2": 0.6022714376449585, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 655 + }, + { + "completion_length": 10.859375, + "epoch": 0.1149465568599965, + "grad_norm": 26.189268834179053, + "kl": 0.052978515625, + "learning_rate": 8.85228666549851e-07, + "loss": -0.014, + "reward": 1.5754039287567139, + "reward_std": 0.25072386860847473, + "rewards/accuracy_reward_stage2": 0.6066538095474243, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 656 + }, + { + "completion_length": 18.671875, + "epoch": 0.11512178026984406, + "grad_norm": 22.108350234072944, + "kl": 0.015625, + "learning_rate": 8.850534431400035e-07, + "loss": 0.0062, + "reward": 1.5428324937820435, + "reward_std": 0.25868257880210876, + "rewards/accuracy_reward_stage2": 0.5428324937820435, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 657 + }, + { + "completion_length": 10.1875, + "epoch": 0.1152970036796916, + "grad_norm": 16.81031071335677, + "kl": 0.06640625, + "learning_rate": 8.848782197301558e-07, + "loss": 0.0265, + "reward": 1.6095609664916992, + "reward_std": 0.1511930674314499, + "rewards/accuracy_reward_stage2": 0.609561026096344, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 658 + }, + { + "completion_length": 18.171875, + "epoch": 0.11547222708953916, + "grad_norm": 23.544019291161536, + "kl": 0.33203125, + "learning_rate": 8.847029963203083e-07, + "loss": 0.1328, + "reward": 1.2967886924743652, + "reward_std": 0.24356095492839813, + "rewards/accuracy_reward_stage2": 0.42178869247436523, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 659 + }, + { + "completion_length": 9.46875, + "epoch": 0.11564745049938671, + "grad_norm": 20.610555296149347, + "kl": 0.045166015625, + "learning_rate": 8.845277729104608e-07, + "loss": 0.0181, + "reward": 1.5369362831115723, + "reward_std": 0.19677358865737915, + "rewards/accuracy_reward_stage2": 0.5369362235069275, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 660 + }, + { + "completion_length": 11.578125, + "epoch": 0.11582267390923427, + "grad_norm": 23.473030243653238, + "kl": 0.09375, + "learning_rate": 8.843525495006132e-07, + "loss": -0.0066, + "reward": 1.5962979793548584, + "reward_std": 0.24861329793930054, + "rewards/accuracy_reward_stage2": 0.6119229793548584, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 661 + }, + { + "completion_length": 11.921875, + "epoch": 0.11599789731908183, + "grad_norm": 33.93562617380281, + "kl": 0.08251953125, + "learning_rate": 8.841773260907657e-07, + "loss": 0.0329, + "reward": 1.5833332538604736, + "reward_std": 0.212066650390625, + "rewards/accuracy_reward_stage2": 0.5833332538604736, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 662 + }, + { + "completion_length": 9.84375, + "epoch": 0.11617312072892938, + "grad_norm": 26.082326069186735, + "kl": 0.06396484375, + "learning_rate": 8.840021026809182e-07, + "loss": -0.0278, + "reward": 1.449662446975708, + "reward_std": 0.22050856053829193, + "rewards/accuracy_reward_stage2": 0.48091232776641846, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 663 + }, + { + "completion_length": 11.359375, + "epoch": 0.11634834413877694, + "grad_norm": 20.736667075932573, + "kl": 0.05029296875, + "learning_rate": 8.838268792710706e-07, + "loss": 0.0201, + "reward": 1.7296762466430664, + "reward_std": 0.19434456527233124, + "rewards/accuracy_reward_stage2": 0.7296761870384216, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 664 + }, + { + "completion_length": 15.875, + "epoch": 0.1165235675486245, + "grad_norm": 16.50673791343758, + "kl": 0.060791015625, + "learning_rate": 8.83651655861223e-07, + "loss": 0.0243, + "reward": 1.299643635749817, + "reward_std": 0.21787844598293304, + "rewards/accuracy_reward_stage2": 0.2996436357498169, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 665 + }, + { + "completion_length": 18.34375, + "epoch": 0.11669879095847205, + "grad_norm": 22.42753279935099, + "kl": 0.416015625, + "learning_rate": 8.834764324513754e-07, + "loss": 0.1658, + "reward": 1.5624425411224365, + "reward_std": 0.16296470165252686, + "rewards/accuracy_reward_stage2": 0.6874425411224365, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 666 + }, + { + "completion_length": 12.671875, + "epoch": 0.11687401436831961, + "grad_norm": 21.038441282610368, + "kl": 0.0615234375, + "learning_rate": 8.833012090415279e-07, + "loss": 0.0246, + "reward": 1.6154024600982666, + "reward_std": 0.28916823863983154, + "rewards/accuracy_reward_stage2": 0.7404024004936218, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 667 + }, + { + "completion_length": 9.609375, + "epoch": 0.11704923777816717, + "grad_norm": 30.071927015193936, + "kl": 0.08447265625, + "learning_rate": 8.831259856316804e-07, + "loss": 0.0338, + "reward": 1.4298069477081299, + "reward_std": 0.21128447353839874, + "rewards/accuracy_reward_stage2": 0.4298068881034851, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 668 + }, + { + "completion_length": 7.453125, + "epoch": 0.11722446118801472, + "grad_norm": 21.105957298891614, + "kl": 0.06494140625, + "learning_rate": 8.829507622218328e-07, + "loss": -0.0181, + "reward": 1.3958759307861328, + "reward_std": 0.17537108063697815, + "rewards/accuracy_reward_stage2": 0.5365009903907776, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 669 + }, + { + "completion_length": 10.46875, + "epoch": 0.11739968459786228, + "grad_norm": 18.960024727042775, + "kl": 0.056884765625, + "learning_rate": 8.827755388119852e-07, + "loss": -0.0656, + "reward": 1.4355816841125488, + "reward_std": 0.23458895087242126, + "rewards/accuracy_reward_stage2": 0.46683168411254883, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 670 + }, + { + "completion_length": 37.203125, + "epoch": 0.11757490800770982, + "grad_norm": 56.1089179681269, + "kl": 0.3671875, + "learning_rate": 8.826003154021377e-07, + "loss": 0.1468, + "reward": 1.473738193511963, + "reward_std": 0.2174667865037918, + "rewards/accuracy_reward_stage2": 0.5987382531166077, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 671 + }, + { + "completion_length": 7.15625, + "epoch": 0.11775013141755739, + "grad_norm": 13.451270267464256, + "kl": 0.11474609375, + "learning_rate": 8.824250919922901e-07, + "loss": 0.0459, + "reward": 1.5743929147720337, + "reward_std": 0.10862401127815247, + "rewards/accuracy_reward_stage2": 0.6993929147720337, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 672 + }, + { + "completion_length": 20.109375, + "epoch": 0.11792535482740495, + "grad_norm": 26.849572474111802, + "kl": 0.1298828125, + "learning_rate": 8.822498685824426e-07, + "loss": 0.0186, + "reward": 1.508543610572815, + "reward_std": 0.21059757471084595, + "rewards/accuracy_reward_stage2": 0.6491686105728149, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 673 + }, + { + "completion_length": 10.328125, + "epoch": 0.1181005782372525, + "grad_norm": 17.31888261104493, + "kl": 0.03564453125, + "learning_rate": 8.82074645172595e-07, + "loss": 0.0143, + "reward": 1.3142361640930176, + "reward_std": 0.1992053985595703, + "rewards/accuracy_reward_stage2": 0.3142361044883728, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 674 + }, + { + "completion_length": 13.3125, + "epoch": 0.11827580164710005, + "grad_norm": 21.049934078158824, + "kl": 0.140625, + "learning_rate": 8.818994217627474e-07, + "loss": 0.0562, + "reward": 1.5868923664093018, + "reward_std": 0.19813916087150574, + "rewards/accuracy_reward_stage2": 0.586892306804657, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 675 + }, + { + "completion_length": 15.25, + "epoch": 0.11845102505694761, + "grad_norm": 67.66555012601259, + "kl": 0.052001953125, + "learning_rate": 8.817241983528999e-07, + "loss": 0.0209, + "reward": 1.648368000984192, + "reward_std": 0.13681325316429138, + "rewards/accuracy_reward_stage2": 0.6483679413795471, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 676 + }, + { + "completion_length": 8.625, + "epoch": 0.11862624846679516, + "grad_norm": 23.914579904623068, + "kl": 0.0274658203125, + "learning_rate": 8.815489749430523e-07, + "loss": 0.011, + "reward": 1.7271525859832764, + "reward_std": 0.29088348150253296, + "rewards/accuracy_reward_stage2": 0.7271526455879211, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 677 + }, + { + "completion_length": 10.359375, + "epoch": 0.11880147187664272, + "grad_norm": 23.544438238575456, + "kl": 0.06494140625, + "learning_rate": 8.813737515332048e-07, + "loss": -0.0126, + "reward": 1.7988197803497314, + "reward_std": 0.2707360088825226, + "rewards/accuracy_reward_stage2": 0.8144446611404419, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 678 + }, + { + "completion_length": 13.609375, + "epoch": 0.11897669528649027, + "grad_norm": 24.38470438267223, + "kl": 0.06884765625, + "learning_rate": 8.811985281233573e-07, + "loss": -0.0755, + "reward": 1.5430048704147339, + "reward_std": 0.2995755672454834, + "rewards/accuracy_reward_stage2": 0.5898798108100891, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 679 + }, + { + "completion_length": 20.3125, + "epoch": 0.11915191869633783, + "grad_norm": 20.877247111687545, + "kl": 0.55078125, + "learning_rate": 8.810233047135097e-07, + "loss": 0.2208, + "reward": 1.4461277723312378, + "reward_std": 0.10148172080516815, + "rewards/accuracy_reward_stage2": 0.5711277723312378, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 680 + }, + { + "completion_length": 12.59375, + "epoch": 0.11932714210618539, + "grad_norm": 53.43297706665784, + "kl": 0.07080078125, + "learning_rate": 8.808480813036622e-07, + "loss": 0.0284, + "reward": 1.6597884893417358, + "reward_std": 0.29190492630004883, + "rewards/accuracy_reward_stage2": 0.6597884893417358, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 681 + }, + { + "completion_length": 14.15625, + "epoch": 0.11950236551603294, + "grad_norm": 11.011832755978508, + "kl": 0.047119140625, + "learning_rate": 8.806728578938146e-07, + "loss": 0.0188, + "reward": 1.5102589130401611, + "reward_std": 0.10205793380737305, + "rewards/accuracy_reward_stage2": 0.5102588534355164, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 682 + }, + { + "completion_length": 9.640625, + "epoch": 0.1196775889258805, + "grad_norm": 25.308451607783756, + "kl": 0.0859375, + "learning_rate": 8.80497634483967e-07, + "loss": -0.0043, + "reward": 1.5487972497940063, + "reward_std": 0.15885029733181, + "rewards/accuracy_reward_stage2": 0.5800472497940063, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 683 + }, + { + "completion_length": 22.40625, + "epoch": 0.11985281233572806, + "grad_norm": 25.29527051256025, + "kl": 0.031982421875, + "learning_rate": 8.803224110741195e-07, + "loss": 0.0129, + "reward": 1.7597458362579346, + "reward_std": 0.18770715594291687, + "rewards/accuracy_reward_stage2": 0.7597458362579346, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 684 + }, + { + "completion_length": 29.03125, + "epoch": 0.1200280357455756, + "grad_norm": 86.19787745150437, + "kl": 0.236328125, + "learning_rate": 8.801471876642718e-07, + "loss": 0.0505, + "reward": 1.5671895742416382, + "reward_std": 0.1661956012248993, + "rewards/accuracy_reward_stage2": 0.7078145742416382, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 685 + }, + { + "completion_length": 6.984375, + "epoch": 0.12020325915542317, + "grad_norm": 13.60605571568435, + "kl": 0.07177734375, + "learning_rate": 8.799719642544243e-07, + "loss": 0.0288, + "reward": 1.4552290439605713, + "reward_std": 0.16398081183433533, + "rewards/accuracy_reward_stage2": 0.4552290439605713, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 686 + }, + { + "completion_length": 10.390625, + "epoch": 0.12037848256527071, + "grad_norm": 25.567410027834242, + "kl": 0.015380859375, + "learning_rate": 8.797967408445768e-07, + "loss": 0.0062, + "reward": 1.7256697416305542, + "reward_std": 0.20889979600906372, + "rewards/accuracy_reward_stage2": 0.7256697416305542, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 687 + }, + { + "completion_length": 8.640625, + "epoch": 0.12055370597511827, + "grad_norm": 16.393250950051225, + "kl": 0.038818359375, + "learning_rate": 8.796215174347292e-07, + "loss": 0.0156, + "reward": 1.6450669765472412, + "reward_std": 0.09931539744138718, + "rewards/accuracy_reward_stage2": 0.6450668573379517, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 688 + }, + { + "completion_length": 19.53125, + "epoch": 0.12072892938496584, + "grad_norm": 20.266352703794034, + "kl": 0.0556640625, + "learning_rate": 8.794462940248817e-07, + "loss": -0.0219, + "reward": 1.4606654644012451, + "reward_std": 0.3459789752960205, + "rewards/accuracy_reward_stage2": 0.47629040479660034, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 689 + }, + { + "completion_length": 12.0625, + "epoch": 0.12090415279481338, + "grad_norm": 45.55200569986124, + "kl": 0.322265625, + "learning_rate": 8.792710706150341e-07, + "loss": 0.1292, + "reward": 1.5311663150787354, + "reward_std": 0.19967830181121826, + "rewards/accuracy_reward_stage2": 0.6561661958694458, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 690 + }, + { + "completion_length": 10.890625, + "epoch": 0.12107937620466094, + "grad_norm": 19.922042543972633, + "kl": 0.0703125, + "learning_rate": 8.790958472051866e-07, + "loss": 0.0281, + "reward": 1.6214659214019775, + "reward_std": 0.17283859848976135, + "rewards/accuracy_reward_stage2": 0.6214658617973328, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 691 + }, + { + "completion_length": 22.9375, + "epoch": 0.1212545996145085, + "grad_norm": 19.99328887677077, + "kl": 0.046142578125, + "learning_rate": 8.789206237953391e-07, + "loss": 0.0185, + "reward": 1.484812617301941, + "reward_std": 0.16919946670532227, + "rewards/accuracy_reward_stage2": 0.48481255769729614, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 692 + }, + { + "completion_length": 8.171875, + "epoch": 0.12142982302435605, + "grad_norm": 24.11697341254596, + "kl": 0.0247802734375, + "learning_rate": 8.787454003854915e-07, + "loss": 0.0099, + "reward": 1.6628926992416382, + "reward_std": 0.2817220687866211, + "rewards/accuracy_reward_stage2": 0.6628926992416382, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 693 + }, + { + "completion_length": 14.6875, + "epoch": 0.12160504643420361, + "grad_norm": 55.143918136953324, + "kl": 0.625, + "learning_rate": 8.78570176975644e-07, + "loss": 0.1772, + "reward": 1.2694811820983887, + "reward_std": 0.259592741727829, + "rewards/accuracy_reward_stage2": 0.41010621190071106, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 694 + }, + { + "completion_length": 14.734375, + "epoch": 0.12178026984405116, + "grad_norm": 21.688875450489633, + "kl": 0.03564453125, + "learning_rate": 8.783949535657964e-07, + "loss": 0.0143, + "reward": 1.6681108474731445, + "reward_std": 0.13850000500679016, + "rewards/accuracy_reward_stage2": 0.6681109666824341, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 695 + }, + { + "completion_length": 9.453125, + "epoch": 0.12195549325389872, + "grad_norm": 20.260297289156117, + "kl": 0.0986328125, + "learning_rate": 8.782197301559487e-07, + "loss": -0.0152, + "reward": 1.6024032831192017, + "reward_std": 0.26473379135131836, + "rewards/accuracy_reward_stage2": 0.6336532831192017, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 696 + }, + { + "completion_length": 13.5625, + "epoch": 0.12213071666374628, + "grad_norm": 52.401001224493655, + "kl": 0.5, + "learning_rate": 8.780445067461012e-07, + "loss": 0.124, + "reward": 1.285620927810669, + "reward_std": 0.34111350774765015, + "rewards/accuracy_reward_stage2": 0.44187092781066895, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 697 + }, + { + "completion_length": 17.609375, + "epoch": 0.12230594007359383, + "grad_norm": 20.79419101396458, + "kl": 0.23828125, + "learning_rate": 8.778692833362536e-07, + "loss": 0.0295, + "reward": 1.3721497058868408, + "reward_std": 0.11534123867750168, + "rewards/accuracy_reward_stage2": 0.5283997058868408, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 698 + }, + { + "completion_length": 19.265625, + "epoch": 0.12248116348344139, + "grad_norm": 16.840847107451385, + "kl": 0.016357421875, + "learning_rate": 8.776940599264061e-07, + "loss": -0.0376, + "reward": 1.3428521156311035, + "reward_std": 0.12679797410964966, + "rewards/accuracy_reward_stage2": 0.3584771156311035, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 699 + }, + { + "completion_length": 7.390625, + "epoch": 0.12265638689328895, + "grad_norm": 10.71083439910046, + "kl": 0.01177978515625, + "learning_rate": 8.775188365165586e-07, + "loss": -0.0395, + "reward": 1.375, + "reward_std": 0.0883883461356163, + "rewards/accuracy_reward_stage2": 0.390625, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 700 + }, + { + "completion_length": 23.359375, + "epoch": 0.1228316103031365, + "grad_norm": 127.53890039782576, + "kl": 0.451171875, + "learning_rate": 8.77343613106711e-07, + "loss": 0.1522, + "reward": 1.495405912399292, + "reward_std": 0.22437676787376404, + "rewards/accuracy_reward_stage2": 0.6360308527946472, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 701 + }, + { + "completion_length": 11.90625, + "epoch": 0.12300683371298406, + "grad_norm": 22.438948285025383, + "kl": 0.0966796875, + "learning_rate": 8.771683896968635e-07, + "loss": -0.0445, + "reward": 1.5524406433105469, + "reward_std": 0.2997596263885498, + "rewards/accuracy_reward_stage2": 0.5836907029151917, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 702 + }, + { + "completion_length": 13.21875, + "epoch": 0.1231820571228316, + "grad_norm": 19.987582006618496, + "kl": 0.044921875, + "learning_rate": 8.76993166287016e-07, + "loss": -0.0704, + "reward": 1.6021901369094849, + "reward_std": 0.3515224754810333, + "rewards/accuracy_reward_stage2": 0.6334401369094849, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 703 + }, + { + "completion_length": 17.109375, + "epoch": 0.12335728053267916, + "grad_norm": 23.503168971994356, + "kl": 0.06396484375, + "learning_rate": 8.768179428771684e-07, + "loss": 0.0255, + "reward": 1.6832342147827148, + "reward_std": 0.20066285133361816, + "rewards/accuracy_reward_stage2": 0.6832343339920044, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 704 + }, + { + "completion_length": 12.6875, + "epoch": 0.12353250394252673, + "grad_norm": 19.273611616412307, + "kl": 0.0869140625, + "learning_rate": 8.766427194673208e-07, + "loss": -0.0095, + "reward": 1.6391098499298096, + "reward_std": 0.2224484533071518, + "rewards/accuracy_reward_stage2": 0.6547348499298096, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 705 + }, + { + "completion_length": 7.671875, + "epoch": 0.12370772735237427, + "grad_norm": 18.929037623227902, + "kl": 0.05419921875, + "learning_rate": 8.764674960574732e-07, + "loss": -0.0225, + "reward": 1.328125, + "reward_std": 0.1530819833278656, + "rewards/accuracy_reward_stage2": 0.46875, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 706 + }, + { + "completion_length": 13.21875, + "epoch": 0.12388295076222183, + "grad_norm": 13.248950555078183, + "kl": 0.0189208984375, + "learning_rate": 8.762922726476257e-07, + "loss": 0.0076, + "reward": 1.7340033054351807, + "reward_std": 0.10389992594718933, + "rewards/accuracy_reward_stage2": 0.7340033054351807, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 707 + }, + { + "completion_length": 20.359375, + "epoch": 0.1240581741720694, + "grad_norm": 27.180274349507958, + "kl": 0.2197265625, + "learning_rate": 8.761170492377782e-07, + "loss": 0.0879, + "reward": 1.3913912773132324, + "reward_std": 0.165449857711792, + "rewards/accuracy_reward_stage2": 0.5163911581039429, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 708 + }, + { + "completion_length": 21.78125, + "epoch": 0.12423339758191694, + "grad_norm": 58.96739621642977, + "kl": 0.435546875, + "learning_rate": 8.759418258279305e-07, + "loss": 0.1303, + "reward": 1.2916667461395264, + "reward_std": 0.2051776796579361, + "rewards/accuracy_reward_stage2": 0.5572916865348816, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 709 + }, + { + "completion_length": 8.546875, + "epoch": 0.1244086209917645, + "grad_norm": 25.120922125351125, + "kl": 0.08447265625, + "learning_rate": 8.75766602418083e-07, + "loss": -0.0181, + "reward": 1.6080281734466553, + "reward_std": 0.3198818564414978, + "rewards/accuracy_reward_stage2": 0.6392781734466553, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 710 + }, + { + "completion_length": 8.734375, + "epoch": 0.12458384440161206, + "grad_norm": 28.15242862624003, + "kl": 0.1923828125, + "learning_rate": 8.755913790082355e-07, + "loss": 0.0328, + "reward": 1.4423253536224365, + "reward_std": 0.17335930466651917, + "rewards/accuracy_reward_stage2": 0.45795029401779175, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 711 + }, + { + "completion_length": 10.90625, + "epoch": 0.12475906781145961, + "grad_norm": 21.76944181859614, + "kl": 0.10009765625, + "learning_rate": 8.754161555983879e-07, + "loss": 0.0401, + "reward": 1.505290150642395, + "reward_std": 0.2847074270248413, + "rewards/accuracy_reward_stage2": 0.5052902102470398, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 712 + }, + { + "completion_length": 9.0625, + "epoch": 0.12493429122130717, + "grad_norm": 18.094072371451865, + "kl": 0.0625, + "learning_rate": 8.752409321885404e-07, + "loss": -0.0187, + "reward": 1.6135753393173218, + "reward_std": 0.1741529405117035, + "rewards/accuracy_reward_stage2": 0.6292003393173218, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 713 + }, + { + "completion_length": 8.609375, + "epoch": 0.12510951463115472, + "grad_norm": 15.344482387105208, + "kl": 0.1025390625, + "learning_rate": 8.750657087786927e-07, + "loss": 0.0411, + "reward": 1.7685964107513428, + "reward_std": 0.06149989739060402, + "rewards/accuracy_reward_stage2": 0.768596351146698, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 714 + }, + { + "completion_length": 12.0625, + "epoch": 0.1252847380410023, + "grad_norm": 62.719821119851055, + "kl": 0.0654296875, + "learning_rate": 8.748904853688452e-07, + "loss": 0.0262, + "reward": 1.5387461185455322, + "reward_std": 0.22166486084461212, + "rewards/accuracy_reward_stage2": 0.5387461185455322, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 715 + }, + { + "completion_length": 8.90625, + "epoch": 0.12545996145084984, + "grad_norm": 22.598417730239905, + "kl": 0.08203125, + "learning_rate": 8.747152619589977e-07, + "loss": -0.043, + "reward": 1.7066401243209839, + "reward_std": 0.23297566175460815, + "rewards/accuracy_reward_stage2": 0.8628901839256287, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 716 + }, + { + "completion_length": 13.8125, + "epoch": 0.12563518486069739, + "grad_norm": 19.205609822610867, + "kl": 0.1552734375, + "learning_rate": 8.745400385491501e-07, + "loss": 0.062, + "reward": 1.5689747333526611, + "reward_std": 0.11873051524162292, + "rewards/accuracy_reward_stage2": 0.6939746737480164, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 717 + }, + { + "completion_length": 17.75, + "epoch": 0.12581040827054493, + "grad_norm": 24.131101589429424, + "kl": 0.0277099609375, + "learning_rate": 8.743648151393026e-07, + "loss": 0.0111, + "reward": 1.6367136240005493, + "reward_std": 0.21164877712726593, + "rewards/accuracy_reward_stage2": 0.6367136240005493, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 718 + }, + { + "completion_length": 12.671875, + "epoch": 0.1259856316803925, + "grad_norm": 18.986879097817095, + "kl": 0.16015625, + "learning_rate": 8.741895917294551e-07, + "loss": 0.037, + "reward": 1.5682744979858398, + "reward_std": 0.16256004571914673, + "rewards/accuracy_reward_stage2": 0.7088994383811951, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 719 + }, + { + "completion_length": 26.125, + "epoch": 0.12616085509024005, + "grad_norm": 14.897658742516182, + "kl": 0.033203125, + "learning_rate": 8.740143683196075e-07, + "loss": -0.0309, + "reward": 1.5009183883666992, + "reward_std": 0.135984867811203, + "rewards/accuracy_reward_stage2": 0.516543447971344, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 720 + }, + { + "completion_length": 14.15625, + "epoch": 0.1263360785000876, + "grad_norm": 1887.643603829332, + "kl": 6.03125, + "learning_rate": 8.738391449097599e-07, + "loss": 2.4267, + "reward": 1.3118016719818115, + "reward_std": 0.15657545626163483, + "rewards/accuracy_reward_stage2": 0.4368016719818115, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 721 + }, + { + "completion_length": 8.875, + "epoch": 0.12651130190993518, + "grad_norm": 15.99578727889066, + "kl": 0.07373046875, + "learning_rate": 8.736639214999123e-07, + "loss": -0.0463, + "reward": 1.6581439971923828, + "reward_std": 0.31767192482948303, + "rewards/accuracy_reward_stage2": 0.689393937587738, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 722 + }, + { + "completion_length": 10.125, + "epoch": 0.12668652531978272, + "grad_norm": 21.204954483400815, + "kl": 0.11328125, + "learning_rate": 8.734886980900648e-07, + "loss": -0.0853, + "reward": 1.407235860824585, + "reward_std": 0.3323286771774292, + "rewards/accuracy_reward_stage2": 0.45411089062690735, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 723 + }, + { + "completion_length": 14.296875, + "epoch": 0.12686174872963027, + "grad_norm": 22.800784257270934, + "kl": 0.3984375, + "learning_rate": 8.733134746802173e-07, + "loss": 0.1378, + "reward": 1.3840141296386719, + "reward_std": 0.2325722724199295, + "rewards/accuracy_reward_stage2": 0.5246391296386719, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 724 + }, + { + "completion_length": 8.890625, + "epoch": 0.12703697213947784, + "grad_norm": 20.381938458516128, + "kl": 0.043701171875, + "learning_rate": 8.731382512703696e-07, + "loss": 0.0175, + "reward": 1.5718777179718018, + "reward_std": 0.21215331554412842, + "rewards/accuracy_reward_stage2": 0.5718777179718018, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 725 + }, + { + "completion_length": 8.421875, + "epoch": 0.1272121955493254, + "grad_norm": 21.793579301173956, + "kl": 0.080078125, + "learning_rate": 8.729630278605221e-07, + "loss": 0.0321, + "reward": 1.6613794565200806, + "reward_std": 0.3252708613872528, + "rewards/accuracy_reward_stage2": 0.6613793969154358, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 726 + }, + { + "completion_length": 14.09375, + "epoch": 0.12738741895917294, + "grad_norm": 23.380397758448332, + "kl": 0.0673828125, + "learning_rate": 8.727878044506745e-07, + "loss": -0.0613, + "reward": 1.5214309692382812, + "reward_std": 0.2900305986404419, + "rewards/accuracy_reward_stage2": 0.5526810884475708, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 727 + }, + { + "completion_length": 8.203125, + "epoch": 0.1275626423690205, + "grad_norm": 18.344928650612594, + "kl": 0.11767578125, + "learning_rate": 8.72612581040827e-07, + "loss": 0.0471, + "reward": 1.5166375637054443, + "reward_std": 0.23853465914726257, + "rewards/accuracy_reward_stage2": 0.7666375041007996, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 728 + }, + { + "completion_length": 13.75, + "epoch": 0.12773786577886806, + "grad_norm": 11.786925146699105, + "kl": 0.039306640625, + "learning_rate": 8.724373576309795e-07, + "loss": -0.0644, + "reward": 1.3146369457244873, + "reward_std": 0.16122567653656006, + "rewards/accuracy_reward_stage2": 0.4708869457244873, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 729 + }, + { + "completion_length": 10.59375, + "epoch": 0.1279130891887156, + "grad_norm": 19.45807361148035, + "kl": 0.09423828125, + "learning_rate": 8.722621342211319e-07, + "loss": 0.0378, + "reward": 1.5389803647994995, + "reward_std": 0.22324511408805847, + "rewards/accuracy_reward_stage2": 0.6639803647994995, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 730 + }, + { + "completion_length": 13.296875, + "epoch": 0.12808831259856318, + "grad_norm": 10.960043287191809, + "kl": 0.33203125, + "learning_rate": 8.720869108112844e-07, + "loss": 0.133, + "reward": 1.5178592205047607, + "reward_std": 0.08258861303329468, + "rewards/accuracy_reward_stage2": 0.6428592205047607, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 731 + }, + { + "completion_length": 12.515625, + "epoch": 0.12826353600841073, + "grad_norm": 17.782479377656646, + "kl": 0.043212890625, + "learning_rate": 8.719116874014369e-07, + "loss": -0.0269, + "reward": 1.580472707748413, + "reward_std": 0.23168525099754333, + "rewards/accuracy_reward_stage2": 0.5960977673530579, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 732 + }, + { + "completion_length": 8.3125, + "epoch": 0.12843875941825827, + "grad_norm": 25.95392845141862, + "kl": 0.0732421875, + "learning_rate": 8.717364639915893e-07, + "loss": -0.0464, + "reward": 1.5123786926269531, + "reward_std": 0.2772209644317627, + "rewards/accuracy_reward_stage2": 0.5436286926269531, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 733 + }, + { + "completion_length": 23.703125, + "epoch": 0.12861398282810582, + "grad_norm": 38.20590883091491, + "kl": 0.609375, + "learning_rate": 8.715612405817416e-07, + "loss": 0.2438, + "reward": 1.48157799243927, + "reward_std": 0.26270562410354614, + "rewards/accuracy_reward_stage2": 0.73157799243927, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 734 + }, + { + "completion_length": 12.46875, + "epoch": 0.1287892062379534, + "grad_norm": 24.100699456688726, + "kl": 0.0908203125, + "learning_rate": 8.71386017171894e-07, + "loss": 0.0364, + "reward": 1.5929126739501953, + "reward_std": 0.2643412947654724, + "rewards/accuracy_reward_stage2": 0.5929126739501953, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 735 + }, + { + "completion_length": 9.703125, + "epoch": 0.12896442964780094, + "grad_norm": 10.832855482198559, + "kl": 0.016845703125, + "learning_rate": 8.712107937620465e-07, + "loss": -0.0879, + "reward": 1.484375, + "reward_std": 0.13258251547813416, + "rewards/accuracy_reward_stage2": 0.53125, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 736 + }, + { + "completion_length": 14.765625, + "epoch": 0.1291396530576485, + "grad_norm": 17.12707163097698, + "kl": 0.2177734375, + "learning_rate": 8.71035570352199e-07, + "loss": 0.0871, + "reward": 1.5152562856674194, + "reward_std": 0.11481408774852753, + "rewards/accuracy_reward_stage2": 0.7652561664581299, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 737 + }, + { + "completion_length": 10.296875, + "epoch": 0.12931487646749606, + "grad_norm": 18.785523386499325, + "kl": 0.095703125, + "learning_rate": 8.708603469423514e-07, + "loss": 0.0382, + "reward": 1.8014370203018188, + "reward_std": 0.17660526931285858, + "rewards/accuracy_reward_stage2": 0.8014370203018188, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 738 + }, + { + "completion_length": 13.875, + "epoch": 0.1294900998773436, + "grad_norm": 24.0749346047583, + "kl": 0.09521484375, + "learning_rate": 8.706851235325039e-07, + "loss": 0.0381, + "reward": 1.5514681339263916, + "reward_std": 0.2884979546070099, + "rewards/accuracy_reward_stage2": 0.5514680743217468, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 739 + }, + { + "completion_length": 13.0, + "epoch": 0.12966532328719116, + "grad_norm": 23.044753771481854, + "kl": 0.09228515625, + "learning_rate": 8.705099001226564e-07, + "loss": 0.037, + "reward": 1.5986329317092896, + "reward_std": 0.29432013630867004, + "rewards/accuracy_reward_stage2": 0.7236329317092896, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 740 + }, + { + "completion_length": 8.84375, + "epoch": 0.12984054669703873, + "grad_norm": 28.607090780035833, + "kl": 0.018798828125, + "learning_rate": 8.703346767128088e-07, + "loss": 0.0075, + "reward": 1.4213995933532715, + "reward_std": 0.2705962657928467, + "rewards/accuracy_reward_stage2": 0.42139962315559387, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 741 + }, + { + "completion_length": 8.078125, + "epoch": 0.13001577010688628, + "grad_norm": 22.206143566559955, + "kl": 0.1025390625, + "learning_rate": 8.701594533029613e-07, + "loss": -0.0031, + "reward": 1.5844957828521729, + "reward_std": 0.2147214710712433, + "rewards/accuracy_reward_stage2": 0.7251207828521729, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 742 + }, + { + "completion_length": 14.46875, + "epoch": 0.13019099351673383, + "grad_norm": 23.76171747296305, + "kl": 0.1259765625, + "learning_rate": 8.699842298931137e-07, + "loss": 0.0081, + "reward": 1.5806400775909424, + "reward_std": 0.20044711232185364, + "rewards/accuracy_reward_stage2": 0.7212650775909424, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 743 + }, + { + "completion_length": 11.28125, + "epoch": 0.1303662169265814, + "grad_norm": 26.143393613825847, + "kl": 0.03759765625, + "learning_rate": 8.698090064832662e-07, + "loss": 0.0151, + "reward": 1.4961230754852295, + "reward_std": 0.25255924463272095, + "rewards/accuracy_reward_stage2": 0.4961230754852295, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 744 + }, + { + "completion_length": 8.46875, + "epoch": 0.13054144033642895, + "grad_norm": 31.395400062191023, + "kl": 0.05419921875, + "learning_rate": 8.696337830734186e-07, + "loss": 0.0217, + "reward": 1.7549707889556885, + "reward_std": 0.2908035218715668, + "rewards/accuracy_reward_stage2": 0.7549707293510437, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 745 + }, + { + "completion_length": 7.0, + "epoch": 0.1307166637462765, + "grad_norm": 19.43893579940994, + "kl": 0.146484375, + "learning_rate": 8.69458559663571e-07, + "loss": 0.0585, + "reward": 1.581559658050537, + "reward_std": 0.23520039021968842, + "rewards/accuracy_reward_stage2": 0.5815596580505371, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 746 + }, + { + "completion_length": 43.46875, + "epoch": 0.13089188715612407, + "grad_norm": 17.920030287701124, + "kl": 0.1015625, + "learning_rate": 8.692833362537234e-07, + "loss": 0.0406, + "reward": 1.3412933349609375, + "reward_std": 0.1644451916217804, + "rewards/accuracy_reward_stage2": 0.4662933945655823, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 747 + }, + { + "completion_length": 7.03125, + "epoch": 0.13106711056597162, + "grad_norm": 20.164146416527387, + "kl": 0.02294921875, + "learning_rate": 8.691081128438759e-07, + "loss": -0.0242, + "reward": 1.5950117111206055, + "reward_std": 0.22564148902893066, + "rewards/accuracy_reward_stage2": 0.6106366515159607, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 748 + }, + { + "completion_length": 13.671875, + "epoch": 0.13124233397581916, + "grad_norm": 24.397572687906287, + "kl": 0.11767578125, + "learning_rate": 8.689328894340283e-07, + "loss": 0.0096, + "reward": 1.5717556476593018, + "reward_std": 0.2309304177761078, + "rewards/accuracy_reward_stage2": 0.5873807072639465, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 749 + }, + { + "completion_length": 10.28125, + "epoch": 0.13141755738566674, + "grad_norm": 18.586354742955837, + "kl": 0.057861328125, + "learning_rate": 8.687576660241808e-07, + "loss": -0.0064, + "reward": 1.49538254737854, + "reward_std": 0.20117239654064178, + "rewards/accuracy_reward_stage2": 0.6360074877738953, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 750 + }, + { + "completion_length": 6.875, + "epoch": 0.13159278079551429, + "grad_norm": 18.772365600988905, + "kl": 0.0546875, + "learning_rate": 8.685824426143332e-07, + "loss": 0.0219, + "reward": 1.5222609043121338, + "reward_std": 0.1611718237400055, + "rewards/accuracy_reward_stage2": 0.647260844707489, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 751 + }, + { + "completion_length": 11.5, + "epoch": 0.13176800420536183, + "grad_norm": 26.936451249334294, + "kl": 0.150390625, + "learning_rate": 8.684072192044857e-07, + "loss": 0.0601, + "reward": 1.3048322200775146, + "reward_std": 0.27304765582084656, + "rewards/accuracy_reward_stage2": 0.4298322796821594, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 752 + }, + { + "completion_length": 6.921875, + "epoch": 0.13194322761520938, + "grad_norm": 13.725810132685641, + "kl": 0.01495361328125, + "learning_rate": 8.682319957946382e-07, + "loss": 0.006, + "reward": 1.6228388547897339, + "reward_std": 0.0823933333158493, + "rewards/accuracy_reward_stage2": 0.6228388547897339, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 753 + }, + { + "completion_length": 12.28125, + "epoch": 0.13211845102505695, + "grad_norm": 17.3231009586425, + "kl": 0.095703125, + "learning_rate": 8.680567723847905e-07, + "loss": 0.0383, + "reward": 1.5200889110565186, + "reward_std": 0.1671619862318039, + "rewards/accuracy_reward_stage2": 0.5200889110565186, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 754 + }, + { + "completion_length": 9.15625, + "epoch": 0.1322936744349045, + "grad_norm": 21.947823238942373, + "kl": 0.123046875, + "learning_rate": 8.67881548974943e-07, + "loss": 0.0494, + "reward": 1.4984718561172485, + "reward_std": 0.1990078091621399, + "rewards/accuracy_reward_stage2": 0.6234718561172485, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 755 + }, + { + "completion_length": 7.484375, + "epoch": 0.13246889784475205, + "grad_norm": 20.55658085463294, + "kl": 0.068359375, + "learning_rate": 8.677063255650955e-07, + "loss": 0.0273, + "reward": 1.631592035293579, + "reward_std": 0.20693038403987885, + "rewards/accuracy_reward_stage2": 0.6315920352935791, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 756 + }, + { + "completion_length": 13.109375, + "epoch": 0.13264412125459962, + "grad_norm": 895.2476838263423, + "kl": 3.9375, + "learning_rate": 8.675311021552479e-07, + "loss": 1.5671, + "reward": 1.2633342742919922, + "reward_std": 0.12322809547185898, + "rewards/accuracy_reward_stage2": 0.3883342742919922, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 757 + }, + { + "completion_length": 9.109375, + "epoch": 0.13281934466444717, + "grad_norm": 15.094495405516248, + "kl": 0.03759765625, + "learning_rate": 8.673558787454004e-07, + "loss": 0.015, + "reward": 1.5943365097045898, + "reward_std": 0.07661169767379761, + "rewards/accuracy_reward_stage2": 0.5943365693092346, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 758 + }, + { + "completion_length": 11.546875, + "epoch": 0.13299456807429472, + "grad_norm": 19.67056194967656, + "kl": 0.050537109375, + "learning_rate": 8.671806553355527e-07, + "loss": 0.0202, + "reward": 1.7037529945373535, + "reward_std": 0.22206325829029083, + "rewards/accuracy_reward_stage2": 0.7037530541419983, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 759 + }, + { + "completion_length": 8.421875, + "epoch": 0.1331697914841423, + "grad_norm": 24.23192740150212, + "kl": 0.083984375, + "learning_rate": 8.670054319257052e-07, + "loss": 0.0047, + "reward": 1.543332815170288, + "reward_std": 0.26957929134368896, + "rewards/accuracy_reward_stage2": 0.5589578151702881, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 760 + }, + { + "completion_length": 11.3125, + "epoch": 0.13334501489398984, + "grad_norm": 23.52029972754725, + "kl": 0.09130859375, + "learning_rate": 8.668302085158577e-07, + "loss": 0.0367, + "reward": 1.6756335496902466, + "reward_std": 0.1896108090877533, + "rewards/accuracy_reward_stage2": 0.6756335496902466, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 761 + }, + { + "completion_length": 8.640625, + "epoch": 0.13352023830383739, + "grad_norm": 19.93103272604443, + "kl": 0.10498046875, + "learning_rate": 8.666549851060101e-07, + "loss": -0.0247, + "reward": 1.2811100482940674, + "reward_std": 0.27886366844177246, + "rewards/accuracy_reward_stage2": 0.31236010789871216, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 762 + }, + { + "completion_length": 8.765625, + "epoch": 0.13369546171368496, + "grad_norm": 28.2057588361612, + "kl": 0.20703125, + "learning_rate": 8.664797616961626e-07, + "loss": 0.083, + "reward": 1.4688949584960938, + "reward_std": 0.1758657693862915, + "rewards/accuracy_reward_stage2": 0.5938950181007385, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 763 + }, + { + "completion_length": 15.296875, + "epoch": 0.1338706851235325, + "grad_norm": 28.10233930198314, + "kl": 0.11328125, + "learning_rate": 8.66304538286315e-07, + "loss": -0.0117, + "reward": 1.5274418592453003, + "reward_std": 0.35585153102874756, + "rewards/accuracy_reward_stage2": 0.5586917996406555, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 764 + }, + { + "completion_length": 12.21875, + "epoch": 0.13404590853338005, + "grad_norm": 22.15661040647775, + "kl": 0.03173828125, + "learning_rate": 8.661293148764674e-07, + "loss": 0.0127, + "reward": 1.6838589906692505, + "reward_std": 0.27672165632247925, + "rewards/accuracy_reward_stage2": 0.6838589310646057, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 765 + }, + { + "completion_length": 19.125, + "epoch": 0.13422113194322763, + "grad_norm": 191.1697869554803, + "kl": 0.053955078125, + "learning_rate": 8.659540914666199e-07, + "loss": 0.0216, + "reward": 1.3028383255004883, + "reward_std": 0.22169262170791626, + "rewards/accuracy_reward_stage2": 0.4278383255004883, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 766 + }, + { + "completion_length": 7.40625, + "epoch": 0.13439635535307518, + "grad_norm": 22.16596563872041, + "kl": 0.09423828125, + "learning_rate": 8.657788680567723e-07, + "loss": -0.0065, + "reward": 1.4294730424880981, + "reward_std": 0.2778435945510864, + "rewards/accuracy_reward_stage2": 0.5700980424880981, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 767 + }, + { + "completion_length": 12.609375, + "epoch": 0.13457157876292272, + "grad_norm": 14.85272111547269, + "kl": 0.0341796875, + "learning_rate": 8.656036446469248e-07, + "loss": 0.0137, + "reward": 1.395758867263794, + "reward_std": 0.15206801891326904, + "rewards/accuracy_reward_stage2": 0.5207589268684387, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 768 + }, + { + "completion_length": 7.359375, + "epoch": 0.13474680217277027, + "grad_norm": 40.27145578675297, + "kl": 0.4140625, + "learning_rate": 8.654284212370773e-07, + "loss": 0.166, + "reward": 1.4100593328475952, + "reward_std": 0.180698424577713, + "rewards/accuracy_reward_stage2": 0.5350593328475952, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 769 + }, + { + "completion_length": 7.96875, + "epoch": 0.13492202558261784, + "grad_norm": 21.098959385331987, + "kl": 0.06640625, + "learning_rate": 8.652531978272297e-07, + "loss": -0.0516, + "reward": 1.5042563676834106, + "reward_std": 0.32883530855178833, + "rewards/accuracy_reward_stage2": 0.5355063676834106, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 770 + }, + { + "completion_length": 9.59375, + "epoch": 0.1350972489924654, + "grad_norm": 21.0235097757217, + "kl": 0.09033203125, + "learning_rate": 8.650779744173822e-07, + "loss": -0.0923, + "reward": 1.3507249355316162, + "reward_std": 0.30107730627059937, + "rewards/accuracy_reward_stage2": 0.4132249057292938, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 771 + }, + { + "completion_length": 10.21875, + "epoch": 0.13527247240231294, + "grad_norm": 19.182190078886475, + "kl": 0.042724609375, + "learning_rate": 8.649027510075346e-07, + "loss": 0.0171, + "reward": 1.4446072578430176, + "reward_std": 0.20177596807479858, + "rewards/accuracy_reward_stage2": 0.569607138633728, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 772 + }, + { + "completion_length": 8.078125, + "epoch": 0.1354476958121605, + "grad_norm": 21.87737505155135, + "kl": 0.0849609375, + "learning_rate": 8.64727527597687e-07, + "loss": 0.034, + "reward": 1.4658043384552002, + "reward_std": 0.2499678134918213, + "rewards/accuracy_reward_stage2": 0.590804398059845, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 773 + }, + { + "completion_length": 9.765625, + "epoch": 0.13562291922200806, + "grad_norm": 13.952596182194974, + "kl": 0.046630859375, + "learning_rate": 8.645523041878394e-07, + "loss": 0.0187, + "reward": 1.5219866037368774, + "reward_std": 0.13566339015960693, + "rewards/accuracy_reward_stage2": 0.5219866037368774, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 774 + }, + { + "completion_length": 8.34375, + "epoch": 0.1357981426318556, + "grad_norm": 20.70125677688085, + "kl": 0.078125, + "learning_rate": 8.643770807779918e-07, + "loss": 0.0312, + "reward": 1.6207479238510132, + "reward_std": 0.1453634649515152, + "rewards/accuracy_reward_stage2": 0.6207479238510132, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 775 + }, + { + "completion_length": 6.53125, + "epoch": 0.13597336604170318, + "grad_norm": 19.221602229273802, + "kl": 0.041259765625, + "learning_rate": 8.642018573681443e-07, + "loss": 0.0165, + "reward": 1.329564094543457, + "reward_std": 0.21231430768966675, + "rewards/accuracy_reward_stage2": 0.32956403493881226, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 776 + }, + { + "completion_length": 7.546875, + "epoch": 0.13614858945155073, + "grad_norm": 20.77515018987291, + "kl": 0.05419921875, + "learning_rate": 8.640266339582968e-07, + "loss": 0.0217, + "reward": 1.5638327598571777, + "reward_std": 0.15072381496429443, + "rewards/accuracy_reward_stage2": 0.563832700252533, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 777 + }, + { + "completion_length": 15.21875, + "epoch": 0.13632381286139827, + "grad_norm": 19.313471380679403, + "kl": 0.02978515625, + "learning_rate": 8.638514105484492e-07, + "loss": 0.0119, + "reward": 1.543156385421753, + "reward_std": 0.09772832691669464, + "rewards/accuracy_reward_stage2": 0.5431563258171082, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 778 + }, + { + "completion_length": 12.40625, + "epoch": 0.13649903627124585, + "grad_norm": 20.7202571891817, + "kl": 0.060791015625, + "learning_rate": 8.636761871386017e-07, + "loss": 0.0243, + "reward": 1.1963204145431519, + "reward_std": 0.21110892295837402, + "rewards/accuracy_reward_stage2": 0.19632048904895782, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 779 + }, + { + "completion_length": 6.34375, + "epoch": 0.1366742596810934, + "grad_norm": 10.142569369338114, + "kl": 0.058837890625, + "learning_rate": 8.635009637287542e-07, + "loss": 0.0235, + "reward": 1.6852272748947144, + "reward_std": 0.051082856953144073, + "rewards/accuracy_reward_stage2": 0.6852272748947144, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 780 + }, + { + "completion_length": 8.46875, + "epoch": 0.13684948309094094, + "grad_norm": 18.331737891811393, + "kl": 0.01263427734375, + "learning_rate": 8.633257403189066e-07, + "loss": 0.005, + "reward": 1.6638405323028564, + "reward_std": 0.1704673171043396, + "rewards/accuracy_reward_stage2": 0.6638404726982117, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 781 + }, + { + "completion_length": 9.125, + "epoch": 0.13702470650078852, + "grad_norm": 10.998835905647725, + "kl": 0.007049560546875, + "learning_rate": 8.631505169090591e-07, + "loss": 0.0028, + "reward": 1.609375, + "reward_std": 0.04419417306780815, + "rewards/accuracy_reward_stage2": 0.609375, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 782 + }, + { + "completion_length": 16.203125, + "epoch": 0.13719992991063606, + "grad_norm": 121.12707998830088, + "kl": 0.68359375, + "learning_rate": 8.629752934992115e-07, + "loss": 0.2291, + "reward": 1.2269890308380127, + "reward_std": 0.20804274082183838, + "rewards/accuracy_reward_stage2": 0.4926139712333679, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 783 + }, + { + "completion_length": 11.1875, + "epoch": 0.1373751533204836, + "grad_norm": 66.34782787803577, + "kl": 0.671875, + "learning_rate": 8.62800070089364e-07, + "loss": 0.2691, + "reward": 1.4658288955688477, + "reward_std": 0.21245905756950378, + "rewards/accuracy_reward_stage2": 0.5908288359642029, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 784 + }, + { + "completion_length": 10.75, + "epoch": 0.13755037673033116, + "grad_norm": 22.697008213792316, + "kl": 0.062255859375, + "learning_rate": 8.626248466795163e-07, + "loss": 0.025, + "reward": 1.5069878101348877, + "reward_std": 0.15365807712078094, + "rewards/accuracy_reward_stage2": 0.6319879293441772, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 785 + }, + { + "completion_length": 10.171875, + "epoch": 0.13772560014017873, + "grad_norm": 16.450378612004357, + "kl": 0.018798828125, + "learning_rate": 8.624496232696687e-07, + "loss": 0.0075, + "reward": 1.642259120941162, + "reward_std": 0.14816491305828094, + "rewards/accuracy_reward_stage2": 0.6422590017318726, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 786 + }, + { + "completion_length": 7.640625, + "epoch": 0.13790082355002628, + "grad_norm": 20.521265091381824, + "kl": 0.0546875, + "learning_rate": 8.622743998598212e-07, + "loss": 0.0009, + "reward": 1.5071511268615723, + "reward_std": 0.2744485139846802, + "rewards/accuracy_reward_stage2": 0.5227761268615723, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 787 + }, + { + "completion_length": 9.890625, + "epoch": 0.13807604695987383, + "grad_norm": 22.28505104220285, + "kl": 0.07861328125, + "learning_rate": 8.620991764499737e-07, + "loss": 0.0314, + "reward": 1.5369726419448853, + "reward_std": 0.18660400807857513, + "rewards/accuracy_reward_stage2": 0.6619726419448853, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 788 + }, + { + "completion_length": 7.5, + "epoch": 0.1382512703697214, + "grad_norm": 11.461413144710713, + "kl": 0.0228271484375, + "learning_rate": 8.619239530401261e-07, + "loss": 0.0091, + "reward": 1.4349414110183716, + "reward_std": 0.07261689007282257, + "rewards/accuracy_reward_stage2": 0.4349414110183716, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 789 + }, + { + "completion_length": 8.796875, + "epoch": 0.13842649377956895, + "grad_norm": 21.565315889628515, + "kl": 0.09619140625, + "learning_rate": 8.617487296302786e-07, + "loss": 0.0384, + "reward": 1.4510877132415771, + "reward_std": 0.24449574947357178, + "rewards/accuracy_reward_stage2": 0.5760876536369324, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 790 + }, + { + "completion_length": 8.734375, + "epoch": 0.1386017171894165, + "grad_norm": 36.1687120434632, + "kl": 0.09521484375, + "learning_rate": 8.61573506220431e-07, + "loss": 0.0381, + "reward": 1.5556349754333496, + "reward_std": 0.311960905790329, + "rewards/accuracy_reward_stage2": 0.5556348562240601, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 791 + }, + { + "completion_length": 17.453125, + "epoch": 0.13877694059926407, + "grad_norm": 22.713123989105284, + "kl": 0.146484375, + "learning_rate": 8.613982828105835e-07, + "loss": 0.0308, + "reward": 1.6747777462005615, + "reward_std": 0.2890956401824951, + "rewards/accuracy_reward_stage2": 0.6904026865959167, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 792 + }, + { + "completion_length": 7.296875, + "epoch": 0.13895216400911162, + "grad_norm": 22.463928540543556, + "kl": 0.09765625, + "learning_rate": 8.61223059400736e-07, + "loss": 0.0008, + "reward": 1.462833046913147, + "reward_std": 0.1759517639875412, + "rewards/accuracy_reward_stage2": 0.603458046913147, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 793 + }, + { + "completion_length": 8.703125, + "epoch": 0.13912738741895916, + "grad_norm": 22.675946642947583, + "kl": 0.0284423828125, + "learning_rate": 8.610478359908883e-07, + "loss": 0.0114, + "reward": 1.3541667461395264, + "reward_std": 0.21643014252185822, + "rewards/accuracy_reward_stage2": 0.3541666865348816, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 794 + }, + { + "completion_length": 9.90625, + "epoch": 0.13930261082880674, + "grad_norm": 16.915255498782287, + "kl": 0.02685546875, + "learning_rate": 8.608726125810408e-07, + "loss": 0.0107, + "reward": 1.635071039199829, + "reward_std": 0.091707743704319, + "rewards/accuracy_reward_stage2": 0.6350710391998291, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 795 + }, + { + "completion_length": 12.90625, + "epoch": 0.13947783423865429, + "grad_norm": 19.268752675673827, + "kl": 0.053466796875, + "learning_rate": 8.606973891711933e-07, + "loss": 0.0213, + "reward": 1.5058095455169678, + "reward_std": 0.2826850414276123, + "rewards/accuracy_reward_stage2": 0.5058095455169678, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 796 + }, + { + "completion_length": 8.734375, + "epoch": 0.13965305764850183, + "grad_norm": 23.808344356205204, + "kl": 0.138671875, + "learning_rate": 8.605221657613457e-07, + "loss": 0.0554, + "reward": 1.6443569660186768, + "reward_std": 0.33315661549568176, + "rewards/accuracy_reward_stage2": 0.6443569660186768, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 797 + }, + { + "completion_length": 9.171875, + "epoch": 0.1398282810583494, + "grad_norm": 18.136804001317085, + "kl": 0.048583984375, + "learning_rate": 8.603469423514981e-07, + "loss": -0.0076, + "reward": 1.7026225328445435, + "reward_std": 0.20409967005252838, + "rewards/accuracy_reward_stage2": 0.7182475328445435, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 798 + }, + { + "completion_length": 6.40625, + "epoch": 0.14000350446819695, + "grad_norm": 20.57066511895683, + "kl": 0.05224609375, + "learning_rate": 8.601717189416505e-07, + "loss": -0.0232, + "reward": 1.7694220542907715, + "reward_std": 0.30438560247421265, + "rewards/accuracy_reward_stage2": 0.7850470542907715, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 799 + }, + { + "completion_length": 8.125, + "epoch": 0.1401787278780445, + "grad_norm": 22.88045288586515, + "kl": 0.15625, + "learning_rate": 8.59996495531803e-07, + "loss": 0.0313, + "reward": 1.3915396928787231, + "reward_std": 0.2325017750263214, + "rewards/accuracy_reward_stage2": 0.5321646928787231, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 800 + }, + { + "completion_length": 9.96875, + "epoch": 0.14035395128789208, + "grad_norm": 22.457627248125416, + "kl": 0.13671875, + "learning_rate": 8.598212721219555e-07, + "loss": 0.0547, + "reward": 1.691390037536621, + "reward_std": 0.30775442719459534, + "rewards/accuracy_reward_stage2": 0.6913900375366211, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 801 + }, + { + "completion_length": 8.59375, + "epoch": 0.14052917469773962, + "grad_norm": 14.506472327882316, + "kl": 0.04150390625, + "learning_rate": 8.596460487121079e-07, + "loss": 0.0165, + "reward": 1.573103666305542, + "reward_std": 0.07388261705636978, + "rewards/accuracy_reward_stage2": 0.573103666305542, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 802 + }, + { + "completion_length": 12.1875, + "epoch": 0.14070439810758717, + "grad_norm": 13.264301091214213, + "kl": 0.06640625, + "learning_rate": 8.594708253022604e-07, + "loss": 0.0266, + "reward": 1.0838366746902466, + "reward_std": 0.20186206698417664, + "rewards/accuracy_reward_stage2": 0.3338366448879242, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 803 + }, + { + "completion_length": 10.625, + "epoch": 0.14087962151743472, + "grad_norm": 39.71989699208024, + "kl": 0.1767578125, + "learning_rate": 8.592956018924127e-07, + "loss": -0.0047, + "reward": 1.5464773178100586, + "reward_std": 0.21688970923423767, + "rewards/accuracy_reward_stage2": 0.7027274370193481, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 804 + }, + { + "completion_length": 10.953125, + "epoch": 0.1410548449272823, + "grad_norm": 16.35935979261754, + "kl": 0.0703125, + "learning_rate": 8.591203784825652e-07, + "loss": 0.0281, + "reward": 1.3673032522201538, + "reward_std": 0.09058556705713272, + "rewards/accuracy_reward_stage2": 0.3673032522201538, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 805 + }, + { + "completion_length": 10.65625, + "epoch": 0.14123006833712984, + "grad_norm": 30.11380461178658, + "kl": 0.062255859375, + "learning_rate": 8.589451550727177e-07, + "loss": -0.0398, + "reward": 1.544505000114441, + "reward_std": 0.25117573142051697, + "rewards/accuracy_reward_stage2": 0.5757550597190857, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 806 + }, + { + "completion_length": 11.125, + "epoch": 0.14140529174697739, + "grad_norm": 21.97961959396547, + "kl": 0.0673828125, + "learning_rate": 8.587699316628701e-07, + "loss": -0.0172, + "reward": 1.6793184280395508, + "reward_std": 0.2505590617656708, + "rewards/accuracy_reward_stage2": 0.6949434876441956, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 807 + }, + { + "completion_length": 9.21875, + "epoch": 0.14158051515682496, + "grad_norm": 43.11287489187185, + "kl": 0.02392578125, + "learning_rate": 8.585947082530226e-07, + "loss": 0.0096, + "reward": 1.546875, + "reward_std": 0.1530819982290268, + "rewards/accuracy_reward_stage2": 0.546875, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 808 + }, + { + "completion_length": 16.25, + "epoch": 0.1417557385666725, + "grad_norm": 24.6818306471637, + "kl": 0.62890625, + "learning_rate": 8.584194848431751e-07, + "loss": 0.2514, + "reward": 1.486750602722168, + "reward_std": 0.1537483036518097, + "rewards/accuracy_reward_stage2": 0.6117505431175232, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 809 + }, + { + "completion_length": 13.078125, + "epoch": 0.14193096197652005, + "grad_norm": 19.039469497624456, + "kl": 0.064453125, + "learning_rate": 8.582442614333274e-07, + "loss": -0.014, + "reward": 1.4495387077331543, + "reward_std": 0.3235572576522827, + "rewards/accuracy_reward_stage2": 0.4651636779308319, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 810 + }, + { + "completion_length": 7.1875, + "epoch": 0.14210618538636763, + "grad_norm": 24.169194951924073, + "kl": 0.06591796875, + "learning_rate": 8.580690380234799e-07, + "loss": 0.0263, + "reward": 1.7233256101608276, + "reward_std": 0.23263150453567505, + "rewards/accuracy_reward_stage2": 0.7233256101608276, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 811 + }, + { + "completion_length": 12.234375, + "epoch": 0.14228140879621518, + "grad_norm": 22.087116534277232, + "kl": 0.46875, + "learning_rate": 8.578938146136323e-07, + "loss": 0.1455, + "reward": 1.4000294208526611, + "reward_std": 0.2222493290901184, + "rewards/accuracy_reward_stage2": 0.5406544208526611, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 812 + }, + { + "completion_length": 8.171875, + "epoch": 0.14245663220606272, + "grad_norm": 8.13413593004782, + "kl": 0.007415771484375, + "learning_rate": 8.577185912037847e-07, + "loss": 0.003, + "reward": 1.7436164617538452, + "reward_std": 0.018055392429232597, + "rewards/accuracy_reward_stage2": 0.7436164617538452, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 813 + }, + { + "completion_length": 15.84375, + "epoch": 0.1426318556159103, + "grad_norm": 13.88930059775845, + "kl": 0.06005859375, + "learning_rate": 8.575433677939372e-07, + "loss": 0.024, + "reward": 1.5528483390808105, + "reward_std": 0.07238230854272842, + "rewards/accuracy_reward_stage2": 0.5528483390808105, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 814 + }, + { + "completion_length": 12.484375, + "epoch": 0.14280707902575784, + "grad_norm": 22.855578480267933, + "kl": 0.08056640625, + "learning_rate": 8.573681443840896e-07, + "loss": -0.0002, + "reward": 1.4860951900482178, + "reward_std": 0.19443252682685852, + "rewards/accuracy_reward_stage2": 0.501720130443573, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 815 + }, + { + "completion_length": 7.1875, + "epoch": 0.1429823024356054, + "grad_norm": 21.597799572130995, + "kl": 0.126953125, + "learning_rate": 8.571929209742421e-07, + "loss": 0.0507, + "reward": 1.630251169204712, + "reward_std": 0.15653660893440247, + "rewards/accuracy_reward_stage2": 0.6302511096000671, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 816 + }, + { + "completion_length": 10.203125, + "epoch": 0.14315752584545297, + "grad_norm": 17.619651486942193, + "kl": 0.205078125, + "learning_rate": 8.570176975643946e-07, + "loss": 0.0818, + "reward": 1.4163931608200073, + "reward_std": 0.2162127047777176, + "rewards/accuracy_reward_stage2": 0.6663932204246521, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 817 + }, + { + "completion_length": 16.21875, + "epoch": 0.1433327492553005, + "grad_norm": 20.40379259103948, + "kl": 0.09716796875, + "learning_rate": 8.56842474154547e-07, + "loss": 0.0387, + "reward": 1.5769197940826416, + "reward_std": 0.24281121790409088, + "rewards/accuracy_reward_stage2": 0.7019197344779968, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 818 + }, + { + "completion_length": 12.71875, + "epoch": 0.14350797266514806, + "grad_norm": 55.3327278627242, + "kl": 0.0634765625, + "learning_rate": 8.566672507446995e-07, + "loss": -0.0059, + "reward": 1.5887277126312256, + "reward_std": 0.3874633312225342, + "rewards/accuracy_reward_stage2": 0.6043526530265808, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 819 + }, + { + "completion_length": 7.28125, + "epoch": 0.1436831960749956, + "grad_norm": 24.61589629636778, + "kl": 0.045166015625, + "learning_rate": 8.564920273348519e-07, + "loss": 0.018, + "reward": 1.4394075870513916, + "reward_std": 0.1608877032995224, + "rewards/accuracy_reward_stage2": 0.5644077062606812, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 820 + }, + { + "completion_length": 11.78125, + "epoch": 0.14385841948484318, + "grad_norm": 13.219207764251584, + "kl": 0.0120849609375, + "learning_rate": 8.563168039250044e-07, + "loss": 0.0049, + "reward": 1.451958417892456, + "reward_std": 0.12982237339019775, + "rewards/accuracy_reward_stage2": 0.4519583582878113, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 821 + }, + { + "completion_length": 11.734375, + "epoch": 0.14403364289469073, + "grad_norm": 18.588382161945407, + "kl": 0.05810546875, + "learning_rate": 8.561415805151569e-07, + "loss": -0.0119, + "reward": 1.6447747945785522, + "reward_std": 0.20467260479927063, + "rewards/accuracy_reward_stage2": 0.7853997945785522, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 822 + }, + { + "completion_length": 10.0625, + "epoch": 0.14420886630453827, + "grad_norm": 33.79391222827712, + "kl": 0.07421875, + "learning_rate": 8.559663571053091e-07, + "loss": 0.0007, + "reward": 1.5684666633605957, + "reward_std": 0.2820379137992859, + "rewards/accuracy_reward_stage2": 0.7090917229652405, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 823 + }, + { + "completion_length": 11.203125, + "epoch": 0.14438408971438585, + "grad_norm": 19.944653404203173, + "kl": 0.12451171875, + "learning_rate": 8.557911336954616e-07, + "loss": 0.0499, + "reward": 1.621762990951538, + "reward_std": 0.2086431235074997, + "rewards/accuracy_reward_stage2": 0.6217628717422485, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 824 + }, + { + "completion_length": 8.984375, + "epoch": 0.1445593131242334, + "grad_norm": 14.625539721921214, + "kl": 0.04248046875, + "learning_rate": 8.556159102856141e-07, + "loss": 0.017, + "reward": 1.8925212621688843, + "reward_std": 0.09097467362880707, + "rewards/accuracy_reward_stage2": 0.8925212621688843, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 825 + }, + { + "completion_length": 15.859375, + "epoch": 0.14473453653408094, + "grad_norm": 14.368354249599616, + "kl": 0.07958984375, + "learning_rate": 8.554406868757665e-07, + "loss": 0.0319, + "reward": 1.4069151878356934, + "reward_std": 0.10731781274080276, + "rewards/accuracy_reward_stage2": 0.5319151878356934, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 826 + }, + { + "completion_length": 9.0, + "epoch": 0.14490975994392852, + "grad_norm": 29.72020467621293, + "kl": 0.03076171875, + "learning_rate": 8.55265463465919e-07, + "loss": 0.0123, + "reward": 1.6061508655548096, + "reward_std": 0.3720834255218506, + "rewards/accuracy_reward_stage2": 0.7311508059501648, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 827 + }, + { + "completion_length": 11.03125, + "epoch": 0.14508498335377606, + "grad_norm": 23.300291074931497, + "kl": 0.134765625, + "learning_rate": 8.550902400560714e-07, + "loss": 0.0538, + "reward": 1.2861018180847168, + "reward_std": 0.19155770540237427, + "rewards/accuracy_reward_stage2": 0.4111018776893616, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 828 + }, + { + "completion_length": 14.546875, + "epoch": 0.1452602067636236, + "grad_norm": 19.748170632331988, + "kl": 0.08056640625, + "learning_rate": 8.549150166462239e-07, + "loss": 0.0321, + "reward": 1.495689868927002, + "reward_std": 0.09303957223892212, + "rewards/accuracy_reward_stage2": 0.49568989872932434, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 829 + }, + { + "completion_length": 8.9375, + "epoch": 0.1454354301734712, + "grad_norm": 23.557736521778914, + "kl": 0.16015625, + "learning_rate": 8.547397932363764e-07, + "loss": 0.0642, + "reward": 1.2532269954681396, + "reward_std": 0.24097494781017303, + "rewards/accuracy_reward_stage2": 0.5032269358634949, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 830 + }, + { + "completion_length": 11.0625, + "epoch": 0.14561065358331873, + "grad_norm": 17.655466469547825, + "kl": 0.12109375, + "learning_rate": 8.545645698265288e-07, + "loss": 0.0485, + "reward": 1.512831449508667, + "reward_std": 0.14166758954524994, + "rewards/accuracy_reward_stage2": 0.637831449508667, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 831 + }, + { + "completion_length": 14.890625, + "epoch": 0.14578587699316628, + "grad_norm": 45.874030977900944, + "kl": 0.37890625, + "learning_rate": 8.543893464166813e-07, + "loss": 0.1516, + "reward": 1.225730061531067, + "reward_std": 0.21166354417800903, + "rewards/accuracy_reward_stage2": 0.3507300317287445, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 832 + }, + { + "completion_length": 10.046875, + "epoch": 0.14596110040301385, + "grad_norm": 25.695155827712917, + "kl": 0.142578125, + "learning_rate": 8.542141230068338e-07, + "loss": 0.0187, + "reward": 1.499477744102478, + "reward_std": 0.2859499454498291, + "rewards/accuracy_reward_stage2": 0.530727744102478, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 833 + }, + { + "completion_length": 15.96875, + "epoch": 0.1461363238128614, + "grad_norm": 12.749670925067296, + "kl": 0.03564453125, + "learning_rate": 8.540388995969861e-07, + "loss": -0.03, + "reward": 1.5882692337036133, + "reward_std": 0.1215648502111435, + "rewards/accuracy_reward_stage2": 0.6038942933082581, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 834 + }, + { + "completion_length": 7.40625, + "epoch": 0.14631154722270895, + "grad_norm": 20.419820412171152, + "kl": 0.0849609375, + "learning_rate": 8.538636761871386e-07, + "loss": 0.0341, + "reward": 1.7913644313812256, + "reward_std": 0.1585237681865692, + "rewards/accuracy_reward_stage2": 0.7913644313812256, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 835 + }, + { + "completion_length": 13.9375, + "epoch": 0.14648677063255652, + "grad_norm": 14.851993005874192, + "kl": 0.07421875, + "learning_rate": 8.536884527772909e-07, + "loss": -0.0138, + "reward": 1.6181893348693848, + "reward_std": 0.09273967146873474, + "rewards/accuracy_reward_stage2": 0.63381427526474, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 836 + }, + { + "completion_length": 8.953125, + "epoch": 0.14666199404240407, + "grad_norm": 15.156193193114621, + "kl": 0.06103515625, + "learning_rate": 8.535132293674434e-07, + "loss": 0.0244, + "reward": 1.3110003471374512, + "reward_std": 0.11011946201324463, + "rewards/accuracy_reward_stage2": 0.5610003471374512, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 837 + }, + { + "completion_length": 12.5625, + "epoch": 0.14683721745225162, + "grad_norm": 19.635947621442128, + "kl": 0.0908203125, + "learning_rate": 8.533380059575959e-07, + "loss": 0.0363, + "reward": 1.588404655456543, + "reward_std": 0.2183130383491516, + "rewards/accuracy_reward_stage2": 0.5884045958518982, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 838 + }, + { + "completion_length": 10.90625, + "epoch": 0.14701244086209916, + "grad_norm": 21.805824502522523, + "kl": 0.12890625, + "learning_rate": 8.531627825477483e-07, + "loss": 0.0514, + "reward": 1.6451040506362915, + "reward_std": 0.2274044007062912, + "rewards/accuracy_reward_stage2": 0.7701040506362915, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 839 + }, + { + "completion_length": 12.875, + "epoch": 0.14718766427194674, + "grad_norm": 16.040950668543186, + "kl": 0.07958984375, + "learning_rate": 8.529875591379008e-07, + "loss": 0.0318, + "reward": 1.7686783075332642, + "reward_std": 0.1477348506450653, + "rewards/accuracy_reward_stage2": 0.7686783075332642, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 840 + }, + { + "completion_length": 11.5625, + "epoch": 0.14736288768179429, + "grad_norm": 30.501348999856685, + "kl": 0.30859375, + "learning_rate": 8.528123357280533e-07, + "loss": 0.0915, + "reward": 1.3233115673065186, + "reward_std": 0.2250458151102066, + "rewards/accuracy_reward_stage2": 0.46393659710884094, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 841 + }, + { + "completion_length": 12.78125, + "epoch": 0.14753811109164183, + "grad_norm": 22.250109240384603, + "kl": 0.455078125, + "learning_rate": 8.526371123182057e-07, + "loss": 0.149, + "reward": 1.1825652122497559, + "reward_std": 0.19976741075515747, + "rewards/accuracy_reward_stage2": 0.44819021224975586, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 842 + }, + { + "completion_length": 9.28125, + "epoch": 0.1477133345014894, + "grad_norm": 16.5712263276257, + "kl": 0.041015625, + "learning_rate": 8.524618889083582e-07, + "loss": 0.0164, + "reward": 1.6536760330200195, + "reward_std": 0.12514111399650574, + "rewards/accuracy_reward_stage2": 0.6536760926246643, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 843 + }, + { + "completion_length": 27.203125, + "epoch": 0.14788855791133695, + "grad_norm": 17.3294287434311, + "kl": 0.51171875, + "learning_rate": 8.522866654985105e-07, + "loss": 0.2058, + "reward": 1.306678295135498, + "reward_std": 0.21027681231498718, + "rewards/accuracy_reward_stage2": 0.4316784143447876, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 844 + }, + { + "completion_length": 10.640625, + "epoch": 0.1480637813211845, + "grad_norm": 19.513135768745453, + "kl": 0.57421875, + "learning_rate": 8.52111442088663e-07, + "loss": 0.229, + "reward": 1.5448485612869263, + "reward_std": 0.17319580912590027, + "rewards/accuracy_reward_stage2": 0.6698485612869263, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 845 + }, + { + "completion_length": 7.203125, + "epoch": 0.14823900473103208, + "grad_norm": 24.577003518223613, + "kl": 0.03076171875, + "learning_rate": 8.519362186788155e-07, + "loss": 0.0123, + "reward": 1.5872409343719482, + "reward_std": 0.10471472889184952, + "rewards/accuracy_reward_stage2": 0.5872409343719482, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 846 + }, + { + "completion_length": 23.0, + "epoch": 0.14841422814087962, + "grad_norm": 16.92597397416524, + "kl": 0.09228515625, + "learning_rate": 8.517609952689679e-07, + "loss": -0.0072, + "reward": 1.4274213314056396, + "reward_std": 0.15768280625343323, + "rewards/accuracy_reward_stage2": 0.5680463314056396, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 847 + }, + { + "completion_length": 16.5, + "epoch": 0.14858945155072717, + "grad_norm": 22.36619562360302, + "kl": 0.05419921875, + "learning_rate": 8.515857718591204e-07, + "loss": 0.0217, + "reward": 1.5342915058135986, + "reward_std": 0.15957878530025482, + "rewards/accuracy_reward_stage2": 0.5342913866043091, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 848 + }, + { + "completion_length": 13.25, + "epoch": 0.14876467496057474, + "grad_norm": 16.30851395059505, + "kl": 0.060302734375, + "learning_rate": 8.514105484492728e-07, + "loss": 0.0242, + "reward": 1.6138888597488403, + "reward_std": 0.17767907679080963, + "rewards/accuracy_reward_stage2": 0.6138888597488403, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 849 + }, + { + "completion_length": 11.265625, + "epoch": 0.1489398983704223, + "grad_norm": 19.055320023008036, + "kl": 0.083984375, + "learning_rate": 8.512353250394252e-07, + "loss": 0.0335, + "reward": 1.6319842338562012, + "reward_std": 0.2157711535692215, + "rewards/accuracy_reward_stage2": 0.756984293460846, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 850 + }, + { + "completion_length": 10.03125, + "epoch": 0.14911512178026984, + "grad_norm": 18.371430716507827, + "kl": 0.0888671875, + "learning_rate": 8.510601016295777e-07, + "loss": 0.0355, + "reward": 1.6245781183242798, + "reward_std": 0.23073048889636993, + "rewards/accuracy_reward_stage2": 0.6245781779289246, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 851 + }, + { + "completion_length": 7.9375, + "epoch": 0.1492903451901174, + "grad_norm": 20.02482116621384, + "kl": 0.01544189453125, + "learning_rate": 8.508848782197301e-07, + "loss": 0.0062, + "reward": 1.7079994678497314, + "reward_std": 0.1766592115163803, + "rewards/accuracy_reward_stage2": 0.7079994678497314, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 852 + }, + { + "completion_length": 10.234375, + "epoch": 0.14946556859996496, + "grad_norm": 15.743834333932089, + "kl": 0.08251953125, + "learning_rate": 8.507096548098825e-07, + "loss": 0.0331, + "reward": 1.6240935325622559, + "reward_std": 0.11661704629659653, + "rewards/accuracy_reward_stage2": 0.7490935325622559, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 853 + }, + { + "completion_length": 10.28125, + "epoch": 0.1496407920098125, + "grad_norm": 27.186682983752124, + "kl": 0.287109375, + "learning_rate": 8.50534431400035e-07, + "loss": 0.1149, + "reward": 1.171875, + "reward_std": 0.13258251547813416, + "rewards/accuracy_reward_stage2": 0.296875, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 854 + }, + { + "completion_length": 11.40625, + "epoch": 0.14981601541966005, + "grad_norm": 26.354753154950743, + "kl": 0.14453125, + "learning_rate": 8.503592079901874e-07, + "loss": 0.0577, + "reward": 1.4655927419662476, + "reward_std": 0.2811081111431122, + "rewards/accuracy_reward_stage2": 0.46559271216392517, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 855 + }, + { + "completion_length": 8.65625, + "epoch": 0.14999123882950763, + "grad_norm": 14.760397091858115, + "kl": 0.07177734375, + "learning_rate": 8.501839845803399e-07, + "loss": -0.0042, + "reward": 1.421875, + "reward_std": 0.2597545385360718, + "rewards/accuracy_reward_stage2": 0.4375, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 856 + }, + { + "completion_length": 14.515625, + "epoch": 0.15016646223935518, + "grad_norm": 29.598384378452092, + "kl": 0.07470703125, + "learning_rate": 8.500087611704924e-07, + "loss": 0.0299, + "reward": 1.3475062847137451, + "reward_std": 0.3122522532939911, + "rewards/accuracy_reward_stage2": 0.47250625491142273, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 857 + }, + { + "completion_length": 7.515625, + "epoch": 0.15034168564920272, + "grad_norm": 16.23384962972575, + "kl": 0.0703125, + "learning_rate": 8.498335377606448e-07, + "loss": 0.0282, + "reward": 1.5904297828674316, + "reward_std": 0.19798138737678528, + "rewards/accuracy_reward_stage2": 0.7154297828674316, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 858 + }, + { + "completion_length": 7.625, + "epoch": 0.1505169090590503, + "grad_norm": 24.902021328037982, + "kl": 0.05712890625, + "learning_rate": 8.496583143507973e-07, + "loss": 0.0228, + "reward": 1.7132034301757812, + "reward_std": 0.2248350977897644, + "rewards/accuracy_reward_stage2": 0.7132034301757812, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 859 + }, + { + "completion_length": 12.390625, + "epoch": 0.15069213246889784, + "grad_norm": 25.012761537860488, + "kl": 0.057861328125, + "learning_rate": 8.494830909409497e-07, + "loss": -0.0116, + "reward": 1.5423566102981567, + "reward_std": 0.2658918499946594, + "rewards/accuracy_reward_stage2": 0.557981550693512, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 860 + }, + { + "completion_length": 8.53125, + "epoch": 0.1508673558787454, + "grad_norm": 18.010656267314577, + "kl": 0.033203125, + "learning_rate": 8.493078675311021e-07, + "loss": 0.0133, + "reward": 1.5944479703903198, + "reward_std": 0.09074701368808746, + "rewards/accuracy_reward_stage2": 0.5944479703903198, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 861 + }, + { + "completion_length": 11.125, + "epoch": 0.15104257928859297, + "grad_norm": 19.175468347035594, + "kl": 0.01300048828125, + "learning_rate": 8.491326441212546e-07, + "loss": 0.0052, + "reward": 1.7279086112976074, + "reward_std": 0.11366454511880875, + "rewards/accuracy_reward_stage2": 0.7279086112976074, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 862 + }, + { + "completion_length": 11.59375, + "epoch": 0.1512178026984405, + "grad_norm": 17.84506185110907, + "kl": 0.177734375, + "learning_rate": 8.489574207114069e-07, + "loss": 0.071, + "reward": 1.5485143661499023, + "reward_std": 0.19748035073280334, + "rewards/accuracy_reward_stage2": 0.5485143661499023, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 863 + }, + { + "completion_length": 12.34375, + "epoch": 0.15139302610828806, + "grad_norm": 20.25179638432365, + "kl": 0.048828125, + "learning_rate": 8.487821973015594e-07, + "loss": 0.0195, + "reward": 1.4129174947738647, + "reward_std": 0.3144758641719818, + "rewards/accuracy_reward_stage2": 0.41291752457618713, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 864 + }, + { + "completion_length": 17.390625, + "epoch": 0.15156824951813563, + "grad_norm": 23.015576292466694, + "kl": 0.0673828125, + "learning_rate": 8.486069738917118e-07, + "loss": 0.027, + "reward": 1.5686707496643066, + "reward_std": 0.1687781810760498, + "rewards/accuracy_reward_stage2": 0.5686706304550171, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 865 + }, + { + "completion_length": 10.640625, + "epoch": 0.15174347292798318, + "grad_norm": 25.940419340471752, + "kl": 0.06298828125, + "learning_rate": 8.484317504818643e-07, + "loss": -0.0758, + "reward": 1.4635417461395264, + "reward_std": 0.26842159032821655, + "rewards/accuracy_reward_stage2": 0.5104166865348816, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 866 + }, + { + "completion_length": 7.53125, + "epoch": 0.15191869633783073, + "grad_norm": 21.38780500879031, + "kl": 0.06591796875, + "learning_rate": 8.482565270720168e-07, + "loss": -0.0052, + "reward": 1.7270491123199463, + "reward_std": 0.27854660153388977, + "rewards/accuracy_reward_stage2": 0.7426741719245911, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 867 + }, + { + "completion_length": 12.09375, + "epoch": 0.1520939197476783, + "grad_norm": 16.75516936788131, + "kl": 0.0223388671875, + "learning_rate": 8.480813036621692e-07, + "loss": -0.0345, + "reward": 1.648539423942566, + "reward_std": 0.1907489001750946, + "rewards/accuracy_reward_stage2": 0.6641644239425659, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 868 + }, + { + "completion_length": 9.484375, + "epoch": 0.15226914315752585, + "grad_norm": 16.02390522378857, + "kl": 0.0255126953125, + "learning_rate": 8.479060802523217e-07, + "loss": 0.0102, + "reward": 1.742557406425476, + "reward_std": 0.09017640352249146, + "rewards/accuracy_reward_stage2": 0.7425574064254761, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 869 + }, + { + "completion_length": 12.3125, + "epoch": 0.1524443665673734, + "grad_norm": 19525.62924019597, + "kl": 81.0, + "learning_rate": 8.477308568424742e-07, + "loss": 32.4057, + "reward": 1.4085581302642822, + "reward_std": 0.26161444187164307, + "rewards/accuracy_reward_stage2": 0.5491830706596375, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 870 + }, + { + "completion_length": 8.34375, + "epoch": 0.15261958997722094, + "grad_norm": 14.501936237694595, + "kl": 0.091796875, + "learning_rate": 8.475556334326266e-07, + "loss": 0.0014, + "reward": 1.762599229812622, + "reward_std": 0.11367248743772507, + "rewards/accuracy_reward_stage2": 0.7782242298126221, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 871 + }, + { + "completion_length": 12.5625, + "epoch": 0.15279481338706852, + "grad_norm": 19.60168459278808, + "kl": 0.01531982421875, + "learning_rate": 8.473804100227791e-07, + "loss": 0.0061, + "reward": 1.6145833730697632, + "reward_std": 0.1613328456878662, + "rewards/accuracy_reward_stage2": 0.6145833134651184, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 872 + }, + { + "completion_length": 20.203125, + "epoch": 0.15297003679691606, + "grad_norm": 472.34090688262086, + "kl": 2.40625, + "learning_rate": 8.472051866129316e-07, + "loss": 0.9589, + "reward": 1.3896098136901855, + "reward_std": 0.12324882298707962, + "rewards/accuracy_reward_stage2": 0.514609694480896, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 873 + }, + { + "completion_length": 12.0, + "epoch": 0.1531452602067636, + "grad_norm": 22.644802413426646, + "kl": 0.05517578125, + "learning_rate": 8.470299632030838e-07, + "loss": 0.022, + "reward": 1.5755821466445923, + "reward_std": 0.16714820265769958, + "rewards/accuracy_reward_stage2": 0.5755821466445923, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 874 + }, + { + "completion_length": 8.65625, + "epoch": 0.1533204836166112, + "grad_norm": 20.082514560590244, + "kl": 0.044677734375, + "learning_rate": 8.468547397932363e-07, + "loss": 0.0179, + "reward": 1.7464009523391724, + "reward_std": 0.28311580419540405, + "rewards/accuracy_reward_stage2": 0.7464009523391724, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 875 + }, + { + "completion_length": 11.21875, + "epoch": 0.15349570702645873, + "grad_norm": 14.584972917601204, + "kl": 0.08544921875, + "learning_rate": 8.466795163833887e-07, + "loss": 0.0342, + "reward": 1.7056055068969727, + "reward_std": 0.1420706957578659, + "rewards/accuracy_reward_stage2": 0.7056055068969727, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 876 + }, + { + "completion_length": 10.046875, + "epoch": 0.15367093043630628, + "grad_norm": 15.131222808042867, + "kl": 0.10107421875, + "learning_rate": 8.465042929735412e-07, + "loss": 0.0403, + "reward": 1.6770800352096558, + "reward_std": 0.11454734951257706, + "rewards/accuracy_reward_stage2": 0.677079975605011, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 877 + }, + { + "completion_length": 13.546875, + "epoch": 0.15384615384615385, + "grad_norm": 33.88045076112149, + "kl": 0.349609375, + "learning_rate": 8.463290695636937e-07, + "loss": 0.1401, + "reward": 1.7129206657409668, + "reward_std": 0.19110271334648132, + "rewards/accuracy_reward_stage2": 0.837920606136322, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 878 + }, + { + "completion_length": 13.390625, + "epoch": 0.1540213772560014, + "grad_norm": 18.9186422728538, + "kl": 0.1640625, + "learning_rate": 8.461538461538461e-07, + "loss": 0.0284, + "reward": 1.4437906742095947, + "reward_std": 0.2090407907962799, + "rewards/accuracy_reward_stage2": 0.5844157338142395, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 879 + }, + { + "completion_length": 11.5625, + "epoch": 0.15419660066584895, + "grad_norm": 22.16291882417554, + "kl": 0.0771484375, + "learning_rate": 8.459786227439986e-07, + "loss": 0.018, + "reward": 1.5695393085479736, + "reward_std": 0.2572243809700012, + "rewards/accuracy_reward_stage2": 0.6945393085479736, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 880 + }, + { + "completion_length": 13.09375, + "epoch": 0.15437182407569652, + "grad_norm": 23.00953445571644, + "kl": 0.20703125, + "learning_rate": 8.45803399334151e-07, + "loss": 0.0833, + "reward": 1.3842726945877075, + "reward_std": 0.24705079197883606, + "rewards/accuracy_reward_stage2": 0.5092726945877075, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 881 + }, + { + "completion_length": 13.328125, + "epoch": 0.15454704748554407, + "grad_norm": 21.121694948505297, + "kl": 0.0673828125, + "learning_rate": 8.456281759243035e-07, + "loss": 0.027, + "reward": 1.80861234664917, + "reward_std": 0.12876972556114197, + "rewards/accuracy_reward_stage2": 0.8086122870445251, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 882 + }, + { + "completion_length": 12.84375, + "epoch": 0.15472227089539162, + "grad_norm": 27.381915192859076, + "kl": 0.1103515625, + "learning_rate": 8.45452952514456e-07, + "loss": 0.0055, + "reward": 1.6596755981445312, + "reward_std": 0.31475722789764404, + "rewards/accuracy_reward_stage2": 0.6753007173538208, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 883 + }, + { + "completion_length": 10.390625, + "epoch": 0.1548974943052392, + "grad_norm": 22.464663083727196, + "kl": 0.050048828125, + "learning_rate": 8.452777291046083e-07, + "loss": 0.02, + "reward": 1.599549651145935, + "reward_std": 0.23784250020980835, + "rewards/accuracy_reward_stage2": 0.5995496511459351, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 884 + }, + { + "completion_length": 8.8125, + "epoch": 0.15507271771508674, + "grad_norm": 21.711014490365343, + "kl": 0.10400390625, + "learning_rate": 8.451025056947608e-07, + "loss": 0.016, + "reward": 1.6585588455200195, + "reward_std": 0.2637866735458374, + "rewards/accuracy_reward_stage2": 0.6741837859153748, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 885 + }, + { + "completion_length": 10.015625, + "epoch": 0.15524794112493429, + "grad_norm": 14.963862309249967, + "kl": 0.1533203125, + "learning_rate": 8.449272822849133e-07, + "loss": 0.0252, + "reward": 1.5280214548110962, + "reward_std": 0.15586411952972412, + "rewards/accuracy_reward_stage2": 0.5436464548110962, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 886 + }, + { + "completion_length": 10.859375, + "epoch": 0.15542316453478186, + "grad_norm": 21.292734357299597, + "kl": 0.033203125, + "learning_rate": 8.447520588750656e-07, + "loss": 0.0133, + "reward": 1.5913714170455933, + "reward_std": 0.2325046956539154, + "rewards/accuracy_reward_stage2": 0.5913714170455933, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 887 + }, + { + "completion_length": 11.828125, + "epoch": 0.1555983879446294, + "grad_norm": 16.972353026269435, + "kl": 0.1337890625, + "learning_rate": 8.445768354652181e-07, + "loss": 0.0171, + "reward": 1.586073637008667, + "reward_std": 0.15965032577514648, + "rewards/accuracy_reward_stage2": 0.726698637008667, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 888 + }, + { + "completion_length": 9.5, + "epoch": 0.15577361135447695, + "grad_norm": 18.709718658093724, + "kl": 0.0908203125, + "learning_rate": 8.444016120553705e-07, + "loss": 0.0365, + "reward": 1.4933067560195923, + "reward_std": 0.1306682527065277, + "rewards/accuracy_reward_stage2": 0.6183068156242371, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 889 + }, + { + "completion_length": 11.5625, + "epoch": 0.1559488347643245, + "grad_norm": 26.23996003618781, + "kl": 0.384765625, + "learning_rate": 8.44226388645523e-07, + "loss": 0.1538, + "reward": 1.390785574913025, + "reward_std": 0.24497109651565552, + "rewards/accuracy_reward_stage2": 0.5157855749130249, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 890 + }, + { + "completion_length": 8.640625, + "epoch": 0.15612405817417208, + "grad_norm": 20.44810828757477, + "kl": 0.045166015625, + "learning_rate": 8.440511652356755e-07, + "loss": 0.0181, + "reward": 1.4914238452911377, + "reward_std": 0.11264529824256897, + "rewards/accuracy_reward_stage2": 0.4914238750934601, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 891 + }, + { + "completion_length": 19.890625, + "epoch": 0.15629928158401962, + "grad_norm": 20.186573134056676, + "kl": 0.466796875, + "learning_rate": 8.438759418258279e-07, + "loss": 0.1425, + "reward": 1.1410515308380127, + "reward_std": 0.1473521888256073, + "rewards/accuracy_reward_stage2": 0.28167659044265747, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 892 + }, + { + "completion_length": 7.625, + "epoch": 0.15647450499386717, + "grad_norm": 20.23432496884711, + "kl": 0.0927734375, + "learning_rate": 8.437007184159803e-07, + "loss": 0.0372, + "reward": 1.5041792392730713, + "reward_std": 0.07779411971569061, + "rewards/accuracy_reward_stage2": 0.5198042392730713, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 893 + }, + { + "completion_length": 13.796875, + "epoch": 0.15664972840371474, + "grad_norm": 15.623264418195424, + "kl": 0.01202392578125, + "learning_rate": 8.435254950061328e-07, + "loss": 0.0048, + "reward": 1.471541404724121, + "reward_std": 0.12218821048736572, + "rewards/accuracy_reward_stage2": 0.4715413451194763, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 894 + }, + { + "completion_length": 10.1875, + "epoch": 0.1568249518135623, + "grad_norm": 19.707690843826153, + "kl": 0.056884765625, + "learning_rate": 8.433502715962852e-07, + "loss": -0.0214, + "reward": 1.5333350896835327, + "reward_std": 0.1759674847126007, + "rewards/accuracy_reward_stage2": 0.5489600896835327, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 895 + }, + { + "completion_length": 11.53125, + "epoch": 0.15700017522340984, + "grad_norm": 28.43513771094324, + "kl": 0.052978515625, + "learning_rate": 8.431750481864377e-07, + "loss": 0.0212, + "reward": 1.3929375410079956, + "reward_std": 0.3441798985004425, + "rewards/accuracy_reward_stage2": 0.5179375410079956, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 896 + }, + { + "completion_length": 9.0, + "epoch": 0.1571753986332574, + "grad_norm": 34.6067214018072, + "kl": 0.18359375, + "learning_rate": 8.429998247765901e-07, + "loss": 0.0294, + "reward": 1.5371513366699219, + "reward_std": 0.14979934692382812, + "rewards/accuracy_reward_stage2": 0.5527763366699219, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 897 + }, + { + "completion_length": 9.921875, + "epoch": 0.15735062204310496, + "grad_norm": 29.157215435126794, + "kl": 0.1044921875, + "learning_rate": 8.428246013667426e-07, + "loss": 0.0418, + "reward": 1.6572283506393433, + "reward_std": 0.16798993945121765, + "rewards/accuracy_reward_stage2": 0.6572283506393433, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 898 + }, + { + "completion_length": 11.953125, + "epoch": 0.1575258454529525, + "grad_norm": 18.84363409067946, + "kl": 0.1201171875, + "learning_rate": 8.426493779568951e-07, + "loss": 0.0481, + "reward": 1.5634629726409912, + "reward_std": 0.2565038800239563, + "rewards/accuracy_reward_stage2": 0.5634629726409912, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 899 + }, + { + "completion_length": 7.65625, + "epoch": 0.15770106886280008, + "grad_norm": 17.599262016486215, + "kl": 0.0595703125, + "learning_rate": 8.424741545470474e-07, + "loss": 0.0238, + "reward": 1.7447772026062012, + "reward_std": 0.2339225709438324, + "rewards/accuracy_reward_stage2": 0.7447772026062012, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 900 + }, + { + "completion_length": 9.9375, + "epoch": 0.15787629227264763, + "grad_norm": 25.476523126982208, + "kl": 0.0179443359375, + "learning_rate": 8.422989311371999e-07, + "loss": 0.0072, + "reward": 1.4794890880584717, + "reward_std": 0.23430663347244263, + "rewards/accuracy_reward_stage2": 0.47948914766311646, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 901 + }, + { + "completion_length": 12.46875, + "epoch": 0.15805151568249518, + "grad_norm": 24.442260381874355, + "kl": 0.078125, + "learning_rate": 8.421237077273524e-07, + "loss": 0.0313, + "reward": 1.6743123531341553, + "reward_std": 0.19850794970989227, + "rewards/accuracy_reward_stage2": 0.6743123531341553, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 902 + }, + { + "completion_length": 9.015625, + "epoch": 0.15822673909234275, + "grad_norm": 21.14240758724476, + "kl": 0.11328125, + "learning_rate": 8.419484843175047e-07, + "loss": 0.0056, + "reward": 1.4228363037109375, + "reward_std": 0.25364816188812256, + "rewards/accuracy_reward_stage2": 0.5634613633155823, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 903 + }, + { + "completion_length": 15.0, + "epoch": 0.1584019625021903, + "grad_norm": 20.79188296865029, + "kl": 0.17578125, + "learning_rate": 8.417732609076572e-07, + "loss": -0.0182, + "reward": 1.4797170162200928, + "reward_std": 0.15435296297073364, + "rewards/accuracy_reward_stage2": 0.5109670162200928, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 904 + }, + { + "completion_length": 8.78125, + "epoch": 0.15857718591203784, + "grad_norm": 14.706728396656908, + "kl": 0.08349609375, + "learning_rate": 8.415980374978096e-07, + "loss": 0.0334, + "reward": 1.2869999408721924, + "reward_std": 0.08212931454181671, + "rewards/accuracy_reward_stage2": 0.5370000004768372, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 905 + }, + { + "completion_length": 21.984375, + "epoch": 0.1587524093218854, + "grad_norm": 20.45850299415546, + "kl": 0.1708984375, + "learning_rate": 8.414228140879621e-07, + "loss": 0.0685, + "reward": 1.2784233093261719, + "reward_std": 0.15729627013206482, + "rewards/accuracy_reward_stage2": 0.40342339873313904, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 906 + }, + { + "completion_length": 10.625, + "epoch": 0.15892763273173297, + "grad_norm": 20.26740604800182, + "kl": 0.039794921875, + "learning_rate": 8.412475906781146e-07, + "loss": 0.0159, + "reward": 1.5881588459014893, + "reward_std": 0.14251913130283356, + "rewards/accuracy_reward_stage2": 0.5881587862968445, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 907 + }, + { + "completion_length": 9.09375, + "epoch": 0.1591028561415805, + "grad_norm": 19.96265675198023, + "kl": 0.037353515625, + "learning_rate": 8.41072367268267e-07, + "loss": 0.0149, + "reward": 1.71971595287323, + "reward_std": 0.17054104804992676, + "rewards/accuracy_reward_stage2": 0.71971595287323, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 908 + }, + { + "completion_length": 9.953125, + "epoch": 0.15927807955142806, + "grad_norm": 20.517225673527474, + "kl": 0.06787109375, + "learning_rate": 8.408971438584195e-07, + "loss": 0.027, + "reward": 1.6317434310913086, + "reward_std": 0.21083226799964905, + "rewards/accuracy_reward_stage2": 0.6317434906959534, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 909 + }, + { + "completion_length": 9.765625, + "epoch": 0.15945330296127563, + "grad_norm": 20.84710532062063, + "kl": 0.03076171875, + "learning_rate": 8.40721920448572e-07, + "loss": 0.0123, + "reward": 1.46875, + "reward_std": 0.2619796395301819, + "rewards/accuracy_reward_stage2": 0.46875, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 910 + }, + { + "completion_length": 11.0625, + "epoch": 0.15962852637112318, + "grad_norm": 13.841406690310421, + "kl": 0.0196533203125, + "learning_rate": 8.405466970387244e-07, + "loss": -0.0363, + "reward": 1.761332392692566, + "reward_std": 0.11642816662788391, + "rewards/accuracy_reward_stage2": 0.7769573330879211, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 911 + }, + { + "completion_length": 8.28125, + "epoch": 0.15980374978097073, + "grad_norm": 17.91290633966341, + "kl": 0.14453125, + "learning_rate": 8.403714736288767e-07, + "loss": 0.0579, + "reward": 1.7218239307403564, + "reward_std": 0.10979120433330536, + "rewards/accuracy_reward_stage2": 0.7218239307403564, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 912 + }, + { + "completion_length": 16.421875, + "epoch": 0.1599789731908183, + "grad_norm": 7060.804174203911, + "kl": 31.375, + "learning_rate": 8.401962502190291e-07, + "loss": 12.5837, + "reward": 1.574540376663208, + "reward_std": 0.337665855884552, + "rewards/accuracy_reward_stage2": 0.7151654362678528, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 913 + }, + { + "completion_length": 12.03125, + "epoch": 0.16015419660066585, + "grad_norm": 22.485843157999255, + "kl": 0.10498046875, + "learning_rate": 8.400210268091816e-07, + "loss": 0.0109, + "reward": 1.547934651374817, + "reward_std": 0.2350578010082245, + "rewards/accuracy_reward_stage2": 0.6885595917701721, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 914 + }, + { + "completion_length": 16.75, + "epoch": 0.1603294200105134, + "grad_norm": 22.681329137939034, + "kl": 0.0751953125, + "learning_rate": 8.398458033993341e-07, + "loss": -0.1154, + "reward": 1.7184510231018066, + "reward_std": 0.23591922223567963, + "rewards/accuracy_reward_stage2": 0.7809509634971619, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 915 + }, + { + "completion_length": 6.515625, + "epoch": 0.16050464342036097, + "grad_norm": 22.439686965003617, + "kl": 0.050048828125, + "learning_rate": 8.396705799894865e-07, + "loss": 0.0201, + "reward": 1.5222173929214478, + "reward_std": 0.18794281780719757, + "rewards/accuracy_reward_stage2": 0.5222173929214478, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 916 + }, + { + "completion_length": 9.9375, + "epoch": 0.16067986683020852, + "grad_norm": 20.917834139045315, + "kl": 0.09521484375, + "learning_rate": 8.39495356579639e-07, + "loss": 0.0381, + "reward": 1.3864175081253052, + "reward_std": 0.1506006121635437, + "rewards/accuracy_reward_stage2": 0.5114175081253052, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 917 + }, + { + "completion_length": 19.0, + "epoch": 0.16085509024005606, + "grad_norm": 22.230505217804176, + "kl": 0.08984375, + "learning_rate": 8.393201331697915e-07, + "loss": 0.0359, + "reward": 1.5073747634887695, + "reward_std": 0.1665232926607132, + "rewards/accuracy_reward_stage2": 0.50737464427948, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 918 + }, + { + "completion_length": 9.921875, + "epoch": 0.16103031364990364, + "grad_norm": 21.06591612131756, + "kl": 0.11962890625, + "learning_rate": 8.391449097599439e-07, + "loss": 0.0479, + "reward": 1.4632796049118042, + "reward_std": 0.2604491710662842, + "rewards/accuracy_reward_stage2": 0.5882796049118042, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 919 + }, + { + "completion_length": 7.375, + "epoch": 0.16120553705975119, + "grad_norm": 23.745634816698153, + "kl": 0.1220703125, + "learning_rate": 8.389696863500964e-07, + "loss": 0.0046, + "reward": 1.4650936126708984, + "reward_std": 0.25936761498451233, + "rewards/accuracy_reward_stage2": 0.4807187020778656, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 920 + }, + { + "completion_length": 12.0, + "epoch": 0.16138076046959873, + "grad_norm": 19.61377943867712, + "kl": 0.1015625, + "learning_rate": 8.387944629402488e-07, + "loss": 0.0407, + "reward": 1.5062568187713623, + "reward_std": 0.24306440353393555, + "rewards/accuracy_reward_stage2": 0.6312568187713623, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 921 + }, + { + "completion_length": 12.8125, + "epoch": 0.1615559838794463, + "grad_norm": 23.541936474447102, + "kl": 0.2060546875, + "learning_rate": 8.386192395304013e-07, + "loss": 0.0823, + "reward": 1.4451262950897217, + "reward_std": 0.20797914266586304, + "rewards/accuracy_reward_stage2": 0.6951261758804321, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 922 + }, + { + "completion_length": 10.5, + "epoch": 0.16173120728929385, + "grad_norm": 19.920258716182968, + "kl": 0.330078125, + "learning_rate": 8.384440161205537e-07, + "loss": 0.145, + "reward": 1.4631624221801758, + "reward_std": 0.17326810956001282, + "rewards/accuracy_reward_stage2": 0.5881624221801758, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 923 + }, + { + "completion_length": 10.3125, + "epoch": 0.1619064306991414, + "grad_norm": 19.987624769580446, + "kl": 0.07958984375, + "learning_rate": 8.382687927107061e-07, + "loss": -0.0358, + "reward": 1.5927538871765137, + "reward_std": 0.20818987488746643, + "rewards/accuracy_reward_stage2": 0.7490040063858032, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 924 + }, + { + "completion_length": 5.3125, + "epoch": 0.16208165410898895, + "grad_norm": 16.61358413552998, + "kl": 0.01495361328125, + "learning_rate": 8.380935693008585e-07, + "loss": 0.006, + "reward": 1.877314805984497, + "reward_std": 0.14518392086029053, + "rewards/accuracy_reward_stage2": 0.8773148059844971, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 925 + }, + { + "completion_length": 12.421875, + "epoch": 0.16225687751883652, + "grad_norm": 20.542640522580562, + "kl": 0.138671875, + "learning_rate": 8.37918345891011e-07, + "loss": 0.0555, + "reward": 1.7388439178466797, + "reward_std": 0.1671719253063202, + "rewards/accuracy_reward_stage2": 0.7388438582420349, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 926 + }, + { + "completion_length": 12.40625, + "epoch": 0.16243210092868407, + "grad_norm": 15.331506669605071, + "kl": 0.12255859375, + "learning_rate": 8.377431224811634e-07, + "loss": 0.0491, + "reward": 1.3322160243988037, + "reward_std": 0.08134040981531143, + "rewards/accuracy_reward_stage2": 0.4572159945964813, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 927 + }, + { + "completion_length": 11.921875, + "epoch": 0.16260732433853162, + "grad_norm": 19.2444126838387, + "kl": 0.028564453125, + "learning_rate": 8.375678990713159e-07, + "loss": -0.0328, + "reward": 1.5271073579788208, + "reward_std": 0.30369114875793457, + "rewards/accuracy_reward_stage2": 0.6677324175834656, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 928 + }, + { + "completion_length": 8.34375, + "epoch": 0.1627825477483792, + "grad_norm": 13.892006524907126, + "kl": 0.0205078125, + "learning_rate": 8.373926756614683e-07, + "loss": 0.0082, + "reward": 1.6059027910232544, + "reward_std": 0.08084940165281296, + "rewards/accuracy_reward_stage2": 0.6059027910232544, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 929 + }, + { + "completion_length": 16.46875, + "epoch": 0.16295777115822674, + "grad_norm": 25.398573109817477, + "kl": 0.197265625, + "learning_rate": 8.372174522516208e-07, + "loss": 0.0396, + "reward": 1.4554202556610107, + "reward_std": 0.1284077912569046, + "rewards/accuracy_reward_stage2": 0.47104525566101074, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 930 + }, + { + "completion_length": 12.984375, + "epoch": 0.16313299456807429, + "grad_norm": 21.605017942356415, + "kl": 0.0693359375, + "learning_rate": 8.370422288417733e-07, + "loss": -0.0037, + "reward": 1.4630324840545654, + "reward_std": 0.38340917229652405, + "rewards/accuracy_reward_stage2": 0.47865748405456543, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 931 + }, + { + "completion_length": 9.09375, + "epoch": 0.16330821797792186, + "grad_norm": 22.13872771036885, + "kl": 0.046142578125, + "learning_rate": 8.368670054319256e-07, + "loss": -0.0149, + "reward": 1.28238844871521, + "reward_std": 0.16411705315113068, + "rewards/accuracy_reward_stage2": 0.29801347851753235, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 932 + }, + { + "completion_length": 16.5, + "epoch": 0.1634834413877694, + "grad_norm": 18.146799048508555, + "kl": 0.2412109375, + "learning_rate": 8.366917820220781e-07, + "loss": 0.063, + "reward": 1.306018590927124, + "reward_std": 0.2175029218196869, + "rewards/accuracy_reward_stage2": 0.4466434717178345, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 933 + }, + { + "completion_length": 10.03125, + "epoch": 0.16365866479761695, + "grad_norm": 15.35898759648924, + "kl": 0.0517578125, + "learning_rate": 8.365165586122306e-07, + "loss": 0.0207, + "reward": 1.464646339416504, + "reward_std": 0.1268872767686844, + "rewards/accuracy_reward_stage2": 0.46464625000953674, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 934 + }, + { + "completion_length": 7.96875, + "epoch": 0.16383388820746453, + "grad_norm": 13.403024814567052, + "kl": 0.04833984375, + "learning_rate": 8.36341335202383e-07, + "loss": 0.0193, + "reward": 1.6848759651184082, + "reward_std": 0.10018566995859146, + "rewards/accuracy_reward_stage2": 0.6848759651184082, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 935 + }, + { + "completion_length": 9.03125, + "epoch": 0.16400911161731208, + "grad_norm": 17.930941069831324, + "kl": 0.08251953125, + "learning_rate": 8.361661117925355e-07, + "loss": -0.0112, + "reward": 1.4982693195343018, + "reward_std": 0.17483346164226532, + "rewards/accuracy_reward_stage2": 0.513894259929657, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 936 + }, + { + "completion_length": 10.5625, + "epoch": 0.16418433502715962, + "grad_norm": 22.799043223780735, + "kl": 0.07421875, + "learning_rate": 8.359908883826879e-07, + "loss": -0.0142, + "reward": 1.4645646810531616, + "reward_std": 0.3297140598297119, + "rewards/accuracy_reward_stage2": 0.48018965125083923, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 937 + }, + { + "completion_length": 10.8125, + "epoch": 0.1643595584370072, + "grad_norm": 22.565240947724515, + "kl": 0.068359375, + "learning_rate": 8.358156649728403e-07, + "loss": 0.0182, + "reward": 1.7171674966812134, + "reward_std": 0.07919125258922577, + "rewards/accuracy_reward_stage2": 0.7327924966812134, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 938 + }, + { + "completion_length": 10.625, + "epoch": 0.16453478184685474, + "grad_norm": 17.200532557520877, + "kl": 0.04443359375, + "learning_rate": 8.356404415629928e-07, + "loss": -0.0128, + "reward": 1.4361746311187744, + "reward_std": 0.12804433703422546, + "rewards/accuracy_reward_stage2": 0.5767996311187744, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 939 + }, + { + "completion_length": 12.609375, + "epoch": 0.1647100052567023, + "grad_norm": 19.822105679449265, + "kl": 0.30859375, + "learning_rate": 8.354652181531452e-07, + "loss": 0.079, + "reward": 1.7268434762954712, + "reward_std": 0.13828769326210022, + "rewards/accuracy_reward_stage2": 0.8674684166908264, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 940 + }, + { + "completion_length": 19.421875, + "epoch": 0.16488522866654984, + "grad_norm": 21.698097014939922, + "kl": 0.07568359375, + "learning_rate": 8.352899947432977e-07, + "loss": 0.0081, + "reward": 1.5953387022018433, + "reward_std": 0.20249740779399872, + "rewards/accuracy_reward_stage2": 0.6109637022018433, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 941 + }, + { + "completion_length": 12.046875, + "epoch": 0.1650604520763974, + "grad_norm": 19.471630114890893, + "kl": 0.177734375, + "learning_rate": 8.3511477133345e-07, + "loss": 0.071, + "reward": 1.0085077285766602, + "reward_std": 0.19060616195201874, + "rewards/accuracy_reward_stage2": 0.2585076689720154, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 942 + }, + { + "completion_length": 14.0, + "epoch": 0.16523567548624496, + "grad_norm": 18.459723557568463, + "kl": 0.10595703125, + "learning_rate": 8.349395479236025e-07, + "loss": 0.0424, + "reward": 1.3339991569519043, + "reward_std": 0.1780506670475006, + "rewards/accuracy_reward_stage2": 0.4589990973472595, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 943 + }, + { + "completion_length": 8.59375, + "epoch": 0.1654108988960925, + "grad_norm": 21.711342619576424, + "kl": 0.162109375, + "learning_rate": 8.34764324513755e-07, + "loss": -0.021, + "reward": 1.465111494064331, + "reward_std": 0.2542092800140381, + "rewards/accuracy_reward_stage2": 0.621361494064331, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 944 + }, + { + "completion_length": 15.78125, + "epoch": 0.16558612230594008, + "grad_norm": 12.957827896906505, + "kl": 0.0224609375, + "learning_rate": 8.345891011039074e-07, + "loss": 0.009, + "reward": 1.1822917461395264, + "reward_std": 0.08154669404029846, + "rewards/accuracy_reward_stage2": 0.3072916865348816, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 945 + }, + { + "completion_length": 15.90625, + "epoch": 0.16576134571578763, + "grad_norm": 14.16153592665762, + "kl": 0.056396484375, + "learning_rate": 8.344138776940599e-07, + "loss": 0.0226, + "reward": 1.3079233169555664, + "reward_std": 0.1660769283771515, + "rewards/accuracy_reward_stage2": 0.5579233169555664, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 946 + }, + { + "completion_length": 19.765625, + "epoch": 0.16593656912563517, + "grad_norm": 26.178065837337677, + "kl": 0.10595703125, + "learning_rate": 8.342386542842124e-07, + "loss": 0.0107, + "reward": 1.4617750644683838, + "reward_std": 0.29273518919944763, + "rewards/accuracy_reward_stage2": 0.47740012407302856, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 947 + }, + { + "completion_length": 10.71875, + "epoch": 0.16611179253548275, + "grad_norm": 23.879074853142832, + "kl": 0.150390625, + "learning_rate": 8.340634308743648e-07, + "loss": 0.0603, + "reward": 1.5003604888916016, + "reward_std": 0.1546555906534195, + "rewards/accuracy_reward_stage2": 0.6253605484962463, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 948 + }, + { + "completion_length": 11.625, + "epoch": 0.1662870159453303, + "grad_norm": 28.97017047697589, + "kl": 0.08935546875, + "learning_rate": 8.338882074645173e-07, + "loss": 0.0358, + "reward": 1.651204228401184, + "reward_std": 0.17542898654937744, + "rewards/accuracy_reward_stage2": 0.6512041687965393, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 949 + }, + { + "completion_length": 12.015625, + "epoch": 0.16646223935517784, + "grad_norm": 24.650912731008486, + "kl": 0.078125, + "learning_rate": 8.337129840546698e-07, + "loss": 0.0312, + "reward": 1.4317245483398438, + "reward_std": 0.16404840350151062, + "rewards/accuracy_reward_stage2": 0.5567246079444885, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 950 + }, + { + "completion_length": 8.265625, + "epoch": 0.16663746276502542, + "grad_norm": 25.199836114032845, + "kl": 0.019775390625, + "learning_rate": 8.335377606448221e-07, + "loss": 0.0079, + "reward": 1.5420386791229248, + "reward_std": 0.2433536946773529, + "rewards/accuracy_reward_stage2": 0.5420386791229248, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 951 + }, + { + "completion_length": 10.9375, + "epoch": 0.16681268617487297, + "grad_norm": 168.31758714014757, + "kl": 0.61328125, + "learning_rate": 8.333625372349745e-07, + "loss": 0.2342, + "reward": 1.4494047164916992, + "reward_std": 0.2671560049057007, + "rewards/accuracy_reward_stage2": 0.590029776096344, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 952 + }, + { + "completion_length": 11.53125, + "epoch": 0.1669879095847205, + "grad_norm": 21.797026985006323, + "kl": 0.10595703125, + "learning_rate": 8.331873138251269e-07, + "loss": 0.0095, + "reward": 1.448401689529419, + "reward_std": 0.21111756563186646, + "rewards/accuracy_reward_stage2": 0.46402665972709656, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 953 + }, + { + "completion_length": 19.484375, + "epoch": 0.1671631329945681, + "grad_norm": 221.20774745773093, + "kl": 0.8515625, + "learning_rate": 8.330120904152794e-07, + "loss": 0.3394, + "reward": 1.4512382745742798, + "reward_std": 0.15571743249893188, + "rewards/accuracy_reward_stage2": 0.5762382745742798, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 954 + }, + { + "completion_length": 8.921875, + "epoch": 0.16733835640441563, + "grad_norm": 17.292543256694714, + "kl": 0.1318359375, + "learning_rate": 8.328368670054319e-07, + "loss": -0.0206, + "reward": 1.715613603591919, + "reward_std": 0.19096148014068604, + "rewards/accuracy_reward_stage2": 0.746863603591919, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 955 + }, + { + "completion_length": 13.265625, + "epoch": 0.16751357981426318, + "grad_norm": 20.08735695941081, + "kl": 0.11669921875, + "learning_rate": 8.326616435955843e-07, + "loss": 0.0468, + "reward": 1.4697279930114746, + "reward_std": 0.1781584620475769, + "rewards/accuracy_reward_stage2": 0.5947280526161194, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 956 + }, + { + "completion_length": 28.828125, + "epoch": 0.16768880322411073, + "grad_norm": 302.9314365613275, + "kl": 1.640625, + "learning_rate": 8.324864201857368e-07, + "loss": 0.6545, + "reward": 1.36354398727417, + "reward_std": 0.06286264955997467, + "rewards/accuracy_reward_stage2": 0.6135439872741699, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 957 + }, + { + "completion_length": 7.9375, + "epoch": 0.1678640266339583, + "grad_norm": 23.483468245101076, + "kl": 0.10595703125, + "learning_rate": 8.323111967758892e-07, + "loss": 0.0424, + "reward": 1.3482142686843872, + "reward_std": 0.27842962741851807, + "rewards/accuracy_reward_stage2": 0.4732142686843872, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 958 + }, + { + "completion_length": 9.21875, + "epoch": 0.16803925004380585, + "grad_norm": 23.99415537775469, + "kl": 0.12158203125, + "learning_rate": 8.321359733660417e-07, + "loss": 0.0152, + "reward": 1.4995429515838623, + "reward_std": 0.2657421827316284, + "rewards/accuracy_reward_stage2": 0.5307928919792175, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 959 + }, + { + "completion_length": 11.390625, + "epoch": 0.1682144734536534, + "grad_norm": 36.190832877832605, + "kl": 0.0234375, + "learning_rate": 8.319607499561942e-07, + "loss": 0.0094, + "reward": 1.620686650276184, + "reward_std": 0.10265517234802246, + "rewards/accuracy_reward_stage2": 0.6206865906715393, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 960 + }, + { + "completion_length": 8.09375, + "epoch": 0.16838969686350097, + "grad_norm": 18.849443706981564, + "kl": 0.0849609375, + "learning_rate": 8.317855265463466e-07, + "loss": -0.0143, + "reward": 1.3765008449554443, + "reward_std": 0.20061229169368744, + "rewards/accuracy_reward_stage2": 0.4077509045600891, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 961 + }, + { + "completion_length": 16.546875, + "epoch": 0.16856492027334852, + "grad_norm": 27.7507287897152, + "kl": 0.7578125, + "learning_rate": 8.31610303136499e-07, + "loss": 0.3034, + "reward": 1.4773821830749512, + "reward_std": 0.04698540270328522, + "rewards/accuracy_reward_stage2": 0.727382242679596, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 962 + }, + { + "completion_length": 10.0625, + "epoch": 0.16874014368319606, + "grad_norm": 15.944671828158617, + "kl": 0.0322265625, + "learning_rate": 8.314350797266514e-07, + "loss": -0.0643, + "reward": 1.3591651916503906, + "reward_std": 0.22507423162460327, + "rewards/accuracy_reward_stage2": 0.390415221452713, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 963 + }, + { + "completion_length": 13.140625, + "epoch": 0.16891536709304364, + "grad_norm": 21.640372689023383, + "kl": 0.1826171875, + "learning_rate": 8.312598563168038e-07, + "loss": 0.0184, + "reward": 1.2974778413772583, + "reward_std": 0.22381377220153809, + "rewards/accuracy_reward_stage2": 0.4537278115749359, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 964 + }, + { + "completion_length": 22.359375, + "epoch": 0.16909059050289119, + "grad_norm": 21.901735100068972, + "kl": 0.20703125, + "learning_rate": 8.310846329069563e-07, + "loss": 0.0957, + "reward": 1.496659278869629, + "reward_std": 0.1753218173980713, + "rewards/accuracy_reward_stage2": 0.6216592788696289, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 965 + }, + { + "completion_length": 10.328125, + "epoch": 0.16926581391273873, + "grad_norm": 20.0283372209553, + "kl": 0.08740234375, + "learning_rate": 8.309094094971087e-07, + "loss": -0.0093, + "reward": 1.6073633432388306, + "reward_std": 0.212762713432312, + "rewards/accuracy_reward_stage2": 0.6229883432388306, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 966 + }, + { + "completion_length": 7.3125, + "epoch": 0.1694410373225863, + "grad_norm": 20.241618585052276, + "kl": 0.1015625, + "learning_rate": 8.307341860872612e-07, + "loss": -0.0036, + "reward": 1.5482078790664673, + "reward_std": 0.2085924744606018, + "rewards/accuracy_reward_stage2": 0.5638328790664673, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 967 + }, + { + "completion_length": 9.546875, + "epoch": 0.16961626073243385, + "grad_norm": 17.233476394758718, + "kl": 0.04638671875, + "learning_rate": 8.305589626774137e-07, + "loss": -0.0423, + "reward": 1.6302083730697632, + "reward_std": 0.1822493076324463, + "rewards/accuracy_reward_stage2": 0.6614583730697632, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 968 + }, + { + "completion_length": 27.1875, + "epoch": 0.1697914841422814, + "grad_norm": 19.06856915302853, + "kl": 0.039794921875, + "learning_rate": 8.303837392675661e-07, + "loss": -0.1353, + "reward": 1.31388258934021, + "reward_std": 0.23465386033058167, + "rewards/accuracy_reward_stage2": 0.3763824701309204, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 969 + }, + { + "completion_length": 13.625, + "epoch": 0.16996670755212898, + "grad_norm": 21.596284107801807, + "kl": 0.3828125, + "learning_rate": 8.302085158577186e-07, + "loss": 0.1202, + "reward": 1.1787537336349487, + "reward_std": 0.16805896162986755, + "rewards/accuracy_reward_stage2": 0.31937870383262634, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 970 + }, + { + "completion_length": 7.4375, + "epoch": 0.17014193096197652, + "grad_norm": 16.395785575312768, + "kl": 0.038818359375, + "learning_rate": 8.300332924478711e-07, + "loss": -0.0287, + "reward": 1.5564574003219604, + "reward_std": 0.1440478414297104, + "rewards/accuracy_reward_stage2": 0.5720824003219604, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 971 + }, + { + "completion_length": 8.671875, + "epoch": 0.17031715437182407, + "grad_norm": 14.197834374550085, + "kl": 0.03125, + "learning_rate": 8.298580690380234e-07, + "loss": 0.0126, + "reward": 1.618015170097351, + "reward_std": 0.0638478696346283, + "rewards/accuracy_reward_stage2": 0.6180151700973511, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 972 + }, + { + "completion_length": 9.21875, + "epoch": 0.17049237778167164, + "grad_norm": 18.13707578787983, + "kl": 0.07080078125, + "learning_rate": 8.296828456281759e-07, + "loss": 0.0284, + "reward": 1.5550284385681152, + "reward_std": 0.16677148640155792, + "rewards/accuracy_reward_stage2": 0.5550283789634705, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 973 + }, + { + "completion_length": 9.671875, + "epoch": 0.1706676011915192, + "grad_norm": 22.426704935069942, + "kl": 0.028564453125, + "learning_rate": 8.295076222183283e-07, + "loss": -0.0747, + "reward": 1.7982523441314697, + "reward_std": 0.2379818707704544, + "rewards/accuracy_reward_stage2": 0.8295024633407593, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 974 + }, + { + "completion_length": 11.40625, + "epoch": 0.17084282460136674, + "grad_norm": 23.30227030147312, + "kl": 0.0693359375, + "learning_rate": 8.293323988084808e-07, + "loss": 0.0277, + "reward": 1.4609272480010986, + "reward_std": 0.29046812653541565, + "rewards/accuracy_reward_stage2": 0.46092718839645386, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 975 + }, + { + "completion_length": 9.875, + "epoch": 0.17101804801121429, + "grad_norm": 11.607327794748048, + "kl": 0.09375, + "learning_rate": 8.291571753986332e-07, + "loss": 0.0375, + "reward": 1.6811981201171875, + "reward_std": 0.05087604746222496, + "rewards/accuracy_reward_stage2": 0.8061981201171875, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 976 + }, + { + "completion_length": 20.390625, + "epoch": 0.17119327142106186, + "grad_norm": 19.407638180474976, + "kl": 0.06396484375, + "learning_rate": 8.289819519887856e-07, + "loss": 0.0256, + "reward": 1.4600509405136108, + "reward_std": 0.21399806439876556, + "rewards/accuracy_reward_stage2": 0.46005094051361084, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 977 + }, + { + "completion_length": 8.484375, + "epoch": 0.1713684948309094, + "grad_norm": 23.060799171899124, + "kl": 0.0703125, + "learning_rate": 8.288067285789381e-07, + "loss": 0.028, + "reward": 1.6289044618606567, + "reward_std": 0.27257654070854187, + "rewards/accuracy_reward_stage2": 0.628904402256012, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 978 + }, + { + "completion_length": 8.84375, + "epoch": 0.17154371824075695, + "grad_norm": 19.641303748012547, + "kl": 0.046142578125, + "learning_rate": 8.286315051690906e-07, + "loss": 0.0185, + "reward": 1.4471064805984497, + "reward_std": 0.18143045902252197, + "rewards/accuracy_reward_stage2": 0.4471064805984497, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 979 + }, + { + "completion_length": 18.9375, + "epoch": 0.17171894165060453, + "grad_norm": 19.427931995450898, + "kl": 0.06298828125, + "learning_rate": 8.28456281759243e-07, + "loss": -0.0189, + "reward": 1.4050720930099487, + "reward_std": 0.19070225954055786, + "rewards/accuracy_reward_stage2": 0.42069703340530396, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 980 + }, + { + "completion_length": 9.203125, + "epoch": 0.17189416506045208, + "grad_norm": 15.666858070238954, + "kl": 0.099609375, + "learning_rate": 8.282810583493955e-07, + "loss": -0.0777, + "reward": 1.551900863647461, + "reward_std": 0.21262939274311066, + "rewards/accuracy_reward_stage2": 0.5987757444381714, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 981 + }, + { + "completion_length": 15.375, + "epoch": 0.17206938847029962, + "grad_norm": 15.354945603315256, + "kl": 0.08251953125, + "learning_rate": 8.281058349395478e-07, + "loss": 0.0331, + "reward": 1.2965600490570068, + "reward_std": 0.09104090929031372, + "rewards/accuracy_reward_stage2": 0.42156004905700684, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 982 + }, + { + "completion_length": 12.09375, + "epoch": 0.1722446118801472, + "grad_norm": 22.37440013286015, + "kl": 0.038330078125, + "learning_rate": 8.279306115297003e-07, + "loss": 0.0153, + "reward": 1.5221551656723022, + "reward_std": 0.2770881652832031, + "rewards/accuracy_reward_stage2": 0.5221551656723022, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 983 + }, + { + "completion_length": 7.234375, + "epoch": 0.17241983528999474, + "grad_norm": 16.939924437153255, + "kl": 0.00830078125, + "learning_rate": 8.277553881198528e-07, + "loss": 0.0033, + "reward": 1.7744736671447754, + "reward_std": 0.17920680344104767, + "rewards/accuracy_reward_stage2": 0.7744735479354858, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 984 + }, + { + "completion_length": 22.359375, + "epoch": 0.1725950586998423, + "grad_norm": 3498.02854669452, + "kl": 9.5, + "learning_rate": 8.275801647100052e-07, + "loss": 3.7769, + "reward": 1.7484500408172607, + "reward_std": 0.09648245573043823, + "rewards/accuracy_reward_stage2": 0.8734498620033264, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 985 + }, + { + "completion_length": 9.734375, + "epoch": 0.17277028210968987, + "grad_norm": 17.801115214882383, + "kl": 0.0634765625, + "learning_rate": 8.274049413001577e-07, + "loss": 0.0253, + "reward": 1.5197932720184326, + "reward_std": 0.16293829679489136, + "rewards/accuracy_reward_stage2": 0.5197933912277222, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 986 + }, + { + "completion_length": 10.5, + "epoch": 0.1729455055195374, + "grad_norm": 17.742698414488256, + "kl": 0.07470703125, + "learning_rate": 8.272297178903102e-07, + "loss": 0.0299, + "reward": 1.6276013851165771, + "reward_std": 0.19804228842258453, + "rewards/accuracy_reward_stage2": 0.6276013851165771, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 987 + }, + { + "completion_length": 10.75, + "epoch": 0.17312072892938496, + "grad_norm": 20.169397919693026, + "kl": 0.054443359375, + "learning_rate": 8.270544944804626e-07, + "loss": 0.0218, + "reward": 1.4787770509719849, + "reward_std": 0.2628193199634552, + "rewards/accuracy_reward_stage2": 0.4787770211696625, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 988 + }, + { + "completion_length": 15.921875, + "epoch": 0.17329595233923253, + "grad_norm": 19.564975038357, + "kl": 0.11767578125, + "learning_rate": 8.26879271070615e-07, + "loss": 0.0472, + "reward": 1.4452990293502808, + "reward_std": 0.1860145926475525, + "rewards/accuracy_reward_stage2": 0.6952989101409912, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 989 + }, + { + "completion_length": 17.71875, + "epoch": 0.17347117574908008, + "grad_norm": 24.568791594952728, + "kl": 0.034423828125, + "learning_rate": 8.267040476607674e-07, + "loss": 0.0137, + "reward": 1.5325812101364136, + "reward_std": 0.26802152395248413, + "rewards/accuracy_reward_stage2": 0.5325811505317688, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 990 + }, + { + "completion_length": 11.234375, + "epoch": 0.17364639915892763, + "grad_norm": 31.406451632987274, + "kl": 0.046142578125, + "learning_rate": 8.265288242509199e-07, + "loss": 0.0185, + "reward": 1.516391634941101, + "reward_std": 0.1867346167564392, + "rewards/accuracy_reward_stage2": 0.5163915753364563, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 991 + }, + { + "completion_length": 14.453125, + "epoch": 0.17382162256877517, + "grad_norm": 24.623943884661465, + "kl": 0.1875, + "learning_rate": 8.263536008410723e-07, + "loss": 0.0753, + "reward": 1.708542823791504, + "reward_std": 0.16841836273670197, + "rewards/accuracy_reward_stage2": 0.8335429430007935, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 992 + }, + { + "completion_length": 8.453125, + "epoch": 0.17399684597862275, + "grad_norm": 566.4321454989408, + "kl": 3.0625, + "learning_rate": 8.261783774312247e-07, + "loss": 1.1852, + "reward": 1.7703094482421875, + "reward_std": 0.19710037112236023, + "rewards/accuracy_reward_stage2": 0.785934329032898, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 993 + }, + { + "completion_length": 9.65625, + "epoch": 0.1741720693884703, + "grad_norm": 21.60589644918224, + "kl": 0.166015625, + "learning_rate": 8.260031540213772e-07, + "loss": 0.0663, + "reward": 1.2493162155151367, + "reward_std": 0.2732129991054535, + "rewards/accuracy_reward_stage2": 0.37431615591049194, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 994 + }, + { + "completion_length": 9.140625, + "epoch": 0.17434729279831784, + "grad_norm": 567.350235789533, + "kl": 3.3125, + "learning_rate": 8.258279306115297e-07, + "loss": 1.2209, + "reward": 1.3685557842254639, + "reward_std": 0.2889344096183777, + "rewards/accuracy_reward_stage2": 0.41543081402778625, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 995 + }, + { + "completion_length": 13.421875, + "epoch": 0.17452251620816542, + "grad_norm": 132.5916422675832, + "kl": 1.015625, + "learning_rate": 8.256527072016821e-07, + "loss": 0.3629, + "reward": 1.3894778490066528, + "reward_std": 0.082199826836586, + "rewards/accuracy_reward_stage2": 0.6551028490066528, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 996 + }, + { + "completion_length": 7.515625, + "epoch": 0.17469773961801296, + "grad_norm": 14.370187452272509, + "kl": 0.1826171875, + "learning_rate": 8.254774837918346e-07, + "loss": 0.0285, + "reward": 1.5352981090545654, + "reward_std": 0.11823684722185135, + "rewards/accuracy_reward_stage2": 0.5509230494499207, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 997 + }, + { + "completion_length": 8.09375, + "epoch": 0.1748729630278605, + "grad_norm": 24.826995542235746, + "kl": 0.04345703125, + "learning_rate": 8.25302260381987e-07, + "loss": 0.0174, + "reward": 1.6802245378494263, + "reward_std": 0.20285210013389587, + "rewards/accuracy_reward_stage2": 0.6802244782447815, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 998 + }, + { + "completion_length": 12.46875, + "epoch": 0.1750481864377081, + "grad_norm": 35.334960123002354, + "kl": 0.0517578125, + "learning_rate": 8.251270369721395e-07, + "loss": 0.0209, + "reward": 1.8040918111801147, + "reward_std": 0.2325887829065323, + "rewards/accuracy_reward_stage2": 0.8040918111801147, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 999 + }, + { + "completion_length": 9.328125, + "epoch": 0.17522340984755563, + "grad_norm": 24.793032711533158, + "kl": 0.10009765625, + "learning_rate": 8.24951813562292e-07, + "loss": 0.0399, + "reward": 1.706731915473938, + "reward_std": 0.34516337513923645, + "rewards/accuracy_reward_stage2": 0.7067318558692932, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1000 + }, + { + "completion_length": 12.09375, + "epoch": 0.17539863325740318, + "grad_norm": 21.874107086733733, + "kl": 0.05615234375, + "learning_rate": 8.247765901524442e-07, + "loss": -0.0217, + "reward": 1.6877598762512207, + "reward_std": 0.2215467244386673, + "rewards/accuracy_reward_stage2": 0.7033848762512207, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1001 + }, + { + "completion_length": 11.265625, + "epoch": 0.17557385666725076, + "grad_norm": 18.000764914593567, + "kl": 0.30078125, + "learning_rate": 8.246013667425967e-07, + "loss": 0.0758, + "reward": 1.5376827716827393, + "reward_std": 0.24442484974861145, + "rewards/accuracy_reward_stage2": 0.6783077120780945, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1002 + }, + { + "completion_length": 7.265625, + "epoch": 0.1757490800770983, + "grad_norm": 18.565844961588045, + "kl": 0.05517578125, + "learning_rate": 8.244261433327491e-07, + "loss": 0.0221, + "reward": 1.7236640453338623, + "reward_std": 0.2000371664762497, + "rewards/accuracy_reward_stage2": 0.7236641049385071, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1003 + }, + { + "completion_length": 13.6875, + "epoch": 0.17592430348694585, + "grad_norm": 20.088877733252122, + "kl": 0.396484375, + "learning_rate": 8.242509199229016e-07, + "loss": 0.1152, + "reward": 1.3435370922088623, + "reward_std": 0.14601978659629822, + "rewards/accuracy_reward_stage2": 0.48416221141815186, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1004 + }, + { + "completion_length": 13.53125, + "epoch": 0.17609952689679342, + "grad_norm": 16.471332796592286, + "kl": 0.061767578125, + "learning_rate": 8.240756965130541e-07, + "loss": 0.0247, + "reward": 1.5413925647735596, + "reward_std": 0.1500665247440338, + "rewards/accuracy_reward_stage2": 0.5413926243782043, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1005 + }, + { + "completion_length": 8.765625, + "epoch": 0.17627475030664097, + "grad_norm": 27.168697897342867, + "kl": 0.1875, + "learning_rate": 8.239004731032065e-07, + "loss": -0.0037, + "reward": 1.568939447402954, + "reward_std": 0.2704058885574341, + "rewards/accuracy_reward_stage2": 0.7251893877983093, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1006 + }, + { + "completion_length": 10.8125, + "epoch": 0.17644997371648852, + "grad_norm": 22.345118749931096, + "kl": 0.2060546875, + "learning_rate": 8.23725249693359e-07, + "loss": 0.0025, + "reward": 1.7439064979553223, + "reward_std": 0.2780749499797821, + "rewards/accuracy_reward_stage2": 0.7751563787460327, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1007 + }, + { + "completion_length": 19.296875, + "epoch": 0.1766251971263361, + "grad_norm": 29.612030789420974, + "kl": 0.07177734375, + "learning_rate": 8.235500262835115e-07, + "loss": 0.0288, + "reward": 1.6531291007995605, + "reward_std": 0.1276826113462448, + "rewards/accuracy_reward_stage2": 0.6531291007995605, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1008 + }, + { + "completion_length": 7.890625, + "epoch": 0.17680042053618364, + "grad_norm": 20.102240938891864, + "kl": 0.044921875, + "learning_rate": 8.233748028736639e-07, + "loss": -0.0262, + "reward": 1.6397186517715454, + "reward_std": 0.2941434979438782, + "rewards/accuracy_reward_stage2": 0.6553436517715454, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1009 + }, + { + "completion_length": 12.5, + "epoch": 0.17697564394603119, + "grad_norm": 27.096622355934347, + "kl": 0.035888671875, + "learning_rate": 8.231995794638164e-07, + "loss": -0.0298, + "reward": 1.3705095052719116, + "reward_std": 0.1347227394580841, + "rewards/accuracy_reward_stage2": 0.38613444566726685, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1010 + }, + { + "completion_length": 11.421875, + "epoch": 0.17715086735587873, + "grad_norm": 19.0051264155405, + "kl": 0.046142578125, + "learning_rate": 8.230243560539689e-07, + "loss": 0.0185, + "reward": 1.4608999490737915, + "reward_std": 0.1947353333234787, + "rewards/accuracy_reward_stage2": 0.5858998894691467, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1011 + }, + { + "completion_length": 10.65625, + "epoch": 0.1773260907657263, + "grad_norm": 13.807597260017692, + "kl": 0.072265625, + "learning_rate": 8.228491326441212e-07, + "loss": -0.1012, + "reward": 1.7135417461395264, + "reward_std": 0.17712606489658356, + "rewards/accuracy_reward_stage2": 0.7604166269302368, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 1012 + }, + { + "completion_length": 14.15625, + "epoch": 0.17750131417557385, + "grad_norm": 16.648019138029532, + "kl": 0.1962890625, + "learning_rate": 8.226739092342737e-07, + "loss": 0.0784, + "reward": 1.5950548648834229, + "reward_std": 0.12263785302639008, + "rewards/accuracy_reward_stage2": 0.7200549840927124, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1013 + }, + { + "completion_length": 10.0, + "epoch": 0.1776765375854214, + "grad_norm": 22.397243983004504, + "kl": 0.1416015625, + "learning_rate": 8.22498685824426e-07, + "loss": -0.0199, + "reward": 1.4478588104248047, + "reward_std": 0.23456881940364838, + "rewards/accuracy_reward_stage2": 0.4791087210178375, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1014 + }, + { + "completion_length": 11.5, + "epoch": 0.17785176099526898, + "grad_norm": 17.19528296606665, + "kl": 0.08544921875, + "learning_rate": 8.223234624145785e-07, + "loss": -0.0101, + "reward": 1.459334135055542, + "reward_std": 0.2601989209651947, + "rewards/accuracy_reward_stage2": 0.474959135055542, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1015 + }, + { + "completion_length": 11.171875, + "epoch": 0.17802698440511652, + "grad_norm": 22.32363164934085, + "kl": 0.052001953125, + "learning_rate": 8.22148239004731e-07, + "loss": -0.0141, + "reward": 1.3699970245361328, + "reward_std": 0.1954904943704605, + "rewards/accuracy_reward_stage2": 0.5106220245361328, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1016 + }, + { + "completion_length": 9.171875, + "epoch": 0.17820220781496407, + "grad_norm": 21.83425997837965, + "kl": 0.06201171875, + "learning_rate": 8.219730155948834e-07, + "loss": -0.0194, + "reward": 1.7290661334991455, + "reward_std": 0.18619462847709656, + "rewards/accuracy_reward_stage2": 0.7446911931037903, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1017 + }, + { + "completion_length": 8.015625, + "epoch": 0.17837743122481164, + "grad_norm": 14.028706814796353, + "kl": 0.040771484375, + "learning_rate": 8.217977921850359e-07, + "loss": 0.0163, + "reward": 1.59375, + "reward_std": 0.1462521106004715, + "rewards/accuracy_reward_stage2": 0.59375, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1018 + }, + { + "completion_length": 12.828125, + "epoch": 0.1785526546346592, + "grad_norm": 16.785372668563895, + "kl": 0.042236328125, + "learning_rate": 8.216225687751883e-07, + "loss": 0.0169, + "reward": 1.6821075677871704, + "reward_std": 0.17112760245800018, + "rewards/accuracy_reward_stage2": 0.6821075677871704, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1019 + }, + { + "completion_length": 17.859375, + "epoch": 0.17872787804450674, + "grad_norm": 17.11698704652708, + "kl": 0.015869140625, + "learning_rate": 8.214473453653408e-07, + "loss": -0.0378, + "reward": 1.546691656112671, + "reward_std": 0.06553763151168823, + "rewards/accuracy_reward_stage2": 0.5623167157173157, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1020 + }, + { + "completion_length": 8.203125, + "epoch": 0.1789031014543543, + "grad_norm": 23.871929503809273, + "kl": 0.095703125, + "learning_rate": 8.212721219554933e-07, + "loss": -0.063, + "reward": 1.7383593320846558, + "reward_std": 0.20009878277778625, + "rewards/accuracy_reward_stage2": 0.7852343320846558, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 1021 + }, + { + "completion_length": 9.84375, + "epoch": 0.17907832486420186, + "grad_norm": 30.169501250704013, + "kl": 0.10888671875, + "learning_rate": 8.210968985456456e-07, + "loss": 0.0435, + "reward": 1.5823220014572144, + "reward_std": 0.22581787407398224, + "rewards/accuracy_reward_stage2": 0.7073220014572144, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1022 + }, + { + "completion_length": 11.484375, + "epoch": 0.1792535482740494, + "grad_norm": 20.79046787185343, + "kl": 0.10888671875, + "learning_rate": 8.209216751357981e-07, + "loss": -0.0007, + "reward": 1.5117167234420776, + "reward_std": 0.21293523907661438, + "rewards/accuracy_reward_stage2": 0.5273416638374329, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1023 + }, + { + "completion_length": 12.6875, + "epoch": 0.17942877168389698, + "grad_norm": 21.841933395446556, + "kl": 0.058837890625, + "learning_rate": 8.207464517259506e-07, + "loss": 0.002, + "reward": 1.5082931518554688, + "reward_std": 0.2580886483192444, + "rewards/accuracy_reward_stage2": 0.5239181518554688, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1024 + }, + { + "completion_length": 10.703125, + "epoch": 0.17960399509374453, + "grad_norm": 27.915163388669363, + "kl": 0.13671875, + "learning_rate": 8.20571228316103e-07, + "loss": 0.0115, + "reward": 1.5165634155273438, + "reward_std": 0.245716854929924, + "rewards/accuracy_reward_stage2": 0.5478134155273438, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1025 + }, + { + "completion_length": 6.109375, + "epoch": 0.17977921850359208, + "grad_norm": 12.013743742666504, + "kl": 0.010498046875, + "learning_rate": 8.203960049062555e-07, + "loss": 0.0042, + "reward": 1.8125, + "reward_std": 0.06681530922651291, + "rewards/accuracy_reward_stage2": 0.8125, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1026 + }, + { + "completion_length": 11.15625, + "epoch": 0.17995444191343962, + "grad_norm": 160.2641583697851, + "kl": 0.59765625, + "learning_rate": 8.202207814964078e-07, + "loss": 0.2043, + "reward": 1.437111496925354, + "reward_std": 0.32900726795196533, + "rewards/accuracy_reward_stage2": 0.702736496925354, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 1027 + }, + { + "completion_length": 10.65625, + "epoch": 0.1801296653232872, + "grad_norm": 25.617864049464025, + "kl": 0.03515625, + "learning_rate": 8.200455580865603e-07, + "loss": 0.0141, + "reward": 1.6387648582458496, + "reward_std": 0.2850415110588074, + "rewards/accuracy_reward_stage2": 0.6387649178504944, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1028 + }, + { + "completion_length": 8.015625, + "epoch": 0.18030488873313474, + "grad_norm": 22.632944374449437, + "kl": 0.04150390625, + "learning_rate": 8.198703346767128e-07, + "loss": 0.0166, + "reward": 1.4933924674987793, + "reward_std": 0.17765173316001892, + "rewards/accuracy_reward_stage2": 0.7433923482894897, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1029 + }, + { + "completion_length": 7.609375, + "epoch": 0.1804801121429823, + "grad_norm": 11.832671025211214, + "kl": 0.00701904296875, + "learning_rate": 8.196951112668652e-07, + "loss": 0.0028, + "reward": 1.65625, + "reward_std": 0.0578637570142746, + "rewards/accuracy_reward_stage2": 0.65625, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1030 + }, + { + "completion_length": 11.25, + "epoch": 0.18065533555282987, + "grad_norm": 21.565992219102643, + "kl": 0.07861328125, + "learning_rate": 8.195198878570176e-07, + "loss": 0.0315, + "reward": 1.5944758653640747, + "reward_std": 0.23725871741771698, + "rewards/accuracy_reward_stage2": 0.5944758653640747, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1031 + }, + { + "completion_length": 9.25, + "epoch": 0.1808305589626774, + "grad_norm": 19.56751291277132, + "kl": 0.058837890625, + "learning_rate": 8.193446644471701e-07, + "loss": 0.0235, + "reward": 1.4681397676467896, + "reward_std": 0.2315388321876526, + "rewards/accuracy_reward_stage2": 0.5931397676467896, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1032 + }, + { + "completion_length": 11.8125, + "epoch": 0.18100578237252496, + "grad_norm": 21.39438530811492, + "kl": 0.052734375, + "learning_rate": 8.191694410373225e-07, + "loss": 0.0211, + "reward": 1.8061552047729492, + "reward_std": 0.13493405282497406, + "rewards/accuracy_reward_stage2": 0.8061552047729492, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1033 + }, + { + "completion_length": 9.546875, + "epoch": 0.18118100578237253, + "grad_norm": 21.69549112720359, + "kl": 0.060302734375, + "learning_rate": 8.18994217627475e-07, + "loss": -0.0642, + "reward": 1.5303882360458374, + "reward_std": 0.3947955071926117, + "rewards/accuracy_reward_stage2": 0.5616382360458374, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1034 + }, + { + "completion_length": 13.515625, + "epoch": 0.18135622919222008, + "grad_norm": 16.958675766024093, + "kl": 0.1796875, + "learning_rate": 8.188189942176274e-07, + "loss": 0.0719, + "reward": 1.3252782821655273, + "reward_std": 0.2296641618013382, + "rewards/accuracy_reward_stage2": 0.4502781629562378, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1035 + }, + { + "completion_length": 12.28125, + "epoch": 0.18153145260206763, + "grad_norm": 14.465418387381794, + "kl": 0.17578125, + "learning_rate": 8.186437708077799e-07, + "loss": 0.0261, + "reward": 1.515625, + "reward_std": 0.1530819982290268, + "rewards/accuracy_reward_stage2": 0.65625, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1036 + }, + { + "completion_length": 9.265625, + "epoch": 0.1817066760119152, + "grad_norm": 20.9024336904025, + "kl": 0.14453125, + "learning_rate": 8.184685473979324e-07, + "loss": -0.0128, + "reward": 1.6768765449523926, + "reward_std": 0.1317557990550995, + "rewards/accuracy_reward_stage2": 0.7081265449523926, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1037 + }, + { + "completion_length": 11.71875, + "epoch": 0.18188189942176275, + "grad_norm": 25.559950008766783, + "kl": 0.0810546875, + "learning_rate": 8.182933239880848e-07, + "loss": -0.0117, + "reward": 1.5764124393463135, + "reward_std": 0.23896048963069916, + "rewards/accuracy_reward_stage2": 0.5920374393463135, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1038 + }, + { + "completion_length": 34.03125, + "epoch": 0.1820571228316103, + "grad_norm": 25.275923160610986, + "kl": 0.09765625, + "learning_rate": 8.181181005782373e-07, + "loss": 0.0392, + "reward": 1.6226279735565186, + "reward_std": 0.087415412068367, + "rewards/accuracy_reward_stage2": 0.6226279735565186, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1039 + }, + { + "completion_length": 7.84375, + "epoch": 0.18223234624145787, + "grad_norm": 23.407709353813917, + "kl": 0.061767578125, + "learning_rate": 8.179428771683897e-07, + "loss": -0.0194, + "reward": 1.4883452653884888, + "reward_std": 0.21674522757530212, + "rewards/accuracy_reward_stage2": 0.5039702653884888, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1040 + }, + { + "completion_length": 12.578125, + "epoch": 0.18240756965130542, + "grad_norm": 18.522952285363942, + "kl": 0.07568359375, + "learning_rate": 8.17767653758542e-07, + "loss": -0.0076, + "reward": 1.6132853031158447, + "reward_std": 0.1885463446378708, + "rewards/accuracy_reward_stage2": 0.6289101839065552, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1041 + }, + { + "completion_length": 10.578125, + "epoch": 0.18258279306115296, + "grad_norm": 33.825003381200695, + "kl": 0.1611328125, + "learning_rate": 8.175924303486945e-07, + "loss": -0.0031, + "reward": 1.6792283058166504, + "reward_std": 0.24878743290901184, + "rewards/accuracy_reward_stage2": 0.7104784250259399, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1042 + }, + { + "completion_length": 15.03125, + "epoch": 0.18275801647100054, + "grad_norm": 20.278543151963856, + "kl": 0.06298828125, + "learning_rate": 8.174172069388469e-07, + "loss": 0.0253, + "reward": 1.1826815605163574, + "reward_std": 0.17803940176963806, + "rewards/accuracy_reward_stage2": 0.30768144130706787, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1043 + }, + { + "completion_length": 10.90625, + "epoch": 0.1829332398808481, + "grad_norm": 24.402430870338062, + "kl": 0.0732421875, + "learning_rate": 8.172419835289994e-07, + "loss": 0.0294, + "reward": 1.543660283088684, + "reward_std": 0.1427423655986786, + "rewards/accuracy_reward_stage2": 0.5436602830886841, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1044 + }, + { + "completion_length": 14.140625, + "epoch": 0.18310846329069563, + "grad_norm": 32.14133710365091, + "kl": 0.37109375, + "learning_rate": 8.170667601191519e-07, + "loss": 0.149, + "reward": 1.4166667461395264, + "reward_std": 0.3040403723716736, + "rewards/accuracy_reward_stage2": 0.5416666865348816, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1045 + }, + { + "completion_length": 5.453125, + "epoch": 0.18328368670054318, + "grad_norm": 15.213673839148703, + "kl": 0.13671875, + "learning_rate": 8.168915367093043e-07, + "loss": 0.0103, + "reward": 1.5307811498641968, + "reward_std": 0.07436943054199219, + "rewards/accuracy_reward_stage2": 0.6714061498641968, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1046 + }, + { + "completion_length": 10.34375, + "epoch": 0.18345891011039075, + "grad_norm": 19.228850233741067, + "kl": 0.06298828125, + "learning_rate": 8.167163132994568e-07, + "loss": 0.0253, + "reward": 1.7453439235687256, + "reward_std": 0.1812073290348053, + "rewards/accuracy_reward_stage2": 0.7453439235687256, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1047 + }, + { + "completion_length": 9.078125, + "epoch": 0.1836341335202383, + "grad_norm": 23.540814203428233, + "kl": 0.1689453125, + "learning_rate": 8.165410898896093e-07, + "loss": -0.008, + "reward": 1.5110549926757812, + "reward_std": 0.22013264894485474, + "rewards/accuracy_reward_stage2": 0.5423049330711365, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1048 + }, + { + "completion_length": 7.328125, + "epoch": 0.18380935693008585, + "grad_norm": 40.582636908844044, + "kl": 0.3046875, + "learning_rate": 8.163658664797617e-07, + "loss": 0.0398, + "reward": 1.451476812362671, + "reward_std": 0.20129188895225525, + "rewards/accuracy_reward_stage2": 0.4827268123626709, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1049 + }, + { + "completion_length": 10.84375, + "epoch": 0.18398458033993342, + "grad_norm": 27.574486341804228, + "kl": 0.07421875, + "learning_rate": 8.161906430699142e-07, + "loss": 0.0297, + "reward": 1.5007617473602295, + "reward_std": 0.35384151339530945, + "rewards/accuracy_reward_stage2": 0.5007617473602295, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1050 + }, + { + "completion_length": 11.828125, + "epoch": 0.18415980374978097, + "grad_norm": 32.54566477332695, + "kl": 0.353515625, + "learning_rate": 8.160154196600665e-07, + "loss": 0.1412, + "reward": 1.5478098392486572, + "reward_std": 0.31436848640441895, + "rewards/accuracy_reward_stage2": 0.6728098392486572, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1051 + }, + { + "completion_length": 14.953125, + "epoch": 0.18433502715962852, + "grad_norm": 20.455856754457844, + "kl": 0.0849609375, + "learning_rate": 8.158401962502189e-07, + "loss": 0.0341, + "reward": 1.5930328369140625, + "reward_std": 0.17691928148269653, + "rewards/accuracy_reward_stage2": 0.593032956123352, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1052 + }, + { + "completion_length": 9.65625, + "epoch": 0.1845102505694761, + "grad_norm": 26.652573712542743, + "kl": 0.21484375, + "learning_rate": 8.156649728403714e-07, + "loss": 0.0573, + "reward": 1.5435185432434082, + "reward_std": 0.28644296526908875, + "rewards/accuracy_reward_stage2": 0.6841434836387634, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1053 + }, + { + "completion_length": 10.015625, + "epoch": 0.18468547397932364, + "grad_norm": 23.14445149802474, + "kl": 0.060546875, + "learning_rate": 8.154897494305238e-07, + "loss": 0.0242, + "reward": 1.634928584098816, + "reward_std": 0.2715989947319031, + "rewards/accuracy_reward_stage2": 0.7599285840988159, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1054 + }, + { + "completion_length": 9.59375, + "epoch": 0.18486069738917119, + "grad_norm": 15.48288506366053, + "kl": 0.34765625, + "learning_rate": 8.153145260206763e-07, + "loss": 0.0949, + "reward": 1.5993139743804932, + "reward_std": 0.16807261109352112, + "rewards/accuracy_reward_stage2": 0.7399389743804932, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1055 + }, + { + "completion_length": 12.4375, + "epoch": 0.18503592079901876, + "grad_norm": 22.38243459661095, + "kl": 0.2216796875, + "learning_rate": 8.151393026108288e-07, + "loss": -0.0003, + "reward": 1.511639952659607, + "reward_std": 0.2581867277622223, + "rewards/accuracy_reward_stage2": 0.5585149526596069, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 1056 + }, + { + "completion_length": 10.203125, + "epoch": 0.1852111442088663, + "grad_norm": 20.25981681681545, + "kl": 0.0859375, + "learning_rate": 8.149640792009812e-07, + "loss": 0.0343, + "reward": 1.7087209224700928, + "reward_std": 0.14371052384376526, + "rewards/accuracy_reward_stage2": 0.7087209224700928, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1057 + }, + { + "completion_length": 12.109375, + "epoch": 0.18538636761871385, + "grad_norm": 22.906658516478977, + "kl": 0.2890625, + "learning_rate": 8.147888557911337e-07, + "loss": 0.0715, + "reward": 1.4852299690246582, + "reward_std": 0.17032143473625183, + "rewards/accuracy_reward_stage2": 0.6258548498153687, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1058 + }, + { + "completion_length": 9.859375, + "epoch": 0.18556159102856143, + "grad_norm": 26.553366802136384, + "kl": 0.09375, + "learning_rate": 8.146136323812861e-07, + "loss": 0.0159, + "reward": 1.4717214107513428, + "reward_std": 0.28934115171432495, + "rewards/accuracy_reward_stage2": 0.48734647035598755, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1059 + }, + { + "completion_length": 10.28125, + "epoch": 0.18573681443840898, + "grad_norm": 21.046621537109452, + "kl": 0.056396484375, + "learning_rate": 8.144384089714386e-07, + "loss": 0.0225, + "reward": 1.7228630781173706, + "reward_std": 0.0990498960018158, + "rewards/accuracy_reward_stage2": 0.7228630185127258, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1060 + }, + { + "completion_length": 9.53125, + "epoch": 0.18591203784825652, + "grad_norm": 18.7875170669259, + "kl": 0.072265625, + "learning_rate": 8.14263185561591e-07, + "loss": 0.029, + "reward": 1.4805216789245605, + "reward_std": 0.11815441399812698, + "rewards/accuracy_reward_stage2": 0.48052167892456055, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1061 + }, + { + "completion_length": 6.921875, + "epoch": 0.18608726125810407, + "grad_norm": 18.409645335365852, + "kl": 0.0693359375, + "learning_rate": 8.140879621517434e-07, + "loss": -0.0163, + "reward": 1.6867990493774414, + "reward_std": 0.21753624081611633, + "rewards/accuracy_reward_stage2": 0.7024240493774414, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1062 + }, + { + "completion_length": 11.0, + "epoch": 0.18626248466795164, + "grad_norm": 20.982584348741664, + "kl": 0.08740234375, + "learning_rate": 8.139127387418959e-07, + "loss": 0.0351, + "reward": 1.4232040643692017, + "reward_std": 0.16690698266029358, + "rewards/accuracy_reward_stage2": 0.5482040643692017, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1063 + }, + { + "completion_length": 22.359375, + "epoch": 0.1864377080777992, + "grad_norm": 17.957689558341478, + "kl": 0.1806640625, + "learning_rate": 8.137375153320484e-07, + "loss": -0.0343, + "reward": 1.0880229473114014, + "reward_std": 0.25343039631843567, + "rewards/accuracy_reward_stage2": 0.5098979473114014, + "rewards/format_reward_stage1_pointerpad": 0.578125, + "scores/accuracy_reward_stage2": 0.578125, + "step": 1064 + }, + { + "completion_length": 7.015625, + "epoch": 0.18661293148764674, + "grad_norm": 19.408175653268714, + "kl": 0.05712890625, + "learning_rate": 8.135622919222007e-07, + "loss": 0.0229, + "reward": 1.560366153717041, + "reward_std": 0.19479292631149292, + "rewards/accuracy_reward_stage2": 0.560366153717041, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1065 + }, + { + "completion_length": 7.578125, + "epoch": 0.1867881548974943, + "grad_norm": 31.666787791274242, + "kl": 0.0908203125, + "learning_rate": 8.133870685123532e-07, + "loss": -0.029, + "reward": 1.700068473815918, + "reward_std": 0.2957872152328491, + "rewards/accuracy_reward_stage2": 0.7313185334205627, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1066 + }, + { + "completion_length": 9.203125, + "epoch": 0.18696337830734186, + "grad_norm": 22.06110956212385, + "kl": 0.0712890625, + "learning_rate": 8.132118451025056e-07, + "loss": 0.0286, + "reward": 1.3757433891296387, + "reward_std": 0.24035391211509705, + "rewards/accuracy_reward_stage2": 0.5007432699203491, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1067 + }, + { + "completion_length": 7.265625, + "epoch": 0.1871386017171894, + "grad_norm": 21.13342642811075, + "kl": 0.12353515625, + "learning_rate": 8.130366216926581e-07, + "loss": 0.0495, + "reward": 1.4848458766937256, + "reward_std": 0.18105000257492065, + "rewards/accuracy_reward_stage2": 0.48484593629837036, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1068 + }, + { + "completion_length": 5.4375, + "epoch": 0.18731382512703698, + "grad_norm": 10.078001794329074, + "kl": 0.03662109375, + "learning_rate": 8.128613982828106e-07, + "loss": 0.0146, + "reward": 1.4926791191101074, + "reward_std": 0.020706364884972572, + "rewards/accuracy_reward_stage2": 0.4926791787147522, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1069 + }, + { + "completion_length": 11.90625, + "epoch": 0.18748904853688453, + "grad_norm": 31.03697558939259, + "kl": 0.09375, + "learning_rate": 8.12686174872963e-07, + "loss": -0.0068, + "reward": 1.4946585893630981, + "reward_std": 0.18645590543746948, + "rewards/accuracy_reward_stage2": 0.6352835893630981, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1070 + }, + { + "completion_length": 8.234375, + "epoch": 0.18766427194673208, + "grad_norm": 27.770742176690753, + "kl": 0.146484375, + "learning_rate": 8.125109514631154e-07, + "loss": -0.03, + "reward": 1.5777851343154907, + "reward_std": 0.27430057525634766, + "rewards/accuracy_reward_stage2": 0.6090351343154907, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1071 + }, + { + "completion_length": 13.1875, + "epoch": 0.18783949535657965, + "grad_norm": 635.6301357426747, + "kl": 3.203125, + "learning_rate": 8.123357280532679e-07, + "loss": 1.2792, + "reward": 1.5925309658050537, + "reward_std": 0.18635889887809753, + "rewards/accuracy_reward_stage2": 0.7175308465957642, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1072 + }, + { + "completion_length": 11.5, + "epoch": 0.1880147187664272, + "grad_norm": 23.2727223905994, + "kl": 0.1533203125, + "learning_rate": 8.121605046434203e-07, + "loss": 0.0613, + "reward": 1.76097571849823, + "reward_std": 0.15913929045200348, + "rewards/accuracy_reward_stage2": 0.76097571849823, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1073 + }, + { + "completion_length": 5.921875, + "epoch": 0.18818994217627474, + "grad_norm": 20.897642898148753, + "kl": 0.04052734375, + "learning_rate": 8.119852812335728e-07, + "loss": -0.0279, + "reward": 1.6360900402069092, + "reward_std": 0.19334176182746887, + "rewards/accuracy_reward_stage2": 0.6517150402069092, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1074 + }, + { + "completion_length": 9.625, + "epoch": 0.18836516558612232, + "grad_norm": 13.357495831373486, + "kl": 0.033203125, + "learning_rate": 8.118100578237252e-07, + "loss": 0.0133, + "reward": 1.6267361640930176, + "reward_std": 0.17079266905784607, + "rewards/accuracy_reward_stage2": 0.6267361044883728, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1075 + }, + { + "completion_length": 10.109375, + "epoch": 0.18854038899596987, + "grad_norm": 31.228774447933233, + "kl": 0.162109375, + "learning_rate": 8.116348344138777e-07, + "loss": 0.0648, + "reward": 1.598435640335083, + "reward_std": 0.2621033191680908, + "rewards/accuracy_reward_stage2": 0.5984355807304382, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1076 + }, + { + "completion_length": 11.078125, + "epoch": 0.1887156124058174, + "grad_norm": 18.03238472395094, + "kl": 0.0849609375, + "learning_rate": 8.114596110040302e-07, + "loss": 0.0051, + "reward": 1.6736887693405151, + "reward_std": 0.14449408650398254, + "rewards/accuracy_reward_stage2": 0.6893137693405151, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1077 + }, + { + "completion_length": 13.921875, + "epoch": 0.18889083581566496, + "grad_norm": 31.39284474795272, + "kl": 0.068359375, + "learning_rate": 8.112843875941825e-07, + "loss": -0.0723, + "reward": 1.5917601585388184, + "reward_std": 0.2058051973581314, + "rewards/accuracy_reward_stage2": 0.6386352181434631, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 1078 + }, + { + "completion_length": 10.9375, + "epoch": 0.18906605922551253, + "grad_norm": 22.829727849729412, + "kl": 0.0693359375, + "learning_rate": 8.11109164184335e-07, + "loss": -0.0002, + "reward": 1.7911438941955566, + "reward_std": 0.19995662569999695, + "rewards/accuracy_reward_stage2": 0.8067688941955566, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1079 + }, + { + "completion_length": 6.75, + "epoch": 0.18924128263536008, + "grad_norm": 35.82822640876689, + "kl": 0.1279296875, + "learning_rate": 8.109339407744873e-07, + "loss": 0.0513, + "reward": 1.7231206893920898, + "reward_std": 0.14698222279548645, + "rewards/accuracy_reward_stage2": 0.8481206297874451, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1080 + }, + { + "completion_length": 9.359375, + "epoch": 0.18941650604520763, + "grad_norm": 20.760161439985456, + "kl": 0.08984375, + "learning_rate": 8.107587173646398e-07, + "loss": -0.0082, + "reward": 1.7030006647109985, + "reward_std": 0.2652503252029419, + "rewards/accuracy_reward_stage2": 0.7186257243156433, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1081 + }, + { + "completion_length": 6.28125, + "epoch": 0.1895917294550552, + "grad_norm": 20.00055457981059, + "kl": 0.013916015625, + "learning_rate": 8.105834939547923e-07, + "loss": -0.0362, + "reward": 1.7184606790542603, + "reward_std": 0.1488310694694519, + "rewards/accuracy_reward_stage2": 0.7340856790542603, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1082 + }, + { + "completion_length": 11.796875, + "epoch": 0.18976695286490275, + "grad_norm": 15.90440340391825, + "kl": 0.0966796875, + "learning_rate": 8.104082705449447e-07, + "loss": 0.0218, + "reward": 1.650546669960022, + "reward_std": 0.1746128350496292, + "rewards/accuracy_reward_stage2": 0.666171669960022, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1083 + }, + { + "completion_length": 13.96875, + "epoch": 0.1899421762747503, + "grad_norm": 15.21184938869327, + "kl": 0.054931640625, + "learning_rate": 8.102330471350972e-07, + "loss": 0.022, + "reward": 1.2956702709197998, + "reward_std": 0.12772494554519653, + "rewards/accuracy_reward_stage2": 0.29567036032676697, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1084 + }, + { + "completion_length": 5.984375, + "epoch": 0.19011739968459787, + "grad_norm": 13.046954389444721, + "kl": 0.037841796875, + "learning_rate": 8.100578237252497e-07, + "loss": 0.0152, + "reward": 1.6959822177886963, + "reward_std": 0.08044600486755371, + "rewards/accuracy_reward_stage2": 0.6959822177886963, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1085 + }, + { + "completion_length": 8.765625, + "epoch": 0.19029262309444542, + "grad_norm": 21.808137766383577, + "kl": 0.10791015625, + "learning_rate": 8.098826003154021e-07, + "loss": -0.0244, + "reward": 1.6888558864593506, + "reward_std": 0.17180359363555908, + "rewards/accuracy_reward_stage2": 0.8451060056686401, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1086 + }, + { + "completion_length": 11.671875, + "epoch": 0.19046784650429296, + "grad_norm": 14.883602876711144, + "kl": 0.0252685546875, + "learning_rate": 8.097073769055546e-07, + "loss": 0.0101, + "reward": 1.473452091217041, + "reward_std": 0.12437894195318222, + "rewards/accuracy_reward_stage2": 0.473452091217041, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1087 + }, + { + "completion_length": 14.59375, + "epoch": 0.19064306991414054, + "grad_norm": 12.088597325510479, + "kl": 0.1767578125, + "learning_rate": 8.095321534957071e-07, + "loss": 0.0708, + "reward": 1.215935468673706, + "reward_std": 0.06121998280286789, + "rewards/accuracy_reward_stage2": 0.34093552827835083, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1088 + }, + { + "completion_length": 8.984375, + "epoch": 0.1908182933239881, + "grad_norm": 15.176806742901167, + "kl": 0.0947265625, + "learning_rate": 8.093569300858595e-07, + "loss": -0.0115, + "reward": 1.570344090461731, + "reward_std": 0.22428151965141296, + "rewards/accuracy_reward_stage2": 0.601594090461731, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1089 + }, + { + "completion_length": 13.96875, + "epoch": 0.19099351673383563, + "grad_norm": 24.956813132983317, + "kl": 0.10791015625, + "learning_rate": 8.09181706676012e-07, + "loss": 0.0873, + "reward": 1.2724158763885498, + "reward_std": 0.2870734632015228, + "rewards/accuracy_reward_stage2": 0.5224158763885498, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1090 + }, + { + "completion_length": 8.8125, + "epoch": 0.1911687401436832, + "grad_norm": 19.027927979774955, + "kl": 0.09228515625, + "learning_rate": 8.090064832661642e-07, + "loss": 0.0112, + "reward": 1.6313860416412354, + "reward_std": 0.20960725843906403, + "rewards/accuracy_reward_stage2": 0.6626360416412354, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1091 + }, + { + "completion_length": 9.6875, + "epoch": 0.19134396355353075, + "grad_norm": 22.860291887433835, + "kl": 0.0908203125, + "learning_rate": 8.088312598563167e-07, + "loss": -0.0496, + "reward": 1.5270261764526367, + "reward_std": 0.3534308969974518, + "rewards/accuracy_reward_stage2": 0.5582762360572815, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1092 + }, + { + "completion_length": 11.421875, + "epoch": 0.1915191869633783, + "grad_norm": 22.108352820227587, + "kl": 0.1376953125, + "learning_rate": 8.086560364464692e-07, + "loss": 0.011, + "reward": 1.4140586853027344, + "reward_std": 0.20068290829658508, + "rewards/accuracy_reward_stage2": 0.4296835660934448, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1093 + }, + { + "completion_length": 5.984375, + "epoch": 0.19169441037322588, + "grad_norm": 16.302770410900887, + "kl": 0.10009765625, + "learning_rate": 8.084808130366216e-07, + "loss": -0.004, + "reward": 1.6134690046310425, + "reward_std": 0.1999814510345459, + "rewards/accuracy_reward_stage2": 0.6290940046310425, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1094 + }, + { + "completion_length": 19.375, + "epoch": 0.19186963378307342, + "grad_norm": 19.348644015651782, + "kl": 0.07763671875, + "learning_rate": 8.083055896267741e-07, + "loss": -0.0012, + "reward": 1.392347812652588, + "reward_std": 0.1986556202173233, + "rewards/accuracy_reward_stage2": 0.5329726934432983, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1095 + }, + { + "completion_length": 11.34375, + "epoch": 0.19204485719292097, + "grad_norm": 20.1216235120964, + "kl": 0.08642578125, + "learning_rate": 8.081303662169265e-07, + "loss": 0.0461, + "reward": 1.4377480745315552, + "reward_std": 0.22237080335617065, + "rewards/accuracy_reward_stage2": 0.5627480745315552, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1096 + }, + { + "completion_length": 9.875, + "epoch": 0.19222008060276852, + "grad_norm": 17.62212799015951, + "kl": 0.0654296875, + "learning_rate": 8.07955142807079e-07, + "loss": 0.0262, + "reward": 1.527910828590393, + "reward_std": 0.1326691210269928, + "rewards/accuracy_reward_stage2": 0.5279108881950378, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1097 + }, + { + "completion_length": 8.984375, + "epoch": 0.1923953040126161, + "grad_norm": 20.687971074598163, + "kl": 0.11328125, + "learning_rate": 8.077799193972315e-07, + "loss": 0.0454, + "reward": 1.5507917404174805, + "reward_std": 0.336439311504364, + "rewards/accuracy_reward_stage2": 0.5507918000221252, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1098 + }, + { + "completion_length": 10.421875, + "epoch": 0.19257052742246364, + "grad_norm": 10.686266812372155, + "kl": 0.01470947265625, + "learning_rate": 8.076046959873839e-07, + "loss": 0.0059, + "reward": 1.5433006286621094, + "reward_std": 0.0748034194111824, + "rewards/accuracy_reward_stage2": 0.6683006286621094, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1099 + }, + { + "completion_length": 8.78125, + "epoch": 0.19274575083231119, + "grad_norm": 20.305827220265968, + "kl": 0.10986328125, + "learning_rate": 8.074294725775364e-07, + "loss": 0.0141, + "reward": 1.7699222564697266, + "reward_std": 0.1974988877773285, + "rewards/accuracy_reward_stage2": 0.785547137260437, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1100 + }, + { + "completion_length": 6.78125, + "epoch": 0.19292097424215876, + "grad_norm": 46.005300340627144, + "kl": 0.3046875, + "learning_rate": 8.072542491676888e-07, + "loss": 0.0777, + "reward": 1.5677083730697632, + "reward_std": 0.3144148588180542, + "rewards/accuracy_reward_stage2": 0.5833333134651184, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1101 + }, + { + "completion_length": 12.8125, + "epoch": 0.1930961976520063, + "grad_norm": 23.389305778485923, + "kl": 0.076171875, + "learning_rate": 8.070790257578412e-07, + "loss": 0.0304, + "reward": 1.4332172870635986, + "reward_std": 0.20425169169902802, + "rewards/accuracy_reward_stage2": 0.5582171678543091, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1102 + }, + { + "completion_length": 8.546875, + "epoch": 0.19327142106185385, + "grad_norm": 25.892819741346866, + "kl": 0.19921875, + "learning_rate": 8.069038023479936e-07, + "loss": -0.063, + "reward": 1.413844347000122, + "reward_std": 0.30733755230903625, + "rewards/accuracy_reward_stage2": 0.4919692873954773, + "rewards/format_reward_stage1_pointerpad": 0.921875, + "scores/accuracy_reward_stage2": 0.921875, + "step": 1103 + }, + { + "completion_length": 8.203125, + "epoch": 0.19344664447170143, + "grad_norm": 26.958837496349805, + "kl": 0.0712890625, + "learning_rate": 8.06728578938146e-07, + "loss": 0.0285, + "reward": 1.6971174478530884, + "reward_std": 0.23234084248542786, + "rewards/accuracy_reward_stage2": 0.6971173882484436, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1104 + }, + { + "completion_length": 12.140625, + "epoch": 0.19362186788154898, + "grad_norm": 23.112960942928332, + "kl": 0.020263671875, + "learning_rate": 8.065533555282985e-07, + "loss": 0.0144, + "reward": 1.2263470888137817, + "reward_std": 0.10987623780965805, + "rewards/accuracy_reward_stage2": 0.4763471186161041, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1105 + }, + { + "completion_length": 11.515625, + "epoch": 0.19379709129139652, + "grad_norm": 19.83519566726814, + "kl": 0.0673828125, + "learning_rate": 8.06378132118451e-07, + "loss": 0.027, + "reward": 1.6052569150924683, + "reward_std": 0.1178978905081749, + "rewards/accuracy_reward_stage2": 0.6052569150924683, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1106 + }, + { + "completion_length": 13.5625, + "epoch": 0.1939723147012441, + "grad_norm": 1856.6081789063355, + "kl": 2.390625, + "learning_rate": 8.062029087086034e-07, + "loss": 0.9175, + "reward": 1.3263888359069824, + "reward_std": 0.12646648287773132, + "rewards/accuracy_reward_stage2": 0.3420138955116272, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1107 + }, + { + "completion_length": 11.640625, + "epoch": 0.19414753811109164, + "grad_norm": 22.62046556055641, + "kl": 0.08447265625, + "learning_rate": 8.060276852987559e-07, + "loss": -0.0104, + "reward": 1.5089672803878784, + "reward_std": 0.24158672988414764, + "rewards/accuracy_reward_stage2": 0.6495921611785889, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1108 + }, + { + "completion_length": 19.890625, + "epoch": 0.1943227615209392, + "grad_norm": 21.380509612699353, + "kl": 0.041259765625, + "learning_rate": 8.058524618889084e-07, + "loss": 0.0165, + "reward": 1.525514841079712, + "reward_std": 0.1627964973449707, + "rewards/accuracy_reward_stage2": 0.5255147814750671, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1109 + }, + { + "completion_length": 11.3125, + "epoch": 0.19449798493078677, + "grad_norm": 18.69847620677502, + "kl": 0.08349609375, + "learning_rate": 8.056772384790608e-07, + "loss": 0.0334, + "reward": 1.5814133882522583, + "reward_std": 0.1631113588809967, + "rewards/accuracy_reward_stage2": 0.5814133882522583, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1110 + }, + { + "completion_length": 9.25, + "epoch": 0.1946732083406343, + "grad_norm": 18.246474115447, + "kl": 0.04345703125, + "learning_rate": 8.055020150692132e-07, + "loss": -0.0166, + "reward": 1.7617158889770508, + "reward_std": 0.17398208379745483, + "rewards/accuracy_reward_stage2": 0.7773408889770508, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1111 + }, + { + "completion_length": 22.890625, + "epoch": 0.19484843175048186, + "grad_norm": 17.920499342663035, + "kl": 0.11083984375, + "learning_rate": 8.053267916593656e-07, + "loss": -0.0287, + "reward": 1.2582231760025024, + "reward_std": 0.18175308406352997, + "rewards/accuracy_reward_stage2": 0.41447317600250244, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1112 + }, + { + "completion_length": 9.328125, + "epoch": 0.1950236551603294, + "grad_norm": 18.210669269174822, + "kl": 0.03759765625, + "learning_rate": 8.051515682495181e-07, + "loss": -0.0606, + "reward": 1.638547658920288, + "reward_std": 0.2656075954437256, + "rewards/accuracy_reward_stage2": 0.6697976589202881, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1113 + }, + { + "completion_length": 9.984375, + "epoch": 0.19519887857017698, + "grad_norm": 22.267678059768336, + "kl": 0.0703125, + "learning_rate": 8.049763448396706e-07, + "loss": 0.0281, + "reward": 1.6199893951416016, + "reward_std": 0.29019030928611755, + "rewards/accuracy_reward_stage2": 0.6199893951416016, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1114 + }, + { + "completion_length": 11.65625, + "epoch": 0.19537410198002453, + "grad_norm": 22.77008648581453, + "kl": 0.0419921875, + "learning_rate": 8.04801121429823e-07, + "loss": -0.0713, + "reward": 1.839109182357788, + "reward_std": 0.1781412959098816, + "rewards/accuracy_reward_stage2": 0.8703591823577881, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1115 + }, + { + "completion_length": 11.0, + "epoch": 0.19554932538987208, + "grad_norm": 19.049510532142538, + "kl": 0.08251953125, + "learning_rate": 8.046258980199754e-07, + "loss": -0.0112, + "reward": 1.5514042377471924, + "reward_std": 0.17855620384216309, + "rewards/accuracy_reward_stage2": 0.6920292377471924, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1116 + }, + { + "completion_length": 10.140625, + "epoch": 0.19572454879971965, + "grad_norm": 25.80724586964345, + "kl": 0.103515625, + "learning_rate": 8.044506746101279e-07, + "loss": 0.0414, + "reward": 1.433285117149353, + "reward_std": 0.16345283389091492, + "rewards/accuracy_reward_stage2": 0.558285117149353, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1117 + }, + { + "completion_length": 9.6875, + "epoch": 0.1958997722095672, + "grad_norm": 19.673693253177163, + "kl": 0.056640625, + "learning_rate": 8.042754512002803e-07, + "loss": 0.0227, + "reward": 1.4013640880584717, + "reward_std": 0.14320652186870575, + "rewards/accuracy_reward_stage2": 0.7763641476631165, + "rewards/format_reward_stage1_pointerpad": 0.625, + "scores/accuracy_reward_stage2": 0.625, + "step": 1118 + }, + { + "completion_length": 17.109375, + "epoch": 0.19607499561941474, + "grad_norm": 95.22382663635376, + "kl": 0.416015625, + "learning_rate": 8.041002277904328e-07, + "loss": 0.1661, + "reward": 1.4747977256774902, + "reward_std": 0.2025090903043747, + "rewards/accuracy_reward_stage2": 0.5997976660728455, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1119 + }, + { + "completion_length": 10.046875, + "epoch": 0.19625021902926232, + "grad_norm": 23.39398079319575, + "kl": 0.0546875, + "learning_rate": 8.039250043805851e-07, + "loss": 0.0219, + "reward": 1.6447781324386597, + "reward_std": 0.22827255725860596, + "rewards/accuracy_reward_stage2": 0.6447781324386597, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1120 + }, + { + "completion_length": 15.765625, + "epoch": 0.19642544243910987, + "grad_norm": 18.078899714493488, + "kl": 0.033447265625, + "learning_rate": 8.037497809707376e-07, + "loss": -0.0308, + "reward": 1.4864583015441895, + "reward_std": 0.2050531953573227, + "rewards/accuracy_reward_stage2": 0.5020833015441895, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1121 + }, + { + "completion_length": 8.6875, + "epoch": 0.1966006658489574, + "grad_norm": 21.8183949180115, + "kl": 0.03271484375, + "learning_rate": 8.035745575608901e-07, + "loss": 0.013, + "reward": 1.651584506034851, + "reward_std": 0.2105352282524109, + "rewards/accuracy_reward_stage2": 0.6515845060348511, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1122 + }, + { + "completion_length": 10.15625, + "epoch": 0.196775889258805, + "grad_norm": 15.930285731895992, + "kl": 0.0634765625, + "learning_rate": 8.033993341510425e-07, + "loss": 0.0252, + "reward": 1.454774260520935, + "reward_std": 0.1642688512802124, + "rewards/accuracy_reward_stage2": 0.47039929032325745, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1123 + }, + { + "completion_length": 5.15625, + "epoch": 0.19695111266865253, + "grad_norm": 21.751143246670633, + "kl": 0.1025390625, + "learning_rate": 8.03224110741195e-07, + "loss": 0.0126, + "reward": 1.7925353050231934, + "reward_std": 0.1622324138879776, + "rewards/accuracy_reward_stage2": 0.8081602454185486, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1124 + }, + { + "completion_length": 10.375, + "epoch": 0.19712633607850008, + "grad_norm": 11.346635286919138, + "kl": 0.05859375, + "learning_rate": 8.030488873313475e-07, + "loss": -0.0011, + "reward": 1.5891244411468506, + "reward_std": 0.08004673570394516, + "rewards/accuracy_reward_stage2": 0.6047494411468506, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1125 + }, + { + "completion_length": 7.421875, + "epoch": 0.19730155948834766, + "grad_norm": 26.259271451383956, + "kl": 0.173828125, + "learning_rate": 8.028736639214999e-07, + "loss": 0.0304, + "reward": 1.3985657691955566, + "reward_std": 0.21947166323661804, + "rewards/accuracy_reward_stage2": 0.5391908288002014, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1126 + }, + { + "completion_length": 10.203125, + "epoch": 0.1974767828981952, + "grad_norm": 30.238017587159945, + "kl": 0.3515625, + "learning_rate": 8.026984405116524e-07, + "loss": 0.1409, + "reward": 1.228277564048767, + "reward_std": 0.133104607462883, + "rewards/accuracy_reward_stage2": 0.6032775640487671, + "rewards/format_reward_stage1_pointerpad": 0.625, + "scores/accuracy_reward_stage2": 0.625, + "step": 1127 + }, + { + "completion_length": 7.203125, + "epoch": 0.19765200630804275, + "grad_norm": 17.460221312721277, + "kl": 0.04833984375, + "learning_rate": 8.025232171018048e-07, + "loss": 0.0192, + "reward": 1.3928313255310059, + "reward_std": 0.15808694064617157, + "rewards/accuracy_reward_stage2": 0.39283138513565063, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1128 + }, + { + "completion_length": 9.53125, + "epoch": 0.19782722971789032, + "grad_norm": 18.46095666098341, + "kl": 0.051025390625, + "learning_rate": 8.023479936919572e-07, + "loss": 0.0075, + "reward": 1.6616019010543823, + "reward_std": 0.13137364387512207, + "rewards/accuracy_reward_stage2": 0.6772269010543823, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1129 + }, + { + "completion_length": 11.09375, + "epoch": 0.19800245312773787, + "grad_norm": 20.90678156081599, + "kl": 0.0947265625, + "learning_rate": 8.021727702821096e-07, + "loss": -0.0064, + "reward": 1.7562756538391113, + "reward_std": 0.35668328404426575, + "rewards/accuracy_reward_stage2": 0.7719005346298218, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1130 + }, + { + "completion_length": 8.90625, + "epoch": 0.19817767653758542, + "grad_norm": 13.7045249582709, + "kl": 0.04150390625, + "learning_rate": 8.01997546872262e-07, + "loss": 0.0166, + "reward": 1.6348446607589722, + "reward_std": 0.08211646229028702, + "rewards/accuracy_reward_stage2": 0.6348447203636169, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1131 + }, + { + "completion_length": 11.25, + "epoch": 0.19835289994743296, + "grad_norm": 16.81366964105771, + "kl": 0.1279296875, + "learning_rate": 8.018223234624145e-07, + "loss": 0.0071, + "reward": 1.7176910638809204, + "reward_std": 0.28899407386779785, + "rewards/accuracy_reward_stage2": 0.7333160042762756, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1132 + }, + { + "completion_length": 9.4375, + "epoch": 0.19852812335728054, + "grad_norm": 19.902013500283253, + "kl": 0.09912109375, + "learning_rate": 8.01647100052567e-07, + "loss": 0.0397, + "reward": 1.7990940809249878, + "reward_std": 0.21646492183208466, + "rewards/accuracy_reward_stage2": 0.799094021320343, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1133 + }, + { + "completion_length": 9.234375, + "epoch": 0.1987033467671281, + "grad_norm": 13.540574761629962, + "kl": 0.040283203125, + "learning_rate": 8.014718766427194e-07, + "loss": 0.0161, + "reward": 1.744128704071045, + "reward_std": 0.05768556892871857, + "rewards/accuracy_reward_stage2": 0.7441287040710449, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1134 + }, + { + "completion_length": 13.34375, + "epoch": 0.19887857017697563, + "grad_norm": 25.82564634090041, + "kl": 0.1923828125, + "learning_rate": 8.012966532328719e-07, + "loss": 0.0769, + "reward": 1.1752233505249023, + "reward_std": 0.09604233503341675, + "rewards/accuracy_reward_stage2": 0.42522335052490234, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1135 + }, + { + "completion_length": 10.3125, + "epoch": 0.1990537935868232, + "grad_norm": 19.314766378810056, + "kl": 0.1220703125, + "learning_rate": 8.011214298230243e-07, + "loss": 0.0046, + "reward": 1.4528274536132812, + "reward_std": 0.18750616908073425, + "rewards/accuracy_reward_stage2": 0.4684523642063141, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1136 + }, + { + "completion_length": 14.109375, + "epoch": 0.19922901699667075, + "grad_norm": 16.821980930138615, + "kl": 0.0615234375, + "learning_rate": 8.009462064131768e-07, + "loss": 0.0247, + "reward": 1.4835102558135986, + "reward_std": 0.166833758354187, + "rewards/accuracy_reward_stage2": 0.48351022601127625, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1137 + }, + { + "completion_length": 15.625, + "epoch": 0.1994042404065183, + "grad_norm": 31.868072743598333, + "kl": 0.109375, + "learning_rate": 8.007709830033293e-07, + "loss": -0.0005, + "reward": 1.5000779628753662, + "reward_std": 0.3490258753299713, + "rewards/accuracy_reward_stage2": 0.5157029628753662, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1138 + }, + { + "completion_length": 11.453125, + "epoch": 0.19957946381636588, + "grad_norm": 25.6116140210764, + "kl": 0.07080078125, + "learning_rate": 8.005957595934817e-07, + "loss": 0.0284, + "reward": 1.364166498184204, + "reward_std": 0.20888856053352356, + "rewards/accuracy_reward_stage2": 0.4891664981842041, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1139 + }, + { + "completion_length": 7.890625, + "epoch": 0.19975468722621342, + "grad_norm": 24.85245877767008, + "kl": 0.072265625, + "learning_rate": 8.004205361836342e-07, + "loss": 0.0288, + "reward": 1.5358493328094482, + "reward_std": 0.27124035358428955, + "rewards/accuracy_reward_stage2": 0.5358492732048035, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1140 + }, + { + "completion_length": 12.8125, + "epoch": 0.19992991063606097, + "grad_norm": 14.61945538091605, + "kl": 0.0308837890625, + "learning_rate": 8.002453127737866e-07, + "loss": -0.0043, + "reward": 1.4979475736618042, + "reward_std": 0.11111369729042053, + "rewards/accuracy_reward_stage2": 0.5135725736618042, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1141 + }, + { + "completion_length": 12.53125, + "epoch": 0.20010513404590854, + "grad_norm": 16.880834934148936, + "kl": 0.038818359375, + "learning_rate": 8.000700893639389e-07, + "loss": 0.0156, + "reward": 1.6113297939300537, + "reward_std": 0.13249364495277405, + "rewards/accuracy_reward_stage2": 0.7363297343254089, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1142 + }, + { + "completion_length": 9.140625, + "epoch": 0.2002803574557561, + "grad_norm": 15.358944238027423, + "kl": 0.06396484375, + "learning_rate": 7.998948659540914e-07, + "loss": -0.0186, + "reward": 1.5073845386505127, + "reward_std": 0.10182757675647736, + "rewards/accuracy_reward_stage2": 0.5230096578598022, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1143 + }, + { + "completion_length": 10.6875, + "epoch": 0.20045558086560364, + "grad_norm": 19.06697388533954, + "kl": 0.1533203125, + "learning_rate": 7.997196425442438e-07, + "loss": 0.0613, + "reward": 1.7727296352386475, + "reward_std": 0.1398918628692627, + "rewards/accuracy_reward_stage2": 0.7727296948432922, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1144 + }, + { + "completion_length": 8.390625, + "epoch": 0.2006308042754512, + "grad_norm": 17.057848521186898, + "kl": 0.1728515625, + "learning_rate": 7.995444191343963e-07, + "loss": 0.0691, + "reward": 1.6468448638916016, + "reward_std": 0.1826433539390564, + "rewards/accuracy_reward_stage2": 0.6468449234962463, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1145 + }, + { + "completion_length": 9.703125, + "epoch": 0.20080602768529876, + "grad_norm": 14.97255121728084, + "kl": 0.0556640625, + "learning_rate": 7.993691957245488e-07, + "loss": -0.0111, + "reward": 1.500192642211914, + "reward_std": 0.16978204250335693, + "rewards/accuracy_reward_stage2": 0.5158176422119141, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1146 + }, + { + "completion_length": 9.21875, + "epoch": 0.2009812510951463, + "grad_norm": 12.285037631467146, + "kl": 0.08203125, + "learning_rate": 7.991939723147012e-07, + "loss": 0.0329, + "reward": 1.5083717107772827, + "reward_std": 0.09912580996751785, + "rewards/accuracy_reward_stage2": 0.6333716511726379, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1147 + }, + { + "completion_length": 9.84375, + "epoch": 0.20115647450499385, + "grad_norm": 10.819531810185309, + "kl": 0.01275634765625, + "learning_rate": 7.990187489048537e-07, + "loss": 0.0051, + "reward": 1.8347173929214478, + "reward_std": 0.06732519716024399, + "rewards/accuracy_reward_stage2": 0.8347173929214478, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1148 + }, + { + "completion_length": 10.46875, + "epoch": 0.20133169791484143, + "grad_norm": 17.510513441111854, + "kl": 0.08203125, + "learning_rate": 7.988435254950062e-07, + "loss": -0.1252, + "reward": 1.4475042819976807, + "reward_std": 0.3052405118942261, + "rewards/accuracy_reward_stage2": 0.5100042819976807, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 1149 + }, + { + "completion_length": 9.71875, + "epoch": 0.20150692132468898, + "grad_norm": 24.563211909751576, + "kl": 0.11328125, + "learning_rate": 7.986683020851585e-07, + "loss": 0.0517, + "reward": 1.265645980834961, + "reward_std": 0.2737762928009033, + "rewards/accuracy_reward_stage2": 0.39064595103263855, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1150 + }, + { + "completion_length": 8.609375, + "epoch": 0.20168214473453652, + "grad_norm": 20.56948420996003, + "kl": 0.07568359375, + "learning_rate": 7.98493078675311e-07, + "loss": 0.0304, + "reward": 1.5552361011505127, + "reward_std": 0.23706388473510742, + "rewards/accuracy_reward_stage2": 0.5552360415458679, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1151 + }, + { + "completion_length": 28.578125, + "epoch": 0.2018573681443841, + "grad_norm": 21.06436355277442, + "kl": 0.2578125, + "learning_rate": 7.983178552654634e-07, + "loss": 0.1027, + "reward": 1.3616572618484497, + "reward_std": 0.19103842973709106, + "rewards/accuracy_reward_stage2": 0.48665720224380493, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1152 + }, + { + "completion_length": 12.359375, + "epoch": 0.20203259155423164, + "grad_norm": 23.185828052219872, + "kl": 0.0361328125, + "learning_rate": 7.981426318556159e-07, + "loss": 0.0144, + "reward": 1.584661602973938, + "reward_std": 0.22834071516990662, + "rewards/accuracy_reward_stage2": 0.5846616625785828, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1153 + }, + { + "completion_length": 18.421875, + "epoch": 0.2022078149640792, + "grad_norm": 18.080209220125052, + "kl": 0.06982421875, + "learning_rate": 7.979674084457683e-07, + "loss": 0.028, + "reward": 1.4647139310836792, + "reward_std": 0.16236665844917297, + "rewards/accuracy_reward_stage2": 0.4647139012813568, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1154 + }, + { + "completion_length": 12.59375, + "epoch": 0.20238303837392677, + "grad_norm": 41.51210090003271, + "kl": 0.06982421875, + "learning_rate": 7.977921850359207e-07, + "loss": -0.0054, + "reward": 1.5676136016845703, + "reward_std": 0.2357185333967209, + "rewards/accuracy_reward_stage2": 0.5832385420799255, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1155 + }, + { + "completion_length": 10.515625, + "epoch": 0.2025582617837743, + "grad_norm": 12.776722404361198, + "kl": 0.1318359375, + "learning_rate": 7.976169616260732e-07, + "loss": 0.0095, + "reward": 1.0949840545654297, + "reward_std": 0.2613898515701294, + "rewards/accuracy_reward_stage2": 0.2356090545654297, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1156 + }, + { + "completion_length": 11.875, + "epoch": 0.20273348519362186, + "grad_norm": 12.378745536969769, + "kl": 0.06689453125, + "learning_rate": 7.974417382162256e-07, + "loss": 0.0269, + "reward": 1.6979291439056396, + "reward_std": 0.04445386305451393, + "rewards/accuracy_reward_stage2": 0.6979291439056396, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1157 + }, + { + "completion_length": 11.34375, + "epoch": 0.20290870860346943, + "grad_norm": 15.21828623768596, + "kl": 0.04052734375, + "learning_rate": 7.972665148063781e-07, + "loss": 0.0162, + "reward": 1.7189762592315674, + "reward_std": 0.17833425104618073, + "rewards/accuracy_reward_stage2": 0.7189762592315674, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1158 + }, + { + "completion_length": 11.125, + "epoch": 0.20308393201331698, + "grad_norm": 23.90652706021063, + "kl": 0.06787109375, + "learning_rate": 7.970912913965306e-07, + "loss": -0.0171, + "reward": 1.751429796218872, + "reward_std": 0.2318522334098816, + "rewards/accuracy_reward_stage2": 0.7670547366142273, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1159 + }, + { + "completion_length": 11.53125, + "epoch": 0.20325915542316453, + "grad_norm": 19.569111992947953, + "kl": 0.0966796875, + "learning_rate": 7.969160679866829e-07, + "loss": 0.0385, + "reward": 1.4768517017364502, + "reward_std": 0.2140122354030609, + "rewards/accuracy_reward_stage2": 0.6018517017364502, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1160 + }, + { + "completion_length": 11.453125, + "epoch": 0.2034343788330121, + "grad_norm": 17.326516841882064, + "kl": 0.0291748046875, + "learning_rate": 7.967408445768354e-07, + "loss": 0.0117, + "reward": 1.625, + "reward_std": 0.22236785292625427, + "rewards/accuracy_reward_stage2": 0.625, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1161 + }, + { + "completion_length": 12.296875, + "epoch": 0.20360960224285965, + "grad_norm": 20.93028425320662, + "kl": 0.134765625, + "learning_rate": 7.965656211669879e-07, + "loss": -0.0578, + "reward": 1.5847158432006836, + "reward_std": 0.3005771040916443, + "rewards/accuracy_reward_stage2": 0.6315909624099731, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 1162 + }, + { + "completion_length": 10.75, + "epoch": 0.2037848256527072, + "grad_norm": 17.116689083154068, + "kl": 0.36328125, + "learning_rate": 7.963903977571403e-07, + "loss": 0.1448, + "reward": 1.4689933061599731, + "reward_std": 0.18849441409111023, + "rewards/accuracy_reward_stage2": 0.8439933061599731, + "rewards/format_reward_stage1_pointerpad": 0.625, + "scores/accuracy_reward_stage2": 0.625, + "step": 1163 + }, + { + "completion_length": 15.296875, + "epoch": 0.20396004906255474, + "grad_norm": 20.782040905035508, + "kl": 0.04638671875, + "learning_rate": 7.962151743472928e-07, + "loss": 0.0186, + "reward": 1.5337340831756592, + "reward_std": 0.23394903540611267, + "rewards/accuracy_reward_stage2": 0.533734142780304, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1164 + }, + { + "completion_length": 10.953125, + "epoch": 0.20413527247240232, + "grad_norm": 20.41830434742461, + "kl": 0.08984375, + "learning_rate": 7.960399509374453e-07, + "loss": 0.0359, + "reward": 1.777416706085205, + "reward_std": 0.35240471363067627, + "rewards/accuracy_reward_stage2": 0.7774167060852051, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1165 + }, + { + "completion_length": 8.765625, + "epoch": 0.20431049588224987, + "grad_norm": 23.215274778028444, + "kl": 0.2060546875, + "learning_rate": 7.958647275275977e-07, + "loss": 0.0823, + "reward": 1.5440936088562012, + "reward_std": 0.24966692924499512, + "rewards/accuracy_reward_stage2": 0.5440936088562012, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1166 + }, + { + "completion_length": 8.59375, + "epoch": 0.2044857192920974, + "grad_norm": 16.453486111218634, + "kl": 0.0400390625, + "learning_rate": 7.956895041177501e-07, + "loss": 0.016, + "reward": 1.7762818336486816, + "reward_std": 0.11226281523704529, + "rewards/accuracy_reward_stage2": 0.7762819528579712, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1167 + }, + { + "completion_length": 11.875, + "epoch": 0.204660942701945, + "grad_norm": 20.38053897005864, + "kl": 0.10107421875, + "learning_rate": 7.955142807079025e-07, + "loss": 0.0096, + "reward": 1.6993811130523682, + "reward_std": 0.14725381135940552, + "rewards/accuracy_reward_stage2": 0.7150062322616577, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1168 + }, + { + "completion_length": 9.46875, + "epoch": 0.20483616611179253, + "grad_norm": 20.873197553058322, + "kl": 0.0277099609375, + "learning_rate": 7.95339057298055e-07, + "loss": 0.0111, + "reward": 1.6127396821975708, + "reward_std": 0.1205105409026146, + "rewards/accuracy_reward_stage2": 0.6127396821975708, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1169 + }, + { + "completion_length": 24.578125, + "epoch": 0.20501138952164008, + "grad_norm": 22.375963993171645, + "kl": 0.25, + "learning_rate": 7.951638338882074e-07, + "loss": 0.1002, + "reward": 1.1214206218719482, + "reward_std": 0.24306708574295044, + "rewards/accuracy_reward_stage2": 0.24642051756381989, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1170 + }, + { + "completion_length": 8.796875, + "epoch": 0.20518661293148766, + "grad_norm": 27.950636957218467, + "kl": 0.027099609375, + "learning_rate": 7.949886104783598e-07, + "loss": 0.0108, + "reward": 1.6019957065582275, + "reward_std": 0.245945006608963, + "rewards/accuracy_reward_stage2": 0.6019957661628723, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1171 + }, + { + "completion_length": 6.828125, + "epoch": 0.2053618363413352, + "grad_norm": 15.034136443009805, + "kl": 0.0751953125, + "learning_rate": 7.948133870685123e-07, + "loss": 0.0301, + "reward": 1.8496700525283813, + "reward_std": 0.05726194754242897, + "rewards/accuracy_reward_stage2": 0.8496700525283813, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1172 + }, + { + "completion_length": 9.03125, + "epoch": 0.20553705975118275, + "grad_norm": 24.505177220334467, + "kl": 0.109375, + "learning_rate": 7.946381636586647e-07, + "loss": 0.0438, + "reward": 1.6173192262649536, + "reward_std": 0.3106708824634552, + "rewards/accuracy_reward_stage2": 0.6173191666603088, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1173 + }, + { + "completion_length": 14.28125, + "epoch": 0.20571228316103032, + "grad_norm": 21.066084623597465, + "kl": 0.0250244140625, + "learning_rate": 7.944629402488172e-07, + "loss": 0.01, + "reward": 1.5202445983886719, + "reward_std": 0.15491852164268494, + "rewards/accuracy_reward_stage2": 0.5202445983886719, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1174 + }, + { + "completion_length": 11.0625, + "epoch": 0.20588750657087787, + "grad_norm": 20.666768324535514, + "kl": 0.032470703125, + "learning_rate": 7.942877168389697e-07, + "loss": 0.013, + "reward": 1.59375, + "reward_std": 0.29143062233924866, + "rewards/accuracy_reward_stage2": 0.59375, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1175 + }, + { + "completion_length": 7.59375, + "epoch": 0.20606272998072542, + "grad_norm": 18.042454348711036, + "kl": 0.107421875, + "learning_rate": 7.941124934291221e-07, + "loss": 0.0145, + "reward": 1.7791666984558105, + "reward_std": 0.20184138417243958, + "rewards/accuracy_reward_stage2": 0.9197916984558105, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1176 + }, + { + "completion_length": 11.78125, + "epoch": 0.206237953390573, + "grad_norm": 7.186882587218823, + "kl": 0.035400390625, + "learning_rate": 7.939372700192746e-07, + "loss": 0.0142, + "reward": 1.4192759990692139, + "reward_std": 0.02953476831316948, + "rewards/accuracy_reward_stage2": 0.41927602887153625, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1177 + }, + { + "completion_length": 7.8125, + "epoch": 0.20641317680042054, + "grad_norm": 23.64243820773421, + "kl": 0.058837890625, + "learning_rate": 7.937620466094271e-07, + "loss": -0.051, + "reward": 1.6805853843688965, + "reward_std": 0.2879638075828552, + "rewards/accuracy_reward_stage2": 0.7274603247642517, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 1178 + }, + { + "completion_length": 12.90625, + "epoch": 0.20658840021026809, + "grad_norm": 19.158613745103754, + "kl": 0.08056640625, + "learning_rate": 7.935868231995795e-07, + "loss": -0.0373, + "reward": 1.4834885597229004, + "reward_std": 0.23706209659576416, + "rewards/accuracy_reward_stage2": 0.5303636193275452, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 1179 + }, + { + "completion_length": 11.453125, + "epoch": 0.20676362362011566, + "grad_norm": 17.980312882318678, + "kl": 0.11328125, + "learning_rate": 7.934115997897318e-07, + "loss": -0.031, + "reward": 1.6598703861236572, + "reward_std": 0.1916605830192566, + "rewards/accuracy_reward_stage2": 0.691120445728302, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1180 + }, + { + "completion_length": 12.734375, + "epoch": 0.2069388470299632, + "grad_norm": 23.35800848338547, + "kl": 0.453125, + "learning_rate": 7.932363763798842e-07, + "loss": 0.1307, + "reward": 1.2424618005752563, + "reward_std": 0.17332936823368073, + "rewards/accuracy_reward_stage2": 0.39871180057525635, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1181 + }, + { + "completion_length": 10.03125, + "epoch": 0.20711407043981075, + "grad_norm": 34.072947702124445, + "kl": 0.32421875, + "learning_rate": 7.930611529700367e-07, + "loss": 0.1296, + "reward": 1.5852689743041992, + "reward_std": 0.29314032196998596, + "rewards/accuracy_reward_stage2": 0.7102688550949097, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1182 + }, + { + "completion_length": 12.03125, + "epoch": 0.2072892938496583, + "grad_norm": 22.76788574523607, + "kl": 0.07568359375, + "learning_rate": 7.928859295601892e-07, + "loss": -0.0027, + "reward": 1.4920098781585693, + "reward_std": 0.2621656060218811, + "rewards/accuracy_reward_stage2": 0.5076348781585693, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1183 + }, + { + "completion_length": 10.015625, + "epoch": 0.20746451725950588, + "grad_norm": 17.1175348012611, + "kl": 0.09130859375, + "learning_rate": 7.927107061503416e-07, + "loss": -0.0052, + "reward": 1.480435848236084, + "reward_std": 0.1929093450307846, + "rewards/accuracy_reward_stage2": 0.621060848236084, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1184 + }, + { + "completion_length": 11.046875, + "epoch": 0.20763974066935342, + "grad_norm": 22.068041434728716, + "kl": 0.0341796875, + "learning_rate": 7.925354827404941e-07, + "loss": -0.0369, + "reward": 1.7756726741790771, + "reward_std": 0.11437465250492096, + "rewards/accuracy_reward_stage2": 0.8069226741790771, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1185 + }, + { + "completion_length": 9.0625, + "epoch": 0.20781496407920097, + "grad_norm": 18.76005311417912, + "kl": 0.054931640625, + "learning_rate": 7.923602593306466e-07, + "loss": 0.022, + "reward": 1.5227527618408203, + "reward_std": 0.2675768733024597, + "rewards/accuracy_reward_stage2": 0.5227527618408203, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1186 + }, + { + "completion_length": 8.953125, + "epoch": 0.20799018748904854, + "grad_norm": 14.812565593998304, + "kl": 0.042724609375, + "learning_rate": 7.92185035920799e-07, + "loss": -0.0246, + "reward": 1.7401741743087769, + "reward_std": 0.17801398038864136, + "rewards/accuracy_reward_stage2": 0.7557991147041321, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1187 + }, + { + "completion_length": 11.75, + "epoch": 0.2081654108988961, + "grad_norm": 21.42100080905678, + "kl": 0.03857421875, + "learning_rate": 7.920098125109515e-07, + "loss": 0.0154, + "reward": 1.7521522045135498, + "reward_std": 0.2301727831363678, + "rewards/accuracy_reward_stage2": 0.7521520853042603, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1188 + }, + { + "completion_length": 16.96875, + "epoch": 0.20834063430874364, + "grad_norm": 25.1811684105259, + "kl": 0.10009765625, + "learning_rate": 7.918345891011039e-07, + "loss": 0.0168, + "reward": 1.161747694015503, + "reward_std": 0.22458958625793457, + "rewards/accuracy_reward_stage2": 0.4273727238178253, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 1189 + }, + { + "completion_length": 9.609375, + "epoch": 0.2085158577185912, + "grad_norm": 26.014563985480596, + "kl": 0.271484375, + "learning_rate": 7.916593656912563e-07, + "loss": 0.0525, + "reward": 1.4985435009002686, + "reward_std": 0.34751009941101074, + "rewards/accuracy_reward_stage2": 0.6547934412956238, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1190 + }, + { + "completion_length": 30.4375, + "epoch": 0.20869108112843876, + "grad_norm": 20.238894284968815, + "kl": 0.07763671875, + "learning_rate": 7.914841422814088e-07, + "loss": -0.0129, + "reward": 1.6505463123321533, + "reward_std": 0.21846714615821838, + "rewards/accuracy_reward_stage2": 0.6661714315414429, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1191 + }, + { + "completion_length": 9.4375, + "epoch": 0.2088663045382863, + "grad_norm": 18.85616159916294, + "kl": 0.08544921875, + "learning_rate": 7.913089188715612e-07, + "loss": -0.0099, + "reward": 1.4091585874557495, + "reward_std": 0.20862269401550293, + "rewards/accuracy_reward_stage2": 0.4404085874557495, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1192 + }, + { + "completion_length": 11.34375, + "epoch": 0.20904152794813388, + "grad_norm": 20.685152739147835, + "kl": 0.1259765625, + "learning_rate": 7.911336954617136e-07, + "loss": 0.0132, + "reward": 1.276153326034546, + "reward_std": 0.15010762214660645, + "rewards/accuracy_reward_stage2": 0.5417782664299011, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 1193 + }, + { + "completion_length": 12.796875, + "epoch": 0.20921675135798143, + "grad_norm": 31.895420228966728, + "kl": 0.1953125, + "learning_rate": 7.909584720518661e-07, + "loss": 0.0562, + "reward": 1.4700446128845215, + "reward_std": 0.21943055093288422, + "rewards/accuracy_reward_stage2": 0.4856695532798767, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1194 + }, + { + "completion_length": 11.90625, + "epoch": 0.20939197476782898, + "grad_norm": 22.69955377588612, + "kl": 0.0439453125, + "learning_rate": 7.907832486420185e-07, + "loss": -0.0104, + "reward": 1.5846116542816162, + "reward_std": 0.22462745010852814, + "rewards/accuracy_reward_stage2": 0.6002365946769714, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1195 + }, + { + "completion_length": 19.3125, + "epoch": 0.20956719817767655, + "grad_norm": 34.454333953277235, + "kl": 0.1357421875, + "learning_rate": 7.90608025232171e-07, + "loss": 0.0543, + "reward": 1.2979505062103271, + "reward_std": 0.309200644493103, + "rewards/accuracy_reward_stage2": 0.42295050621032715, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1196 + }, + { + "completion_length": 8.125, + "epoch": 0.2097424215875241, + "grad_norm": 18.722270676400463, + "kl": 0.07470703125, + "learning_rate": 7.904328018223234e-07, + "loss": 0.03, + "reward": 1.7971199750900269, + "reward_std": 0.13342170417308807, + "rewards/accuracy_reward_stage2": 0.7971200942993164, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1197 + }, + { + "completion_length": 16.125, + "epoch": 0.20991764499737164, + "grad_norm": 24.03594003478929, + "kl": 0.08447265625, + "learning_rate": 7.902575784124759e-07, + "loss": -0.0022, + "reward": 1.4187324047088623, + "reward_std": 0.24392619729042053, + "rewards/accuracy_reward_stage2": 0.43435734510421753, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1198 + }, + { + "completion_length": 8.6875, + "epoch": 0.2100928684072192, + "grad_norm": 25.46547914674772, + "kl": 0.09716796875, + "learning_rate": 7.900823550026284e-07, + "loss": 0.0174, + "reward": 1.6383514404296875, + "reward_std": 0.3255687654018402, + "rewards/accuracy_reward_stage2": 0.6539763808250427, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1199 + }, + { + "completion_length": 7.234375, + "epoch": 0.21026809181706677, + "grad_norm": 16.712020493406808, + "kl": 0.06298828125, + "learning_rate": 7.899071315927807e-07, + "loss": -0.0016, + "reward": 1.3724149465560913, + "reward_std": 0.25119179487228394, + "rewards/accuracy_reward_stage2": 0.3880399465560913, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1200 + }, + { + "completion_length": 10.859375, + "epoch": 0.2104433152269143, + "grad_norm": 22.661832595121936, + "kl": 0.072265625, + "learning_rate": 7.897319081829332e-07, + "loss": -0.0128, + "reward": 1.7846884727478027, + "reward_std": 0.2079046070575714, + "rewards/accuracy_reward_stage2": 0.800313413143158, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1201 + }, + { + "completion_length": 9.421875, + "epoch": 0.21061853863676186, + "grad_norm": 27.33161021288942, + "kl": 0.041259765625, + "learning_rate": 7.895566847730857e-07, + "loss": 0.0165, + "reward": 1.3854670524597168, + "reward_std": 0.2343364655971527, + "rewards/accuracy_reward_stage2": 0.3854671120643616, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1202 + }, + { + "completion_length": 6.78125, + "epoch": 0.21079376204660943, + "grad_norm": 23.866240175046933, + "kl": 0.020751953125, + "learning_rate": 7.893814613632381e-07, + "loss": 0.0083, + "reward": 1.6647517681121826, + "reward_std": 0.19862306118011475, + "rewards/accuracy_reward_stage2": 0.6647517085075378, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1203 + }, + { + "completion_length": 9.125, + "epoch": 0.21096898545645698, + "grad_norm": 18.7644930982759, + "kl": 0.07763671875, + "learning_rate": 7.892062379533906e-07, + "loss": 0.0309, + "reward": 1.561574101448059, + "reward_std": 0.16064518690109253, + "rewards/accuracy_reward_stage2": 0.5615741610527039, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1204 + }, + { + "completion_length": 8.515625, + "epoch": 0.21114420886630453, + "grad_norm": 16.056818653041567, + "kl": 0.0419921875, + "learning_rate": 7.890310145435429e-07, + "loss": 0.0169, + "reward": 1.705617904663086, + "reward_std": 0.09264673292636871, + "rewards/accuracy_reward_stage2": 0.8306180238723755, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1205 + }, + { + "completion_length": 12.6875, + "epoch": 0.2113194322761521, + "grad_norm": 15.735374459387911, + "kl": 0.034423828125, + "learning_rate": 7.888557911336954e-07, + "loss": 0.0138, + "reward": 1.639738917350769, + "reward_std": 0.12377573549747467, + "rewards/accuracy_reward_stage2": 0.6397388577461243, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1206 + }, + { + "completion_length": 7.40625, + "epoch": 0.21149465568599965, + "grad_norm": 18.19948266499784, + "kl": 0.08935546875, + "learning_rate": 7.886805677238479e-07, + "loss": 0.0356, + "reward": 1.5436455011367798, + "reward_std": 0.17799049615859985, + "rewards/accuracy_reward_stage2": 0.5436455011367798, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1207 + }, + { + "completion_length": 10.875, + "epoch": 0.2116698790958472, + "grad_norm": 18.79429944475187, + "kl": 0.11865234375, + "learning_rate": 7.885053443140003e-07, + "loss": 0.0174, + "reward": 1.5807538032531738, + "reward_std": 0.2054576575756073, + "rewards/accuracy_reward_stage2": 0.5963788628578186, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1208 + }, + { + "completion_length": 12.5625, + "epoch": 0.21184510250569477, + "grad_norm": 24.119986223232797, + "kl": 0.16796875, + "learning_rate": 7.883301209041528e-07, + "loss": 0.0291, + "reward": 1.428783655166626, + "reward_std": 0.3158418834209442, + "rewards/accuracy_reward_stage2": 0.44440874457359314, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1209 + }, + { + "completion_length": 18.4375, + "epoch": 0.21202032591554232, + "grad_norm": 20.70340367230994, + "kl": 0.11181640625, + "learning_rate": 7.881548974943052e-07, + "loss": 0.0446, + "reward": 1.3978643417358398, + "reward_std": 0.23703636229038239, + "rewards/accuracy_reward_stage2": 0.39786434173583984, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1210 + }, + { + "completion_length": 11.09375, + "epoch": 0.21219554932538987, + "grad_norm": 17.57744479600521, + "kl": 0.08642578125, + "learning_rate": 7.879796740844576e-07, + "loss": -0.0095, + "reward": 1.2551759481430054, + "reward_std": 0.1964961737394333, + "rewards/accuracy_reward_stage2": 0.27080094814300537, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1211 + }, + { + "completion_length": 6.140625, + "epoch": 0.21237077273523744, + "grad_norm": 14.552088548686209, + "kl": 0.0260009765625, + "learning_rate": 7.878044506746101e-07, + "loss": -0.023, + "reward": 1.8353174924850464, + "reward_std": 0.20921599864959717, + "rewards/accuracy_reward_stage2": 0.8509424924850464, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1212 + }, + { + "completion_length": 15.125, + "epoch": 0.212545996145085, + "grad_norm": 13.133946279031946, + "kl": 0.022705078125, + "learning_rate": 7.876292272647625e-07, + "loss": -0.0351, + "reward": 1.4045956134796143, + "reward_std": 0.1509314924478531, + "rewards/accuracy_reward_stage2": 0.42022058367729187, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1213 + }, + { + "completion_length": 10.421875, + "epoch": 0.21272121955493253, + "grad_norm": 30.50049286912514, + "kl": 0.04736328125, + "learning_rate": 7.87454003854915e-07, + "loss": 0.0189, + "reward": 1.411908507347107, + "reward_std": 0.18127146363258362, + "rewards/accuracy_reward_stage2": 0.41190850734710693, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1214 + }, + { + "completion_length": 15.3125, + "epoch": 0.2128964429647801, + "grad_norm": 18.48778725455288, + "kl": 0.0269775390625, + "learning_rate": 7.872787804450675e-07, + "loss": -0.0334, + "reward": 1.590174913406372, + "reward_std": 0.20116549730300903, + "rewards/accuracy_reward_stage2": 0.6057999134063721, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1215 + }, + { + "completion_length": 11.296875, + "epoch": 0.21307166637462766, + "grad_norm": 20.392004552064943, + "kl": 0.134765625, + "learning_rate": 7.871035570352199e-07, + "loss": 0.0098, + "reward": 1.435154914855957, + "reward_std": 0.1450117528438568, + "rewards/accuracy_reward_stage2": 0.4507799446582794, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1216 + }, + { + "completion_length": 12.21875, + "epoch": 0.2132468897844752, + "grad_norm": 13.498308548221969, + "kl": 0.12060546875, + "learning_rate": 7.869283336253724e-07, + "loss": 0.0171, + "reward": 1.0430498123168945, + "reward_std": 0.09386852383613586, + "rewards/accuracy_reward_stage2": 0.18367479741573334, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1217 + }, + { + "completion_length": 8.34375, + "epoch": 0.21342211319432275, + "grad_norm": 22.306457599821844, + "kl": 0.2197265625, + "learning_rate": 7.867531102155247e-07, + "loss": 0.0879, + "reward": 1.6171928644180298, + "reward_std": 0.2317761927843094, + "rewards/accuracy_reward_stage2": 0.6171928644180298, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1218 + }, + { + "completion_length": 9.546875, + "epoch": 0.21359733660417032, + "grad_norm": 11.867011647782718, + "kl": 0.044189453125, + "learning_rate": 7.865778868056771e-07, + "loss": 0.0177, + "reward": 1.38582181930542, + "reward_std": 0.06935185939073563, + "rewards/accuracy_reward_stage2": 0.5108217000961304, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1219 + }, + { + "completion_length": 10.890625, + "epoch": 0.21377256001401787, + "grad_norm": 15.016949281265788, + "kl": 0.051513671875, + "learning_rate": 7.864026633958296e-07, + "loss": 0.0206, + "reward": 1.1094141006469727, + "reward_std": 0.1364666223526001, + "rewards/accuracy_reward_stage2": 0.10941408574581146, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1220 + }, + { + "completion_length": 20.1875, + "epoch": 0.21394778342386542, + "grad_norm": 18.42504754043461, + "kl": 0.068359375, + "learning_rate": 7.86227439985982e-07, + "loss": -0.0826, + "reward": 1.5647317171096802, + "reward_std": 0.17734460532665253, + "rewards/accuracy_reward_stage2": 0.6116067171096802, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 1221 + }, + { + "completion_length": 8.0625, + "epoch": 0.214123006833713, + "grad_norm": 21.446315689231362, + "kl": 0.032470703125, + "learning_rate": 7.860522165761345e-07, + "loss": 0.013, + "reward": 1.7758066654205322, + "reward_std": 0.19553758203983307, + "rewards/accuracy_reward_stage2": 0.7758066058158875, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1222 + }, + { + "completion_length": 14.265625, + "epoch": 0.21429823024356054, + "grad_norm": 23.97497416542969, + "kl": 0.07666015625, + "learning_rate": 7.85876993166287e-07, + "loss": 0.0306, + "reward": 1.5564537048339844, + "reward_std": 0.21574008464813232, + "rewards/accuracy_reward_stage2": 0.5564536452293396, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1223 + }, + { + "completion_length": 10.875, + "epoch": 0.21447345365340809, + "grad_norm": 18.180349359021452, + "kl": 0.10205078125, + "learning_rate": 7.857017697564394e-07, + "loss": 0.0407, + "reward": 1.3234200477600098, + "reward_std": 0.18061794340610504, + "rewards/accuracy_reward_stage2": 0.44842010736465454, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1224 + }, + { + "completion_length": 13.578125, + "epoch": 0.21464867706325566, + "grad_norm": 18.325128493418834, + "kl": 0.07373046875, + "learning_rate": 7.855265463465919e-07, + "loss": 0.0296, + "reward": 1.7562847137451172, + "reward_std": 0.22711655497550964, + "rewards/accuracy_reward_stage2": 0.7562847137451172, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1225 + }, + { + "completion_length": 11.859375, + "epoch": 0.2148239004731032, + "grad_norm": 17.05566843473295, + "kl": 0.040283203125, + "learning_rate": 7.853513229367444e-07, + "loss": -0.0987, + "reward": 1.5513169765472412, + "reward_std": 0.16350050270557404, + "rewards/accuracy_reward_stage2": 0.7231919765472412, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 1226 + }, + { + "completion_length": 12.59375, + "epoch": 0.21499912388295075, + "grad_norm": 26.692888583630907, + "kl": 0.296875, + "learning_rate": 7.851760995268968e-07, + "loss": 0.114, + "reward": 1.2322908639907837, + "reward_std": 0.3121843636035919, + "rewards/accuracy_reward_stage2": 0.4822908937931061, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1227 + }, + { + "completion_length": 9.828125, + "epoch": 0.21517434729279833, + "grad_norm": 21.447089137118855, + "kl": 0.0947265625, + "learning_rate": 7.850008761170493e-07, + "loss": 0.0378, + "reward": 1.6512415409088135, + "reward_std": 0.2510579824447632, + "rewards/accuracy_reward_stage2": 0.6512414813041687, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1228 + }, + { + "completion_length": 13.859375, + "epoch": 0.21534957070264588, + "grad_norm": 64.9879149157075, + "kl": 0.33203125, + "learning_rate": 7.848256527072016e-07, + "loss": 0.1329, + "reward": 1.4411249160766602, + "reward_std": 0.1767362356185913, + "rewards/accuracy_reward_stage2": 0.6911249160766602, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1229 + }, + { + "completion_length": 9.421875, + "epoch": 0.21552479411249342, + "grad_norm": 73.17992385714928, + "kl": 0.404296875, + "learning_rate": 7.846504292973541e-07, + "loss": 0.0985, + "reward": 1.5135424137115479, + "reward_std": 0.28798261284828186, + "rewards/accuracy_reward_stage2": 0.6697924733161926, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1230 + }, + { + "completion_length": 6.25, + "epoch": 0.215700017522341, + "grad_norm": 17.784801022236678, + "kl": 0.1298828125, + "learning_rate": 7.844752058875065e-07, + "loss": 0.008, + "reward": 1.4939236640930176, + "reward_std": 0.21711787581443787, + "rewards/accuracy_reward_stage2": 0.5095486640930176, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1231 + }, + { + "completion_length": 8.671875, + "epoch": 0.21587524093218854, + "grad_norm": 24.758583579901067, + "kl": 0.1044921875, + "learning_rate": 7.842999824776589e-07, + "loss": -0.0597, + "reward": 1.3905413150787354, + "reward_std": 0.24760988354682922, + "rewards/accuracy_reward_stage2": 0.4374162256717682, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 1232 + }, + { + "completion_length": 11.640625, + "epoch": 0.2160504643420361, + "grad_norm": 21.77876149803177, + "kl": 0.20703125, + "learning_rate": 7.841247590678114e-07, + "loss": 0.0494, + "reward": 1.3181720972061157, + "reward_std": 0.32347288727760315, + "rewards/accuracy_reward_stage2": 0.4587971866130829, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1233 + }, + { + "completion_length": 7.390625, + "epoch": 0.21622568775188364, + "grad_norm": 22.39564970611295, + "kl": 0.1357421875, + "learning_rate": 7.839495356579638e-07, + "loss": 0.0376, + "reward": 1.6510417461395264, + "reward_std": 0.22779880464076996, + "rewards/accuracy_reward_stage2": 0.6666666865348816, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1234 + }, + { + "completion_length": 8.265625, + "epoch": 0.2164009111617312, + "grad_norm": 25.959874464914165, + "kl": 0.027099609375, + "learning_rate": 7.837743122481163e-07, + "loss": 0.0109, + "reward": 1.3759396076202393, + "reward_std": 0.29031646251678467, + "rewards/accuracy_reward_stage2": 0.3759395480155945, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1235 + }, + { + "completion_length": 7.234375, + "epoch": 0.21657613457157876, + "grad_norm": 20.21760185077339, + "kl": 0.1298828125, + "learning_rate": 7.835990888382688e-07, + "loss": 0.0519, + "reward": 1.5539495944976807, + "reward_std": 0.2241932451725006, + "rewards/accuracy_reward_stage2": 0.5539496541023254, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1236 + }, + { + "completion_length": 20.5625, + "epoch": 0.2167513579814263, + "grad_norm": 35.69379614088005, + "kl": 0.142578125, + "learning_rate": 7.834238654284212e-07, + "loss": 0.0634, + "reward": 1.6434786319732666, + "reward_std": 0.18241646885871887, + "rewards/accuracy_reward_stage2": 0.7684785723686218, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1237 + }, + { + "completion_length": 11.265625, + "epoch": 0.21692658139127388, + "grad_norm": 14.892077914037673, + "kl": 0.044921875, + "learning_rate": 7.832486420185737e-07, + "loss": 0.018, + "reward": 1.382194995880127, + "reward_std": 0.12057159096002579, + "rewards/accuracy_reward_stage2": 0.3821950852870941, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1238 + }, + { + "completion_length": 8.421875, + "epoch": 0.21710180480112143, + "grad_norm": 17.29611686120686, + "kl": 0.09765625, + "learning_rate": 7.830734186087262e-07, + "loss": 0.039, + "reward": 1.4690710306167603, + "reward_std": 0.15366077423095703, + "rewards/accuracy_reward_stage2": 0.46907100081443787, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1239 + }, + { + "completion_length": 12.59375, + "epoch": 0.21727702821096898, + "grad_norm": 15.11141683071002, + "kl": 0.10009765625, + "learning_rate": 7.828981951988785e-07, + "loss": -0.0029, + "reward": 1.627720832824707, + "reward_std": 0.15333834290504456, + "rewards/accuracy_reward_stage2": 0.643345832824707, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1240 + }, + { + "completion_length": 14.609375, + "epoch": 0.21745225162081655, + "grad_norm": 21.22067711348212, + "kl": 0.024169921875, + "learning_rate": 7.82722971789031e-07, + "loss": 0.0097, + "reward": 1.6614583730697632, + "reward_std": 0.1928693801164627, + "rewards/accuracy_reward_stage2": 0.6614582538604736, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1241 + }, + { + "completion_length": 7.609375, + "epoch": 0.2176274750306641, + "grad_norm": 13.858555219264197, + "kl": 0.0849609375, + "learning_rate": 7.825477483791834e-07, + "loss": -0.0452, + "reward": 1.7296041250228882, + "reward_std": 0.1613890528678894, + "rewards/accuracy_reward_stage2": 0.760854184627533, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1242 + }, + { + "completion_length": 10.5, + "epoch": 0.21780269844051164, + "grad_norm": 23.196623299024466, + "kl": 0.208984375, + "learning_rate": 7.823725249693359e-07, + "loss": 0.0503, + "reward": 1.2230807542800903, + "reward_std": 0.24334552884101868, + "rewards/accuracy_reward_stage2": 0.36370575428009033, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1243 + }, + { + "completion_length": 14.28125, + "epoch": 0.21797792185035922, + "grad_norm": 12.800626978253057, + "kl": 0.08154296875, + "learning_rate": 7.821973015594883e-07, + "loss": -0.0013, + "reward": 1.4294836521148682, + "reward_std": 0.10467779636383057, + "rewards/accuracy_reward_stage2": 0.6951085925102234, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 1244 + }, + { + "completion_length": 34.484375, + "epoch": 0.21815314526020677, + "grad_norm": 11.287570658853458, + "kl": 0.0517578125, + "learning_rate": 7.820220781496407e-07, + "loss": 0.0207, + "reward": 1.5413293838500977, + "reward_std": 0.17220129072666168, + "rewards/accuracy_reward_stage2": 0.5413292646408081, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1245 + }, + { + "completion_length": 8.890625, + "epoch": 0.2183283686700543, + "grad_norm": 86.19789461067052, + "kl": 0.1123046875, + "learning_rate": 7.818468547397932e-07, + "loss": 0.0449, + "reward": 1.6259727478027344, + "reward_std": 0.21742461621761322, + "rewards/accuracy_reward_stage2": 0.6259727478027344, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1246 + }, + { + "completion_length": 12.125, + "epoch": 0.2185035920799019, + "grad_norm": 19.16149566715807, + "kl": 0.06787109375, + "learning_rate": 7.816716313299457e-07, + "loss": -0.0171, + "reward": 1.2638311386108398, + "reward_std": 0.20646998286247253, + "rewards/accuracy_reward_stage2": 0.2794560194015503, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1247 + }, + { + "completion_length": 14.609375, + "epoch": 0.21867881548974943, + "grad_norm": 25.56266166845523, + "kl": 0.059814453125, + "learning_rate": 7.814964079200981e-07, + "loss": 0.024, + "reward": 1.5029523372650146, + "reward_std": 0.205928772687912, + "rewards/accuracy_reward_stage2": 0.5029522776603699, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1248 + }, + { + "completion_length": 13.171875, + "epoch": 0.21885403889959698, + "grad_norm": 17.61052129867045, + "kl": 0.10400390625, + "learning_rate": 7.813211845102505e-07, + "loss": -0.0468, + "reward": 1.2945280075073242, + "reward_std": 0.2459794282913208, + "rewards/accuracy_reward_stage2": 0.3257780075073242, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1249 + }, + { + "completion_length": 11.71875, + "epoch": 0.21902926230944456, + "grad_norm": 11.221758396438073, + "kl": 0.0120849609375, + "learning_rate": 7.811459611004029e-07, + "loss": 0.0048, + "reward": 1.6181175708770752, + "reward_std": 0.01946648210287094, + "rewards/accuracy_reward_stage2": 0.6181175708770752, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1250 + }, + { + "completion_length": 7.0625, + "epoch": 0.2192044857192921, + "grad_norm": 16.126607670282013, + "kl": 0.07958984375, + "learning_rate": 7.809707376905554e-07, + "loss": -0.0123, + "reward": 1.7962268590927124, + "reward_std": 0.10672344267368317, + "rewards/accuracy_reward_stage2": 0.8118518590927124, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1251 + }, + { + "completion_length": 12.671875, + "epoch": 0.21937970912913965, + "grad_norm": 29.519790301106305, + "kl": 0.0277099609375, + "learning_rate": 7.807955142807079e-07, + "loss": 0.0111, + "reward": 1.421875, + "reward_std": 0.38664889335632324, + "rewards/accuracy_reward_stage2": 0.421875, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1252 + }, + { + "completion_length": 9.4375, + "epoch": 0.2195549325389872, + "grad_norm": 31.182060614934088, + "kl": 0.056884765625, + "learning_rate": 7.806202908708603e-07, + "loss": 0.0228, + "reward": 1.4860175848007202, + "reward_std": 0.2832135558128357, + "rewards/accuracy_reward_stage2": 0.4860175848007202, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1253 + }, + { + "completion_length": 7.921875, + "epoch": 0.21973015594883477, + "grad_norm": 11.564218986630497, + "kl": 0.033935546875, + "learning_rate": 7.804450674610128e-07, + "loss": 0.0136, + "reward": 1.6184473037719727, + "reward_std": 0.11314624547958374, + "rewards/accuracy_reward_stage2": 0.6184473633766174, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1254 + }, + { + "completion_length": 11.9375, + "epoch": 0.21990537935868232, + "grad_norm": 6.513019818636513, + "kl": 0.039306640625, + "learning_rate": 7.802698440511653e-07, + "loss": 0.0158, + "reward": 1.671875, + "reward_std": 0.0646936446428299, + "rewards/accuracy_reward_stage2": 0.671875, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1255 + }, + { + "completion_length": 9.203125, + "epoch": 0.22008060276852986, + "grad_norm": 20.747896613755444, + "kl": 0.059814453125, + "learning_rate": 7.800946206413176e-07, + "loss": 0.024, + "reward": 1.3773555755615234, + "reward_std": 0.25220632553100586, + "rewards/accuracy_reward_stage2": 0.3773554861545563, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1256 + }, + { + "completion_length": 9.265625, + "epoch": 0.22025582617837744, + "grad_norm": 24.392216642627467, + "kl": 0.1396484375, + "learning_rate": 7.799193972314701e-07, + "loss": 0.0117, + "reward": 1.505408763885498, + "reward_std": 0.16076895594596863, + "rewards/accuracy_reward_stage2": 0.6460338234901428, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1257 + }, + { + "completion_length": 11.015625, + "epoch": 0.220431049588225, + "grad_norm": 187.6336735161918, + "kl": 1.0, + "learning_rate": 7.797441738216225e-07, + "loss": 0.3622, + "reward": 1.6911745071411133, + "reward_std": 0.21796134114265442, + "rewards/accuracy_reward_stage2": 0.8317995071411133, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1258 + }, + { + "completion_length": 9.1875, + "epoch": 0.22060627299807253, + "grad_norm": 17.763683470304855, + "kl": 0.10595703125, + "learning_rate": 7.795689504117749e-07, + "loss": 0.0424, + "reward": 1.6380715370178223, + "reward_std": 0.17473512887954712, + "rewards/accuracy_reward_stage2": 0.6380715370178223, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1259 + }, + { + "completion_length": 16.09375, + "epoch": 0.2207814964079201, + "grad_norm": 18.883167626689094, + "kl": 0.09521484375, + "learning_rate": 7.793937270019274e-07, + "loss": 0.0012, + "reward": 1.2897546291351318, + "reward_std": 0.21671342849731445, + "rewards/accuracy_reward_stage2": 0.30537962913513184, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1260 + }, + { + "completion_length": 7.859375, + "epoch": 0.22095671981776766, + "grad_norm": 24.10659942453599, + "kl": 0.09375, + "learning_rate": 7.792185035920798e-07, + "loss": 0.0375, + "reward": 1.5981502532958984, + "reward_std": 0.27921926975250244, + "rewards/accuracy_reward_stage2": 0.5981503129005432, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1261 + }, + { + "completion_length": 7.390625, + "epoch": 0.2211319432276152, + "grad_norm": 14.815017372927116, + "kl": 0.05908203125, + "learning_rate": 7.790432801822323e-07, + "loss": 0.0237, + "reward": 1.590078592300415, + "reward_std": 0.21127043664455414, + "rewards/accuracy_reward_stage2": 0.590078592300415, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1262 + }, + { + "completion_length": 8.875, + "epoch": 0.22130716663746278, + "grad_norm": 18.08682737782561, + "kl": 0.10302734375, + "learning_rate": 7.788680567723848e-07, + "loss": 0.0014, + "reward": 1.4229505062103271, + "reward_std": 0.25344976782798767, + "rewards/accuracy_reward_stage2": 0.5635755062103271, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1263 + }, + { + "completion_length": 9.8125, + "epoch": 0.22148239004731032, + "grad_norm": 17.866828523480077, + "kl": 0.06103515625, + "learning_rate": 7.786928333625372e-07, + "loss": 0.0244, + "reward": 1.3703603744506836, + "reward_std": 0.19101378321647644, + "rewards/accuracy_reward_stage2": 0.495360404253006, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1264 + }, + { + "completion_length": 12.46875, + "epoch": 0.22165761345715787, + "grad_norm": 17.874063479543306, + "kl": 0.1728515625, + "learning_rate": 7.785176099526897e-07, + "loss": 0.0245, + "reward": 1.5815012454986572, + "reward_std": 0.23980316519737244, + "rewards/accuracy_reward_stage2": 0.7221262454986572, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1265 + }, + { + "completion_length": 18.40625, + "epoch": 0.22183283686700545, + "grad_norm": 25.11799851246929, + "kl": 0.1953125, + "learning_rate": 7.783423865428421e-07, + "loss": 0.0466, + "reward": 1.4459002017974854, + "reward_std": 0.24602550268173218, + "rewards/accuracy_reward_stage2": 0.5865253210067749, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1266 + }, + { + "completion_length": 13.265625, + "epoch": 0.222008060276853, + "grad_norm": 21.582078379159743, + "kl": 0.07275390625, + "learning_rate": 7.781671631329946e-07, + "loss": 0.029, + "reward": 1.214440107345581, + "reward_std": 0.16473287343978882, + "rewards/accuracy_reward_stage2": 0.3394400477409363, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1267 + }, + { + "completion_length": 9.75, + "epoch": 0.22218328368670054, + "grad_norm": 24.881310227565013, + "kl": 0.62890625, + "learning_rate": 7.779919397231471e-07, + "loss": 0.2498, + "reward": 1.3854460716247559, + "reward_std": 0.29034459590911865, + "rewards/accuracy_reward_stage2": 0.5104460716247559, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1268 + }, + { + "completion_length": 7.34375, + "epoch": 0.22235850709654809, + "grad_norm": 14.940564340974653, + "kl": 0.0810546875, + "learning_rate": 7.778167163132993e-07, + "loss": 0.0323, + "reward": 1.6923959255218506, + "reward_std": 0.09215311706066132, + "rewards/accuracy_reward_stage2": 0.6923958659172058, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1269 + }, + { + "completion_length": 12.25, + "epoch": 0.22253373050639566, + "grad_norm": 14.231371081239205, + "kl": 0.045654296875, + "learning_rate": 7.776414929034518e-07, + "loss": 0.0182, + "reward": 1.698338270187378, + "reward_std": 0.13502109050750732, + "rewards/accuracy_reward_stage2": 0.6983382701873779, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1270 + }, + { + "completion_length": 17.015625, + "epoch": 0.2227089539162432, + "grad_norm": 17.951746853772647, + "kl": 0.0537109375, + "learning_rate": 7.774662694936043e-07, + "loss": 0.0215, + "reward": 1.1886450052261353, + "reward_std": 0.17170041799545288, + "rewards/accuracy_reward_stage2": 0.31364506483078003, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1271 + }, + { + "completion_length": 13.234375, + "epoch": 0.22288417732609075, + "grad_norm": 20.725766439084733, + "kl": 0.08154296875, + "learning_rate": 7.772910460837567e-07, + "loss": 0.0159, + "reward": 1.6198360919952393, + "reward_std": 0.25739431381225586, + "rewards/accuracy_reward_stage2": 0.6354610919952393, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1272 + }, + { + "completion_length": 7.28125, + "epoch": 0.22305940073593833, + "grad_norm": 23.434106099546728, + "kl": 0.1669921875, + "learning_rate": 7.771158226739092e-07, + "loss": 0.0227, + "reward": 1.688348650932312, + "reward_std": 0.2744218707084656, + "rewards/accuracy_reward_stage2": 0.719598650932312, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1273 + }, + { + "completion_length": 11.734375, + "epoch": 0.22323462414578588, + "grad_norm": 23.463084611699912, + "kl": 0.0498046875, + "learning_rate": 7.769405992640616e-07, + "loss": -0.0242, + "reward": 1.5922805070877075, + "reward_std": 0.2871755361557007, + "rewards/accuracy_reward_stage2": 0.7329055666923523, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1274 + }, + { + "completion_length": 6.921875, + "epoch": 0.22340984755563342, + "grad_norm": 22.14878463098126, + "kl": 0.134765625, + "learning_rate": 7.767653758542141e-07, + "loss": 0.0096, + "reward": 1.6665723323822021, + "reward_std": 0.27279797196388245, + "rewards/accuracy_reward_stage2": 0.6821973323822021, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1275 + }, + { + "completion_length": 6.96875, + "epoch": 0.223585070965481, + "grad_norm": 17.98879927843346, + "kl": 0.09619140625, + "learning_rate": 7.765901524443666e-07, + "loss": -0.0058, + "reward": 1.4087541103363037, + "reward_std": 0.24091657996177673, + "rewards/accuracy_reward_stage2": 0.4243791103363037, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1276 + }, + { + "completion_length": 13.03125, + "epoch": 0.22376029437532854, + "grad_norm": 22.424179520437004, + "kl": 0.080078125, + "learning_rate": 7.76414929034519e-07, + "loss": 0.011, + "reward": 1.6969799995422363, + "reward_std": 0.1915295571088791, + "rewards/accuracy_reward_stage2": 0.7126048803329468, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1277 + }, + { + "completion_length": 7.765625, + "epoch": 0.2239355177851761, + "grad_norm": 17.401912665765387, + "kl": 0.091796875, + "learning_rate": 7.762397056246715e-07, + "loss": -0.0055, + "reward": 1.5669753551483154, + "reward_std": 0.09806131571531296, + "rewards/accuracy_reward_stage2": 0.5826001763343811, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1278 + }, + { + "completion_length": 16.234375, + "epoch": 0.22411074119502367, + "grad_norm": 19.61910716871486, + "kl": 0.08349609375, + "learning_rate": 7.76064482214824e-07, + "loss": -0.0385, + "reward": 1.4222311973571777, + "reward_std": 0.29086655378341675, + "rewards/accuracy_reward_stage2": 0.45348113775253296, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1279 + }, + { + "completion_length": 15.765625, + "epoch": 0.2242859646048712, + "grad_norm": 20.170390141515178, + "kl": 0.384765625, + "learning_rate": 7.758892588049763e-07, + "loss": 0.1094, + "reward": 1.4278593063354492, + "reward_std": 0.14910349249839783, + "rewards/accuracy_reward_stage2": 0.5684843063354492, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1280 + }, + { + "completion_length": 11.40625, + "epoch": 0.22446118801471876, + "grad_norm": 17.25825844167911, + "kl": 0.0615234375, + "learning_rate": 7.757140353951288e-07, + "loss": -0.0195, + "reward": 1.255530595779419, + "reward_std": 0.2321069836616516, + "rewards/accuracy_reward_stage2": 0.39615553617477417, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1281 + }, + { + "completion_length": 10.6875, + "epoch": 0.22463641142456633, + "grad_norm": 16.864030485310032, + "kl": 0.0673828125, + "learning_rate": 7.755388119852811e-07, + "loss": -0.0173, + "reward": 1.738398551940918, + "reward_std": 0.1555314064025879, + "rewards/accuracy_reward_stage2": 0.7540234923362732, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1282 + }, + { + "completion_length": 9.59375, + "epoch": 0.22481163483441388, + "grad_norm": 21.18331323146374, + "kl": 0.14453125, + "learning_rate": 7.753635885754336e-07, + "loss": 0.0577, + "reward": 1.6540005207061768, + "reward_std": 0.1803339272737503, + "rewards/accuracy_reward_stage2": 0.6540004014968872, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1283 + }, + { + "completion_length": 12.9375, + "epoch": 0.22498685824426143, + "grad_norm": 23.49818812479695, + "kl": 0.056884765625, + "learning_rate": 7.751883651655861e-07, + "loss": -0.0179, + "reward": 1.664846658706665, + "reward_std": 0.19526709616184235, + "rewards/accuracy_reward_stage2": 0.680471658706665, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1284 + }, + { + "completion_length": 13.890625, + "epoch": 0.22516208165410898, + "grad_norm": 23.28590729981405, + "kl": 0.057373046875, + "learning_rate": 7.750131417557385e-07, + "loss": -0.0213, + "reward": 1.4284430742263794, + "reward_std": 0.2543635070323944, + "rewards/accuracy_reward_stage2": 0.4440680146217346, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1285 + }, + { + "completion_length": 22.96875, + "epoch": 0.22533730506395655, + "grad_norm": 19.21656824927377, + "kl": 0.09912109375, + "learning_rate": 7.74837918345891e-07, + "loss": 0.0395, + "reward": 1.3398176431655884, + "reward_std": 0.16608867049217224, + "rewards/accuracy_reward_stage2": 0.4648175835609436, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1286 + }, + { + "completion_length": 13.171875, + "epoch": 0.2255125284738041, + "grad_norm": 23.36717503355835, + "kl": 0.083984375, + "learning_rate": 7.746626949360435e-07, + "loss": 0.0335, + "reward": 1.4650541543960571, + "reward_std": 0.22038257122039795, + "rewards/accuracy_reward_stage2": 0.46505406498908997, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1287 + }, + { + "completion_length": 11.65625, + "epoch": 0.22568775188365164, + "grad_norm": 19.443791282030777, + "kl": 0.0203857421875, + "learning_rate": 7.744874715261959e-07, + "loss": 0.0081, + "reward": 1.7337589263916016, + "reward_std": 0.21359241008758545, + "rewards/accuracy_reward_stage2": 0.7337589263916016, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1288 + }, + { + "completion_length": 12.578125, + "epoch": 0.22586297529349922, + "grad_norm": 14.696962381102287, + "kl": 0.0546875, + "learning_rate": 7.743122481163483e-07, + "loss": 0.0219, + "reward": 1.4507033824920654, + "reward_std": 0.11517933756113052, + "rewards/accuracy_reward_stage2": 0.45070335268974304, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1289 + }, + { + "completion_length": 12.140625, + "epoch": 0.22603819870334677, + "grad_norm": 18.039059813289775, + "kl": 0.06591796875, + "learning_rate": 7.741370247065007e-07, + "loss": 0.0135, + "reward": 1.5439950227737427, + "reward_std": 0.13915899395942688, + "rewards/accuracy_reward_stage2": 0.6689950227737427, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1290 + }, + { + "completion_length": 13.875, + "epoch": 0.2262134221131943, + "grad_norm": 14.804979613619244, + "kl": 0.05859375, + "learning_rate": 7.739618012966532e-07, + "loss": 0.0234, + "reward": 1.4256266355514526, + "reward_std": 0.12437914311885834, + "rewards/accuracy_reward_stage2": 0.42562660574913025, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1291 + }, + { + "completion_length": 8.71875, + "epoch": 0.2263886455230419, + "grad_norm": 27.679772931226807, + "kl": 0.251953125, + "learning_rate": 7.737865778868057e-07, + "loss": 0.1072, + "reward": 1.5422618389129639, + "reward_std": 0.14960823953151703, + "rewards/accuracy_reward_stage2": 0.7922618985176086, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1292 + }, + { + "completion_length": 8.796875, + "epoch": 0.22656386893288943, + "grad_norm": 16.907333227979905, + "kl": 0.12109375, + "learning_rate": 7.736113544769581e-07, + "loss": 0.014, + "reward": 1.551778793334961, + "reward_std": 0.12833553552627563, + "rewards/accuracy_reward_stage2": 0.5674037337303162, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1293 + }, + { + "completion_length": 7.765625, + "epoch": 0.22673909234273698, + "grad_norm": 20.22861234970417, + "kl": 0.048583984375, + "learning_rate": 7.734361310671105e-07, + "loss": 0.0195, + "reward": 1.5390467643737793, + "reward_std": 0.2336047738790512, + "rewards/accuracy_reward_stage2": 0.6640467047691345, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1294 + }, + { + "completion_length": 11.203125, + "epoch": 0.22691431575258456, + "grad_norm": 26.05748474925187, + "kl": 0.07666015625, + "learning_rate": 7.732609076572629e-07, + "loss": 0.014, + "reward": 1.462762713432312, + "reward_std": 0.2649151384830475, + "rewards/accuracy_reward_stage2": 0.603387713432312, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1295 + }, + { + "completion_length": 11.0, + "epoch": 0.2270895391624321, + "grad_norm": 25.09086900081551, + "kl": 0.57421875, + "learning_rate": 7.730856842474154e-07, + "loss": 0.2078, + "reward": 1.268276572227478, + "reward_std": 0.2907959818840027, + "rewards/accuracy_reward_stage2": 0.518276572227478, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1296 + }, + { + "completion_length": 13.515625, + "epoch": 0.22726476257227965, + "grad_norm": 19.14340541773585, + "kl": 0.037109375, + "learning_rate": 7.729104608375679e-07, + "loss": 0.0149, + "reward": 1.7883508205413818, + "reward_std": 0.16622239351272583, + "rewards/accuracy_reward_stage2": 0.7883508801460266, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1297 + }, + { + "completion_length": 11.15625, + "epoch": 0.22743998598212722, + "grad_norm": 21.16259113723349, + "kl": 0.0751953125, + "learning_rate": 7.727352374277202e-07, + "loss": -0.0475, + "reward": 1.6002087593078613, + "reward_std": 0.25808942317962646, + "rewards/accuracy_reward_stage2": 0.6314586997032166, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1298 + }, + { + "completion_length": 10.765625, + "epoch": 0.22761520939197477, + "grad_norm": 19.104041721217694, + "kl": 0.1318359375, + "learning_rate": 7.725600140178727e-07, + "loss": 0.0527, + "reward": 1.3541667461395264, + "reward_std": 0.26745420694351196, + "rewards/accuracy_reward_stage2": 0.4791666567325592, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1299 + }, + { + "completion_length": 9.3125, + "epoch": 0.22779043280182232, + "grad_norm": 9.065751220738832, + "kl": 0.00518798828125, + "learning_rate": 7.723847906080252e-07, + "loss": 0.0021, + "reward": 1.7539682388305664, + "reward_std": 0.01122391689568758, + "rewards/accuracy_reward_stage2": 0.7539682388305664, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1300 + }, + { + "completion_length": 10.796875, + "epoch": 0.2279656562116699, + "grad_norm": 16.62573543229576, + "kl": 0.07958984375, + "learning_rate": 7.722095671981776e-07, + "loss": 0.0003, + "reward": 1.459397792816162, + "reward_std": 0.28754448890686035, + "rewards/accuracy_reward_stage2": 0.6000228524208069, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1301 + }, + { + "completion_length": 13.859375, + "epoch": 0.22814087962151744, + "grad_norm": 18.95066625577687, + "kl": 0.08203125, + "learning_rate": 7.720343437883301e-07, + "loss": 0.0327, + "reward": 1.5195292234420776, + "reward_std": 0.1240844875574112, + "rewards/accuracy_reward_stage2": 0.5195292234420776, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1302 + }, + { + "completion_length": 7.234375, + "epoch": 0.228316103031365, + "grad_norm": 27.896227499752165, + "kl": 0.022216796875, + "learning_rate": 7.718591203784826e-07, + "loss": 0.0089, + "reward": 1.6503667831420898, + "reward_std": 0.19524559378623962, + "rewards/accuracy_reward_stage2": 0.7753667831420898, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1303 + }, + { + "completion_length": 9.5625, + "epoch": 0.22849132644121253, + "grad_norm": 21.168475667163364, + "kl": 0.138671875, + "learning_rate": 7.71683896968635e-07, + "loss": -0.0267, + "reward": 1.5251177549362183, + "reward_std": 0.23324428498744965, + "rewards/accuracy_reward_stage2": 0.5563677549362183, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1304 + }, + { + "completion_length": 11.03125, + "epoch": 0.2286665498510601, + "grad_norm": 17.068660299280747, + "kl": 0.1181640625, + "learning_rate": 7.715086735587875e-07, + "loss": 0.0471, + "reward": 1.2996182441711426, + "reward_std": 0.19639158248901367, + "rewards/accuracy_reward_stage2": 0.2996181845664978, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1305 + }, + { + "completion_length": 10.859375, + "epoch": 0.22884177326090765, + "grad_norm": 22.08925315478146, + "kl": 0.2216796875, + "learning_rate": 7.713334501489399e-07, + "loss": 0.0091, + "reward": 1.5402624607086182, + "reward_std": 0.22633978724479675, + "rewards/accuracy_reward_stage2": 0.5871374607086182, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 1306 + }, + { + "completion_length": 13.625, + "epoch": 0.2290169966707552, + "grad_norm": 18.914502186490715, + "kl": 0.0888671875, + "learning_rate": 7.711582267390923e-07, + "loss": -0.0086, + "reward": 1.4000566005706787, + "reward_std": 0.14851221442222595, + "rewards/accuracy_reward_stage2": 0.5406815409660339, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1307 + }, + { + "completion_length": 13.140625, + "epoch": 0.22919222008060278, + "grad_norm": 24.534198975775887, + "kl": 0.058837890625, + "learning_rate": 7.709830033292448e-07, + "loss": 0.0236, + "reward": 1.3969494104385376, + "reward_std": 0.3108880817890167, + "rewards/accuracy_reward_stage2": 0.3969494104385376, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1308 + }, + { + "completion_length": 15.03125, + "epoch": 0.22936744349045032, + "grad_norm": 24.113349660176898, + "kl": 0.06982421875, + "learning_rate": 7.708077799193971e-07, + "loss": -0.0163, + "reward": 1.5292431116104126, + "reward_std": 0.2674151659011841, + "rewards/accuracy_reward_stage2": 0.5448680520057678, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1309 + }, + { + "completion_length": 11.625, + "epoch": 0.22954266690029787, + "grad_norm": 17.911011858964358, + "kl": 0.0888671875, + "learning_rate": 7.706325565095496e-07, + "loss": -0.0087, + "reward": 1.62459397315979, + "reward_std": 0.20538245141506195, + "rewards/accuracy_reward_stage2": 0.6402188539505005, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1310 + }, + { + "completion_length": 6.0625, + "epoch": 0.22971789031014545, + "grad_norm": 19.31570319495111, + "kl": 0.0546875, + "learning_rate": 7.70457333099702e-07, + "loss": -0.0115, + "reward": 1.784255862236023, + "reward_std": 0.25300198793411255, + "rewards/accuracy_reward_stage2": 0.799880862236023, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1311 + }, + { + "completion_length": 10.828125, + "epoch": 0.229893113719993, + "grad_norm": 20.540568906206264, + "kl": 0.05224609375, + "learning_rate": 7.702821096898545e-07, + "loss": 0.0209, + "reward": 1.6209710836410522, + "reward_std": 0.1423826813697815, + "rewards/accuracy_reward_stage2": 0.6209710836410522, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1312 + }, + { + "completion_length": 9.703125, + "epoch": 0.23006833712984054, + "grad_norm": 28.448715831191063, + "kl": 0.11083984375, + "learning_rate": 7.70106886280007e-07, + "loss": 0.0443, + "reward": 1.6625604629516602, + "reward_std": 0.336778849363327, + "rewards/accuracy_reward_stage2": 0.6625604629516602, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1313 + }, + { + "completion_length": 9.75, + "epoch": 0.2302435605396881, + "grad_norm": 23.162356470137542, + "kl": 0.150390625, + "learning_rate": 7.699316628701594e-07, + "loss": 0.0108, + "reward": 1.3917927742004395, + "reward_std": 0.36611586809158325, + "rewards/accuracy_reward_stage2": 0.5480427742004395, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1314 + }, + { + "completion_length": 8.015625, + "epoch": 0.23041878394953566, + "grad_norm": 18.946663303104412, + "kl": 0.041259765625, + "learning_rate": 7.697564394603119e-07, + "loss": -0.0277, + "reward": 1.6170215606689453, + "reward_std": 0.20247286558151245, + "rewards/accuracy_reward_stage2": 0.6326465606689453, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1315 + }, + { + "completion_length": 15.703125, + "epoch": 0.2305940073593832, + "grad_norm": 24.83974962007514, + "kl": 0.046142578125, + "learning_rate": 7.695812160504644e-07, + "loss": 0.0184, + "reward": 1.5687530040740967, + "reward_std": 0.29624682664871216, + "rewards/accuracy_reward_stage2": 0.5687530040740967, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1316 + }, + { + "completion_length": 7.578125, + "epoch": 0.23076923076923078, + "grad_norm": 18.37249053762782, + "kl": 0.0849609375, + "learning_rate": 7.694059926406168e-07, + "loss": -0.0102, + "reward": 1.5225424766540527, + "reward_std": 0.23137205839157104, + "rewards/accuracy_reward_stage2": 0.5381674766540527, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1317 + }, + { + "completion_length": 8.265625, + "epoch": 0.23094445417907833, + "grad_norm": 24.39965195019764, + "kl": 0.10302734375, + "learning_rate": 7.692307692307693e-07, + "loss": 0.0412, + "reward": 1.7179219722747803, + "reward_std": 0.25506922602653503, + "rewards/accuracy_reward_stage2": 0.7179219126701355, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1318 + }, + { + "completion_length": 10.90625, + "epoch": 0.23111967758892588, + "grad_norm": 22.134302049761775, + "kl": 0.10498046875, + "learning_rate": 7.690555458209216e-07, + "loss": -0.0022, + "reward": 1.3819736242294312, + "reward_std": 0.24998074769973755, + "rewards/accuracy_reward_stage2": 0.3975986838340759, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1319 + }, + { + "completion_length": 13.703125, + "epoch": 0.23129490099877342, + "grad_norm": 28.60472856199761, + "kl": 0.048583984375, + "learning_rate": 7.68880322411074e-07, + "loss": 0.0194, + "reward": 1.5254038572311401, + "reward_std": 0.2446960210800171, + "rewards/accuracy_reward_stage2": 0.5254038572311401, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1320 + }, + { + "completion_length": 9.9375, + "epoch": 0.231470124408621, + "grad_norm": 13.988686696055257, + "kl": 0.03271484375, + "learning_rate": 7.687050990012265e-07, + "loss": -0.0312, + "reward": 1.84375, + "reward_std": 0.1462520956993103, + "rewards/accuracy_reward_stage2": 0.859375, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1321 + }, + { + "completion_length": 11.578125, + "epoch": 0.23164534781846854, + "grad_norm": 18.07158760655491, + "kl": 0.1171875, + "learning_rate": 7.685298755913789e-07, + "loss": 0.0033, + "reward": 1.8339933156967163, + "reward_std": 0.2281760275363922, + "rewards/accuracy_reward_stage2": 0.8496183156967163, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1322 + }, + { + "completion_length": 7.75, + "epoch": 0.2318205712283161, + "grad_norm": 26.39845456134637, + "kl": 0.0947265625, + "learning_rate": 7.683546521815314e-07, + "loss": 0.0023, + "reward": 1.6090365648269653, + "reward_std": 0.2769169807434082, + "rewards/accuracy_reward_stage2": 0.6246616244316101, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1323 + }, + { + "completion_length": 9.8125, + "epoch": 0.23199579463816367, + "grad_norm": 22.387896866550953, + "kl": 0.08544921875, + "learning_rate": 7.681794287716839e-07, + "loss": 0.0341, + "reward": 1.5603952407836914, + "reward_std": 0.253578782081604, + "rewards/accuracy_reward_stage2": 0.5603952407836914, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1324 + }, + { + "completion_length": 10.96875, + "epoch": 0.2321710180480112, + "grad_norm": 17.495621738171565, + "kl": 0.01275634765625, + "learning_rate": 7.680042053618363e-07, + "loss": 0.0051, + "reward": 1.5471508502960205, + "reward_std": 0.09170855581760406, + "rewards/accuracy_reward_stage2": 0.5471509099006653, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1325 + }, + { + "completion_length": 7.453125, + "epoch": 0.23234624145785876, + "grad_norm": 20.347795944323746, + "kl": 0.083984375, + "learning_rate": 7.678289819519888e-07, + "loss": 0.0336, + "reward": 1.5921326875686646, + "reward_std": 0.21103455126285553, + "rewards/accuracy_reward_stage2": 0.7171327471733093, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1326 + }, + { + "completion_length": 8.140625, + "epoch": 0.23252146486770633, + "grad_norm": 18.193345428480434, + "kl": 0.1015625, + "learning_rate": 7.676537585421412e-07, + "loss": 0.0072, + "reward": 1.6968261003494263, + "reward_std": 0.2153952419757843, + "rewards/accuracy_reward_stage2": 0.7124510407447815, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1327 + }, + { + "completion_length": 11.46875, + "epoch": 0.23269668827755388, + "grad_norm": 26.928705144530184, + "kl": 0.056640625, + "learning_rate": 7.674785351322936e-07, + "loss": 0.0227, + "reward": 1.645999789237976, + "reward_std": 0.19765979051589966, + "rewards/accuracy_reward_stage2": 0.6459997892379761, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1328 + }, + { + "completion_length": 11.828125, + "epoch": 0.23287191168740143, + "grad_norm": 31.28999988787986, + "kl": 0.1591796875, + "learning_rate": 7.673033117224461e-07, + "loss": 0.03, + "reward": 1.1927083730697632, + "reward_std": 0.2777610421180725, + "rewards/accuracy_reward_stage2": 0.4583333134651184, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 1329 + }, + { + "completion_length": 7.453125, + "epoch": 0.233047135097249, + "grad_norm": 12.114613846952071, + "kl": 0.14453125, + "learning_rate": 7.671280883125985e-07, + "loss": 0.0579, + "reward": 1.640345811843872, + "reward_std": 0.20387586951255798, + "rewards/accuracy_reward_stage2": 0.6403458118438721, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1330 + }, + { + "completion_length": 6.4375, + "epoch": 0.23322235850709655, + "grad_norm": 36.1777020029657, + "kl": 0.04833984375, + "learning_rate": 7.66952864902751e-07, + "loss": 0.0194, + "reward": 1.5745203495025635, + "reward_std": 0.12884950637817383, + "rewards/accuracy_reward_stage2": 0.5745203495025635, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1331 + }, + { + "completion_length": 5.765625, + "epoch": 0.2333975819169441, + "grad_norm": 21.644659188058338, + "kl": 0.0390625, + "learning_rate": 7.667776414929035e-07, + "loss": 0.0156, + "reward": 1.721125602722168, + "reward_std": 0.21974197030067444, + "rewards/accuracy_reward_stage2": 0.7211256623268127, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1332 + }, + { + "completion_length": 8.046875, + "epoch": 0.23357280532679167, + "grad_norm": 21.40238993393703, + "kl": 0.11376953125, + "learning_rate": 7.666024180830558e-07, + "loss": 0.0455, + "reward": 1.632363200187683, + "reward_std": 0.19989193975925446, + "rewards/accuracy_reward_stage2": 0.6323632001876831, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1333 + }, + { + "completion_length": 10.4375, + "epoch": 0.23374802873663922, + "grad_norm": 17.30648927448642, + "kl": 0.1318359375, + "learning_rate": 7.664271946732083e-07, + "loss": 0.0529, + "reward": 1.3868898153305054, + "reward_std": 0.08634155243635178, + "rewards/accuracy_reward_stage2": 0.6368898153305054, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1334 + }, + { + "completion_length": 7.875, + "epoch": 0.23392325214648677, + "grad_norm": 17.350471710700887, + "kl": 0.0625, + "learning_rate": 7.662519712633607e-07, + "loss": 0.025, + "reward": 1.398812174797058, + "reward_std": 0.14234712719917297, + "rewards/accuracy_reward_stage2": 0.3988121747970581, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1335 + }, + { + "completion_length": 12.3125, + "epoch": 0.23409847555633434, + "grad_norm": 3030.131469498715, + "kl": 10.0, + "learning_rate": 7.660767478535132e-07, + "loss": 3.9625, + "reward": 1.5227144956588745, + "reward_std": 0.24410173296928406, + "rewards/accuracy_reward_stage2": 0.5383394956588745, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1336 + }, + { + "completion_length": 12.609375, + "epoch": 0.2342736989661819, + "grad_norm": 15.117023305589338, + "kl": 0.07958984375, + "learning_rate": 7.659015244436657e-07, + "loss": -0.0124, + "reward": 1.3785531520843506, + "reward_std": 0.16757111251354218, + "rewards/accuracy_reward_stage2": 0.5191780924797058, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1337 + }, + { + "completion_length": 8.03125, + "epoch": 0.23444892237602943, + "grad_norm": 22.17685357158358, + "kl": 0.036865234375, + "learning_rate": 7.65726301033818e-07, + "loss": 0.0147, + "reward": 1.6041667461395264, + "reward_std": 0.26043471693992615, + "rewards/accuracy_reward_stage2": 0.6041666269302368, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1338 + }, + { + "completion_length": 10.921875, + "epoch": 0.23462414578587698, + "grad_norm": 22.342387603619894, + "kl": 0.177734375, + "learning_rate": 7.655510776239705e-07, + "loss": 0.0711, + "reward": 1.3732129335403442, + "reward_std": 0.24331887066364288, + "rewards/accuracy_reward_stage2": 0.49821293354034424, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1339 + }, + { + "completion_length": 21.734375, + "epoch": 0.23479936919572456, + "grad_norm": 16.78143151695997, + "kl": 0.05615234375, + "learning_rate": 7.65375854214123e-07, + "loss": 0.0225, + "reward": 1.254507064819336, + "reward_std": 0.12991392612457275, + "rewards/accuracy_reward_stage2": 0.3795071244239807, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1340 + }, + { + "completion_length": 10.90625, + "epoch": 0.2349745926055721, + "grad_norm": 22.350945720327807, + "kl": 0.0986328125, + "learning_rate": 7.652006308042754e-07, + "loss": 0.0396, + "reward": 1.622948408126831, + "reward_std": 0.242707759141922, + "rewards/accuracy_reward_stage2": 0.622948408126831, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1341 + }, + { + "completion_length": 9.578125, + "epoch": 0.23514981601541965, + "grad_norm": 9.096876079815528, + "kl": 0.0308837890625, + "learning_rate": 7.650254073944279e-07, + "loss": 0.0124, + "reward": 1.5761775970458984, + "reward_std": 0.022409232333302498, + "rewards/accuracy_reward_stage2": 0.7011775970458984, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1342 + }, + { + "completion_length": 7.984375, + "epoch": 0.23532503942526722, + "grad_norm": 17.327402970574514, + "kl": 0.10009765625, + "learning_rate": 7.648501839845803e-07, + "loss": -0.0041, + "reward": 1.7431111335754395, + "reward_std": 0.2279052436351776, + "rewards/accuracy_reward_stage2": 0.7587360143661499, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1343 + }, + { + "completion_length": 11.03125, + "epoch": 0.23550026283511477, + "grad_norm": 20.08448032414623, + "kl": 0.10009765625, + "learning_rate": 7.646749605747328e-07, + "loss": 0.0066, + "reward": 1.4869825839996338, + "reward_std": 0.26277047395706177, + "rewards/accuracy_reward_stage2": 0.5026075839996338, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1344 + }, + { + "completion_length": 6.359375, + "epoch": 0.23567548624496232, + "grad_norm": 16.92679779383406, + "kl": 0.078125, + "learning_rate": 7.644997371648852e-07, + "loss": -0.013, + "reward": 1.6867897510528564, + "reward_std": 0.15283793210983276, + "rewards/accuracy_reward_stage2": 0.7024147510528564, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1345 + }, + { + "completion_length": 10.765625, + "epoch": 0.2358507096548099, + "grad_norm": 14.929758046595353, + "kl": 0.037109375, + "learning_rate": 7.643245137550376e-07, + "loss": -0.0293, + "reward": 1.4871182441711426, + "reward_std": 0.12831583619117737, + "rewards/accuracy_reward_stage2": 0.5027433037757874, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1346 + }, + { + "completion_length": 15.0625, + "epoch": 0.23602593306465744, + "grad_norm": 26.23246360003974, + "kl": 0.052978515625, + "learning_rate": 7.641492903451901e-07, + "loss": 0.0211, + "reward": 1.2774174213409424, + "reward_std": 0.26563411951065063, + "rewards/accuracy_reward_stage2": 0.4024173617362976, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1347 + }, + { + "completion_length": 14.171875, + "epoch": 0.236201156474505, + "grad_norm": 26.44971956693815, + "kl": 0.08056640625, + "learning_rate": 7.639740669353425e-07, + "loss": 0.0322, + "reward": 1.3588321208953857, + "reward_std": 0.3321700990200043, + "rewards/accuracy_reward_stage2": 0.48383209109306335, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1348 + }, + { + "completion_length": 6.609375, + "epoch": 0.23637637988435256, + "grad_norm": 17.11005799023166, + "kl": 0.0849609375, + "learning_rate": 7.637988435254949e-07, + "loss": -0.0103, + "reward": 1.85406494140625, + "reward_std": 0.1711689531803131, + "rewards/accuracy_reward_stage2": 0.8696897625923157, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1349 + }, + { + "completion_length": 11.359375, + "epoch": 0.2365516032942001, + "grad_norm": 5.892884889471868, + "kl": 0.0179443359375, + "learning_rate": 7.636236201156474e-07, + "loss": 0.0072, + "reward": 1.640625, + "reward_std": 0.04419417306780815, + "rewards/accuracy_reward_stage2": 0.640625, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1350 + }, + { + "completion_length": 9.171875, + "epoch": 0.23672682670404765, + "grad_norm": 20.400597342361245, + "kl": 0.0966796875, + "learning_rate": 7.634483967057998e-07, + "loss": 0.0386, + "reward": 1.3030681610107422, + "reward_std": 0.18530681729316711, + "rewards/accuracy_reward_stage2": 0.5530681610107422, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1351 + }, + { + "completion_length": 6.84375, + "epoch": 0.23690205011389523, + "grad_norm": 26.95413948025918, + "kl": 0.150390625, + "learning_rate": 7.632731732959523e-07, + "loss": 0.0602, + "reward": 1.445420742034912, + "reward_std": 0.3342668116092682, + "rewards/accuracy_reward_stage2": 0.5860457420349121, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1352 + }, + { + "completion_length": 11.703125, + "epoch": 0.23707727352374278, + "grad_norm": 26.432871876528985, + "kl": 0.057373046875, + "learning_rate": 7.630979498861048e-07, + "loss": 0.0229, + "reward": 1.5638206005096436, + "reward_std": 0.2039935439825058, + "rewards/accuracy_reward_stage2": 0.5638206005096436, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1353 + }, + { + "completion_length": 23.75, + "epoch": 0.23725249693359032, + "grad_norm": 24.686417389604138, + "kl": 0.09716796875, + "learning_rate": 7.629227264762572e-07, + "loss": 0.0389, + "reward": 1.6011197566986084, + "reward_std": 0.13856875896453857, + "rewards/accuracy_reward_stage2": 0.601119875907898, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1354 + }, + { + "completion_length": 12.828125, + "epoch": 0.23742772034343787, + "grad_norm": 21.37557254466655, + "kl": 0.064453125, + "learning_rate": 7.627475030664097e-07, + "loss": 0.0259, + "reward": 1.537844181060791, + "reward_std": 0.2930099666118622, + "rewards/accuracy_reward_stage2": 0.5378442406654358, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1355 + }, + { + "completion_length": 14.71875, + "epoch": 0.23760294375328544, + "grad_norm": 23.396625871669126, + "kl": 0.251953125, + "learning_rate": 7.625722796565622e-07, + "loss": 0.062, + "reward": 1.6967604160308838, + "reward_std": 0.1177949458360672, + "rewards/accuracy_reward_stage2": 0.8373852968215942, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1356 + }, + { + "completion_length": 9.09375, + "epoch": 0.237778167163133, + "grad_norm": 17.31944187214885, + "kl": 0.12255859375, + "learning_rate": 7.623970562467146e-07, + "loss": 0.0274, + "reward": 1.5858019590377808, + "reward_std": 0.09252850711345673, + "rewards/accuracy_reward_stage2": 0.7264269590377808, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1357 + }, + { + "completion_length": 7.328125, + "epoch": 0.23795339057298054, + "grad_norm": 30.80854639682047, + "kl": 0.11474609375, + "learning_rate": 7.622218328368669e-07, + "loss": 0.0459, + "reward": 1.5312702655792236, + "reward_std": 0.3283507525920868, + "rewards/accuracy_reward_stage2": 0.5312702655792236, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1358 + }, + { + "completion_length": 14.46875, + "epoch": 0.2381286139828281, + "grad_norm": 23.954151165638518, + "kl": 0.1103515625, + "learning_rate": 7.620466094270193e-07, + "loss": 0.0442, + "reward": 1.4710814952850342, + "reward_std": 0.2931392788887024, + "rewards/accuracy_reward_stage2": 0.5960814952850342, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1359 + }, + { + "completion_length": 6.703125, + "epoch": 0.23830383739267566, + "grad_norm": 17.772351813432653, + "kl": 0.03369140625, + "learning_rate": 7.618713860171718e-07, + "loss": 0.0135, + "reward": 1.5546684265136719, + "reward_std": 0.2100159227848053, + "rewards/accuracy_reward_stage2": 0.5546684265136719, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1360 + }, + { + "completion_length": 14.09375, + "epoch": 0.2384790608025232, + "grad_norm": 26.058587993641353, + "kl": 0.080078125, + "learning_rate": 7.616961626073243e-07, + "loss": 0.032, + "reward": 1.7208819389343262, + "reward_std": 0.2127433568239212, + "rewards/accuracy_reward_stage2": 0.7208819389343262, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1361 + }, + { + "completion_length": 5.625, + "epoch": 0.23865428421237078, + "grad_norm": 12.00886325535751, + "kl": 0.035400390625, + "learning_rate": 7.615209391974767e-07, + "loss": 0.0142, + "reward": 1.6302083730697632, + "reward_std": 0.16082212328910828, + "rewards/accuracy_reward_stage2": 0.6302083730697632, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1362 + }, + { + "completion_length": 10.0625, + "epoch": 0.23882950762221833, + "grad_norm": 18.772731912132876, + "kl": 0.0673828125, + "learning_rate": 7.613457157876292e-07, + "loss": 0.0269, + "reward": 1.3479543924331665, + "reward_std": 0.2074214518070221, + "rewards/accuracy_reward_stage2": 0.47295433282852173, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1363 + }, + { + "completion_length": 10.3125, + "epoch": 0.23900473103206588, + "grad_norm": 34.46678613000854, + "kl": 0.11865234375, + "learning_rate": 7.611704923777817e-07, + "loss": 0.0475, + "reward": 1.659529209136963, + "reward_std": 0.219321608543396, + "rewards/accuracy_reward_stage2": 0.6595291495323181, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1364 + }, + { + "completion_length": 10.40625, + "epoch": 0.23917995444191345, + "grad_norm": 17.89859350913036, + "kl": 0.049560546875, + "learning_rate": 7.609952689679341e-07, + "loss": 0.0198, + "reward": 1.6767587661743164, + "reward_std": 0.16518397629261017, + "rewards/accuracy_reward_stage2": 0.6767587661743164, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1365 + }, + { + "completion_length": 10.171875, + "epoch": 0.239355177851761, + "grad_norm": 19.233591594971617, + "kl": 0.1337890625, + "learning_rate": 7.608200455580866e-07, + "loss": 0.0176, + "reward": 1.380042314529419, + "reward_std": 0.17864108085632324, + "rewards/accuracy_reward_stage2": 0.3956674337387085, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1366 + }, + { + "completion_length": 19.609375, + "epoch": 0.23953040126160854, + "grad_norm": 23.0646300156008, + "kl": 0.1943359375, + "learning_rate": 7.60644822148239e-07, + "loss": 0.0336, + "reward": 1.3207111358642578, + "reward_std": 0.2439693659543991, + "rewards/accuracy_reward_stage2": 0.46133607625961304, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1367 + }, + { + "completion_length": 6.234375, + "epoch": 0.23970562467145612, + "grad_norm": 31.815325142177, + "kl": 0.04248046875, + "learning_rate": 7.604695987383914e-07, + "loss": 0.017, + "reward": 1.5883839130401611, + "reward_std": 0.17570313811302185, + "rewards/accuracy_reward_stage2": 0.5883838534355164, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1368 + }, + { + "completion_length": 11.6875, + "epoch": 0.23988084808130367, + "grad_norm": 21.302564625440397, + "kl": 0.11279296875, + "learning_rate": 7.602943753285439e-07, + "loss": 0.045, + "reward": 1.4860469102859497, + "reward_std": 0.26849985122680664, + "rewards/accuracy_reward_stage2": 0.4860469698905945, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1369 + }, + { + "completion_length": 10.890625, + "epoch": 0.2400560714911512, + "grad_norm": 24.77217627681895, + "kl": 0.09228515625, + "learning_rate": 7.601191519186963e-07, + "loss": -0.0067, + "reward": 1.4838837385177612, + "reward_std": 0.2148168683052063, + "rewards/accuracy_reward_stage2": 0.49950873851776123, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1370 + }, + { + "completion_length": 9.28125, + "epoch": 0.24023129490099876, + "grad_norm": 17.564101825345425, + "kl": 0.07275390625, + "learning_rate": 7.599439285088487e-07, + "loss": -0.0152, + "reward": 1.560983657836914, + "reward_std": 0.17692884802818298, + "rewards/accuracy_reward_stage2": 0.5766085982322693, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1371 + }, + { + "completion_length": 16.21875, + "epoch": 0.24040651831084633, + "grad_norm": 26.161195250356116, + "kl": 0.0289306640625, + "learning_rate": 7.597687050990011e-07, + "loss": -0.0326, + "reward": 1.6595048904418945, + "reward_std": 0.16430558264255524, + "rewards/accuracy_reward_stage2": 0.6751298308372498, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1372 + }, + { + "completion_length": 10.515625, + "epoch": 0.24058174172069388, + "grad_norm": 18.986393795275184, + "kl": 0.09375, + "learning_rate": 7.595934816891536e-07, + "loss": 0.0072, + "reward": 1.644540548324585, + "reward_std": 0.18388310074806213, + "rewards/accuracy_reward_stage2": 0.6601656675338745, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1373 + }, + { + "completion_length": 20.140625, + "epoch": 0.24075696513054143, + "grad_norm": 24.46065165314992, + "kl": 0.1103515625, + "learning_rate": 7.594182582793061e-07, + "loss": 0.0441, + "reward": 1.6360423564910889, + "reward_std": 0.13997207581996918, + "rewards/accuracy_reward_stage2": 0.6360422372817993, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1374 + }, + { + "completion_length": 12.078125, + "epoch": 0.240932188540389, + "grad_norm": 33.907210977500775, + "kl": 0.0947265625, + "learning_rate": 7.592430348694585e-07, + "loss": 0.0379, + "reward": 1.600438117980957, + "reward_std": 0.2929736375808716, + "rewards/accuracy_reward_stage2": 0.6004381775856018, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1375 + }, + { + "completion_length": 8.40625, + "epoch": 0.24110741195023655, + "grad_norm": 47.093239141282, + "kl": 0.298828125, + "learning_rate": 7.59067811459611e-07, + "loss": 0.0861, + "reward": 1.4094713926315308, + "reward_std": 0.1661936342716217, + "rewards/accuracy_reward_stage2": 0.5500965118408203, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1376 + }, + { + "completion_length": 9.734375, + "epoch": 0.2412826353600841, + "grad_norm": 27.281382236546175, + "kl": 0.0859375, + "learning_rate": 7.588925880497635e-07, + "loss": 0.0343, + "reward": 1.6173287630081177, + "reward_std": 0.25000786781311035, + "rewards/accuracy_reward_stage2": 0.6173287630081177, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1377 + }, + { + "completion_length": 18.359375, + "epoch": 0.24145785876993167, + "grad_norm": 25.230940558343345, + "kl": 0.1435546875, + "learning_rate": 7.587173646399158e-07, + "loss": 0.0576, + "reward": 1.389910340309143, + "reward_std": 0.24498048424720764, + "rewards/accuracy_reward_stage2": 0.5149103999137878, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1378 + }, + { + "completion_length": 19.234375, + "epoch": 0.24163308217977922, + "grad_norm": 16.820536169236927, + "kl": 0.01904296875, + "learning_rate": 7.585421412300683e-07, + "loss": 0.0076, + "reward": 1.838128685951233, + "reward_std": 0.15188559889793396, + "rewards/accuracy_reward_stage2": 0.8381286859512329, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1379 + }, + { + "completion_length": 13.34375, + "epoch": 0.24180830558962677, + "grad_norm": 32.13695119853367, + "kl": 0.1474609375, + "learning_rate": 7.583669178202208e-07, + "loss": 0.0245, + "reward": 1.5661542415618896, + "reward_std": 0.20494423806667328, + "rewards/accuracy_reward_stage2": 0.7067792415618896, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1380 + }, + { + "completion_length": 13.828125, + "epoch": 0.24198352899947434, + "grad_norm": 25.714738880686244, + "kl": 0.10986328125, + "learning_rate": 7.581916944103732e-07, + "loss": 0.0437, + "reward": 1.4965579509735107, + "reward_std": 0.2931256890296936, + "rewards/accuracy_reward_stage2": 0.4965580105781555, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1381 + }, + { + "completion_length": 16.625, + "epoch": 0.2421587524093219, + "grad_norm": 25.706780804110668, + "kl": 0.1826171875, + "learning_rate": 7.580164710005257e-07, + "loss": 0.073, + "reward": 1.5389642715454102, + "reward_std": 0.20363156497478485, + "rewards/accuracy_reward_stage2": 0.6639642715454102, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1382 + }, + { + "completion_length": 13.140625, + "epoch": 0.24233397581916943, + "grad_norm": 22.98550952191299, + "kl": 0.056640625, + "learning_rate": 7.578412475906781e-07, + "loss": 0.0226, + "reward": 1.11177396774292, + "reward_std": 0.140178382396698, + "rewards/accuracy_reward_stage2": 0.3617740869522095, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1383 + }, + { + "completion_length": 11.890625, + "epoch": 0.242509199229017, + "grad_norm": 11.553671486294625, + "kl": 0.0208740234375, + "learning_rate": 7.576660241808305e-07, + "loss": 0.0083, + "reward": 1.509393572807312, + "reward_std": 0.061819229274988174, + "rewards/accuracy_reward_stage2": 0.509393572807312, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1384 + }, + { + "completion_length": 13.65625, + "epoch": 0.24268442263886456, + "grad_norm": 16.0570012110359, + "kl": 0.0830078125, + "learning_rate": 7.57490800770983e-07, + "loss": -0.012, + "reward": 1.3893593549728394, + "reward_std": 0.1712827980518341, + "rewards/accuracy_reward_stage2": 0.42060935497283936, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1385 + }, + { + "completion_length": 10.484375, + "epoch": 0.2428596460487121, + "grad_norm": 21.494736968570507, + "kl": 0.12158203125, + "learning_rate": 7.573155773611354e-07, + "loss": 0.0044, + "reward": 1.6255983114242554, + "reward_std": 0.26709023118019104, + "rewards/accuracy_reward_stage2": 0.6412232518196106, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1386 + }, + { + "completion_length": 11.96875, + "epoch": 0.24303486945855968, + "grad_norm": 24.6418008716554, + "kl": 0.07958984375, + "learning_rate": 7.571403539512879e-07, + "loss": -0.0123, + "reward": 1.5523234605789185, + "reward_std": 0.3348226249217987, + "rewards/accuracy_reward_stage2": 0.5679484009742737, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1387 + }, + { + "completion_length": 10.203125, + "epoch": 0.24321009286840722, + "grad_norm": 16.023201165440923, + "kl": 0.0966796875, + "learning_rate": 7.569651305414402e-07, + "loss": 0.0004, + "reward": 1.6837525367736816, + "reward_std": 0.1582503616809845, + "rewards/accuracy_reward_stage2": 0.6993776559829712, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1388 + }, + { + "completion_length": 6.203125, + "epoch": 0.24338531627825477, + "grad_norm": 21.374905539716238, + "kl": 0.103515625, + "learning_rate": 7.567899071315927e-07, + "loss": -0.0381, + "reward": 1.5039560794830322, + "reward_std": 0.363511860370636, + "rewards/accuracy_reward_stage2": 0.535206139087677, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1389 + }, + { + "completion_length": 9.9375, + "epoch": 0.24356053968810232, + "grad_norm": 20.894175897950706, + "kl": 0.072265625, + "learning_rate": 7.566146837217452e-07, + "loss": -0.0154, + "reward": 1.888974666595459, + "reward_std": 0.16132690012454987, + "rewards/accuracy_reward_stage2": 0.9045996069908142, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1390 + }, + { + "completion_length": 10.515625, + "epoch": 0.2437357630979499, + "grad_norm": 21.923229610078614, + "kl": 0.1572265625, + "learning_rate": 7.564394603118976e-07, + "loss": 0.0628, + "reward": 1.4778380393981934, + "reward_std": 0.2118120789527893, + "rewards/accuracy_reward_stage2": 0.6028379797935486, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1391 + }, + { + "completion_length": 8.359375, + "epoch": 0.24391098650779744, + "grad_norm": 25.056572660788657, + "kl": 0.11474609375, + "learning_rate": 7.562642369020501e-07, + "loss": 0.0459, + "reward": 1.3547465801239014, + "reward_std": 0.26677781343460083, + "rewards/accuracy_reward_stage2": 0.3547465205192566, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1392 + }, + { + "completion_length": 10.171875, + "epoch": 0.244086209917645, + "grad_norm": 19.10810836995206, + "kl": 0.166015625, + "learning_rate": 7.560890134922026e-07, + "loss": 0.0118, + "reward": 1.3078572750091553, + "reward_std": 0.2675933539867401, + "rewards/accuracy_reward_stage2": 0.4641074240207672, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1393 + }, + { + "completion_length": 9.6875, + "epoch": 0.24426143332749256, + "grad_norm": 20.465538641356297, + "kl": 0.0869140625, + "learning_rate": 7.55913790082355e-07, + "loss": 0.0059, + "reward": 1.4665038585662842, + "reward_std": 0.20319469273090363, + "rewards/accuracy_reward_stage2": 0.6071288585662842, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1394 + }, + { + "completion_length": 7.703125, + "epoch": 0.2444366567373401, + "grad_norm": 102.19387315630294, + "kl": 0.0849609375, + "learning_rate": 7.557385666725075e-07, + "loss": 0.0228, + "reward": 1.081196904182434, + "reward_std": 0.2433708757162094, + "rewards/accuracy_reward_stage2": 0.3311968743801117, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1395 + }, + { + "completion_length": 7.875, + "epoch": 0.24461188014718765, + "grad_norm": 20.99816953808676, + "kl": 0.056640625, + "learning_rate": 7.555633432626598e-07, + "loss": 0.0227, + "reward": 1.4936261177062988, + "reward_std": 0.24948295950889587, + "rewards/accuracy_reward_stage2": 0.6186261773109436, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1396 + }, + { + "completion_length": 9.28125, + "epoch": 0.24478710355703523, + "grad_norm": 20.247613253089263, + "kl": 0.041748046875, + "learning_rate": 7.553881198528122e-07, + "loss": -0.0146, + "reward": 1.596388816833496, + "reward_std": 0.24301937222480774, + "rewards/accuracy_reward_stage2": 0.6120138168334961, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1397 + }, + { + "completion_length": 12.84375, + "epoch": 0.24496232696688278, + "grad_norm": 18.695765728375, + "kl": 0.11767578125, + "learning_rate": 7.552128964429647e-07, + "loss": 0.0472, + "reward": 1.503840684890747, + "reward_std": 0.07957211136817932, + "rewards/accuracy_reward_stage2": 0.6288406848907471, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1398 + }, + { + "completion_length": 10.828125, + "epoch": 0.24513755037673032, + "grad_norm": 18.080296158546545, + "kl": 0.06396484375, + "learning_rate": 7.550376730331171e-07, + "loss": 0.0256, + "reward": 1.3729963302612305, + "reward_std": 0.14897583425045013, + "rewards/accuracy_reward_stage2": 0.4979962408542633, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1399 + }, + { + "completion_length": 7.625, + "epoch": 0.2453127737865779, + "grad_norm": 15.46236736095634, + "kl": 0.045654296875, + "learning_rate": 7.548624496232696e-07, + "loss": 0.0183, + "reward": 1.765625, + "reward_std": 0.1804211586713791, + "rewards/accuracy_reward_stage2": 0.765625, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1400 + }, + { + "completion_length": 6.640625, + "epoch": 0.24548799719642544, + "grad_norm": 24.35220226987739, + "kl": 0.083984375, + "learning_rate": 7.546872262134221e-07, + "loss": -0.0106, + "reward": 1.58424711227417, + "reward_std": 0.31146925687789917, + "rewards/accuracy_reward_stage2": 0.5998721122741699, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1401 + }, + { + "completion_length": 12.609375, + "epoch": 0.245663220606273, + "grad_norm": 12.630191645005317, + "kl": 0.058349609375, + "learning_rate": 7.545120028035745e-07, + "loss": 0.0234, + "reward": 1.3900747299194336, + "reward_std": 0.11785703897476196, + "rewards/accuracy_reward_stage2": 0.39007464051246643, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1402 + }, + { + "completion_length": 9.984375, + "epoch": 0.24583844401612057, + "grad_norm": 19.35825390615252, + "kl": 0.053955078125, + "learning_rate": 7.54336779393727e-07, + "loss": -0.0226, + "reward": 1.5636982917785645, + "reward_std": 0.2708263397216797, + "rewards/accuracy_reward_stage2": 0.5793232321739197, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1403 + }, + { + "completion_length": 9.625, + "epoch": 0.2460136674259681, + "grad_norm": 21.76949510719285, + "kl": 0.037353515625, + "learning_rate": 7.541615559838794e-07, + "loss": 0.0149, + "reward": 1.489206075668335, + "reward_std": 0.16451683640480042, + "rewards/accuracy_reward_stage2": 0.6142061948776245, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1404 + }, + { + "completion_length": 9.046875, + "epoch": 0.24618889083581566, + "grad_norm": 60.765544867318496, + "kl": 0.404296875, + "learning_rate": 7.539863325740319e-07, + "loss": 0.162, + "reward": 1.5769790410995483, + "reward_std": 0.16119015216827393, + "rewards/accuracy_reward_stage2": 0.8269790410995483, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1405 + }, + { + "completion_length": 19.171875, + "epoch": 0.2463641142456632, + "grad_norm": 27.58976702655693, + "kl": 0.15234375, + "learning_rate": 7.538111091641844e-07, + "loss": 0.061, + "reward": 1.7771577835083008, + "reward_std": 0.20885083079338074, + "rewards/accuracy_reward_stage2": 0.902157723903656, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1406 + }, + { + "completion_length": 13.96875, + "epoch": 0.24653933765551078, + "grad_norm": 15.13368734660908, + "kl": 0.0654296875, + "learning_rate": 7.536358857543368e-07, + "loss": -0.0586, + "reward": 1.3415193557739258, + "reward_std": 0.15027299523353577, + "rewards/accuracy_reward_stage2": 0.37276941537857056, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1407 + }, + { + "completion_length": 12.5, + "epoch": 0.24671456106535833, + "grad_norm": 18.73681435107374, + "kl": 0.040771484375, + "learning_rate": 7.534606623444892e-07, + "loss": 0.0163, + "reward": 1.562386155128479, + "reward_std": 0.201747328042984, + "rewards/accuracy_reward_stage2": 0.562386155128479, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1408 + }, + { + "completion_length": 9.21875, + "epoch": 0.24688978447520588, + "grad_norm": 22.44108350092471, + "kl": 0.044677734375, + "learning_rate": 7.532854389346416e-07, + "loss": 0.0179, + "reward": 1.6416747570037842, + "reward_std": 0.19642874598503113, + "rewards/accuracy_reward_stage2": 0.7666747570037842, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1409 + }, + { + "completion_length": 16.28125, + "epoch": 0.24706500788505345, + "grad_norm": 15.592344850737916, + "kl": 0.1376953125, + "learning_rate": 7.53110215524794e-07, + "loss": 0.055, + "reward": 1.1909624338150024, + "reward_std": 0.13738305866718292, + "rewards/accuracy_reward_stage2": 0.44096243381500244, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1410 + }, + { + "completion_length": 17.859375, + "epoch": 0.247240231294901, + "grad_norm": 29.324014213943627, + "kl": 0.244140625, + "learning_rate": 7.529349921149465e-07, + "loss": 0.0586, + "reward": 1.564818024635315, + "reward_std": 0.3653239607810974, + "rewards/accuracy_reward_stage2": 0.7054431438446045, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1411 + }, + { + "completion_length": 4.921875, + "epoch": 0.24741545470474854, + "grad_norm": 20.83423318481693, + "kl": 0.1259765625, + "learning_rate": 7.527597687050989e-07, + "loss": 0.0377, + "reward": 1.7137820720672607, + "reward_std": 0.2640119194984436, + "rewards/accuracy_reward_stage2": 0.7294071316719055, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1412 + }, + { + "completion_length": 12.53125, + "epoch": 0.24759067811459612, + "grad_norm": 18.45961976763546, + "kl": 0.06396484375, + "learning_rate": 7.525845452952514e-07, + "loss": 0.0255, + "reward": 1.313084363937378, + "reward_std": 0.25324708223342896, + "rewards/accuracy_reward_stage2": 0.4380842447280884, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1413 + }, + { + "completion_length": 12.421875, + "epoch": 0.24776590152444367, + "grad_norm": 36.03340900309115, + "kl": 0.049072265625, + "learning_rate": 7.524093218854039e-07, + "loss": 0.0196, + "reward": 1.501349925994873, + "reward_std": 0.25983327627182007, + "rewards/accuracy_reward_stage2": 0.5013498663902283, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1414 + }, + { + "completion_length": 12.8125, + "epoch": 0.2479411249342912, + "grad_norm": 32.04826873057793, + "kl": 0.2109375, + "learning_rate": 7.522340984755563e-07, + "loss": 0.0842, + "reward": 1.3114793300628662, + "reward_std": 0.19367793202400208, + "rewards/accuracy_reward_stage2": 0.5614794492721558, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1415 + }, + { + "completion_length": 6.90625, + "epoch": 0.2481163483441388, + "grad_norm": 17.831093251512257, + "kl": 0.03271484375, + "learning_rate": 7.520588750657088e-07, + "loss": 0.0131, + "reward": 1.59375, + "reward_std": 0.3255898952484131, + "rewards/accuracy_reward_stage2": 0.71875, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1416 + }, + { + "completion_length": 12.8125, + "epoch": 0.24829157175398633, + "grad_norm": 17.810874882373575, + "kl": 0.271484375, + "learning_rate": 7.518836516558613e-07, + "loss": 0.0648, + "reward": 1.532361626625061, + "reward_std": 0.21110737323760986, + "rewards/accuracy_reward_stage2": 0.6729865670204163, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1417 + }, + { + "completion_length": 9.84375, + "epoch": 0.24846679516383388, + "grad_norm": 15.057571439227704, + "kl": 0.0927734375, + "learning_rate": 7.517084282460136e-07, + "loss": 0.007, + "reward": 1.6341720819473267, + "reward_std": 0.13025765120983124, + "rewards/accuracy_reward_stage2": 0.6497971415519714, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1418 + }, + { + "completion_length": 11.65625, + "epoch": 0.24864201857368146, + "grad_norm": 14.375263032052567, + "kl": 0.0308837890625, + "learning_rate": 7.515332048361661e-07, + "loss": 0.0123, + "reward": 1.642590880393982, + "reward_std": 0.10421305894851685, + "rewards/accuracy_reward_stage2": 0.7675908803939819, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1419 + }, + { + "completion_length": 15.3125, + "epoch": 0.248817241983529, + "grad_norm": 23.254299329378373, + "kl": 0.04541015625, + "learning_rate": 7.513579814263185e-07, + "loss": 0.0182, + "reward": 1.7045319080352783, + "reward_std": 0.18549993634223938, + "rewards/accuracy_reward_stage2": 0.7045319080352783, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1420 + }, + { + "completion_length": 9.3125, + "epoch": 0.24899246539337655, + "grad_norm": 13.079817489563679, + "kl": 0.03515625, + "learning_rate": 7.51182758016471e-07, + "loss": -0.0184, + "reward": 1.5532286167144775, + "reward_std": 0.10524085909128189, + "rewards/accuracy_reward_stage2": 0.5688536167144775, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1421 + }, + { + "completion_length": 10.25, + "epoch": 0.24916768880322412, + "grad_norm": 23.29879296353814, + "kl": 0.0732421875, + "learning_rate": 7.510075346066234e-07, + "loss": 0.0014, + "reward": 1.4815268516540527, + "reward_std": 0.29361575841903687, + "rewards/accuracy_reward_stage2": 0.4971519112586975, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1422 + }, + { + "completion_length": 10.40625, + "epoch": 0.24934291221307167, + "grad_norm": 21.83091816412804, + "kl": 0.07763671875, + "learning_rate": 7.508323111967758e-07, + "loss": 0.0311, + "reward": 1.6659326553344727, + "reward_std": 0.23042136430740356, + "rewards/accuracy_reward_stage2": 0.6659327149391174, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1423 + }, + { + "completion_length": 7.125, + "epoch": 0.24951813562291922, + "grad_norm": 14.444543879346396, + "kl": 0.1572265625, + "learning_rate": 7.506570877869283e-07, + "loss": 0.0629, + "reward": 1.6293728351593018, + "reward_std": 0.1331539899110794, + "rewards/accuracy_reward_stage2": 0.6293728351593018, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1424 + }, + { + "completion_length": 11.453125, + "epoch": 0.24969335903276677, + "grad_norm": 18.51638467771239, + "kl": 0.03515625, + "learning_rate": 7.504818643770808e-07, + "loss": -0.0301, + "reward": 1.515625, + "reward_std": 0.20569033920764923, + "rewards/accuracy_reward_stage2": 0.53125, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1425 + }, + { + "completion_length": 8.5, + "epoch": 0.24986858244261434, + "grad_norm": 11.394857987905583, + "kl": 0.07275390625, + "learning_rate": 7.503066409672332e-07, + "loss": 0.0291, + "reward": 1.5371688604354858, + "reward_std": 0.10468746721744537, + "rewards/accuracy_reward_stage2": 0.5371688008308411, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1426 + }, + { + "completion_length": 11.859375, + "epoch": 0.2500438058524619, + "grad_norm": 19.613433131991332, + "kl": 0.09814453125, + "learning_rate": 7.501314175573856e-07, + "loss": 0.0393, + "reward": 1.4760699272155762, + "reward_std": 0.13601933419704437, + "rewards/accuracy_reward_stage2": 0.4760698676109314, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1427 + }, + { + "completion_length": 9.953125, + "epoch": 0.25021902926230943, + "grad_norm": 18.820985905928033, + "kl": 0.2041015625, + "learning_rate": 7.49956194147538e-07, + "loss": 0.0815, + "reward": 1.6965408325195312, + "reward_std": 0.18144288659095764, + "rewards/accuracy_reward_stage2": 0.6965407729148865, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1428 + }, + { + "completion_length": 10.0625, + "epoch": 0.250394252672157, + "grad_norm": 13.106608164948959, + "kl": 0.0242919921875, + "learning_rate": 7.497809707376905e-07, + "loss": 0.0097, + "reward": 1.4696911573410034, + "reward_std": 0.07452090829610825, + "rewards/accuracy_reward_stage2": 0.4696912169456482, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1429 + }, + { + "completion_length": 12.984375, + "epoch": 0.2505694760820046, + "grad_norm": 17.266848797712317, + "kl": 0.044677734375, + "learning_rate": 7.49605747327843e-07, + "loss": 0.0179, + "reward": 1.7010854482650757, + "reward_std": 0.13501934707164764, + "rewards/accuracy_reward_stage2": 0.7010855078697205, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1430 + }, + { + "completion_length": 6.875, + "epoch": 0.25074469949185213, + "grad_norm": 21.6197429247069, + "kl": 0.203125, + "learning_rate": 7.494305239179954e-07, + "loss": -0.0074, + "reward": 1.5226540565490723, + "reward_std": 0.29842448234558105, + "rewards/accuracy_reward_stage2": 0.5539040565490723, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1431 + }, + { + "completion_length": 8.390625, + "epoch": 0.2509199229016997, + "grad_norm": 28.866050092959014, + "kl": 0.042236328125, + "learning_rate": 7.492553005081479e-07, + "loss": 0.0169, + "reward": 1.445624828338623, + "reward_std": 0.173716202378273, + "rewards/accuracy_reward_stage2": 0.5706248879432678, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1432 + }, + { + "completion_length": 10.46875, + "epoch": 0.2510951463115472, + "grad_norm": 27.56143086547294, + "kl": 0.10107421875, + "learning_rate": 7.490800770983004e-07, + "loss": 0.0211, + "reward": 1.434044361114502, + "reward_std": 0.30634164810180664, + "rewards/accuracy_reward_stage2": 0.4496694803237915, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1433 + }, + { + "completion_length": 13.015625, + "epoch": 0.25127036972139477, + "grad_norm": 73.54643410324547, + "kl": 0.5546875, + "learning_rate": 7.489048536884528e-07, + "loss": 0.1939, + "reward": 1.631661295890808, + "reward_std": 0.22395049035549164, + "rewards/accuracy_reward_stage2": 0.7722861766815186, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1434 + }, + { + "completion_length": 5.46875, + "epoch": 0.2514455931312423, + "grad_norm": 21.097042169226135, + "kl": 0.2431640625, + "learning_rate": 7.487296302786052e-07, + "loss": 0.0198, + "reward": 1.3362268209457397, + "reward_std": 0.21675795316696167, + "rewards/accuracy_reward_stage2": 0.36747682094573975, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1435 + }, + { + "completion_length": 22.015625, + "epoch": 0.25162081654108986, + "grad_norm": 24.83305083490462, + "kl": 0.06689453125, + "learning_rate": 7.485544068687576e-07, + "loss": 0.0268, + "reward": 1.452586054801941, + "reward_std": 0.22090375423431396, + "rewards/accuracy_reward_stage2": 0.45258599519729614, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1436 + }, + { + "completion_length": 12.609375, + "epoch": 0.25179603995093747, + "grad_norm": 17.161128543951413, + "kl": 0.09912109375, + "learning_rate": 7.4837918345891e-07, + "loss": -0.0045, + "reward": 1.6552212238311768, + "reward_std": 0.17116889357566833, + "rewards/accuracy_reward_stage2": 0.670846164226532, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1437 + }, + { + "completion_length": 9.234375, + "epoch": 0.251971263360785, + "grad_norm": 19.6058479034139, + "kl": 0.09619140625, + "learning_rate": 7.482039600490625e-07, + "loss": 0.0384, + "reward": 1.6015892028808594, + "reward_std": 0.22905901074409485, + "rewards/accuracy_reward_stage2": 0.6015892624855042, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1438 + }, + { + "completion_length": 13.65625, + "epoch": 0.25214648677063256, + "grad_norm": 27.10964415369589, + "kl": 0.119140625, + "learning_rate": 7.480287366392149e-07, + "loss": 0.0297, + "reward": 1.3641960620880127, + "reward_std": 0.28837230801582336, + "rewards/accuracy_reward_stage2": 0.5048211812973022, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1439 + }, + { + "completion_length": 23.796875, + "epoch": 0.2523217101804801, + "grad_norm": 21.790024565848892, + "kl": 0.10498046875, + "learning_rate": 7.478535132293674e-07, + "loss": -0.0436, + "reward": 1.3245376348495483, + "reward_std": 0.2398291975259781, + "rewards/accuracy_reward_stage2": 0.35578760504722595, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1440 + }, + { + "completion_length": 13.921875, + "epoch": 0.25249693359032765, + "grad_norm": 25.97179485729417, + "kl": 0.25390625, + "learning_rate": 7.476782898195199e-07, + "loss": 0.0571, + "reward": 1.390436053276062, + "reward_std": 0.31526440382003784, + "rewards/accuracy_reward_stage2": 0.5310611128807068, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1441 + }, + { + "completion_length": 7.203125, + "epoch": 0.2526721570001752, + "grad_norm": 22.677557170391783, + "kl": 0.1640625, + "learning_rate": 7.475030664096723e-07, + "loss": -0.0452, + "reward": 1.5565773248672485, + "reward_std": 0.3190111517906189, + "rewards/accuracy_reward_stage2": 0.6034523248672485, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 1442 + }, + { + "completion_length": 14.625, + "epoch": 0.2528473804100228, + "grad_norm": 17.705483157782556, + "kl": 0.09326171875, + "learning_rate": 7.473278429998248e-07, + "loss": 0.0374, + "reward": 1.362941026687622, + "reward_std": 0.1572730839252472, + "rewards/accuracy_reward_stage2": 0.36294102668762207, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1443 + }, + { + "completion_length": 10.171875, + "epoch": 0.25302260381987035, + "grad_norm": 17.715526509646416, + "kl": 0.0186767578125, + "learning_rate": 7.471526195899772e-07, + "loss": 0.0075, + "reward": 1.6852679252624512, + "reward_std": 0.21711409091949463, + "rewards/accuracy_reward_stage2": 0.6852678656578064, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1444 + }, + { + "completion_length": 8.96875, + "epoch": 0.2531978272297179, + "grad_norm": 21.70342349079483, + "kl": 0.046875, + "learning_rate": 7.469773961801297e-07, + "loss": 0.0188, + "reward": 1.4218826293945312, + "reward_std": 0.14263302087783813, + "rewards/accuracy_reward_stage2": 0.5468826293945312, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1445 + }, + { + "completion_length": 7.765625, + "epoch": 0.25337305063956544, + "grad_norm": 49.26411936487985, + "kl": 0.0189208984375, + "learning_rate": 7.468021727702822e-07, + "loss": -0.013, + "reward": 1.8323465585708618, + "reward_std": 0.1732073575258255, + "rewards/accuracy_reward_stage2": 0.8479715585708618, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1446 + }, + { + "completion_length": 13.59375, + "epoch": 0.253548274049413, + "grad_norm": 22.701810094153558, + "kl": 0.33984375, + "learning_rate": 7.466269493604344e-07, + "loss": 0.0922, + "reward": 1.13534414768219, + "reward_std": 0.25967109203338623, + "rewards/accuracy_reward_stage2": 0.4009692072868347, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 1447 + }, + { + "completion_length": 14.109375, + "epoch": 0.25372349745926054, + "grad_norm": 15.881379556902235, + "kl": 0.08056640625, + "learning_rate": 7.464517259505869e-07, + "loss": 0.0104, + "reward": 1.5367052555084229, + "reward_std": 0.11349460482597351, + "rewards/accuracy_reward_stage2": 0.6773301362991333, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1448 + }, + { + "completion_length": 11.265625, + "epoch": 0.2538987208691081, + "grad_norm": 26.014993849480607, + "kl": 0.1591796875, + "learning_rate": 7.462765025407393e-07, + "loss": 0.0636, + "reward": 1.4904820919036865, + "reward_std": 0.25210410356521606, + "rewards/accuracy_reward_stage2": 0.6154820322990417, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1449 + }, + { + "completion_length": 13.640625, + "epoch": 0.2540739442789557, + "grad_norm": 21.598784966647813, + "kl": 0.09716796875, + "learning_rate": 7.461012791308918e-07, + "loss": 0.0106, + "reward": 1.38657546043396, + "reward_std": 0.1828770488500595, + "rewards/accuracy_reward_stage2": 0.51157546043396, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1450 + }, + { + "completion_length": 9.828125, + "epoch": 0.25424916768880323, + "grad_norm": 20.03192383804041, + "kl": 0.06787109375, + "learning_rate": 7.459260557210443e-07, + "loss": 0.0272, + "reward": 1.3679254055023193, + "reward_std": 0.20850923657417297, + "rewards/accuracy_reward_stage2": 0.3679255247116089, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1451 + }, + { + "completion_length": 9.640625, + "epoch": 0.2544243910986508, + "grad_norm": 19.753770296880482, + "kl": 0.0263671875, + "learning_rate": 7.457508323111967e-07, + "loss": 0.0105, + "reward": 1.7602910995483398, + "reward_std": 0.14263275265693665, + "rewards/accuracy_reward_stage2": 0.7602912187576294, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1452 + }, + { + "completion_length": 10.078125, + "epoch": 0.25459961450849833, + "grad_norm": 19.34286584256072, + "kl": 0.0849609375, + "learning_rate": 7.455756089013492e-07, + "loss": -0.0017, + "reward": 1.4176316261291504, + "reward_std": 0.205317884683609, + "rewards/accuracy_reward_stage2": 0.5582566261291504, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1453 + }, + { + "completion_length": 9.25, + "epoch": 0.2547748379183459, + "grad_norm": 19.34879254141021, + "kl": 0.0869140625, + "learning_rate": 7.454003854915017e-07, + "loss": 0.0348, + "reward": 1.580161690711975, + "reward_std": 0.18217667937278748, + "rewards/accuracy_reward_stage2": 0.5801616907119751, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1454 + }, + { + "completion_length": 18.734375, + "epoch": 0.2549500613281934, + "grad_norm": 39.93095842420503, + "kl": 0.19921875, + "learning_rate": 7.452251620816541e-07, + "loss": 0.0794, + "reward": 1.459136724472046, + "reward_std": 0.3002258241176605, + "rewards/accuracy_reward_stage2": 0.5841366052627563, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1455 + }, + { + "completion_length": 16.375, + "epoch": 0.255125284738041, + "grad_norm": 22.736610580425168, + "kl": 0.0269775390625, + "learning_rate": 7.450499386718066e-07, + "loss": 0.0011, + "reward": 1.651605248451233, + "reward_std": 0.13762909173965454, + "rewards/accuracy_reward_stage2": 0.6672303080558777, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1456 + }, + { + "completion_length": 11.953125, + "epoch": 0.25530050814788857, + "grad_norm": 30.042272676858413, + "kl": 0.11279296875, + "learning_rate": 7.448747152619589e-07, + "loss": 0.0009, + "reward": 1.6088519096374512, + "reward_std": 0.19328728318214417, + "rewards/accuracy_reward_stage2": 0.6244767904281616, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1457 + }, + { + "completion_length": 10.046875, + "epoch": 0.2554757315577361, + "grad_norm": 22.167031177976327, + "kl": 0.0537109375, + "learning_rate": 7.446994918521114e-07, + "loss": 0.0215, + "reward": 1.6259219646453857, + "reward_std": 0.15675854682922363, + "rewards/accuracy_reward_stage2": 0.6259219646453857, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1458 + }, + { + "completion_length": 23.0625, + "epoch": 0.25565095496758367, + "grad_norm": 21.075811345440805, + "kl": 0.038330078125, + "learning_rate": 7.445242684422639e-07, + "loss": 0.0153, + "reward": 1.518410325050354, + "reward_std": 0.11710774898529053, + "rewards/accuracy_reward_stage2": 0.5184102654457092, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1459 + }, + { + "completion_length": 18.875, + "epoch": 0.2558261783774312, + "grad_norm": 14.518518964574008, + "kl": 0.0791015625, + "learning_rate": 7.443490450324162e-07, + "loss": -0.0087, + "reward": 1.4062702655792236, + "reward_std": 0.11130297183990479, + "rewards/accuracy_reward_stage2": 0.42189526557922363, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1460 + }, + { + "completion_length": 12.015625, + "epoch": 0.25600140178727876, + "grad_norm": 28.656267793463243, + "kl": 0.0771484375, + "learning_rate": 7.441738216225687e-07, + "loss": -0.0133, + "reward": 1.7806577682495117, + "reward_std": 0.26290401816368103, + "rewards/accuracy_reward_stage2": 0.7962826490402222, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1461 + }, + { + "completion_length": 7.796875, + "epoch": 0.25617662519712636, + "grad_norm": 11.394494329527667, + "kl": 0.0169677734375, + "learning_rate": 7.439985982127212e-07, + "loss": -0.0262, + "reward": 1.609375, + "reward_std": 0.15981829166412354, + "rewards/accuracy_reward_stage2": 0.625, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1462 + }, + { + "completion_length": 10.828125, + "epoch": 0.2563518486069739, + "grad_norm": 18.163585583393655, + "kl": 0.0625, + "learning_rate": 7.438233748028736e-07, + "loss": -0.0481, + "reward": 1.711039423942566, + "reward_std": 0.25919869542121887, + "rewards/accuracy_reward_stage2": 0.7422893643379211, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1463 + }, + { + "completion_length": 5.53125, + "epoch": 0.25652707201682146, + "grad_norm": 17.824082159016235, + "kl": 0.03759765625, + "learning_rate": 7.436481513930261e-07, + "loss": 0.015, + "reward": 1.53125, + "reward_std": 0.23356688022613525, + "rewards/accuracy_reward_stage2": 0.65625, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1464 + }, + { + "completion_length": 11.59375, + "epoch": 0.256702295426669, + "grad_norm": 21.450314735509018, + "kl": 0.11181640625, + "learning_rate": 7.434729279831785e-07, + "loss": 0.0034, + "reward": 1.5948938131332397, + "reward_std": 0.2250853031873703, + "rewards/accuracy_reward_stage2": 0.8605188727378845, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 1465 + }, + { + "completion_length": 15.875, + "epoch": 0.25687751883651655, + "grad_norm": 27.628838534330345, + "kl": 0.2099609375, + "learning_rate": 7.43297704573331e-07, + "loss": 0.0837, + "reward": 1.5314993858337402, + "reward_std": 0.15965047478675842, + "rewards/accuracy_reward_stage2": 0.6564993858337402, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1466 + }, + { + "completion_length": 10.0, + "epoch": 0.2570527422463641, + "grad_norm": 22.88498429016478, + "kl": 0.11572265625, + "learning_rate": 7.431224811634834e-07, + "loss": 0.0461, + "reward": 1.4264659881591797, + "reward_std": 0.2359931915998459, + "rewards/accuracy_reward_stage2": 0.4264659583568573, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1467 + }, + { + "completion_length": 8.625, + "epoch": 0.25722796565621164, + "grad_norm": 28.53875785233994, + "kl": 0.2080078125, + "learning_rate": 7.429472577536358e-07, + "loss": 0.0521, + "reward": 1.5848909616470337, + "reward_std": 0.30258065462112427, + "rewards/accuracy_reward_stage2": 0.6005159616470337, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1468 + }, + { + "completion_length": 12.375, + "epoch": 0.25740318906605925, + "grad_norm": 20.73955273991918, + "kl": 0.1123046875, + "learning_rate": 7.427720343437883e-07, + "loss": 0.0448, + "reward": 1.2760417461395264, + "reward_std": 0.19150808453559875, + "rewards/accuracy_reward_stage2": 0.4010416567325592, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1469 + }, + { + "completion_length": 8.703125, + "epoch": 0.2575784124759068, + "grad_norm": 20.030808607531387, + "kl": 0.220703125, + "learning_rate": 7.425968109339408e-07, + "loss": 0.0609, + "reward": 1.2748761177062988, + "reward_std": 0.2619929611682892, + "rewards/accuracy_reward_stage2": 0.5405011773109436, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 1470 + }, + { + "completion_length": 14.65625, + "epoch": 0.25775363588575434, + "grad_norm": 20.08715223958517, + "kl": 0.0625, + "learning_rate": 7.424215875240932e-07, + "loss": 0.025, + "reward": 1.4923529624938965, + "reward_std": 0.14947402477264404, + "rewards/accuracy_reward_stage2": 0.49235299229621887, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1471 + }, + { + "completion_length": 11.09375, + "epoch": 0.2579288592956019, + "grad_norm": 25.864038997006453, + "kl": 0.12060546875, + "learning_rate": 7.422463641142457e-07, + "loss": 0.0483, + "reward": 1.5058326721191406, + "reward_std": 0.22785821557044983, + "rewards/accuracy_reward_stage2": 0.6308326721191406, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1472 + }, + { + "completion_length": 14.640625, + "epoch": 0.25810408270544943, + "grad_norm": 19.42238006236761, + "kl": 0.1748046875, + "learning_rate": 7.42071140704398e-07, + "loss": 0.07, + "reward": 1.171360969543457, + "reward_std": 0.22907811403274536, + "rewards/accuracy_reward_stage2": 0.42136093974113464, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1473 + }, + { + "completion_length": 11.9375, + "epoch": 0.258279306115297, + "grad_norm": 57.78994671492662, + "kl": 0.34375, + "learning_rate": 7.418959172945505e-07, + "loss": 0.0761, + "reward": 1.6343741416931152, + "reward_std": 0.21707776188850403, + "rewards/accuracy_reward_stage2": 0.79062420129776, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1474 + }, + { + "completion_length": 15.828125, + "epoch": 0.2584545295251446, + "grad_norm": 21.16854422932251, + "kl": 0.359375, + "learning_rate": 7.41720693884703e-07, + "loss": 0.1228, + "reward": 1.4675710201263428, + "reward_std": 0.11915571242570877, + "rewards/accuracy_reward_stage2": 0.7175710797309875, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1475 + }, + { + "completion_length": 7.265625, + "epoch": 0.25862975293499213, + "grad_norm": 18.533242185784058, + "kl": 0.024169921875, + "learning_rate": 7.415454704748554e-07, + "loss": 0.0097, + "reward": 1.8850057125091553, + "reward_std": 0.0957728922367096, + "rewards/accuracy_reward_stage2": 0.8850055932998657, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1476 + }, + { + "completion_length": 11.4375, + "epoch": 0.2588049763448397, + "grad_norm": 21.933109256368923, + "kl": 0.11328125, + "learning_rate": 7.413702470650078e-07, + "loss": -0.0156, + "reward": 1.615840196609497, + "reward_std": 0.32640892267227173, + "rewards/accuracy_reward_stage2": 0.6470901966094971, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1477 + }, + { + "completion_length": 9.671875, + "epoch": 0.2589801997546872, + "grad_norm": 20.716459022775172, + "kl": 0.06591796875, + "learning_rate": 7.411950236551603e-07, + "loss": -0.0179, + "reward": 1.5069777965545654, + "reward_std": 0.22965574264526367, + "rewards/accuracy_reward_stage2": 0.5226027965545654, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1478 + }, + { + "completion_length": 14.640625, + "epoch": 0.25915542316453477, + "grad_norm": 21.596170252166306, + "kl": 0.11376953125, + "learning_rate": 7.410198002453127e-07, + "loss": 0.0455, + "reward": 1.479250192642212, + "reward_std": 0.15525218844413757, + "rewards/accuracy_reward_stage2": 0.6042501330375671, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1479 + }, + { + "completion_length": 20.0, + "epoch": 0.2593306465743823, + "grad_norm": 22.154253712235363, + "kl": 0.126953125, + "learning_rate": 7.408445768354652e-07, + "loss": 0.0071, + "reward": 1.4506645202636719, + "reward_std": 0.3448774814605713, + "rewards/accuracy_reward_stage2": 0.4662895202636719, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1480 + }, + { + "completion_length": 9.828125, + "epoch": 0.2595058699842299, + "grad_norm": 19.49886095101816, + "kl": 0.15234375, + "learning_rate": 7.406693534256176e-07, + "loss": -0.0273, + "reward": 1.681386947631836, + "reward_std": 0.26997825503349304, + "rewards/accuracy_reward_stage2": 0.7126370072364807, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1481 + }, + { + "completion_length": 9.328125, + "epoch": 0.25968109339407747, + "grad_norm": 15.037718624074698, + "kl": 0.057861328125, + "learning_rate": 7.404941300157701e-07, + "loss": 0.0231, + "reward": 1.8315874338150024, + "reward_std": 0.16824200749397278, + "rewards/accuracy_reward_stage2": 0.8315874338150024, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1482 + }, + { + "completion_length": 12.15625, + "epoch": 0.259856316803925, + "grad_norm": 22.13869404334615, + "kl": 0.09423828125, + "learning_rate": 7.403189066059226e-07, + "loss": 0.0377, + "reward": 1.4400691986083984, + "reward_std": 0.10665388405323029, + "rewards/accuracy_reward_stage2": 0.5650691986083984, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1483 + }, + { + "completion_length": 6.875, + "epoch": 0.26003154021377256, + "grad_norm": 18.15221953566913, + "kl": 0.11328125, + "learning_rate": 7.40143683196075e-07, + "loss": 0.001, + "reward": 1.7418863773345947, + "reward_std": 0.21698671579360962, + "rewards/accuracy_reward_stage2": 0.8825114369392395, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1484 + }, + { + "completion_length": 7.78125, + "epoch": 0.2602067636236201, + "grad_norm": 6.688679135897942, + "kl": 0.052734375, + "learning_rate": 7.399684597862275e-07, + "loss": -0.0144, + "reward": 1.5801225900650024, + "reward_std": 0.05681667849421501, + "rewards/accuracy_reward_stage2": 0.7207475900650024, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1485 + }, + { + "completion_length": 11.15625, + "epoch": 0.26038198703346765, + "grad_norm": 22.284164463542524, + "kl": 0.05859375, + "learning_rate": 7.397932363763799e-07, + "loss": 0.0171, + "reward": 1.3674495220184326, + "reward_std": 0.2036455124616623, + "rewards/accuracy_reward_stage2": 0.4924495220184326, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1486 + }, + { + "completion_length": 12.984375, + "epoch": 0.2605572104433152, + "grad_norm": 12.447484630901936, + "kl": 0.0274658203125, + "learning_rate": 7.396180129665322e-07, + "loss": -0.0179, + "reward": 1.6566715240478516, + "reward_std": 0.08770053833723068, + "rewards/accuracy_reward_stage2": 0.6722966432571411, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1487 + }, + { + "completion_length": 7.859375, + "epoch": 0.2607324338531628, + "grad_norm": 25.099938792494854, + "kl": 0.1435546875, + "learning_rate": 7.394427895566847e-07, + "loss": 0.0574, + "reward": 1.4594025611877441, + "reward_std": 0.24416542053222656, + "rewards/accuracy_reward_stage2": 0.5844025015830994, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1488 + }, + { + "completion_length": 17.625, + "epoch": 0.26090765726301035, + "grad_norm": 21.788132997945162, + "kl": 0.296875, + "learning_rate": 7.392675661468371e-07, + "loss": 0.0749, + "reward": 1.1694362163543701, + "reward_std": 0.19415882229804993, + "rewards/accuracy_reward_stage2": 0.4350612759590149, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 1489 + }, + { + "completion_length": 12.6875, + "epoch": 0.2610828806728579, + "grad_norm": 24.197340000226113, + "kl": 0.208984375, + "learning_rate": 7.390923427369896e-07, + "loss": 0.0392, + "reward": 1.5895700454711914, + "reward_std": 0.33698683977127075, + "rewards/accuracy_reward_stage2": 0.605195164680481, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1490 + }, + { + "completion_length": 8.015625, + "epoch": 0.26125810408270544, + "grad_norm": 23.261036424611014, + "kl": 0.109375, + "learning_rate": 7.389171193271421e-07, + "loss": 0.0437, + "reward": 1.7723209857940674, + "reward_std": 0.23626047372817993, + "rewards/accuracy_reward_stage2": 0.7723209857940674, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1491 + }, + { + "completion_length": 8.5625, + "epoch": 0.261433327492553, + "grad_norm": 21.332989290106536, + "kl": 0.1494140625, + "learning_rate": 7.387418959172945e-07, + "loss": -0.0579, + "reward": 1.338010311126709, + "reward_std": 0.3674984872341156, + "rewards/accuracy_reward_stage2": 0.384885311126709, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 1492 + }, + { + "completion_length": 9.328125, + "epoch": 0.26160855090240054, + "grad_norm": 25.19229994115063, + "kl": 0.16015625, + "learning_rate": 7.38566672507447e-07, + "loss": 0.0641, + "reward": 1.725899338722229, + "reward_std": 0.33280277252197266, + "rewards/accuracy_reward_stage2": 0.725899338722229, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1493 + }, + { + "completion_length": 7.875, + "epoch": 0.26178377431224814, + "grad_norm": 19.955869600132683, + "kl": 0.0869140625, + "learning_rate": 7.383914490975995e-07, + "loss": 0.0346, + "reward": 1.4154143333435059, + "reward_std": 0.22046023607254028, + "rewards/accuracy_reward_stage2": 0.41541436314582825, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1494 + }, + { + "completion_length": 6.15625, + "epoch": 0.2619589977220957, + "grad_norm": 17.110046537530014, + "kl": 0.091796875, + "learning_rate": 7.382162256877519e-07, + "loss": -0.029, + "reward": 1.28125, + "reward_std": 0.26409146189689636, + "rewards/accuracy_reward_stage2": 0.5625, + "rewards/format_reward_stage1_pointerpad": 0.71875, + "scores/accuracy_reward_stage2": 0.71875, + "step": 1495 + }, + { + "completion_length": 14.171875, + "epoch": 0.26213422113194323, + "grad_norm": 20.595751893456807, + "kl": 0.376953125, + "learning_rate": 7.380410022779044e-07, + "loss": 0.1507, + "reward": 1.3054620027542114, + "reward_std": 0.13554486632347107, + "rewards/accuracy_reward_stage2": 0.43046194314956665, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1496 + }, + { + "completion_length": 11.078125, + "epoch": 0.2623094445417908, + "grad_norm": 26.140411911761632, + "kl": 0.126953125, + "learning_rate": 7.378657788680567e-07, + "loss": -0.0612, + "reward": 1.340702772140503, + "reward_std": 0.2606419324874878, + "rewards/accuracy_reward_stage2": 0.38757771253585815, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 1497 + }, + { + "completion_length": 8.953125, + "epoch": 0.26248466795163833, + "grad_norm": 20.82795983912578, + "kl": 0.0615234375, + "learning_rate": 7.376905554582091e-07, + "loss": -0.0043, + "reward": 1.5155892372131348, + "reward_std": 0.20563608407974243, + "rewards/accuracy_reward_stage2": 0.5312142968177795, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1498 + }, + { + "completion_length": 9.796875, + "epoch": 0.2626598913614859, + "grad_norm": 33.9523855883196, + "kl": 0.080078125, + "learning_rate": 7.375153320483616e-07, + "loss": 0.0319, + "reward": 1.6113669872283936, + "reward_std": 0.2674233317375183, + "rewards/accuracy_reward_stage2": 0.6113669276237488, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1499 + }, + { + "completion_length": 8.234375, + "epoch": 0.2628351147713335, + "grad_norm": 24.67586190521906, + "kl": 0.1005859375, + "learning_rate": 7.37340108638514e-07, + "loss": 0.0404, + "reward": 1.703125, + "reward_std": 0.16887325048446655, + "rewards/accuracy_reward_stage2": 0.703125, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1500 + }, + { + "completion_length": 9.109375, + "epoch": 0.263010338181181, + "grad_norm": 24.975522476038243, + "kl": 0.0927734375, + "learning_rate": 7.371648852286665e-07, + "loss": 0.0037, + "reward": 1.3885433673858643, + "reward_std": 0.36126166582107544, + "rewards/accuracy_reward_stage2": 0.40416842699050903, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1501 + }, + { + "completion_length": 8.6875, + "epoch": 0.26318556159102857, + "grad_norm": 23.351333659034683, + "kl": 0.17578125, + "learning_rate": 7.36989661818819e-07, + "loss": 0.0177, + "reward": 1.5084370374679565, + "reward_std": 0.2623979449272156, + "rewards/accuracy_reward_stage2": 0.5396870374679565, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1502 + }, + { + "completion_length": 8.75, + "epoch": 0.2633607850008761, + "grad_norm": 22.303221688076096, + "kl": 0.125, + "learning_rate": 7.368144384089714e-07, + "loss": 0.0503, + "reward": 1.6632972955703735, + "reward_std": 0.3265223205089569, + "rewards/accuracy_reward_stage2": 0.6632972955703735, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1503 + }, + { + "completion_length": 10.609375, + "epoch": 0.26353600841072367, + "grad_norm": 21.468972803101558, + "kl": 0.03125, + "learning_rate": 7.366392149991239e-07, + "loss": 0.0125, + "reward": 1.4165986776351929, + "reward_std": 0.3401222229003906, + "rewards/accuracy_reward_stage2": 0.41659867763519287, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1504 + }, + { + "completion_length": 14.84375, + "epoch": 0.2637112318205712, + "grad_norm": 20.28889674676962, + "kl": 0.041015625, + "learning_rate": 7.364639915892763e-07, + "loss": 0.0165, + "reward": 1.507493495941162, + "reward_std": 0.2466462105512619, + "rewards/accuracy_reward_stage2": 0.5074934959411621, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1505 + }, + { + "completion_length": 8.8125, + "epoch": 0.26388645523041876, + "grad_norm": 13.069562216022993, + "kl": 0.054443359375, + "learning_rate": 7.362887681794288e-07, + "loss": -0.0224, + "reward": 1.5927482843399048, + "reward_std": 0.08447122573852539, + "rewards/accuracy_reward_stage2": 0.7333732843399048, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1506 + }, + { + "completion_length": 11.25, + "epoch": 0.26406167864026636, + "grad_norm": 19.479084888275086, + "kl": 0.107421875, + "learning_rate": 7.361135447695812e-07, + "loss": -0.0244, + "reward": 1.7037560939788818, + "reward_std": 0.21897412836551666, + "rewards/accuracy_reward_stage2": 0.7350061535835266, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1507 + }, + { + "completion_length": 13.3125, + "epoch": 0.2642369020501139, + "grad_norm": 17.467329132593147, + "kl": 0.051513671875, + "learning_rate": 7.359383213597336e-07, + "loss": 0.0206, + "reward": 1.7382456064224243, + "reward_std": 0.16379417479038239, + "rewards/accuracy_reward_stage2": 0.7382456064224243, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1508 + }, + { + "completion_length": 13.859375, + "epoch": 0.26441212545996146, + "grad_norm": 33.108877608375465, + "kl": 0.2470703125, + "learning_rate": 7.357630979498861e-07, + "loss": 0.0991, + "reward": 1.4780302047729492, + "reward_std": 0.03881131857633591, + "rewards/accuracy_reward_stage2": 0.6030303239822388, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1509 + }, + { + "completion_length": 13.984375, + "epoch": 0.264587348869809, + "grad_norm": 27.470318929032366, + "kl": 0.1005859375, + "learning_rate": 7.355878745400386e-07, + "loss": -0.0041, + "reward": 1.5446429252624512, + "reward_std": 0.3013976216316223, + "rewards/accuracy_reward_stage2": 0.6852678060531616, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1510 + }, + { + "completion_length": 13.328125, + "epoch": 0.26476257227965655, + "grad_norm": 23.887624091153057, + "kl": 0.1376953125, + "learning_rate": 7.354126511301909e-07, + "loss": 0.0552, + "reward": 1.4459335803985596, + "reward_std": 0.3139682412147522, + "rewards/accuracy_reward_stage2": 0.5709335207939148, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1511 + }, + { + "completion_length": 13.859375, + "epoch": 0.2649377956895041, + "grad_norm": 20.174374690765312, + "kl": 0.181640625, + "learning_rate": 7.352374277203434e-07, + "loss": 0.0727, + "reward": 1.328125, + "reward_std": 0.2109457552433014, + "rewards/accuracy_reward_stage2": 0.453125, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1512 + }, + { + "completion_length": 9.640625, + "epoch": 0.2651130190993517, + "grad_norm": 23.224227516558585, + "kl": 0.10888671875, + "learning_rate": 7.350622043104958e-07, + "loss": 0.0436, + "reward": 1.6329894065856934, + "reward_std": 0.21760083734989166, + "rewards/accuracy_reward_stage2": 0.6329893469810486, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1513 + }, + { + "completion_length": 13.375, + "epoch": 0.26528824250919925, + "grad_norm": 16.8682280462148, + "kl": 0.051025390625, + "learning_rate": 7.348869809006483e-07, + "loss": 0.0204, + "reward": 1.3874523639678955, + "reward_std": 0.14845938980579376, + "rewards/accuracy_reward_stage2": 0.38745230436325073, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1514 + }, + { + "completion_length": 8.734375, + "epoch": 0.2654634659190468, + "grad_norm": 24.619000253706965, + "kl": 0.12451171875, + "learning_rate": 7.347117574908008e-07, + "loss": -0.0269, + "reward": 1.5498409271240234, + "reward_std": 0.2750667929649353, + "rewards/accuracy_reward_stage2": 0.7060908079147339, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1515 + }, + { + "completion_length": 10.640625, + "epoch": 0.26563868932889434, + "grad_norm": 19.46372314296072, + "kl": 0.1396484375, + "learning_rate": 7.345365340809531e-07, + "loss": 0.0196, + "reward": 1.3163824081420898, + "reward_std": 0.19993111491203308, + "rewards/accuracy_reward_stage2": 0.33200740814208984, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1516 + }, + { + "completion_length": 8.59375, + "epoch": 0.2658139127387419, + "grad_norm": 16.929295225810957, + "kl": 0.044921875, + "learning_rate": 7.343613106711056e-07, + "loss": 0.018, + "reward": 1.6382789611816406, + "reward_std": 0.19527596235275269, + "rewards/accuracy_reward_stage2": 0.6382789611816406, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1517 + }, + { + "completion_length": 18.453125, + "epoch": 0.26598913614858943, + "grad_norm": 18.964953779951724, + "kl": 0.2216796875, + "learning_rate": 7.341860872612581e-07, + "loss": 0.0446, + "reward": 1.358322024345398, + "reward_std": 0.1473166048526764, + "rewards/accuracy_reward_stage2": 0.623947024345398, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 1518 + }, + { + "completion_length": 12.140625, + "epoch": 0.266164359558437, + "grad_norm": 28.538663763536473, + "kl": 0.05078125, + "learning_rate": 7.340108638514105e-07, + "loss": 0.0203, + "reward": 1.677398681640625, + "reward_std": 0.1924341320991516, + "rewards/accuracy_reward_stage2": 0.677398681640625, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1519 + }, + { + "completion_length": 10.21875, + "epoch": 0.2663395829682846, + "grad_norm": 22.416505790465667, + "kl": 0.1357421875, + "learning_rate": 7.33835640441563e-07, + "loss": -0.0172, + "reward": 1.6864490509033203, + "reward_std": 0.31137195229530334, + "rewards/accuracy_reward_stage2": 0.7176990509033203, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1520 + }, + { + "completion_length": 10.515625, + "epoch": 0.26651480637813213, + "grad_norm": 31.324550878208168, + "kl": 0.007598876953125, + "learning_rate": 7.336604170317154e-07, + "loss": 0.003, + "reward": 1.8011363744735718, + "reward_std": 0.07158337533473969, + "rewards/accuracy_reward_stage2": 0.8011363744735718, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1521 + }, + { + "completion_length": 9.78125, + "epoch": 0.2666900297879797, + "grad_norm": 24.600260940307486, + "kl": 0.0966796875, + "learning_rate": 7.334851936218679e-07, + "loss": 0.0387, + "reward": 1.659136176109314, + "reward_std": 0.14838439226150513, + "rewards/accuracy_reward_stage2": 0.659136176109314, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1522 + }, + { + "completion_length": 10.484375, + "epoch": 0.2668652531978272, + "grad_norm": 43.7767226666006, + "kl": 0.2431640625, + "learning_rate": 7.333099702120204e-07, + "loss": 0.0426, + "reward": 1.4813872575759888, + "reward_std": 0.34337955713272095, + "rewards/accuracy_reward_stage2": 0.637637197971344, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1523 + }, + { + "completion_length": 15.9375, + "epoch": 0.26704047660767477, + "grad_norm": 19.529746543141552, + "kl": 0.138671875, + "learning_rate": 7.331347468021727e-07, + "loss": 0.0554, + "reward": 1.5571314096450806, + "reward_std": 0.18025755882263184, + "rewards/accuracy_reward_stage2": 0.5571314692497253, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1524 + }, + { + "completion_length": 13.765625, + "epoch": 0.2672157000175223, + "grad_norm": 22.63237928794489, + "kl": 0.0859375, + "learning_rate": 7.329595233923252e-07, + "loss": -0.0073, + "reward": 1.4489909410476685, + "reward_std": 0.15715591609477997, + "rewards/accuracy_reward_stage2": 0.7146159410476685, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 1525 + }, + { + "completion_length": 11.8125, + "epoch": 0.2673909234273699, + "grad_norm": 24.14759904765817, + "kl": 0.1318359375, + "learning_rate": 7.327842999824775e-07, + "loss": 0.0526, + "reward": 1.4918327331542969, + "reward_std": 0.2508474290370941, + "rewards/accuracy_reward_stage2": 0.4918326735496521, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1526 + }, + { + "completion_length": 6.75, + "epoch": 0.26756614683721747, + "grad_norm": 20.278598793689493, + "kl": 0.07373046875, + "learning_rate": 7.3260907657263e-07, + "loss": 0.0296, + "reward": 1.3896163702011108, + "reward_std": 0.3074378967285156, + "rewards/accuracy_reward_stage2": 0.6396163702011108, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1527 + }, + { + "completion_length": 7.484375, + "epoch": 0.267741370247065, + "grad_norm": 19.871356492961542, + "kl": 0.056640625, + "learning_rate": 7.324338531627825e-07, + "loss": 0.0226, + "reward": 1.7014296054840088, + "reward_std": 0.1917523890733719, + "rewards/accuracy_reward_stage2": 0.7014296054840088, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1528 + }, + { + "completion_length": 9.015625, + "epoch": 0.26791659365691256, + "grad_norm": 18.416822579452255, + "kl": 0.04052734375, + "learning_rate": 7.322586297529349e-07, + "loss": 0.0162, + "reward": 1.6586830615997314, + "reward_std": 0.15771014988422394, + "rewards/accuracy_reward_stage2": 0.6586830615997314, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1529 + }, + { + "completion_length": 13.03125, + "epoch": 0.2680918170667601, + "grad_norm": 157.14169101763926, + "kl": 0.91015625, + "learning_rate": 7.320834063430874e-07, + "loss": 0.3213, + "reward": 1.418661117553711, + "reward_std": 0.2747938632965088, + "rewards/accuracy_reward_stage2": 0.6842861175537109, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 1530 + }, + { + "completion_length": 7.8125, + "epoch": 0.26826704047660765, + "grad_norm": 17.26589900919268, + "kl": 0.05908203125, + "learning_rate": 7.319081829332399e-07, + "loss": 0.0237, + "reward": 1.5729167461395264, + "reward_std": 0.14022307097911835, + "rewards/accuracy_reward_stage2": 0.5729166865348816, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1531 + }, + { + "completion_length": 21.59375, + "epoch": 0.26844226388645526, + "grad_norm": 16.03669517849557, + "kl": 0.0203857421875, + "learning_rate": 7.317329595233923e-07, + "loss": 0.0082, + "reward": 1.642247200012207, + "reward_std": 0.13648752868175507, + "rewards/accuracy_reward_stage2": 0.7672471404075623, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1532 + }, + { + "completion_length": 8.609375, + "epoch": 0.2686174872963028, + "grad_norm": 21.116938570932263, + "kl": 0.06396484375, + "learning_rate": 7.315577361135448e-07, + "loss": -0.0187, + "reward": 1.3490604162216187, + "reward_std": 0.34063291549682617, + "rewards/accuracy_reward_stage2": 0.48968538641929626, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1533 + }, + { + "completion_length": 13.015625, + "epoch": 0.26879271070615035, + "grad_norm": 21.123971038254172, + "kl": 0.08837890625, + "learning_rate": 7.313825127036972e-07, + "loss": 0.0354, + "reward": 1.433570146560669, + "reward_std": 0.17886027693748474, + "rewards/accuracy_reward_stage2": 0.5585700869560242, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1534 + }, + { + "completion_length": 12.40625, + "epoch": 0.2689679341159979, + "grad_norm": 19.209208955755024, + "kl": 0.083984375, + "learning_rate": 7.312072892938497e-07, + "loss": -0.0107, + "reward": 1.373374104499817, + "reward_std": 0.22035284340381622, + "rewards/accuracy_reward_stage2": 0.5139991044998169, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1535 + }, + { + "completion_length": 11.046875, + "epoch": 0.26914315752584544, + "grad_norm": 17.626216892205676, + "kl": 0.040283203125, + "learning_rate": 7.310320658840022e-07, + "loss": 0.0161, + "reward": 1.4475104808807373, + "reward_std": 0.16717243194580078, + "rewards/accuracy_reward_stage2": 0.4475104808807373, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1536 + }, + { + "completion_length": 12.828125, + "epoch": 0.269318380935693, + "grad_norm": 20.258858135507833, + "kl": 0.08984375, + "learning_rate": 7.308568424741544e-07, + "loss": 0.036, + "reward": 1.6602098941802979, + "reward_std": 0.1800985336303711, + "rewards/accuracy_reward_stage2": 0.6602099537849426, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1537 + }, + { + "completion_length": 11.953125, + "epoch": 0.26949360434554054, + "grad_norm": 18.407349718764067, + "kl": 0.056396484375, + "learning_rate": 7.306816190643069e-07, + "loss": 0.0225, + "reward": 1.3184478282928467, + "reward_std": 0.12989208102226257, + "rewards/accuracy_reward_stage2": 0.3184478282928467, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1538 + }, + { + "completion_length": 9.390625, + "epoch": 0.26966882775538814, + "grad_norm": 19.289921141580795, + "kl": 0.08203125, + "learning_rate": 7.305063956544594e-07, + "loss": 0.0328, + "reward": 1.4996446371078491, + "reward_std": 0.2246101200580597, + "rewards/accuracy_reward_stage2": 0.7496446371078491, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1539 + }, + { + "completion_length": 6.484375, + "epoch": 0.2698440511652357, + "grad_norm": 12.441059142254845, + "kl": 0.04541015625, + "learning_rate": 7.303311722446118e-07, + "loss": 0.0181, + "reward": 1.5642765760421753, + "reward_std": 0.08475670963525772, + "rewards/accuracy_reward_stage2": 0.6892765760421753, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1540 + }, + { + "completion_length": 12.359375, + "epoch": 0.27001927457508323, + "grad_norm": 24.390854012867567, + "kl": 0.333984375, + "learning_rate": 7.301559488347643e-07, + "loss": 0.0978, + "reward": 1.6280450820922852, + "reward_std": 0.20801720023155212, + "rewards/accuracy_reward_stage2": 0.7686700820922852, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1541 + }, + { + "completion_length": 10.609375, + "epoch": 0.2701944979849308, + "grad_norm": 19.020692683382507, + "kl": 0.01507568359375, + "learning_rate": 7.299807254249167e-07, + "loss": 0.006, + "reward": 1.4479167461395264, + "reward_std": 0.1515468955039978, + "rewards/accuracy_reward_stage2": 0.4479166865348816, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1542 + }, + { + "completion_length": 9.546875, + "epoch": 0.27036972139477833, + "grad_norm": 18.919351767185397, + "kl": 0.0703125, + "learning_rate": 7.298055020150692e-07, + "loss": 0.0282, + "reward": 1.7478828430175781, + "reward_std": 0.18501171469688416, + "rewards/accuracy_reward_stage2": 0.7478827834129333, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1543 + }, + { + "completion_length": 8.015625, + "epoch": 0.2705449448046259, + "grad_norm": 19.402100954334866, + "kl": 0.1044921875, + "learning_rate": 7.296302786052217e-07, + "loss": 0.0417, + "reward": 1.5251411199569702, + "reward_std": 0.26569750905036926, + "rewards/accuracy_reward_stage2": 0.5251411199569702, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1544 + }, + { + "completion_length": 10.28125, + "epoch": 0.2707201682144735, + "grad_norm": 15.03881206013984, + "kl": 0.02001953125, + "learning_rate": 7.294550551953741e-07, + "loss": 0.008, + "reward": 1.7756855487823486, + "reward_std": 0.12909118831157684, + "rewards/accuracy_reward_stage2": 0.7756855487823486, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1545 + }, + { + "completion_length": 6.96875, + "epoch": 0.270895391624321, + "grad_norm": 13.6036257647761, + "kl": 0.361328125, + "learning_rate": 7.292798317855265e-07, + "loss": 0.1449, + "reward": 1.4483622312545776, + "reward_std": 0.13131847977638245, + "rewards/accuracy_reward_stage2": 0.6983622312545776, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1546 + }, + { + "completion_length": 8.296875, + "epoch": 0.27107061503416857, + "grad_norm": 19.47664920431056, + "kl": 0.1357421875, + "learning_rate": 7.29104608375679e-07, + "loss": 0.0119, + "reward": 1.723066806793213, + "reward_std": 0.2823743224143982, + "rewards/accuracy_reward_stage2": 0.7386916875839233, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1547 + }, + { + "completion_length": 7.84375, + "epoch": 0.2712458384440161, + "grad_norm": 17.67104325953167, + "kl": 0.08935546875, + "learning_rate": 7.289293849658314e-07, + "loss": 0.0358, + "reward": 1.5002610683441162, + "reward_std": 0.27437451481819153, + "rewards/accuracy_reward_stage2": 0.6252610683441162, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1548 + }, + { + "completion_length": 14.328125, + "epoch": 0.27142106185386367, + "grad_norm": 21.85574395429999, + "kl": 0.0220947265625, + "learning_rate": 7.287541615559838e-07, + "loss": 0.0088, + "reward": 1.491215467453003, + "reward_std": 0.20244181156158447, + "rewards/accuracy_reward_stage2": 0.4912155270576477, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1549 + }, + { + "completion_length": 12.46875, + "epoch": 0.2715962852637112, + "grad_norm": 21.99441168996388, + "kl": 0.11376953125, + "learning_rate": 7.285789381461362e-07, + "loss": 0.0454, + "reward": 1.4256680011749268, + "reward_std": 0.2561571002006531, + "rewards/accuracy_reward_stage2": 0.5506680607795715, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1550 + }, + { + "completion_length": 10.84375, + "epoch": 0.2717715086735588, + "grad_norm": 19.485417665693184, + "kl": 0.1865234375, + "learning_rate": 7.284037147362887e-07, + "loss": -0.0068, + "reward": 1.4601422548294067, + "reward_std": 0.2415839433670044, + "rewards/accuracy_reward_stage2": 0.4913923144340515, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1551 + }, + { + "completion_length": 6.484375, + "epoch": 0.27194673208340636, + "grad_norm": 19.868362707801886, + "kl": 0.1474609375, + "learning_rate": 7.282284913264412e-07, + "loss": 0.03, + "reward": 1.6962921619415283, + "reward_std": 0.23688717186450958, + "rewards/accuracy_reward_stage2": 0.7119171619415283, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1552 + }, + { + "completion_length": 8.765625, + "epoch": 0.2721219554932539, + "grad_norm": 21.00231032813797, + "kl": 0.0947265625, + "learning_rate": 7.280532679165936e-07, + "loss": 0.038, + "reward": 1.3415729999542236, + "reward_std": 0.28956979513168335, + "rewards/accuracy_reward_stage2": 0.46657294034957886, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1553 + }, + { + "completion_length": 10.171875, + "epoch": 0.27229717890310146, + "grad_norm": 16.49101090303168, + "kl": 0.048095703125, + "learning_rate": 7.278780445067461e-07, + "loss": -0.0791, + "reward": 1.3982372283935547, + "reward_std": 0.2809445261955261, + "rewards/accuracy_reward_stage2": 0.4451121687889099, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 1554 + }, + { + "completion_length": 9.15625, + "epoch": 0.272472402312949, + "grad_norm": 20.813752137808752, + "kl": 0.10986328125, + "learning_rate": 7.277028210968986e-07, + "loss": -0.0759, + "reward": 1.5947017669677734, + "reward_std": 0.34263890981674194, + "rewards/accuracy_reward_stage2": 0.6415768265724182, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 1555 + }, + { + "completion_length": 14.953125, + "epoch": 0.27264762572279655, + "grad_norm": 19.569881932631915, + "kl": 0.050048828125, + "learning_rate": 7.275275976870509e-07, + "loss": 0.02, + "reward": 1.2932356595993042, + "reward_std": 0.14172068238258362, + "rewards/accuracy_reward_stage2": 0.4182356595993042, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1556 + }, + { + "completion_length": 10.984375, + "epoch": 0.2728228491326441, + "grad_norm": 23.275062004848152, + "kl": 0.2412109375, + "learning_rate": 7.273523742772034e-07, + "loss": 0.0523, + "reward": 1.5078812837600708, + "reward_std": 0.26423394680023193, + "rewards/accuracy_reward_stage2": 0.5235062837600708, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1557 + }, + { + "completion_length": 8.265625, + "epoch": 0.2729980725424917, + "grad_norm": 16.222941710110852, + "kl": 0.08251953125, + "learning_rate": 7.271771508673558e-07, + "loss": 0.0329, + "reward": 1.6588354110717773, + "reward_std": 0.08715774118900299, + "rewards/accuracy_reward_stage2": 0.6588354110717773, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1558 + }, + { + "completion_length": 9.96875, + "epoch": 0.27317329595233925, + "grad_norm": 15.832014677569221, + "kl": 0.061767578125, + "learning_rate": 7.270019274575083e-07, + "loss": -0.0195, + "reward": 1.5660628080368042, + "reward_std": 0.17610442638397217, + "rewards/accuracy_reward_stage2": 0.5816878080368042, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1559 + }, + { + "completion_length": 8.203125, + "epoch": 0.2733485193621868, + "grad_norm": 22.731903546764354, + "kl": 0.1298828125, + "learning_rate": 7.268267040476608e-07, + "loss": 0.008, + "reward": 1.6920348405838013, + "reward_std": 0.24214372038841248, + "rewards/accuracy_reward_stage2": 0.7076598405838013, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1560 + }, + { + "completion_length": 9.96875, + "epoch": 0.27352374277203434, + "grad_norm": 23.863983210931238, + "kl": 0.16015625, + "learning_rate": 7.266514806378132e-07, + "loss": 0.0203, + "reward": 1.4968124628067017, + "reward_std": 0.13817864656448364, + "rewards/accuracy_reward_stage2": 0.6374374628067017, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1561 + }, + { + "completion_length": 10.765625, + "epoch": 0.2736989661818819, + "grad_norm": 21.012253949955664, + "kl": 0.08154296875, + "learning_rate": 7.264762572279656e-07, + "loss": -0.0115, + "reward": 1.4106385707855225, + "reward_std": 0.292081743478775, + "rewards/accuracy_reward_stage2": 0.42626360058784485, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1562 + }, + { + "completion_length": 10.28125, + "epoch": 0.27387418959172943, + "grad_norm": 18.270956325101377, + "kl": 0.2431640625, + "learning_rate": 7.263010338181181e-07, + "loss": 0.0971, + "reward": 1.5469186305999756, + "reward_std": 0.25838232040405273, + "rewards/accuracy_reward_stage2": 0.6719185709953308, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1563 + }, + { + "completion_length": 6.90625, + "epoch": 0.27404941300157704, + "grad_norm": 16.509044988635893, + "kl": 0.03125, + "learning_rate": 7.261258104082705e-07, + "loss": 0.0126, + "reward": 1.7471439838409424, + "reward_std": 0.11338774114847183, + "rewards/accuracy_reward_stage2": 0.7471439838409424, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1564 + }, + { + "completion_length": 13.078125, + "epoch": 0.2742246364114246, + "grad_norm": 23.28838918712886, + "kl": 0.1279296875, + "learning_rate": 7.25950586998423e-07, + "loss": 0.0512, + "reward": 1.424917459487915, + "reward_std": 0.12856319546699524, + "rewards/accuracy_reward_stage2": 0.674917459487915, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1565 + }, + { + "completion_length": 12.46875, + "epoch": 0.27439985982127213, + "grad_norm": 23.4382924648424, + "kl": 0.12109375, + "learning_rate": 7.257753635885753e-07, + "loss": 0.0483, + "reward": 1.663330316543579, + "reward_std": 0.17533919215202332, + "rewards/accuracy_reward_stage2": 0.7883303165435791, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1566 + }, + { + "completion_length": 12.0625, + "epoch": 0.2745750832311197, + "grad_norm": 29.567678011581055, + "kl": 0.03271484375, + "learning_rate": 7.256001401787278e-07, + "loss": -0.0312, + "reward": 1.4522767066955566, + "reward_std": 0.2672945261001587, + "rewards/accuracy_reward_stage2": 0.46790170669555664, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1567 + }, + { + "completion_length": 12.6875, + "epoch": 0.2747503066409672, + "grad_norm": 29.62746153373393, + "kl": 0.01361083984375, + "learning_rate": 7.254249167688803e-07, + "loss": 0.0054, + "reward": 1.375, + "reward_std": 0.2709311842918396, + "rewards/accuracy_reward_stage2": 0.375, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1568 + }, + { + "completion_length": 11.375, + "epoch": 0.27492553005081477, + "grad_norm": 17.557370589904153, + "kl": 0.1123046875, + "learning_rate": 7.252496933590327e-07, + "loss": -0.0266, + "reward": 1.325448989868164, + "reward_std": 0.2274552583694458, + "rewards/accuracy_reward_stage2": 0.48169901967048645, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1569 + }, + { + "completion_length": 10.890625, + "epoch": 0.2751007534606623, + "grad_norm": 27.34549604652153, + "kl": 0.11328125, + "learning_rate": 7.250744699491852e-07, + "loss": -0.0535, + "reward": 1.5265979766845703, + "reward_std": 0.3133997321128845, + "rewards/accuracy_reward_stage2": 0.5734728574752808, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 1570 + }, + { + "completion_length": 7.421875, + "epoch": 0.2752759768705099, + "grad_norm": 18.701687375757633, + "kl": 0.2158203125, + "learning_rate": 7.248992465393377e-07, + "loss": 0.0444, + "reward": 1.6914236545562744, + "reward_std": 0.1895194947719574, + "rewards/accuracy_reward_stage2": 0.7070485353469849, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1571 + }, + { + "completion_length": 9.109375, + "epoch": 0.27545120028035747, + "grad_norm": 14.077681371050371, + "kl": 0.06103515625, + "learning_rate": 7.247240231294901e-07, + "loss": -0.0434, + "reward": 1.7748515605926514, + "reward_std": 0.1633080542087555, + "rewards/accuracy_reward_stage2": 0.8061015009880066, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1572 + }, + { + "completion_length": 10.90625, + "epoch": 0.275626423690205, + "grad_norm": 23.67382172325355, + "kl": 0.2373046875, + "learning_rate": 7.245487997196426e-07, + "loss": 0.0242, + "reward": 1.2528319358825684, + "reward_std": 0.2686885595321655, + "rewards/accuracy_reward_stage2": 0.40908199548721313, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1573 + }, + { + "completion_length": 8.8125, + "epoch": 0.27580164710005256, + "grad_norm": 22.988428113639372, + "kl": 0.09716796875, + "learning_rate": 7.24373576309795e-07, + "loss": 0.039, + "reward": 1.665795087814331, + "reward_std": 0.23197835683822632, + "rewards/accuracy_reward_stage2": 0.665795087814331, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1574 + }, + { + "completion_length": 7.859375, + "epoch": 0.2759768705099001, + "grad_norm": 19.34328584263878, + "kl": 0.07275390625, + "learning_rate": 7.241983528999474e-07, + "loss": 0.029, + "reward": 1.4947218894958496, + "reward_std": 0.1720176637172699, + "rewards/accuracy_reward_stage2": 0.6197218894958496, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1575 + }, + { + "completion_length": 15.5625, + "epoch": 0.27615209391974765, + "grad_norm": 22.570700861758745, + "kl": 0.041748046875, + "learning_rate": 7.240231294900998e-07, + "loss": 0.0167, + "reward": 1.4003034830093384, + "reward_std": 0.22432449460029602, + "rewards/accuracy_reward_stage2": 0.5253034830093384, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1576 + }, + { + "completion_length": 9.609375, + "epoch": 0.27632731732959526, + "grad_norm": 19.67214373662556, + "kl": 0.11083984375, + "learning_rate": 7.238479060802522e-07, + "loss": -0.0314, + "reward": 1.4500322341918945, + "reward_std": 0.21745267510414124, + "rewards/accuracy_reward_stage2": 0.48128223419189453, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1577 + }, + { + "completion_length": 23.21875, + "epoch": 0.2765025407394428, + "grad_norm": 22.79993085876522, + "kl": 0.06982421875, + "learning_rate": 7.236726826704047e-07, + "loss": 0.0139, + "reward": 1.3926570415496826, + "reward_std": 0.17935630679130554, + "rewards/accuracy_reward_stage2": 0.5489070415496826, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1578 + }, + { + "completion_length": 10.1875, + "epoch": 0.27667776414929035, + "grad_norm": 23.08514350231986, + "kl": 0.0888671875, + "learning_rate": 7.234974592605572e-07, + "loss": -0.0061, + "reward": 1.7529489994049072, + "reward_std": 0.20225925743579865, + "rewards/accuracy_reward_stage2": 0.7685739398002625, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1579 + }, + { + "completion_length": 13.5625, + "epoch": 0.2768529875591379, + "grad_norm": 21.042427743109787, + "kl": 0.04345703125, + "learning_rate": 7.233222358507096e-07, + "loss": -0.0268, + "reward": 1.4279401302337646, + "reward_std": 0.16784167289733887, + "rewards/accuracy_reward_stage2": 0.44356510043144226, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1580 + }, + { + "completion_length": 10.390625, + "epoch": 0.27702821096898544, + "grad_norm": 77.07830159883746, + "kl": 0.4609375, + "learning_rate": 7.231470124408621e-07, + "loss": 0.1237, + "reward": 1.6272919178009033, + "reward_std": 0.2883310914039612, + "rewards/accuracy_reward_stage2": 0.6585419178009033, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1581 + }, + { + "completion_length": 11.0, + "epoch": 0.277203434378833, + "grad_norm": 18.558360183545652, + "kl": 0.055908203125, + "learning_rate": 7.229717890310145e-07, + "loss": -0.0218, + "reward": 1.8247655630111694, + "reward_std": 0.20700310170650482, + "rewards/accuracy_reward_stage2": 0.8403905034065247, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1582 + }, + { + "completion_length": 10.546875, + "epoch": 0.2773786577886806, + "grad_norm": 14.116427509022715, + "kl": 0.1572265625, + "learning_rate": 7.22796565621167e-07, + "loss": 0.0345, + "reward": 0.8919172286987305, + "reward_std": 0.153579980134964, + "rewards/accuracy_reward_stage2": 0.28254222869873047, + "rewards/format_reward_stage1_pointerpad": 0.609375, + "scores/accuracy_reward_stage2": 0.609375, + "step": 1583 + }, + { + "completion_length": 8.0625, + "epoch": 0.27755388119852814, + "grad_norm": 20.72078723909815, + "kl": 0.1025390625, + "learning_rate": 7.226213422113195e-07, + "loss": -0.0296, + "reward": 1.6172088384628296, + "reward_std": 0.2659575641155243, + "rewards/accuracy_reward_stage2": 0.6484588384628296, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1584 + }, + { + "completion_length": 11.34375, + "epoch": 0.2777291046083757, + "grad_norm": 26.16018586016227, + "kl": 0.208984375, + "learning_rate": 7.224461188014719e-07, + "loss": 0.0837, + "reward": 1.5841631889343262, + "reward_std": 0.2787715196609497, + "rewards/accuracy_reward_stage2": 0.5841631889343262, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1585 + }, + { + "completion_length": 10.296875, + "epoch": 0.27790432801822323, + "grad_norm": 19.981196513188134, + "kl": 0.048095703125, + "learning_rate": 7.222708953916243e-07, + "loss": -0.025, + "reward": 1.5554091930389404, + "reward_std": 0.24271945655345917, + "rewards/accuracy_reward_stage2": 0.6960341930389404, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1586 + }, + { + "completion_length": 9.140625, + "epoch": 0.2780795514280708, + "grad_norm": 23.940325381236757, + "kl": 0.2490234375, + "learning_rate": 7.220956719817766e-07, + "loss": 0.0579, + "reward": 1.2890129089355469, + "reward_std": 0.29061371088027954, + "rewards/accuracy_reward_stage2": 0.4452629089355469, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1587 + }, + { + "completion_length": 7.84375, + "epoch": 0.27825477483791833, + "grad_norm": 20.466731457336554, + "kl": 0.054443359375, + "learning_rate": 7.219204485719291e-07, + "loss": 0.0218, + "reward": 1.5818238258361816, + "reward_std": 0.15346041321754456, + "rewards/accuracy_reward_stage2": 0.5818238258361816, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1588 + }, + { + "completion_length": 7.6875, + "epoch": 0.2784299982477659, + "grad_norm": 27.632750470965156, + "kl": 0.09228515625, + "learning_rate": 7.217452251620816e-07, + "loss": 0.0176, + "reward": 1.4603047370910645, + "reward_std": 0.25379306077957153, + "rewards/accuracy_reward_stage2": 0.6009297370910645, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1589 + }, + { + "completion_length": 7.453125, + "epoch": 0.2786052216576135, + "grad_norm": 19.974365273381522, + "kl": 0.06884765625, + "learning_rate": 7.21570001752234e-07, + "loss": 0.0276, + "reward": 1.343886375427246, + "reward_std": 0.21338877081871033, + "rewards/accuracy_reward_stage2": 0.46888643503189087, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1590 + }, + { + "completion_length": 8.296875, + "epoch": 0.278780445067461, + "grad_norm": 21.637228824670125, + "kl": 0.060302734375, + "learning_rate": 7.213947783423865e-07, + "loss": -0.0038, + "reward": 1.4773318767547607, + "reward_std": 0.2645827531814575, + "rewards/accuracy_reward_stage2": 0.49295687675476074, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1591 + }, + { + "completion_length": 10.375, + "epoch": 0.27895566847730857, + "grad_norm": 19.634208559796395, + "kl": 0.07177734375, + "learning_rate": 7.21219554932539e-07, + "loss": -0.0597, + "reward": 1.3822365999221802, + "reward_std": 0.33031585812568665, + "rewards/accuracy_reward_stage2": 0.5384865999221802, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1592 + }, + { + "completion_length": 11.109375, + "epoch": 0.2791308918871561, + "grad_norm": 36.81881300779984, + "kl": 0.384765625, + "learning_rate": 7.210443315226914e-07, + "loss": 0.1539, + "reward": 1.5430142879486084, + "reward_std": 0.26516449451446533, + "rewards/accuracy_reward_stage2": 0.7930142283439636, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1593 + }, + { + "completion_length": 13.84375, + "epoch": 0.27930611529700367, + "grad_norm": 21.62546848781223, + "kl": 0.02099609375, + "learning_rate": 7.208691081128439e-07, + "loss": -0.0349, + "reward": 1.7699782848358154, + "reward_std": 0.22004219889640808, + "rewards/accuracy_reward_stage2": 0.7856031656265259, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1594 + }, + { + "completion_length": 12.875, + "epoch": 0.2794813387068512, + "grad_norm": 18.89892017703249, + "kl": 0.0712890625, + "learning_rate": 7.206938847029962e-07, + "loss": -0.0105, + "reward": 1.6201822757720947, + "reward_std": 0.22205275297164917, + "rewards/accuracy_reward_stage2": 0.6358071565628052, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1595 + }, + { + "completion_length": 10.96875, + "epoch": 0.2796565621166988, + "grad_norm": 9.189918380732902, + "kl": 0.03955078125, + "learning_rate": 7.205186612931487e-07, + "loss": -0.0726, + "reward": 1.7642738819122314, + "reward_std": 0.10237517952919006, + "rewards/accuracy_reward_stage2": 0.7955238223075867, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1596 + }, + { + "completion_length": 7.4375, + "epoch": 0.27983178552654636, + "grad_norm": 15.274296589152023, + "kl": 0.0693359375, + "learning_rate": 7.203434378833012e-07, + "loss": -0.0771, + "reward": 1.8618519306182861, + "reward_std": 0.2493779957294464, + "rewards/accuracy_reward_stage2": 0.9087268710136414, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 1597 + }, + { + "completion_length": 11.5, + "epoch": 0.2800070089363939, + "grad_norm": 42.76093281964334, + "kl": 0.1259765625, + "learning_rate": 7.201682144734536e-07, + "loss": 0.006, + "reward": 1.4967622756958008, + "reward_std": 0.19433635473251343, + "rewards/accuracy_reward_stage2": 0.512387216091156, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1598 + }, + { + "completion_length": 17.078125, + "epoch": 0.28018223234624146, + "grad_norm": 22.19032833031183, + "kl": 0.248046875, + "learning_rate": 7.199929910636061e-07, + "loss": 0.0313, + "reward": 1.3876421451568604, + "reward_std": 0.24027925729751587, + "rewards/accuracy_reward_stage2": 0.6688920259475708, + "rewards/format_reward_stage1_pointerpad": 0.71875, + "scores/accuracy_reward_stage2": 0.71875, + "step": 1599 + }, + { + "completion_length": 11.890625, + "epoch": 0.280357455756089, + "grad_norm": 22.30237398012051, + "kl": 0.2353515625, + "learning_rate": 7.198177676537585e-07, + "loss": -0.0071, + "reward": 1.7141770124435425, + "reward_std": 0.3087601959705353, + "rewards/accuracy_reward_stage2": 0.7610519528388977, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 1600 + }, + { + "completion_length": 9.21875, + "epoch": 0.28053267916593655, + "grad_norm": 8.829576239530924, + "kl": 0.0703125, + "learning_rate": 7.196425442439109e-07, + "loss": 0.0281, + "reward": 1.7324020862579346, + "reward_std": 0.03725196793675423, + "rewards/accuracy_reward_stage2": 0.8574021458625793, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1601 + }, + { + "completion_length": 12.5, + "epoch": 0.28070790257578415, + "grad_norm": 15.991249185161834, + "kl": 0.0791015625, + "learning_rate": 7.194673208340634e-07, + "loss": 0.0317, + "reward": 1.5364727973937988, + "reward_std": 0.11035994440317154, + "rewards/accuracy_reward_stage2": 0.5364727973937988, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1602 + }, + { + "completion_length": 8.03125, + "epoch": 0.2808831259856317, + "grad_norm": 17.575426900536577, + "kl": 0.1298828125, + "learning_rate": 7.192920974242158e-07, + "loss": -0.0363, + "reward": 1.6127972602844238, + "reward_std": 0.15401007235050201, + "rewards/accuracy_reward_stage2": 0.6440472602844238, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1603 + }, + { + "completion_length": 16.203125, + "epoch": 0.28105834939547925, + "grad_norm": 17.905045894586475, + "kl": 0.11474609375, + "learning_rate": 7.191168740143683e-07, + "loss": -0.0864, + "reward": 1.4100637435913086, + "reward_std": 0.2545863091945648, + "rewards/accuracy_reward_stage2": 0.4569387435913086, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 1604 + }, + { + "completion_length": 8.640625, + "epoch": 0.2812335728053268, + "grad_norm": 20.18219517559298, + "kl": 0.0556640625, + "learning_rate": 7.189416506045208e-07, + "loss": -0.0219, + "reward": 1.4022883176803589, + "reward_std": 0.21513070166110992, + "rewards/accuracy_reward_stage2": 0.41791337728500366, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1605 + }, + { + "completion_length": 9.34375, + "epoch": 0.28140879621517434, + "grad_norm": 22.68462923752165, + "kl": 0.0947265625, + "learning_rate": 7.187664271946731e-07, + "loss": -0.0504, + "reward": 1.4744114875793457, + "reward_std": 0.28335297107696533, + "rewards/accuracy_reward_stage2": 0.5056614875793457, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1606 + }, + { + "completion_length": 10.375, + "epoch": 0.2815840196250219, + "grad_norm": 52.28396048008046, + "kl": 0.205078125, + "learning_rate": 7.185912037848256e-07, + "loss": 0.0466, + "reward": 1.6137253046035767, + "reward_std": 0.33331912755966187, + "rewards/accuracy_reward_stage2": 0.6293503046035767, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1607 + }, + { + "completion_length": 8.875, + "epoch": 0.28175924303486943, + "grad_norm": 28.125928030587517, + "kl": 0.13671875, + "learning_rate": 7.184159803749781e-07, + "loss": 0.0229, + "reward": 1.5231481790542603, + "reward_std": 0.31302833557128906, + "rewards/accuracy_reward_stage2": 0.6637731790542603, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1608 + }, + { + "completion_length": 14.96875, + "epoch": 0.28193446644471704, + "grad_norm": 31.768402575173674, + "kl": 0.08349609375, + "learning_rate": 7.182407569651305e-07, + "loss": -0.0108, + "reward": 1.2850186824798584, + "reward_std": 0.22496335208415985, + "rewards/accuracy_reward_stage2": 0.3006437122821808, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1609 + }, + { + "completion_length": 14.546875, + "epoch": 0.2821096898545646, + "grad_norm": 21.607326402315472, + "kl": 0.1416015625, + "learning_rate": 7.18065533555283e-07, + "loss": 0.022, + "reward": 1.6873608827590942, + "reward_std": 0.19603906571865082, + "rewards/accuracy_reward_stage2": 0.7029858231544495, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1610 + }, + { + "completion_length": 13.75, + "epoch": 0.28228491326441213, + "grad_norm": 15.443224102301414, + "kl": 0.05859375, + "learning_rate": 7.178903101454354e-07, + "loss": 0.0234, + "reward": 1.625, + "reward_std": 0.1157275140285492, + "rewards/accuracy_reward_stage2": 0.75, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1611 + }, + { + "completion_length": 21.28125, + "epoch": 0.2824601366742597, + "grad_norm": 27.345953796390596, + "kl": 0.23046875, + "learning_rate": 7.177150867355879e-07, + "loss": -0.0345, + "reward": 1.4842654466629028, + "reward_std": 0.28159990906715393, + "rewards/accuracy_reward_stage2": 0.5311404466629028, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 1612 + }, + { + "completion_length": 10.28125, + "epoch": 0.2826353600841072, + "grad_norm": 10.134441654602675, + "kl": 0.091796875, + "learning_rate": 7.175398633257403e-07, + "loss": -0.0052, + "reward": 1.9131689071655273, + "reward_std": 0.1398359090089798, + "rewards/accuracy_reward_stage2": 0.9287939667701721, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1613 + }, + { + "completion_length": 9.28125, + "epoch": 0.28281058349395477, + "grad_norm": 21.48645768776718, + "kl": 0.138671875, + "learning_rate": 7.173646399158927e-07, + "loss": 0.0092, + "reward": 1.359375, + "reward_std": 0.32878512144088745, + "rewards/accuracy_reward_stage2": 0.515625, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1614 + }, + { + "completion_length": 15.421875, + "epoch": 0.2829858069038024, + "grad_norm": 19.737474472499912, + "kl": 0.06787109375, + "learning_rate": 7.171894165060451e-07, + "loss": 0.0272, + "reward": 1.5567772388458252, + "reward_std": 0.11542443186044693, + "rewards/accuracy_reward_stage2": 0.68177729845047, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1615 + }, + { + "completion_length": 13.578125, + "epoch": 0.2831610303136499, + "grad_norm": 24.78032036079619, + "kl": 0.1142578125, + "learning_rate": 7.170141930961976e-07, + "loss": 0.0457, + "reward": 1.5786793231964111, + "reward_std": 0.21386365592479706, + "rewards/accuracy_reward_stage2": 0.5786793828010559, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1616 + }, + { + "completion_length": 6.640625, + "epoch": 0.28333625372349747, + "grad_norm": 23.418408766851822, + "kl": 0.2265625, + "learning_rate": 7.1683896968635e-07, + "loss": -0.0405, + "reward": 1.5210037231445312, + "reward_std": 0.34530240297317505, + "rewards/accuracy_reward_stage2": 0.567878782749176, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 1617 + }, + { + "completion_length": 9.03125, + "epoch": 0.283511477133345, + "grad_norm": 18.082312553657825, + "kl": 0.11767578125, + "learning_rate": 7.166637462765025e-07, + "loss": 0.0111, + "reward": 1.6185312271118164, + "reward_std": 0.1870284378528595, + "rewards/accuracy_reward_stage2": 0.6497811079025269, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1618 + }, + { + "completion_length": 9.078125, + "epoch": 0.28368670054319256, + "grad_norm": 17.717906115355813, + "kl": 0.031982421875, + "learning_rate": 7.164885228666549e-07, + "loss": 0.0128, + "reward": 1.6676509380340576, + "reward_std": 0.13465890288352966, + "rewards/accuracy_reward_stage2": 0.6676508784294128, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1619 + }, + { + "completion_length": 13.53125, + "epoch": 0.2838619239530401, + "grad_norm": 20.824326637564756, + "kl": 0.138671875, + "learning_rate": 7.163132994568074e-07, + "loss": 0.0557, + "reward": 1.265625, + "reward_std": 0.19044628739356995, + "rewards/accuracy_reward_stage2": 0.390625, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1620 + }, + { + "completion_length": 13.578125, + "epoch": 0.28403714736288765, + "grad_norm": 22.11145305537472, + "kl": 0.1728515625, + "learning_rate": 7.161380760469599e-07, + "loss": 0.0624, + "reward": 1.6914026737213135, + "reward_std": 0.1933891624212265, + "rewards/accuracy_reward_stage2": 0.8320276737213135, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1621 + }, + { + "completion_length": 15.359375, + "epoch": 0.28421237077273526, + "grad_norm": 18.671780608441452, + "kl": 0.04443359375, + "learning_rate": 7.159628526371123e-07, + "loss": -0.0599, + "reward": 1.3597071170806885, + "reward_std": 0.16765311360359192, + "rewards/accuracy_reward_stage2": 0.6409571170806885, + "rewards/format_reward_stage1_pointerpad": 0.71875, + "scores/accuracy_reward_stage2": 0.71875, + "step": 1622 + }, + { + "completion_length": 9.5, + "epoch": 0.2843875941825828, + "grad_norm": 19.783864825763466, + "kl": 0.07080078125, + "learning_rate": 7.157876292272648e-07, + "loss": -0.0091, + "reward": 1.5630043745040894, + "reward_std": 0.2638583779335022, + "rewards/accuracy_reward_stage2": 0.7036293745040894, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1623 + }, + { + "completion_length": 11.09375, + "epoch": 0.28456281759243035, + "grad_norm": 18.045229465095055, + "kl": 0.0634765625, + "learning_rate": 7.156124058174173e-07, + "loss": 0.0254, + "reward": 1.6278626918792725, + "reward_std": 0.16786369681358337, + "rewards/accuracy_reward_stage2": 0.627862811088562, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1624 + }, + { + "completion_length": 11.0, + "epoch": 0.2847380410022779, + "grad_norm": 22.45995158178718, + "kl": 0.126953125, + "learning_rate": 7.154371824075697e-07, + "loss": 0.0505, + "reward": 1.616892695426941, + "reward_std": 0.14647267758846283, + "rewards/accuracy_reward_stage2": 0.6168926954269409, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1625 + }, + { + "completion_length": 9.015625, + "epoch": 0.28491326441212544, + "grad_norm": 16.218214133076454, + "kl": 0.224609375, + "learning_rate": 7.15261958997722e-07, + "loss": 0.0478, + "reward": 1.5607107877731323, + "reward_std": 0.15622293949127197, + "rewards/accuracy_reward_stage2": 0.7013357281684875, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1626 + }, + { + "completion_length": 10.0, + "epoch": 0.285088487821973, + "grad_norm": 18.28017136924628, + "kl": 0.1630859375, + "learning_rate": 7.150867355878744e-07, + "loss": 0.0653, + "reward": 1.3840994834899902, + "reward_std": 0.1972706913948059, + "rewards/accuracy_reward_stage2": 0.38409942388534546, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1627 + }, + { + "completion_length": 10.34375, + "epoch": 0.2852637112318206, + "grad_norm": 18.30002553390304, + "kl": 0.0162353515625, + "learning_rate": 7.149115121780269e-07, + "loss": 0.0065, + "reward": 1.625, + "reward_std": 0.0883883461356163, + "rewards/accuracy_reward_stage2": 0.75, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1628 + }, + { + "completion_length": 6.28125, + "epoch": 0.28543893464166814, + "grad_norm": 18.105026194272845, + "kl": 0.09619140625, + "learning_rate": 7.147362887681794e-07, + "loss": 0.0385, + "reward": 1.465050220489502, + "reward_std": 0.14513222873210907, + "rewards/accuracy_reward_stage2": 0.46505028009414673, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1629 + }, + { + "completion_length": 10.890625, + "epoch": 0.2856141580515157, + "grad_norm": 16.56397158084368, + "kl": 0.1298828125, + "learning_rate": 7.145610653583318e-07, + "loss": 0.0077, + "reward": 1.2697676420211792, + "reward_std": 0.25002607703208923, + "rewards/accuracy_reward_stage2": 0.5353926420211792, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 1630 + }, + { + "completion_length": 14.4375, + "epoch": 0.28578938146136323, + "grad_norm": 19.65409355902699, + "kl": 0.1279296875, + "learning_rate": 7.143858419484843e-07, + "loss": 0.0514, + "reward": 1.4833333492279053, + "reward_std": 0.21255075931549072, + "rewards/accuracy_reward_stage2": 0.8583333492279053, + "rewards/format_reward_stage1_pointerpad": 0.625, + "scores/accuracy_reward_stage2": 0.625, + "step": 1631 + }, + { + "completion_length": 6.5, + "epoch": 0.2859646048712108, + "grad_norm": 25.046007821442476, + "kl": 0.076171875, + "learning_rate": 7.142106185386368e-07, + "loss": 0.0305, + "reward": 1.5952610969543457, + "reward_std": 0.22790226340293884, + "rewards/accuracy_reward_stage2": 0.5952612161636353, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1632 + }, + { + "completion_length": 9.78125, + "epoch": 0.28613982828105833, + "grad_norm": 22.298551409705386, + "kl": 0.078125, + "learning_rate": 7.140353951287892e-07, + "loss": -0.0022, + "reward": 1.5244977474212646, + "reward_std": 0.33707937598228455, + "rewards/accuracy_reward_stage2": 0.6651226282119751, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1633 + }, + { + "completion_length": 13.3125, + "epoch": 0.28631505169090593, + "grad_norm": 19.77506952533496, + "kl": 0.26171875, + "learning_rate": 7.138601717189417e-07, + "loss": 0.0391, + "reward": 1.274181604385376, + "reward_std": 0.18674418330192566, + "rewards/accuracy_reward_stage2": 0.4304315447807312, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1634 + }, + { + "completion_length": 7.359375, + "epoch": 0.2864902751007535, + "grad_norm": 22.0114841950705, + "kl": 0.087890625, + "learning_rate": 7.13684948309094e-07, + "loss": -0.009, + "reward": 1.5875247716903687, + "reward_std": 0.31310462951660156, + "rewards/accuracy_reward_stage2": 0.6031497716903687, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1635 + }, + { + "completion_length": 17.546875, + "epoch": 0.286665498510601, + "grad_norm": 17.453595183630732, + "kl": 0.028564453125, + "learning_rate": 7.135097248992465e-07, + "loss": -0.0294, + "reward": 1.7066229581832886, + "reward_std": 0.1509627103805542, + "rewards/accuracy_reward_stage2": 0.7222478985786438, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1636 + }, + { + "completion_length": 16.171875, + "epoch": 0.28684072192044857, + "grad_norm": 25.26047295052692, + "kl": 0.345703125, + "learning_rate": 7.13334501489399e-07, + "loss": 0.1135, + "reward": 1.2085933685302734, + "reward_std": 0.22050847113132477, + "rewards/accuracy_reward_stage2": 0.47421833872795105, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 1637 + }, + { + "completion_length": 10.828125, + "epoch": 0.2870159453302961, + "grad_norm": 29.875900138268193, + "kl": 0.07861328125, + "learning_rate": 7.131592780795513e-07, + "loss": 0.0316, + "reward": 1.5491917133331299, + "reward_std": 0.2803182899951935, + "rewards/accuracy_reward_stage2": 0.5491916537284851, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1638 + }, + { + "completion_length": 25.75, + "epoch": 0.28719116874014367, + "grad_norm": 20.93227222442535, + "kl": 0.1103515625, + "learning_rate": 7.129840546697038e-07, + "loss": -0.0201, + "reward": 1.324310302734375, + "reward_std": 0.26539939641952515, + "rewards/accuracy_reward_stage2": 0.4805603623390198, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1639 + }, + { + "completion_length": 8.5625, + "epoch": 0.2873663921499912, + "grad_norm": 14.537766364808704, + "kl": 0.1328125, + "learning_rate": 7.128088312598563e-07, + "loss": -0.0141, + "reward": 1.6957449913024902, + "reward_std": 0.17225536704063416, + "rewards/accuracy_reward_stage2": 0.7269949913024902, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1640 + }, + { + "completion_length": 11.609375, + "epoch": 0.2875416155598388, + "grad_norm": 20.96258079590175, + "kl": 0.056884765625, + "learning_rate": 7.126336078500087e-07, + "loss": -0.0858, + "reward": 1.7480556964874268, + "reward_std": 0.3304206430912018, + "rewards/accuracy_reward_stage2": 0.7949306964874268, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 1641 + }, + { + "completion_length": 7.65625, + "epoch": 0.28771683896968636, + "grad_norm": 23.292493455122358, + "kl": 0.0751953125, + "learning_rate": 7.124583844401612e-07, + "loss": 0.03, + "reward": 1.7483609914779663, + "reward_std": 0.19954615831375122, + "rewards/accuracy_reward_stage2": 0.7483609914779663, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1642 + }, + { + "completion_length": 26.328125, + "epoch": 0.2878920623795339, + "grad_norm": 17.8083297445197, + "kl": 0.0439453125, + "learning_rate": 7.122831610303136e-07, + "loss": 0.0176, + "reward": 1.6547987461090088, + "reward_std": 0.17785227298736572, + "rewards/accuracy_reward_stage2": 0.6547987461090088, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1643 + }, + { + "completion_length": 12.703125, + "epoch": 0.28806728578938146, + "grad_norm": 30.90286646759068, + "kl": 0.1396484375, + "learning_rate": 7.121079376204661e-07, + "loss": 0.0559, + "reward": 1.5476398468017578, + "reward_std": 0.23930677771568298, + "rewards/accuracy_reward_stage2": 0.6726399660110474, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1644 + }, + { + "completion_length": 8.625, + "epoch": 0.288242509199229, + "grad_norm": 19.56818403132419, + "kl": 0.1806640625, + "learning_rate": 7.119327142106185e-07, + "loss": 0.0346, + "reward": 1.5129913091659546, + "reward_std": 0.22683054208755493, + "rewards/accuracy_reward_stage2": 0.6536163091659546, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1645 + }, + { + "completion_length": 8.5, + "epoch": 0.28841773260907655, + "grad_norm": 19.170340208463543, + "kl": 0.212890625, + "learning_rate": 7.117574908007709e-07, + "loss": 0.0346, + "reward": 1.6334724426269531, + "reward_std": 0.17795699834823608, + "rewards/accuracy_reward_stage2": 0.7897223830223083, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1646 + }, + { + "completion_length": 9.375, + "epoch": 0.28859295601892415, + "grad_norm": 15.89994694611484, + "kl": 0.12890625, + "learning_rate": 7.115822673909234e-07, + "loss": 0.0516, + "reward": 1.3058521747589111, + "reward_std": 0.17969033122062683, + "rewards/accuracy_reward_stage2": 0.43085217475891113, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1647 + }, + { + "completion_length": 7.84375, + "epoch": 0.2887681794287717, + "grad_norm": 23.01958952222859, + "kl": 0.1611328125, + "learning_rate": 7.114070439810759e-07, + "loss": -0.0225, + "reward": 1.4997072219848633, + "reward_std": 0.208816260099411, + "rewards/accuracy_reward_stage2": 0.5309572219848633, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1648 + }, + { + "completion_length": 10.84375, + "epoch": 0.28894340283861925, + "grad_norm": 17.99364116397282, + "kl": 0.0400390625, + "learning_rate": 7.112318205712283e-07, + "loss": 0.016, + "reward": 1.4152864217758179, + "reward_std": 0.16304050385951996, + "rewards/accuracy_reward_stage2": 0.5402864217758179, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1649 + }, + { + "completion_length": 10.765625, + "epoch": 0.2891186262484668, + "grad_norm": 22.290912720435063, + "kl": 0.032958984375, + "learning_rate": 7.110565971613808e-07, + "loss": 0.0132, + "reward": 1.2604167461395264, + "reward_std": 0.20825409889221191, + "rewards/accuracy_reward_stage2": 0.2604166865348816, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1650 + }, + { + "completion_length": 6.515625, + "epoch": 0.28929384965831434, + "grad_norm": 16.86284479055235, + "kl": 0.049072265625, + "learning_rate": 7.108813737515331e-07, + "loss": -0.0245, + "reward": 1.6870609521865845, + "reward_std": 0.14026181399822235, + "rewards/accuracy_reward_stage2": 0.7026859521865845, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1651 + }, + { + "completion_length": 8.046875, + "epoch": 0.2894690730681619, + "grad_norm": 23.0356528349917, + "kl": 0.055419921875, + "learning_rate": 7.107061503416856e-07, + "loss": -0.0152, + "reward": 1.5876586437225342, + "reward_std": 0.2479107826948166, + "rewards/accuracy_reward_stage2": 0.6032836437225342, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1652 + }, + { + "completion_length": 11.5625, + "epoch": 0.2896442964780095, + "grad_norm": 26.79041801998364, + "kl": 0.07568359375, + "learning_rate": 7.105309269318381e-07, + "loss": 0.0014, + "reward": 1.5761586427688599, + "reward_std": 0.2695969343185425, + "rewards/accuracy_reward_stage2": 0.7167835831642151, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1653 + }, + { + "completion_length": 13.171875, + "epoch": 0.28981951988785704, + "grad_norm": 12.232697936613171, + "kl": 0.06640625, + "learning_rate": 7.103557035219905e-07, + "loss": 0.0266, + "reward": 1.479015588760376, + "reward_std": 0.10361681878566742, + "rewards/accuracy_reward_stage2": 0.4790155291557312, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1654 + }, + { + "completion_length": 10.953125, + "epoch": 0.2899947432977046, + "grad_norm": 18.931813636993038, + "kl": 0.11767578125, + "learning_rate": 7.101804801121429e-07, + "loss": 0.0156, + "reward": 1.6782610416412354, + "reward_std": 0.2642056345939636, + "rewards/accuracy_reward_stage2": 0.8188860416412354, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1655 + }, + { + "completion_length": 19.78125, + "epoch": 0.29016996670755213, + "grad_norm": 29.698298899171085, + "kl": 0.166015625, + "learning_rate": 7.100052567022954e-07, + "loss": 0.0663, + "reward": 1.455777645111084, + "reward_std": 0.2696455121040344, + "rewards/accuracy_reward_stage2": 0.45577773451805115, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1656 + }, + { + "completion_length": 9.390625, + "epoch": 0.2903451901173997, + "grad_norm": 20.362922155821412, + "kl": 0.09130859375, + "learning_rate": 7.098300332924478e-07, + "loss": 0.0364, + "reward": 1.796497106552124, + "reward_std": 0.16053104400634766, + "rewards/accuracy_reward_stage2": 0.7964969873428345, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1657 + }, + { + "completion_length": 8.546875, + "epoch": 0.2905204135272472, + "grad_norm": 16.360212652099975, + "kl": 0.0908203125, + "learning_rate": 7.096548098826003e-07, + "loss": 0.0363, + "reward": 1.8485822677612305, + "reward_std": 0.17348849773406982, + "rewards/accuracy_reward_stage2": 0.8485823273658752, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1658 + }, + { + "completion_length": 9.859375, + "epoch": 0.29069563693709477, + "grad_norm": 29.315179459450466, + "kl": 0.044921875, + "learning_rate": 7.094795864727527e-07, + "loss": 0.0179, + "reward": 1.810223937034607, + "reward_std": 0.21215862035751343, + "rewards/accuracy_reward_stage2": 0.8102238774299622, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1659 + }, + { + "completion_length": 7.203125, + "epoch": 0.2908708603469424, + "grad_norm": 17.962626858687273, + "kl": 0.111328125, + "learning_rate": 7.093043630629052e-07, + "loss": -0.0129, + "reward": 1.331225872039795, + "reward_std": 0.21827349066734314, + "rewards/accuracy_reward_stage2": 0.48747581243515015, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1660 + }, + { + "completion_length": 8.25, + "epoch": 0.2910460837567899, + "grad_norm": 16.764163271747165, + "kl": 0.115234375, + "learning_rate": 7.091291396530577e-07, + "loss": 0.0182, + "reward": 1.455185055732727, + "reward_std": 0.1520254909992218, + "rewards/accuracy_reward_stage2": 0.595810055732727, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1661 + }, + { + "completion_length": 17.5, + "epoch": 0.29122130716663747, + "grad_norm": 19.947587748612907, + "kl": 0.2578125, + "learning_rate": 7.089539162432101e-07, + "loss": -0.0365, + "reward": 1.4499015808105469, + "reward_std": 0.32634487748146057, + "rewards/accuracy_reward_stage2": 0.7624015808105469, + "rewards/format_reward_stage1_pointerpad": 0.6875, + "scores/accuracy_reward_stage2": 0.6875, + "step": 1662 + }, + { + "completion_length": 12.78125, + "epoch": 0.291396530576485, + "grad_norm": 24.9274809883867, + "kl": 0.40234375, + "learning_rate": 7.087786928333626e-07, + "loss": 0.1166, + "reward": 1.428887963294983, + "reward_std": 0.12139017879962921, + "rewards/accuracy_reward_stage2": 0.6945129632949829, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 1663 + }, + { + "completion_length": 10.203125, + "epoch": 0.29157175398633256, + "grad_norm": 18.81552695245245, + "kl": 0.1279296875, + "learning_rate": 7.086034694235148e-07, + "loss": 0.051, + "reward": 1.5833933353424072, + "reward_std": 0.2008472979068756, + "rewards/accuracy_reward_stage2": 0.7083932161331177, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1664 + }, + { + "completion_length": 16.46875, + "epoch": 0.2917469773961801, + "grad_norm": 19.36111770410725, + "kl": 0.314453125, + "learning_rate": 7.084282460136673e-07, + "loss": 0.1258, + "reward": 1.2535412311553955, + "reward_std": 0.12364614009857178, + "rewards/accuracy_reward_stage2": 0.5035412311553955, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1665 + }, + { + "completion_length": 9.9375, + "epoch": 0.2919222008060277, + "grad_norm": 21.804938787404357, + "kl": 0.140625, + "learning_rate": 7.082530226038198e-07, + "loss": 0.0118, + "reward": 1.6458333730697632, + "reward_std": 0.1695856750011444, + "rewards/accuracy_reward_stage2": 0.6614583730697632, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1666 + }, + { + "completion_length": 8.9375, + "epoch": 0.29209742421587526, + "grad_norm": 15.49891530709726, + "kl": 0.08544921875, + "learning_rate": 7.080777991939722e-07, + "loss": -0.0101, + "reward": 1.6766133308410645, + "reward_std": 0.19475360214710236, + "rewards/accuracy_reward_stage2": 0.8172383904457092, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1667 + }, + { + "completion_length": 15.125, + "epoch": 0.2922726476257228, + "grad_norm": 16.128066127735444, + "kl": 0.0113525390625, + "learning_rate": 7.079025757841247e-07, + "loss": 0.0046, + "reward": 1.5276246070861816, + "reward_std": 0.09870946407318115, + "rewards/accuracy_reward_stage2": 0.5276245474815369, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1668 + }, + { + "completion_length": 7.46875, + "epoch": 0.29244787103557035, + "grad_norm": 23.841239880152038, + "kl": 0.08447265625, + "learning_rate": 7.077273523742772e-07, + "loss": -0.0079, + "reward": 1.5087754726409912, + "reward_std": 0.15740589797496796, + "rewards/accuracy_reward_stage2": 0.5244004726409912, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1669 + }, + { + "completion_length": 10.21875, + "epoch": 0.2926230944454179, + "grad_norm": 15.687482528874513, + "kl": 0.10986328125, + "learning_rate": 7.075521289644296e-07, + "loss": 0.0439, + "reward": 1.417892575263977, + "reward_std": 0.06578870862722397, + "rewards/accuracy_reward_stage2": 0.667892575263977, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1670 + }, + { + "completion_length": 12.421875, + "epoch": 0.29279831785526544, + "grad_norm": 18.25270654941957, + "kl": 0.12255859375, + "learning_rate": 7.073769055545821e-07, + "loss": 0.0489, + "reward": 1.4978927373886108, + "reward_std": 0.14950445294380188, + "rewards/accuracy_reward_stage2": 0.6228927373886108, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1671 + }, + { + "completion_length": 20.21875, + "epoch": 0.29297354126511305, + "grad_norm": 96.1894213436714, + "kl": 0.5625, + "learning_rate": 7.072016821447345e-07, + "loss": 0.1609, + "reward": 1.3571423292160034, + "reward_std": 0.20367828011512756, + "rewards/accuracy_reward_stage2": 0.5133923292160034, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1672 + }, + { + "completion_length": 6.625, + "epoch": 0.2931487646749606, + "grad_norm": 20.343287663887203, + "kl": 0.06005859375, + "learning_rate": 7.07026458734887e-07, + "loss": 0.0241, + "reward": 1.7696726322174072, + "reward_std": 0.2620747983455658, + "rewards/accuracy_reward_stage2": 0.7696726322174072, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1673 + }, + { + "completion_length": 17.421875, + "epoch": 0.29332398808480814, + "grad_norm": 30.667524608470384, + "kl": 0.1796875, + "learning_rate": 7.068512353250395e-07, + "loss": 0.0493, + "reward": 1.5008697509765625, + "reward_std": 0.11915256083011627, + "rewards/accuracy_reward_stage2": 0.6258696913719177, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1674 + }, + { + "completion_length": 9.671875, + "epoch": 0.2934992114946557, + "grad_norm": 21.4828702837734, + "kl": 0.07373046875, + "learning_rate": 7.066760119151918e-07, + "loss": 0.0293, + "reward": 1.3638755083084106, + "reward_std": 0.1987563669681549, + "rewards/accuracy_reward_stage2": 0.36387550830841064, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1675 + }, + { + "completion_length": 11.5625, + "epoch": 0.29367443490450323, + "grad_norm": 21.590194337992045, + "kl": 0.13671875, + "learning_rate": 7.065007885053443e-07, + "loss": 0.0924, + "reward": 1.173264741897583, + "reward_std": 0.16812384128570557, + "rewards/accuracy_reward_stage2": 0.29826462268829346, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1676 + }, + { + "completion_length": 14.65625, + "epoch": 0.2938496583143508, + "grad_norm": 23.505825274296527, + "kl": 0.10791015625, + "learning_rate": 7.063255650954967e-07, + "loss": 0.0431, + "reward": 1.5011882781982422, + "reward_std": 0.2176738977432251, + "rewards/accuracy_reward_stage2": 0.6261882185935974, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1677 + }, + { + "completion_length": 4.390625, + "epoch": 0.29402488172419833, + "grad_norm": 8.510244036546899, + "kl": 0.0693359375, + "learning_rate": 7.061503416856491e-07, + "loss": -0.0165, + "reward": 1.8720643520355225, + "reward_std": 0.09669148921966553, + "rewards/accuracy_reward_stage2": 0.8876894116401672, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1678 + }, + { + "completion_length": 9.265625, + "epoch": 0.29420010513404593, + "grad_norm": 25.477170919588477, + "kl": 0.1328125, + "learning_rate": 7.059751182758016e-07, + "loss": -0.0351, + "reward": 1.716698169708252, + "reward_std": 0.3310700058937073, + "rewards/accuracy_reward_stage2": 0.7479482293128967, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1679 + }, + { + "completion_length": 9.46875, + "epoch": 0.2943753285438935, + "grad_norm": 24.03928992881974, + "kl": 0.0615234375, + "learning_rate": 7.05799894865954e-07, + "loss": 0.0247, + "reward": 1.6673147678375244, + "reward_std": 0.166721373796463, + "rewards/accuracy_reward_stage2": 0.6673146486282349, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1680 + }, + { + "completion_length": 7.6875, + "epoch": 0.294550551953741, + "grad_norm": 16.872918992455006, + "kl": 0.09423828125, + "learning_rate": 7.056246714561065e-07, + "loss": -0.0064, + "reward": 1.4720426797866821, + "reward_std": 0.2335229367017746, + "rewards/accuracy_reward_stage2": 0.48766764998435974, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1681 + }, + { + "completion_length": 11.65625, + "epoch": 0.29472577536358857, + "grad_norm": 21.510050629437227, + "kl": 0.10107421875, + "learning_rate": 7.05449448046259e-07, + "loss": -0.014, + "reward": 1.7428269386291504, + "reward_std": 0.2706993818283081, + "rewards/accuracy_reward_stage2": 0.7740768194198608, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1682 + }, + { + "completion_length": 9.0, + "epoch": 0.2949009987734361, + "grad_norm": 18.79166684396821, + "kl": 0.060546875, + "learning_rate": 7.052742246364114e-07, + "loss": -0.0199, + "reward": 1.8116947412490845, + "reward_std": 0.17610837519168854, + "rewards/accuracy_reward_stage2": 0.8273198008537292, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1683 + }, + { + "completion_length": 23.0625, + "epoch": 0.29507622218328367, + "grad_norm": 16.363578535268406, + "kl": 0.1708984375, + "learning_rate": 7.050990012265639e-07, + "loss": 0.0684, + "reward": 1.6377060413360596, + "reward_std": 0.08054365962743759, + "rewards/accuracy_reward_stage2": 0.7627060413360596, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1684 + }, + { + "completion_length": 15.140625, + "epoch": 0.29525144559313127, + "grad_norm": 15.03309143820295, + "kl": 0.06396484375, + "learning_rate": 7.049237778167163e-07, + "loss": 0.0256, + "reward": 1.1309211254119873, + "reward_std": 0.11928550899028778, + "rewards/accuracy_reward_stage2": 0.2559211850166321, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1685 + }, + { + "completion_length": 10.71875, + "epoch": 0.2954266690029788, + "grad_norm": 12.656645795151226, + "kl": 0.0308837890625, + "learning_rate": 7.047485544068687e-07, + "loss": 0.0123, + "reward": 1.7976956367492676, + "reward_std": 0.04943205416202545, + "rewards/accuracy_reward_stage2": 0.7976956367492676, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1686 + }, + { + "completion_length": 11.734375, + "epoch": 0.29560189241282636, + "grad_norm": 24.12905263102765, + "kl": 0.12109375, + "learning_rate": 7.045733309970212e-07, + "loss": -0.0167, + "reward": 1.6340982913970947, + "reward_std": 0.2528322637081146, + "rewards/accuracy_reward_stage2": 0.6653482913970947, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1687 + }, + { + "completion_length": 8.953125, + "epoch": 0.2957771158226739, + "grad_norm": 19.304379581875406, + "kl": 0.1611328125, + "learning_rate": 7.043981075871736e-07, + "loss": -0.013, + "reward": 1.532942295074463, + "reward_std": 0.21373668313026428, + "rewards/accuracy_reward_stage2": 0.6891922950744629, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1688 + }, + { + "completion_length": 25.125, + "epoch": 0.29595233923252146, + "grad_norm": 20.032788690640114, + "kl": 0.06689453125, + "learning_rate": 7.04222884177326e-07, + "loss": 0.0268, + "reward": 1.6118648052215576, + "reward_std": 0.14255890250205994, + "rewards/accuracy_reward_stage2": 0.6118648052215576, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1689 + }, + { + "completion_length": 9.515625, + "epoch": 0.296127562642369, + "grad_norm": 23.07272201883303, + "kl": 0.1396484375, + "learning_rate": 7.040476607674785e-07, + "loss": 0.0285, + "reward": 1.4711434841156006, + "reward_std": 0.2035903036594391, + "rewards/accuracy_reward_stage2": 0.4867684245109558, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1690 + }, + { + "completion_length": 22.375, + "epoch": 0.29630278605221655, + "grad_norm": 91.94085271560532, + "kl": 0.84765625, + "learning_rate": 7.038724373576309e-07, + "loss": 0.3547, + "reward": 1.5097134113311768, + "reward_std": 0.231087327003479, + "rewards/accuracy_reward_stage2": 0.7597134709358215, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1691 + }, + { + "completion_length": 7.796875, + "epoch": 0.29647800946206415, + "grad_norm": 19.91389494985646, + "kl": 0.1533203125, + "learning_rate": 7.036972139477834e-07, + "loss": -0.0047, + "reward": 1.438263177871704, + "reward_std": 0.2388438731431961, + "rewards/accuracy_reward_stage2": 0.5945132374763489, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1692 + }, + { + "completion_length": 12.734375, + "epoch": 0.2966532328719117, + "grad_norm": 18.86150756027041, + "kl": 0.064453125, + "learning_rate": 7.035219905379359e-07, + "loss": 0.0259, + "reward": 1.7015407085418701, + "reward_std": 0.21713702380657196, + "rewards/accuracy_reward_stage2": 0.8265406489372253, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1693 + }, + { + "completion_length": 12.46875, + "epoch": 0.29682845628175925, + "grad_norm": 8.4688183375784, + "kl": 0.2431640625, + "learning_rate": 7.033467671280882e-07, + "loss": 0.053, + "reward": 1.265625, + "reward_std": 0.10205793380737305, + "rewards/accuracy_reward_stage2": 0.40625, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1694 + }, + { + "completion_length": 17.296875, + "epoch": 0.2970036796916068, + "grad_norm": 22.47149168439553, + "kl": 0.34375, + "learning_rate": 7.031715437182407e-07, + "loss": 0.1372, + "reward": 1.2965278625488281, + "reward_std": 0.11467509716749191, + "rewards/accuracy_reward_stage2": 0.5465278625488281, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1695 + }, + { + "completion_length": 7.0625, + "epoch": 0.29717890310145434, + "grad_norm": 21.669890151890225, + "kl": 0.13671875, + "learning_rate": 7.029963203083931e-07, + "loss": 0.0549, + "reward": 1.6818768978118896, + "reward_std": 0.14322030544281006, + "rewards/accuracy_reward_stage2": 0.6818768382072449, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1696 + }, + { + "completion_length": 8.3125, + "epoch": 0.2973541265113019, + "grad_norm": 14.261060175777834, + "kl": 0.0279541015625, + "learning_rate": 7.028210968985456e-07, + "loss": -0.0217, + "reward": 1.7881548404693604, + "reward_std": 0.16872760653495789, + "rewards/accuracy_reward_stage2": 0.8037798404693604, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1697 + }, + { + "completion_length": 9.59375, + "epoch": 0.2975293499211495, + "grad_norm": 17.725080767100014, + "kl": 0.072265625, + "learning_rate": 7.026458734886981e-07, + "loss": -0.0152, + "reward": 1.4075841903686523, + "reward_std": 0.1767999678850174, + "rewards/accuracy_reward_stage2": 0.5482091903686523, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1698 + }, + { + "completion_length": 12.6875, + "epoch": 0.29770457333099704, + "grad_norm": 23.378929824865686, + "kl": 0.26953125, + "learning_rate": 7.024706500788505e-07, + "loss": -0.0099, + "reward": 1.4340919256210327, + "reward_std": 0.3488747477531433, + "rewards/accuracy_reward_stage2": 0.6059669256210327, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 1699 + }, + { + "completion_length": 12.5, + "epoch": 0.2978797967408446, + "grad_norm": 24.027023608537906, + "kl": 0.126953125, + "learning_rate": 7.02295426669003e-07, + "loss": -0.0246, + "reward": 1.2346971035003662, + "reward_std": 0.4185022711753845, + "rewards/accuracy_reward_stage2": 0.390947163105011, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1700 + }, + { + "completion_length": 12.75, + "epoch": 0.29805502015069213, + "grad_norm": 21.65595446347219, + "kl": 0.125, + "learning_rate": 7.021202032591555e-07, + "loss": -0.0053, + "reward": 1.5966134071350098, + "reward_std": 0.3082408905029297, + "rewards/accuracy_reward_stage2": 0.6278634667396545, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1701 + }, + { + "completion_length": 11.078125, + "epoch": 0.2982302435605397, + "grad_norm": 23.075324456091533, + "kl": 0.125, + "learning_rate": 7.019449798493078e-07, + "loss": 0.05, + "reward": 1.47487473487854, + "reward_std": 0.21936720609664917, + "rewards/accuracy_reward_stage2": 0.5998746752738953, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1702 + }, + { + "completion_length": 12.8125, + "epoch": 0.2984054669703872, + "grad_norm": 23.632321001409643, + "kl": 0.291015625, + "learning_rate": 7.017697564394603e-07, + "loss": 0.1002, + "reward": 1.6295795440673828, + "reward_std": 0.34378618001937866, + "rewards/accuracy_reward_stage2": 0.7545795440673828, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1703 + }, + { + "completion_length": 12.359375, + "epoch": 0.2985806903802348, + "grad_norm": 26.175977719061464, + "kl": 0.369140625, + "learning_rate": 7.015945330296126e-07, + "loss": 0.126, + "reward": 1.4780054092407227, + "reward_std": 0.2588563561439514, + "rewards/accuracy_reward_stage2": 0.6186305284500122, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1704 + }, + { + "completion_length": 10.3125, + "epoch": 0.2987559137900824, + "grad_norm": 22.18689402119106, + "kl": 0.15234375, + "learning_rate": 7.014193096197651e-07, + "loss": -0.0272, + "reward": 1.6308624744415283, + "reward_std": 0.24226665496826172, + "rewards/accuracy_reward_stage2": 0.6621125340461731, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1705 + }, + { + "completion_length": 8.71875, + "epoch": 0.2989311371999299, + "grad_norm": 12.44901374803875, + "kl": 0.0634765625, + "learning_rate": 7.012440862099176e-07, + "loss": -0.0629, + "reward": 1.558675765991211, + "reward_std": 0.15706884860992432, + "rewards/accuracy_reward_stage2": 0.7149257063865662, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1706 + }, + { + "completion_length": 8.9375, + "epoch": 0.29910636060977747, + "grad_norm": 18.19321123021985, + "kl": 0.14453125, + "learning_rate": 7.0106886280007e-07, + "loss": 0.0263, + "reward": 1.467024326324463, + "reward_std": 0.23365125060081482, + "rewards/accuracy_reward_stage2": 0.48264938592910767, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1707 + }, + { + "completion_length": 9.96875, + "epoch": 0.299281584019625, + "grad_norm": 23.959214635034463, + "kl": 0.138671875, + "learning_rate": 7.008936393902225e-07, + "loss": 0.0553, + "reward": 1.6607258319854736, + "reward_std": 0.3005909323692322, + "rewards/accuracy_reward_stage2": 0.6607259511947632, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1708 + }, + { + "completion_length": 12.984375, + "epoch": 0.29945680742947256, + "grad_norm": 23.925526061688014, + "kl": 0.07958984375, + "learning_rate": 7.00718415980375e-07, + "loss": 0.0318, + "reward": 1.6628730297088623, + "reward_std": 0.16425105929374695, + "rewards/accuracy_reward_stage2": 0.6628729701042175, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1709 + }, + { + "completion_length": 29.578125, + "epoch": 0.2996320308393201, + "grad_norm": 25.19467130350549, + "kl": 0.19140625, + "learning_rate": 7.005431925705274e-07, + "loss": 0.0827, + "reward": 1.296067714691162, + "reward_std": 0.17308276891708374, + "rewards/accuracy_reward_stage2": 0.4366927146911621, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1710 + }, + { + "completion_length": 7.5, + "epoch": 0.2998072542491677, + "grad_norm": 19.493516475996053, + "kl": 0.1123046875, + "learning_rate": 7.003679691606799e-07, + "loss": 0.0007, + "reward": 1.6939597129821777, + "reward_std": 0.2620150148868561, + "rewards/accuracy_reward_stage2": 0.8345847129821777, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1711 + }, + { + "completion_length": 9.25, + "epoch": 0.29998247765901526, + "grad_norm": 19.677259863326938, + "kl": 0.04052734375, + "learning_rate": 7.001927457508323e-07, + "loss": -0.028, + "reward": 1.6832020282745361, + "reward_std": 0.19173334538936615, + "rewards/accuracy_reward_stage2": 0.6988270282745361, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1712 + }, + { + "completion_length": 10.46875, + "epoch": 0.3001577010688628, + "grad_norm": 21.43673387663666, + "kl": 0.1669921875, + "learning_rate": 7.000175223409848e-07, + "loss": -0.0295, + "reward": 1.6994589567184448, + "reward_std": 0.2502681314945221, + "rewards/accuracy_reward_stage2": 0.7463339567184448, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 1713 + }, + { + "completion_length": 10.484375, + "epoch": 0.30033292447871035, + "grad_norm": 19.202934091736996, + "kl": 0.1337890625, + "learning_rate": 6.998422989311373e-07, + "loss": 0.0535, + "reward": 1.3423008918762207, + "reward_std": 0.27192068099975586, + "rewards/accuracy_reward_stage2": 0.3423008322715759, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1714 + }, + { + "completion_length": 18.484375, + "epoch": 0.3005081478885579, + "grad_norm": 21.518890464068733, + "kl": 0.06298828125, + "learning_rate": 6.996670755212895e-07, + "loss": 0.0253, + "reward": 1.4944093227386475, + "reward_std": 0.17470549046993256, + "rewards/accuracy_reward_stage2": 0.49440932273864746, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1715 + }, + { + "completion_length": 17.796875, + "epoch": 0.30068337129840544, + "grad_norm": 48.596822529684296, + "kl": 0.482421875, + "learning_rate": 6.99491852111442e-07, + "loss": 0.0966, + "reward": 1.3591495752334595, + "reward_std": 0.17734628915786743, + "rewards/accuracy_reward_stage2": 0.5310245752334595, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 1716 + }, + { + "completion_length": 16.953125, + "epoch": 0.30085859470825305, + "grad_norm": 23.366423040036004, + "kl": 0.040283203125, + "learning_rate": 6.993166287015945e-07, + "loss": -0.007, + "reward": 1.3081918954849243, + "reward_std": 0.19678080081939697, + "rewards/accuracy_reward_stage2": 0.3238169252872467, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1717 + }, + { + "completion_length": 15.875, + "epoch": 0.3010338181181006, + "grad_norm": 15.664885734367475, + "kl": 0.057861328125, + "learning_rate": 6.991414052917469e-07, + "loss": -0.0651, + "reward": 1.781704306602478, + "reward_std": 0.19567933678627014, + "rewards/accuracy_reward_stage2": 0.8129542469978333, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1718 + }, + { + "completion_length": 7.90625, + "epoch": 0.30120904152794814, + "grad_norm": 18.809252110481758, + "kl": 0.0830078125, + "learning_rate": 6.989661818818994e-07, + "loss": 0.012, + "reward": 1.4171596765518188, + "reward_std": 0.21570885181427002, + "rewards/accuracy_reward_stage2": 0.6827847957611084, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 1719 + }, + { + "completion_length": 9.4375, + "epoch": 0.3013842649377957, + "grad_norm": 22.51829370190563, + "kl": 0.052978515625, + "learning_rate": 6.987909584720518e-07, + "loss": 0.0212, + "reward": 1.703669548034668, + "reward_std": 0.2621934711933136, + "rewards/accuracy_reward_stage2": 0.7036696076393127, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1720 + }, + { + "completion_length": 8.71875, + "epoch": 0.30155948834764323, + "grad_norm": 21.091726299660664, + "kl": 0.052734375, + "learning_rate": 6.986157350622043e-07, + "loss": 0.0211, + "reward": 1.749462604522705, + "reward_std": 0.1858448088169098, + "rewards/accuracy_reward_stage2": 0.7494626045227051, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1721 + }, + { + "completion_length": 8.734375, + "epoch": 0.3017347117574908, + "grad_norm": 25.56784738107091, + "kl": 0.1962890625, + "learning_rate": 6.984405116523568e-07, + "loss": -0.065, + "reward": 1.4051895141601562, + "reward_std": 0.35617536306381226, + "rewards/accuracy_reward_stage2": 0.4676896333694458, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 1722 + }, + { + "completion_length": 22.90625, + "epoch": 0.3019099351673384, + "grad_norm": 24.982174793589298, + "kl": 0.12255859375, + "learning_rate": 6.982652882425092e-07, + "loss": -0.0369, + "reward": 1.7698495388031006, + "reward_std": 0.23711535334587097, + "rewards/accuracy_reward_stage2": 0.8010995984077454, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1723 + }, + { + "completion_length": 9.96875, + "epoch": 0.30208515857718593, + "grad_norm": 21.534866823854347, + "kl": 0.0673828125, + "learning_rate": 6.980900648326617e-07, + "loss": -0.0121, + "reward": 1.7227420806884766, + "reward_std": 0.24864692986011505, + "rewards/accuracy_reward_stage2": 0.8633670806884766, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1724 + }, + { + "completion_length": 9.0625, + "epoch": 0.3022603819870335, + "grad_norm": 15.481663429370426, + "kl": 0.169921875, + "learning_rate": 6.979148414228141e-07, + "loss": -0.0576, + "reward": 1.8736064434051514, + "reward_std": 0.20491771399974823, + "rewards/accuracy_reward_stage2": 0.9204814434051514, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 1725 + }, + { + "completion_length": 12.21875, + "epoch": 0.302435605396881, + "grad_norm": 22.575733117566205, + "kl": 0.1474609375, + "learning_rate": 6.977396180129665e-07, + "loss": -0.0086, + "reward": 1.4143965244293213, + "reward_std": 0.29561150074005127, + "rewards/accuracy_reward_stage2": 0.44564658403396606, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1726 + }, + { + "completion_length": 7.46875, + "epoch": 0.30261082880672857, + "grad_norm": 17.265000582793025, + "kl": 0.068359375, + "learning_rate": 6.97564394603119e-07, + "loss": -0.017, + "reward": 1.6568676233291626, + "reward_std": 0.2403629571199417, + "rewards/accuracy_reward_stage2": 0.6724926233291626, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1727 + }, + { + "completion_length": 10.421875, + "epoch": 0.3027860522165761, + "grad_norm": 13.370376075249549, + "kl": 0.0966796875, + "learning_rate": 6.973891711932713e-07, + "loss": 0.0387, + "reward": 1.4338374137878418, + "reward_std": 0.12409239262342453, + "rewards/accuracy_reward_stage2": 0.5588374733924866, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1728 + }, + { + "completion_length": 10.03125, + "epoch": 0.30296127562642367, + "grad_norm": 16.232930281897875, + "kl": 0.1044921875, + "learning_rate": 6.972139477834238e-07, + "loss": -0.0017, + "reward": 1.6806886196136475, + "reward_std": 0.19015967845916748, + "rewards/accuracy_reward_stage2": 0.6963136792182922, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1729 + }, + { + "completion_length": 8.046875, + "epoch": 0.30313649903627127, + "grad_norm": 17.384646746654873, + "kl": 0.146484375, + "learning_rate": 6.970387243735763e-07, + "loss": 0.0146, + "reward": 1.5590732097625732, + "reward_std": 0.2061610072851181, + "rewards/accuracy_reward_stage2": 0.5746980905532837, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1730 + }, + { + "completion_length": 11.1875, + "epoch": 0.3033117224461188, + "grad_norm": 25.681642519445006, + "kl": 0.07275390625, + "learning_rate": 6.968635009637287e-07, + "loss": -0.0114, + "reward": 1.6075778007507324, + "reward_std": 0.30036935210227966, + "rewards/accuracy_reward_stage2": 0.6232027411460876, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1731 + }, + { + "completion_length": 12.234375, + "epoch": 0.30348694585596636, + "grad_norm": 15.472091349424671, + "kl": 0.11669921875, + "learning_rate": 6.966882775538812e-07, + "loss": 0.0465, + "reward": 1.7527433633804321, + "reward_std": 0.10957083106040955, + "rewards/accuracy_reward_stage2": 0.8777433633804321, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1732 + }, + { + "completion_length": 7.1875, + "epoch": 0.3036621692658139, + "grad_norm": 12.604748146353854, + "kl": 0.08056640625, + "learning_rate": 6.965130541440337e-07, + "loss": 0.0047, + "reward": 1.7152559757232666, + "reward_std": 0.05833900347352028, + "rewards/accuracy_reward_stage2": 0.7308810949325562, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1733 + }, + { + "completion_length": 9.78125, + "epoch": 0.30383739267566146, + "grad_norm": 23.64900274193855, + "kl": 0.1513671875, + "learning_rate": 6.96337830734186e-07, + "loss": 0.0301, + "reward": 1.58614182472229, + "reward_std": 0.31377267837524414, + "rewards/accuracy_reward_stage2": 0.60176682472229, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1734 + }, + { + "completion_length": 11.09375, + "epoch": 0.304012616085509, + "grad_norm": 14.200156242246816, + "kl": 0.07470703125, + "learning_rate": 6.961626073243385e-07, + "loss": -0.0142, + "reward": 1.3241374492645264, + "reward_std": 0.0708392933011055, + "rewards/accuracy_reward_stage2": 0.46476244926452637, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1735 + }, + { + "completion_length": 8.46875, + "epoch": 0.3041878394953566, + "grad_norm": 18.37673948165755, + "kl": 0.0927734375, + "learning_rate": 6.959873839144909e-07, + "loss": 0.0371, + "reward": 1.5182172060012817, + "reward_std": 0.2147284299135208, + "rewards/accuracy_reward_stage2": 0.5182172060012817, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1736 + }, + { + "completion_length": 8.9375, + "epoch": 0.30436306290520415, + "grad_norm": 53.08502332734311, + "kl": 0.357421875, + "learning_rate": 6.958121605046434e-07, + "loss": 0.119, + "reward": 1.6498416662216187, + "reward_std": 0.2479400783777237, + "rewards/accuracy_reward_stage2": 0.7904666066169739, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1737 + }, + { + "completion_length": 9.625, + "epoch": 0.3045382863150517, + "grad_norm": 21.390723516615594, + "kl": 0.15625, + "learning_rate": 6.956369370947959e-07, + "loss": 0.0043, + "reward": 1.3170366287231445, + "reward_std": 0.2997969686985016, + "rewards/accuracy_reward_stage2": 0.4732866883277893, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1738 + }, + { + "completion_length": 8.359375, + "epoch": 0.30471350972489925, + "grad_norm": 20.05931961782236, + "kl": 0.1337890625, + "learning_rate": 6.954617136849483e-07, + "loss": 0.0094, + "reward": 1.5125038623809814, + "reward_std": 0.16261497139930725, + "rewards/accuracy_reward_stage2": 0.6531288623809814, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1739 + }, + { + "completion_length": 19.984375, + "epoch": 0.3048887331347468, + "grad_norm": 20.79173114096201, + "kl": 0.287109375, + "learning_rate": 6.952864902751007e-07, + "loss": 0.0703, + "reward": 1.5382153987884521, + "reward_std": 0.21911008656024933, + "rewards/accuracy_reward_stage2": 0.6788403391838074, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1740 + }, + { + "completion_length": 10.828125, + "epoch": 0.30506395654459434, + "grad_norm": 17.715087000545765, + "kl": 0.1396484375, + "learning_rate": 6.951112668652531e-07, + "loss": 0.0119, + "reward": 1.2132325172424316, + "reward_std": 0.2066960632801056, + "rewards/accuracy_reward_stage2": 0.35385745763778687, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1741 + }, + { + "completion_length": 13.984375, + "epoch": 0.3052391799544419, + "grad_norm": 30.009838675491842, + "kl": 0.1513671875, + "learning_rate": 6.949360434554056e-07, + "loss": -0.0027, + "reward": 1.3477928638458252, + "reward_std": 0.23385745286941528, + "rewards/accuracy_reward_stage2": 0.3790428638458252, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1742 + }, + { + "completion_length": 9.71875, + "epoch": 0.3054144033642895, + "grad_norm": 22.287655232885186, + "kl": 0.1650390625, + "learning_rate": 6.947608200455581e-07, + "loss": -0.0358, + "reward": 1.2880135774612427, + "reward_std": 0.39539164304733276, + "rewards/accuracy_reward_stage2": 0.33488860726356506, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 1743 + }, + { + "completion_length": 9.953125, + "epoch": 0.30558962677413704, + "grad_norm": 20.939609903793155, + "kl": 0.08251953125, + "learning_rate": 6.945855966357104e-07, + "loss": 0.0329, + "reward": 1.4582154750823975, + "reward_std": 0.24663110077381134, + "rewards/accuracy_reward_stage2": 0.45821553468704224, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1744 + }, + { + "completion_length": 14.140625, + "epoch": 0.3057648501839846, + "grad_norm": 30.250789103938825, + "kl": 0.5390625, + "learning_rate": 6.944103732258629e-07, + "loss": 0.1747, + "reward": 1.0708760023117065, + "reward_std": 0.27258533239364624, + "rewards/accuracy_reward_stage2": 0.5865009427070618, + "rewards/format_reward_stage1_pointerpad": 0.484375, + "scores/accuracy_reward_stage2": 0.484375, + "step": 1745 + }, + { + "completion_length": 9.21875, + "epoch": 0.30594007359383213, + "grad_norm": 21.60245725560257, + "kl": 0.0966796875, + "learning_rate": 6.942351498160154e-07, + "loss": 0.0066, + "reward": 1.5730595588684082, + "reward_std": 0.2877153158187866, + "rewards/accuracy_reward_stage2": 0.5886844992637634, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1746 + }, + { + "completion_length": 12.8125, + "epoch": 0.3061152970036797, + "grad_norm": 23.293060975908556, + "kl": 0.103515625, + "learning_rate": 6.940599264061678e-07, + "loss": 0.0415, + "reward": 1.6763148307800293, + "reward_std": 0.24937281012535095, + "rewards/accuracy_reward_stage2": 0.6763148307800293, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1747 + }, + { + "completion_length": 9.625, + "epoch": 0.3062905204135272, + "grad_norm": 20.31014330344514, + "kl": 0.06787109375, + "learning_rate": 6.938847029963203e-07, + "loss": -0.0171, + "reward": 1.2781528234481812, + "reward_std": 0.18712200224399567, + "rewards/accuracy_reward_stage2": 0.41877782344818115, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1748 + }, + { + "completion_length": 12.0625, + "epoch": 0.3064657438233748, + "grad_norm": 21.356448724215582, + "kl": 0.07470703125, + "learning_rate": 6.937094795864727e-07, + "loss": 0.0298, + "reward": 1.7344744205474854, + "reward_std": 0.22010810673236847, + "rewards/accuracy_reward_stage2": 0.7344744801521301, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1749 + }, + { + "completion_length": 6.34375, + "epoch": 0.3066409672332224, + "grad_norm": 16.5876889581935, + "kl": 0.0654296875, + "learning_rate": 6.935342561766252e-07, + "loss": -0.0179, + "reward": 1.6628472805023193, + "reward_std": 0.19510656595230103, + "rewards/accuracy_reward_stage2": 0.6784722805023193, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1750 + }, + { + "completion_length": 11.359375, + "epoch": 0.3068161906430699, + "grad_norm": 19.02986286446497, + "kl": 0.0191650390625, + "learning_rate": 6.933590327667777e-07, + "loss": 0.0076, + "reward": 1.5158599615097046, + "reward_std": 0.11624392867088318, + "rewards/accuracy_reward_stage2": 0.5158599615097046, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1751 + }, + { + "completion_length": 7.796875, + "epoch": 0.30699141405291747, + "grad_norm": 19.201117488136774, + "kl": 0.1806640625, + "learning_rate": 6.931838093569301e-07, + "loss": 0.0173, + "reward": 1.5672911405563354, + "reward_std": 0.26074376702308655, + "rewards/accuracy_reward_stage2": 0.5985411405563354, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1752 + }, + { + "completion_length": 11.828125, + "epoch": 0.307166637462765, + "grad_norm": 27.713417077776867, + "kl": 0.06787109375, + "learning_rate": 6.930085859470825e-07, + "loss": -0.017, + "reward": 1.5677083730697632, + "reward_std": 0.3722786009311676, + "rewards/accuracy_reward_stage2": 0.5833333730697632, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1753 + }, + { + "completion_length": 9.328125, + "epoch": 0.30734186087261256, + "grad_norm": 17.40100673490896, + "kl": 0.05126953125, + "learning_rate": 6.928333625372349e-07, + "loss": 0.0205, + "reward": 1.8788065910339355, + "reward_std": 0.13032446801662445, + "rewards/accuracy_reward_stage2": 0.8788067102432251, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1754 + }, + { + "completion_length": 10.015625, + "epoch": 0.30751708428246016, + "grad_norm": 19.086859227578596, + "kl": 0.06005859375, + "learning_rate": 6.926581391273873e-07, + "loss": 0.0241, + "reward": 1.5123202800750732, + "reward_std": 0.11857327073812485, + "rewards/accuracy_reward_stage2": 0.6373202800750732, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1755 + }, + { + "completion_length": 8.90625, + "epoch": 0.3076923076923077, + "grad_norm": 17.686528099809827, + "kl": 0.0240478515625, + "learning_rate": 6.924829157175398e-07, + "loss": 0.0096, + "reward": 1.5551719665527344, + "reward_std": 0.18451553583145142, + "rewards/accuracy_reward_stage2": 0.5551718473434448, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1756 + }, + { + "completion_length": 11.96875, + "epoch": 0.30786753110215526, + "grad_norm": 19.352114644863942, + "kl": 0.0966796875, + "learning_rate": 6.923076923076922e-07, + "loss": 0.0385, + "reward": 1.438650131225586, + "reward_std": 0.24392160773277283, + "rewards/accuracy_reward_stage2": 0.5636501312255859, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1757 + }, + { + "completion_length": 8.859375, + "epoch": 0.3080427545120028, + "grad_norm": 27.21748250589423, + "kl": 0.1591796875, + "learning_rate": 6.921324688978447e-07, + "loss": 0.0637, + "reward": 1.498934030532837, + "reward_std": 0.22403478622436523, + "rewards/accuracy_reward_stage2": 0.49893397092819214, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1758 + }, + { + "completion_length": 9.140625, + "epoch": 0.30821797792185035, + "grad_norm": 16.378049138983172, + "kl": 0.01336669921875, + "learning_rate": 6.919572454879972e-07, + "loss": 0.0053, + "reward": 1.7135417461395264, + "reward_std": 0.16204531490802765, + "rewards/accuracy_reward_stage2": 0.7135416865348816, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1759 + }, + { + "completion_length": 8.734375, + "epoch": 0.3083932013316979, + "grad_norm": 24.21610767057917, + "kl": 0.2578125, + "learning_rate": 6.917820220781496e-07, + "loss": 0.0634, + "reward": 1.5016014575958252, + "reward_std": 0.2916114926338196, + "rewards/accuracy_reward_stage2": 0.5172264575958252, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1760 + }, + { + "completion_length": 15.46875, + "epoch": 0.30856842474154544, + "grad_norm": 17.971966916719687, + "kl": 0.05029296875, + "learning_rate": 6.916067986683021e-07, + "loss": 0.0202, + "reward": 1.7312650680541992, + "reward_std": 0.16479554772377014, + "rewards/accuracy_reward_stage2": 0.731265127658844, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1761 + }, + { + "completion_length": 12.296875, + "epoch": 0.30874364815139305, + "grad_norm": 13.864875853871668, + "kl": 0.03564453125, + "learning_rate": 6.914315752584546e-07, + "loss": 0.0142, + "reward": 1.4874417781829834, + "reward_std": 0.09884752333164215, + "rewards/accuracy_reward_stage2": 0.4874417185783386, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1762 + }, + { + "completion_length": 14.15625, + "epoch": 0.3089188715612406, + "grad_norm": 45.83784041709486, + "kl": 0.302734375, + "learning_rate": 6.91256351848607e-07, + "loss": 0.1213, + "reward": 1.5485448837280273, + "reward_std": 0.26276490092277527, + "rewards/accuracy_reward_stage2": 0.6735448837280273, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1763 + }, + { + "completion_length": 10.3125, + "epoch": 0.30909409497108814, + "grad_norm": 14.837251793084045, + "kl": 0.04443359375, + "learning_rate": 6.910811284387594e-07, + "loss": 0.0178, + "reward": 1.5204423666000366, + "reward_std": 0.1353163868188858, + "rewards/accuracy_reward_stage2": 0.5204423666000366, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1764 + }, + { + "completion_length": 15.171875, + "epoch": 0.3092693183809357, + "grad_norm": 11.342377410991487, + "kl": 0.019287109375, + "learning_rate": 6.909059050289118e-07, + "loss": 0.0077, + "reward": 1.2142300605773926, + "reward_std": 0.031139397993683815, + "rewards/accuracy_reward_stage2": 0.5892300009727478, + "rewards/format_reward_stage1_pointerpad": 0.625, + "scores/accuracy_reward_stage2": 0.625, + "step": 1765 + }, + { + "completion_length": 9.84375, + "epoch": 0.30944454179078323, + "grad_norm": 16.764132909546117, + "kl": 0.09765625, + "learning_rate": 6.907306816190642e-07, + "loss": 0.0391, + "reward": 1.5185444355010986, + "reward_std": 0.09002818167209625, + "rewards/accuracy_reward_stage2": 0.6435444355010986, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1766 + }, + { + "completion_length": 11.25, + "epoch": 0.3096197652006308, + "grad_norm": 23.46573423877756, + "kl": 0.04052734375, + "learning_rate": 6.905554582092167e-07, + "loss": 0.0162, + "reward": 1.546875, + "reward_std": 0.20872823894023895, + "rewards/accuracy_reward_stage2": 0.546875, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1767 + }, + { + "completion_length": 9.734375, + "epoch": 0.3097949886104784, + "grad_norm": 22.381252833486272, + "kl": 0.08251953125, + "learning_rate": 6.903802347993691e-07, + "loss": 0.033, + "reward": 1.6614583730697632, + "reward_std": 0.30925020575523376, + "rewards/accuracy_reward_stage2": 0.6614583134651184, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1768 + }, + { + "completion_length": 11.6875, + "epoch": 0.30997021202032593, + "grad_norm": 20.636808608361164, + "kl": 0.126953125, + "learning_rate": 6.902050113895216e-07, + "loss": 0.0063, + "reward": 1.4054598808288574, + "reward_std": 0.2945351302623749, + "rewards/accuracy_reward_stage2": 0.4210848808288574, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1769 + }, + { + "completion_length": 7.5, + "epoch": 0.3101454354301735, + "grad_norm": 15.579460620975455, + "kl": 0.039306640625, + "learning_rate": 6.900297879796741e-07, + "loss": 0.0157, + "reward": 1.7460291385650635, + "reward_std": 0.13293343782424927, + "rewards/accuracy_reward_stage2": 0.7460291385650635, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1770 + }, + { + "completion_length": 7.578125, + "epoch": 0.310320658840021, + "grad_norm": 17.058832854839068, + "kl": 0.044677734375, + "learning_rate": 6.898545645698265e-07, + "loss": 0.0178, + "reward": 1.8352556228637695, + "reward_std": 0.16354835033416748, + "rewards/accuracy_reward_stage2": 0.83525550365448, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1771 + }, + { + "completion_length": 14.390625, + "epoch": 0.31049588224986857, + "grad_norm": 17.598162532968843, + "kl": 0.08203125, + "learning_rate": 6.89679341159979e-07, + "loss": -0.0556, + "reward": 1.3854167461395264, + "reward_std": 0.16781337559223175, + "rewards/accuracy_reward_stage2": 0.4166666865348816, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1772 + }, + { + "completion_length": 23.28125, + "epoch": 0.3106711056597161, + "grad_norm": 24.96201886620127, + "kl": 0.482421875, + "learning_rate": 6.895041177501314e-07, + "loss": 0.1934, + "reward": 1.4418706893920898, + "reward_std": 0.10315918177366257, + "rewards/accuracy_reward_stage2": 0.6918706297874451, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1773 + }, + { + "completion_length": 13.234375, + "epoch": 0.3108463290695637, + "grad_norm": 26.336116245880884, + "kl": 0.146484375, + "learning_rate": 6.893288943402838e-07, + "loss": 0.0587, + "reward": 1.6287994384765625, + "reward_std": 0.2064562737941742, + "rewards/accuracy_reward_stage2": 0.6287994384765625, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1774 + }, + { + "completion_length": 14.375, + "epoch": 0.31102155247941127, + "grad_norm": 17.15853837132984, + "kl": 0.037109375, + "learning_rate": 6.891536709304363e-07, + "loss": -0.0293, + "reward": 1.4953597784042358, + "reward_std": 0.26825428009033203, + "rewards/accuracy_reward_stage2": 0.5109847784042358, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1775 + }, + { + "completion_length": 11.6875, + "epoch": 0.3111967758892588, + "grad_norm": 19.782067804864628, + "kl": 0.1259765625, + "learning_rate": 6.889784475205887e-07, + "loss": -0.032, + "reward": 1.452484130859375, + "reward_std": 0.21816937625408173, + "rewards/accuracy_reward_stage2": 0.608734130859375, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1776 + }, + { + "completion_length": 14.125, + "epoch": 0.31137199929910636, + "grad_norm": 28.4017838573517, + "kl": 0.134765625, + "learning_rate": 6.888032241107412e-07, + "loss": 0.0537, + "reward": 1.4646635055541992, + "reward_std": 0.1783100664615631, + "rewards/accuracy_reward_stage2": 0.7146634459495544, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1777 + }, + { + "completion_length": 17.140625, + "epoch": 0.3115472227089539, + "grad_norm": 22.533784478535114, + "kl": 0.12255859375, + "learning_rate": 6.886280007008937e-07, + "loss": 0.0175, + "reward": 1.6235495805740356, + "reward_std": 0.32233843207359314, + "rewards/accuracy_reward_stage2": 0.6391745805740356, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1778 + }, + { + "completion_length": 11.015625, + "epoch": 0.31172244611880146, + "grad_norm": 15.745599310481818, + "kl": 0.0169677734375, + "learning_rate": 6.88452777291046e-07, + "loss": 0.0068, + "reward": 1.8678183555603027, + "reward_std": 0.1711842119693756, + "rewards/accuracy_reward_stage2": 0.867818295955658, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1779 + }, + { + "completion_length": 22.296875, + "epoch": 0.311897669528649, + "grad_norm": 20.326031505125602, + "kl": 0.0654296875, + "learning_rate": 6.882775538811985e-07, + "loss": 0.0261, + "reward": 1.4402329921722412, + "reward_std": 0.2539198100566864, + "rewards/accuracy_reward_stage2": 0.440233051776886, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1780 + }, + { + "completion_length": 42.359375, + "epoch": 0.3120728929384966, + "grad_norm": 21.174740109176046, + "kl": 0.052978515625, + "learning_rate": 6.881023304713509e-07, + "loss": 0.0213, + "reward": 1.5869065523147583, + "reward_std": 0.10317087918519974, + "rewards/accuracy_reward_stage2": 0.5869066119194031, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1781 + }, + { + "completion_length": 11.90625, + "epoch": 0.31224811634834415, + "grad_norm": 18.904937260082143, + "kl": 0.04345703125, + "learning_rate": 6.879271070615034e-07, + "loss": -0.0257, + "reward": 1.6459707021713257, + "reward_std": 0.18680721521377563, + "rewards/accuracy_reward_stage2": 0.6615957021713257, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1782 + }, + { + "completion_length": 8.046875, + "epoch": 0.3124233397581917, + "grad_norm": 25.21570094160528, + "kl": 0.1044921875, + "learning_rate": 6.877518836516559e-07, + "loss": 0.0203, + "reward": 1.656537413597107, + "reward_std": 0.3188740313053131, + "rewards/accuracy_reward_stage2": 0.6721623539924622, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1783 + }, + { + "completion_length": 18.328125, + "epoch": 0.31259856316803925, + "grad_norm": 12.622433851895309, + "kl": 0.010498046875, + "learning_rate": 6.875766602418082e-07, + "loss": 0.0042, + "reward": 1.5705180168151855, + "reward_std": 0.11025116592645645, + "rewards/accuracy_reward_stage2": 0.570517897605896, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1784 + }, + { + "completion_length": 11.21875, + "epoch": 0.3127737865778868, + "grad_norm": 19.055355689325225, + "kl": 0.053955078125, + "learning_rate": 6.874014368319607e-07, + "loss": 0.0215, + "reward": 1.5969265699386597, + "reward_std": 0.23442596197128296, + "rewards/accuracy_reward_stage2": 0.7219265699386597, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1785 + }, + { + "completion_length": 15.578125, + "epoch": 0.31294900998773434, + "grad_norm": 19.587873470697733, + "kl": 0.08349609375, + "learning_rate": 6.872262134221132e-07, + "loss": 0.0335, + "reward": 1.5746169090270996, + "reward_std": 0.141147643327713, + "rewards/accuracy_reward_stage2": 0.5746169090270996, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1786 + }, + { + "completion_length": 9.34375, + "epoch": 0.31312423339758194, + "grad_norm": 14.958775392170649, + "kl": 0.087890625, + "learning_rate": 6.870509900122656e-07, + "loss": 0.0351, + "reward": 1.5703849792480469, + "reward_std": 0.11295461654663086, + "rewards/accuracy_reward_stage2": 0.5703849792480469, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1787 + }, + { + "completion_length": 13.03125, + "epoch": 0.3132994568074295, + "grad_norm": 22.341246226878546, + "kl": 0.099609375, + "learning_rate": 6.868757666024181e-07, + "loss": 0.0527, + "reward": 1.4932540655136108, + "reward_std": 0.2555721402168274, + "rewards/accuracy_reward_stage2": 0.6182540655136108, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1788 + }, + { + "completion_length": 15.65625, + "epoch": 0.31347468021727704, + "grad_norm": 17.995892472886116, + "kl": 0.181640625, + "learning_rate": 6.867005431925705e-07, + "loss": 0.0725, + "reward": 1.3227570056915283, + "reward_std": 0.10920379310846329, + "rewards/accuracy_reward_stage2": 0.4477570056915283, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1789 + }, + { + "completion_length": 12.96875, + "epoch": 0.3136499036271246, + "grad_norm": 24.844965959313072, + "kl": 0.055908203125, + "learning_rate": 6.86525319782723e-07, + "loss": -0.0218, + "reward": 1.794481635093689, + "reward_std": 0.24829509854316711, + "rewards/accuracy_reward_stage2": 0.8101067543029785, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1790 + }, + { + "completion_length": 12.953125, + "epoch": 0.31382512703697213, + "grad_norm": 16.769842743412386, + "kl": 0.056884765625, + "learning_rate": 6.863500963728754e-07, + "loss": -0.0214, + "reward": 1.5633602142333984, + "reward_std": 0.16764867305755615, + "rewards/accuracy_reward_stage2": 0.5789852142333984, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1791 + }, + { + "completion_length": 5.625, + "epoch": 0.3140003504468197, + "grad_norm": 17.682650240172475, + "kl": 0.0908203125, + "learning_rate": 6.861748729630278e-07, + "loss": 0.001, + "reward": 1.6108324527740479, + "reward_std": 0.20219948887825012, + "rewards/accuracy_reward_stage2": 0.6264575719833374, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1792 + }, + { + "completion_length": 12.359375, + "epoch": 0.3141755738566673, + "grad_norm": 18.742977868192597, + "kl": 0.0986328125, + "learning_rate": 6.859996495531802e-07, + "loss": -0.0046, + "reward": 1.536928415298462, + "reward_std": 0.2393271028995514, + "rewards/accuracy_reward_stage2": 0.5525534152984619, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1793 + }, + { + "completion_length": 17.125, + "epoch": 0.3143507972665148, + "grad_norm": 2587.4131228674446, + "kl": 9.4375, + "learning_rate": 6.858244261433327e-07, + "loss": 3.7916, + "reward": 1.308718204498291, + "reward_std": 0.2237774282693863, + "rewards/accuracy_reward_stage2": 0.433718204498291, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1794 + }, + { + "completion_length": 9.21875, + "epoch": 0.3145260206763624, + "grad_norm": 20.517464844163342, + "kl": 0.1181640625, + "learning_rate": 6.856492027334851e-07, + "loss": 0.0474, + "reward": 1.5761194229125977, + "reward_std": 0.17638246715068817, + "rewards/accuracy_reward_stage2": 0.5761193633079529, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1795 + }, + { + "completion_length": 12.65625, + "epoch": 0.3147012440862099, + "grad_norm": 15.253128507606846, + "kl": 0.052734375, + "learning_rate": 6.854739793236376e-07, + "loss": -0.0121, + "reward": 1.2813804149627686, + "reward_std": 0.16135841608047485, + "rewards/accuracy_reward_stage2": 0.4220053553581238, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1796 + }, + { + "completion_length": 17.984375, + "epoch": 0.31487646749605747, + "grad_norm": 18.547588567577, + "kl": 0.130859375, + "learning_rate": 6.8529875591379e-07, + "loss": -0.0184, + "reward": 1.5960500240325928, + "reward_std": 0.2250652313232422, + "rewards/accuracy_reward_stage2": 0.6273000836372375, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1797 + }, + { + "completion_length": 8.578125, + "epoch": 0.315051690905905, + "grad_norm": 16.926765708714097, + "kl": 0.10595703125, + "learning_rate": 6.851235325039425e-07, + "loss": 0.0425, + "reward": 1.3718523979187012, + "reward_std": 0.14817330241203308, + "rewards/accuracy_reward_stage2": 0.4968523681163788, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1798 + }, + { + "completion_length": 8.15625, + "epoch": 0.31522691431575256, + "grad_norm": 18.962333571474712, + "kl": 0.208984375, + "learning_rate": 6.84948309094095e-07, + "loss": 0.0397, + "reward": 1.7007365226745605, + "reward_std": 0.24715575575828552, + "rewards/accuracy_reward_stage2": 0.7163615226745605, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1799 + }, + { + "completion_length": 9.1875, + "epoch": 0.31540213772560016, + "grad_norm": 12.197948562184031, + "kl": 0.0703125, + "learning_rate": 6.847730856842474e-07, + "loss": 0.0282, + "reward": 1.671875, + "reward_std": 0.14489679038524628, + "rewards/accuracy_reward_stage2": 0.671875, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1800 + }, + { + "completion_length": 7.28125, + "epoch": 0.3155773611354477, + "grad_norm": 14.937120670245951, + "kl": 0.0703125, + "learning_rate": 6.845978622743999e-07, + "loss": -0.0059, + "reward": 1.7304699420928955, + "reward_std": 0.19275791943073273, + "rewards/accuracy_reward_stage2": 0.7460950016975403, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1801 + }, + { + "completion_length": 10.078125, + "epoch": 0.31575258454529526, + "grad_norm": 18.702303439522616, + "kl": 0.07421875, + "learning_rate": 6.844226388645524e-07, + "loss": 0.0296, + "reward": 1.6846437454223633, + "reward_std": 0.1779603809118271, + "rewards/accuracy_reward_stage2": 0.6846436858177185, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1802 + }, + { + "completion_length": 10.28125, + "epoch": 0.3159278079551428, + "grad_norm": 24.060191532518267, + "kl": 0.028564453125, + "learning_rate": 6.842474154547048e-07, + "loss": -0.0053, + "reward": 1.7212055921554565, + "reward_std": 0.16443204879760742, + "rewards/accuracy_reward_stage2": 0.7368306517601013, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1803 + }, + { + "completion_length": 13.234375, + "epoch": 0.31610303136499035, + "grad_norm": 15.439669746278195, + "kl": 0.0625, + "learning_rate": 6.840721920448571e-07, + "loss": 0.025, + "reward": 1.3805962800979614, + "reward_std": 0.16425207257270813, + "rewards/accuracy_reward_stage2": 0.5055962800979614, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1804 + }, + { + "completion_length": 15.484375, + "epoch": 0.3162782547748379, + "grad_norm": 22.17714169358847, + "kl": 0.1318359375, + "learning_rate": 6.838969686350095e-07, + "loss": 0.0723, + "reward": 1.2215710878372192, + "reward_std": 0.161886066198349, + "rewards/accuracy_reward_stage2": 0.34657102823257446, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1805 + }, + { + "completion_length": 8.3125, + "epoch": 0.3164534781846855, + "grad_norm": 18.199861306414956, + "kl": 0.05126953125, + "learning_rate": 6.83721745225162e-07, + "loss": -0.0144, + "reward": 1.4537413120269775, + "reward_std": 0.2938691973686218, + "rewards/accuracy_reward_stage2": 0.46936625242233276, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1806 + }, + { + "completion_length": 9.3125, + "epoch": 0.31662870159453305, + "grad_norm": 16.691593883218008, + "kl": 0.22265625, + "learning_rate": 6.835465218153145e-07, + "loss": 0.0604, + "reward": 1.3344494104385376, + "reward_std": 0.16383177042007446, + "rewards/accuracy_reward_stage2": 0.5844494104385376, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1807 + }, + { + "completion_length": 8.921875, + "epoch": 0.3168039250043806, + "grad_norm": 21.852852343190598, + "kl": 0.11181640625, + "learning_rate": 6.833712984054669e-07, + "loss": 0.0204, + "reward": 1.7614967823028564, + "reward_std": 0.23907579481601715, + "rewards/accuracy_reward_stage2": 0.7771217823028564, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1808 + }, + { + "completion_length": 11.09375, + "epoch": 0.31697914841422814, + "grad_norm": 21.97589200147923, + "kl": 0.2470703125, + "learning_rate": 6.831960749956194e-07, + "loss": 0.0891, + "reward": 1.1384527683258057, + "reward_std": 0.20597995817661285, + "rewards/accuracy_reward_stage2": 0.40407782793045044, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 1809 + }, + { + "completion_length": 16.96875, + "epoch": 0.3171543718240757, + "grad_norm": 27.96568751730448, + "kl": 0.265625, + "learning_rate": 6.830208515857718e-07, + "loss": 0.1061, + "reward": 1.6964399814605713, + "reward_std": 0.17561517655849457, + "rewards/accuracy_reward_stage2": 0.8214400410652161, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1810 + }, + { + "completion_length": 10.5625, + "epoch": 0.31732959523392323, + "grad_norm": 21.330821947226656, + "kl": 0.1865234375, + "learning_rate": 6.828456281759243e-07, + "loss": 0.0744, + "reward": 1.559272050857544, + "reward_std": 0.21070796251296997, + "rewards/accuracy_reward_stage2": 0.8092721104621887, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1811 + }, + { + "completion_length": 6.90625, + "epoch": 0.3175048186437708, + "grad_norm": 16.990363875822908, + "kl": 0.05517578125, + "learning_rate": 6.826704047660768e-07, + "loss": 0.0221, + "reward": 1.7029221057891846, + "reward_std": 0.2239094078540802, + "rewards/accuracy_reward_stage2": 0.7029222249984741, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1812 + }, + { + "completion_length": 9.875, + "epoch": 0.3176800420536184, + "grad_norm": 22.556586253233327, + "kl": 0.0419921875, + "learning_rate": 6.824951813562291e-07, + "loss": 0.0169, + "reward": 1.5033842325210571, + "reward_std": 0.17745548486709595, + "rewards/accuracy_reward_stage2": 0.5033841729164124, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1813 + }, + { + "completion_length": 13.59375, + "epoch": 0.31785526546346593, + "grad_norm": 16.64785646842737, + "kl": 0.0673828125, + "learning_rate": 6.823199579463816e-07, + "loss": 0.0269, + "reward": 1.3829011917114258, + "reward_std": 0.1356073021888733, + "rewards/accuracy_reward_stage2": 0.3829011917114258, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1814 + }, + { + "completion_length": 9.0625, + "epoch": 0.3180304888733135, + "grad_norm": 20.360701507122354, + "kl": 0.06982421875, + "learning_rate": 6.821447345365341e-07, + "loss": 0.028, + "reward": 1.6769695281982422, + "reward_std": 0.16446499526500702, + "rewards/accuracy_reward_stage2": 0.6769695281982422, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1815 + }, + { + "completion_length": 13.28125, + "epoch": 0.318205712283161, + "grad_norm": 26.064078900170728, + "kl": 0.11572265625, + "learning_rate": 6.819695111266865e-07, + "loss": 0.0463, + "reward": 1.6434980630874634, + "reward_std": 0.2011084407567978, + "rewards/accuracy_reward_stage2": 0.7684980630874634, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1816 + }, + { + "completion_length": 13.765625, + "epoch": 0.31838093569300857, + "grad_norm": 80.66486305198394, + "kl": 0.32421875, + "learning_rate": 6.817942877168389e-07, + "loss": 0.1009, + "reward": 1.4909992218017578, + "reward_std": 0.2135014533996582, + "rewards/accuracy_reward_stage2": 0.6316242218017578, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1817 + }, + { + "completion_length": 9.09375, + "epoch": 0.3185561591028561, + "grad_norm": 20.126958301707184, + "kl": 0.08154296875, + "learning_rate": 6.816190643069913e-07, + "loss": 0.0326, + "reward": 1.7232518196105957, + "reward_std": 0.17581014335155487, + "rewards/accuracy_reward_stage2": 0.7232518196105957, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1818 + }, + { + "completion_length": 9.03125, + "epoch": 0.3187313825127037, + "grad_norm": 30.36696329002241, + "kl": 0.1337890625, + "learning_rate": 6.814438408971438e-07, + "loss": 0.0093, + "reward": 1.4545722007751465, + "reward_std": 0.3125390410423279, + "rewards/accuracy_reward_stage2": 0.4701971113681793, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1819 + }, + { + "completion_length": 10.140625, + "epoch": 0.31890660592255127, + "grad_norm": 25.695729695133302, + "kl": 0.150390625, + "learning_rate": 6.812686174872963e-07, + "loss": 0.0031, + "reward": 1.7126063108444214, + "reward_std": 0.21297934651374817, + "rewards/accuracy_reward_stage2": 0.7438563108444214, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1820 + }, + { + "completion_length": 5.9375, + "epoch": 0.3190818293323988, + "grad_norm": 10.358628847465996, + "kl": 0.0250244140625, + "learning_rate": 6.810933940774487e-07, + "loss": 0.01, + "reward": 1.6214256286621094, + "reward_std": 0.0782785713672638, + "rewards/accuracy_reward_stage2": 0.6214256286621094, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1821 + }, + { + "completion_length": 5.5625, + "epoch": 0.31925705274224636, + "grad_norm": 13.398588020960577, + "kl": 0.016357421875, + "learning_rate": 6.809181706676012e-07, + "loss": 0.0065, + "reward": 1.6663763523101807, + "reward_std": 0.0802459791302681, + "rewards/accuracy_reward_stage2": 0.6663764119148254, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1822 + }, + { + "completion_length": 9.453125, + "epoch": 0.3194322761520939, + "grad_norm": 10.403578788426847, + "kl": 0.06982421875, + "learning_rate": 6.807429472577537e-07, + "loss": -0.006, + "reward": 1.1771003007888794, + "reward_std": 0.07623656839132309, + "rewards/accuracy_reward_stage2": 0.192725270986557, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1823 + }, + { + "completion_length": 11.140625, + "epoch": 0.31960749956194145, + "grad_norm": 17.66257215236959, + "kl": 0.09326171875, + "learning_rate": 6.80567723847906e-07, + "loss": 0.0374, + "reward": 1.367735743522644, + "reward_std": 0.20256279408931732, + "rewards/accuracy_reward_stage2": 0.49273571372032166, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1824 + }, + { + "completion_length": 10.84375, + "epoch": 0.31978272297178906, + "grad_norm": 21.69167316332861, + "kl": 0.130859375, + "learning_rate": 6.803925004380585e-07, + "loss": -0.121, + "reward": 1.7133700847625732, + "reward_std": 0.3308318555355072, + "rewards/accuracy_reward_stage2": 0.7758700847625732, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 1825 + }, + { + "completion_length": 17.40625, + "epoch": 0.3199579463816366, + "grad_norm": 187.1766272653558, + "kl": 1.421875, + "learning_rate": 6.802172770282109e-07, + "loss": 0.5675, + "reward": 1.5635792016983032, + "reward_std": 0.08996030688285828, + "rewards/accuracy_reward_stage2": 0.6885791420936584, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1826 + }, + { + "completion_length": 9.796875, + "epoch": 0.32013316979148415, + "grad_norm": 21.355418912809615, + "kl": 0.10205078125, + "learning_rate": 6.800420536183634e-07, + "loss": -0.0034, + "reward": 1.9030907154083252, + "reward_std": 0.2621467709541321, + "rewards/accuracy_reward_stage2": 0.9187155961990356, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1827 + }, + { + "completion_length": 11.875, + "epoch": 0.3203083932013317, + "grad_norm": 359.33581112946854, + "kl": 0.8203125, + "learning_rate": 6.798668302085159e-07, + "loss": 0.3285, + "reward": 1.2450122833251953, + "reward_std": 0.19816677272319794, + "rewards/accuracy_reward_stage2": 0.37001216411590576, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1828 + }, + { + "completion_length": 18.609375, + "epoch": 0.32048361661117925, + "grad_norm": 48.93425362715092, + "kl": 0.4140625, + "learning_rate": 6.796916067986683e-07, + "loss": 0.1657, + "reward": 1.510254144668579, + "reward_std": 0.24771133065223694, + "rewards/accuracy_reward_stage2": 0.6352540850639343, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1829 + }, + { + "completion_length": 12.90625, + "epoch": 0.3206588400210268, + "grad_norm": 20.2390037277772, + "kl": 0.10498046875, + "learning_rate": 6.795163833888207e-07, + "loss": 0.013, + "reward": 1.727494478225708, + "reward_std": 0.18558049201965332, + "rewards/accuracy_reward_stage2": 0.7431195378303528, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1830 + }, + { + "completion_length": 9.75, + "epoch": 0.32083406343087434, + "grad_norm": 25.06970663743917, + "kl": 0.2421875, + "learning_rate": 6.793411599789732e-07, + "loss": 0.0968, + "reward": 1.4367856979370117, + "reward_std": 0.2500126361846924, + "rewards/accuracy_reward_stage2": 0.5617856979370117, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1831 + }, + { + "completion_length": 11.5625, + "epoch": 0.32100928684072194, + "grad_norm": 19.475838526195844, + "kl": 0.01373291015625, + "learning_rate": 6.791659365691256e-07, + "loss": 0.0055, + "reward": 1.711681604385376, + "reward_std": 0.1387220323085785, + "rewards/accuracy_reward_stage2": 0.711681604385376, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1832 + }, + { + "completion_length": 9.3125, + "epoch": 0.3211845102505695, + "grad_norm": 15.38598416242657, + "kl": 0.08349609375, + "learning_rate": 6.78990713159278e-07, + "loss": 0.0001, + "reward": 1.4822373390197754, + "reward_std": 0.24437138438224792, + "rewards/accuracy_reward_stage2": 0.49786239862442017, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1833 + }, + { + "completion_length": 16.46875, + "epoch": 0.32135973366041704, + "grad_norm": 26.15721137803418, + "kl": 0.265625, + "learning_rate": 6.788154897494304e-07, + "loss": 0.0644, + "reward": 1.4538758993148804, + "reward_std": 0.28248101472854614, + "rewards/accuracy_reward_stage2": 0.5945007801055908, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1834 + }, + { + "completion_length": 16.109375, + "epoch": 0.3215349570702646, + "grad_norm": 22.7187542979085, + "kl": 0.375, + "learning_rate": 6.786402663395829e-07, + "loss": 0.1497, + "reward": 1.4750638008117676, + "reward_std": 0.2322225123643875, + "rewards/accuracy_reward_stage2": 0.725063681602478, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1835 + }, + { + "completion_length": 20.046875, + "epoch": 0.32171018048011213, + "grad_norm": 19.070056528167537, + "kl": 0.484375, + "learning_rate": 6.784650429297354e-07, + "loss": 0.2005, + "reward": 1.3314766883850098, + "reward_std": 0.22183364629745483, + "rewards/accuracy_reward_stage2": 0.5814766883850098, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1836 + }, + { + "completion_length": 8.125, + "epoch": 0.3218854038899597, + "grad_norm": 14.357050005656363, + "kl": 0.07080078125, + "learning_rate": 6.782898195198878e-07, + "loss": 0.0283, + "reward": 1.2363032102584839, + "reward_std": 0.13740503787994385, + "rewards/accuracy_reward_stage2": 0.3613032400608063, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1837 + }, + { + "completion_length": 16.015625, + "epoch": 0.3220606272998073, + "grad_norm": 18.629287627422684, + "kl": 0.06396484375, + "learning_rate": 6.781145961100403e-07, + "loss": 0.0255, + "reward": 1.4411256313323975, + "reward_std": 0.14058303833007812, + "rewards/accuracy_reward_stage2": 0.5661256313323975, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1838 + }, + { + "completion_length": 10.0, + "epoch": 0.3222358507096548, + "grad_norm": 19.43622528331183, + "kl": 0.06396484375, + "learning_rate": 6.779393727001928e-07, + "loss": -0.0035, + "reward": 1.5037338733673096, + "reward_std": 0.2288183569908142, + "rewards/accuracy_reward_stage2": 0.5193589925765991, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1839 + }, + { + "completion_length": 13.40625, + "epoch": 0.32241107411950237, + "grad_norm": 21.843282347266314, + "kl": 0.1728515625, + "learning_rate": 6.777641492903452e-07, + "loss": 0.0693, + "reward": 1.3334496021270752, + "reward_std": 0.11595918238162994, + "rewards/accuracy_reward_stage2": 0.45844966173171997, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1840 + }, + { + "completion_length": 10.6875, + "epoch": 0.3225862975293499, + "grad_norm": 16.798007544070753, + "kl": 0.05859375, + "learning_rate": 6.775889258804977e-07, + "loss": -0.0713, + "reward": 1.5361828804016113, + "reward_std": 0.19550885260105133, + "rewards/accuracy_reward_stage2": 0.5830577611923218, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 1841 + }, + { + "completion_length": 10.125, + "epoch": 0.32276152093919747, + "grad_norm": 14.468662236027644, + "kl": 0.0189208984375, + "learning_rate": 6.7741370247065e-07, + "loss": 0.0076, + "reward": 1.5052210092544556, + "reward_std": 0.10745733976364136, + "rewards/accuracy_reward_stage2": 0.5052210092544556, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1842 + }, + { + "completion_length": 31.875, + "epoch": 0.322936744349045, + "grad_norm": 19.206921000127394, + "kl": 0.053466796875, + "learning_rate": 6.772384790608024e-07, + "loss": 0.0213, + "reward": 1.525323510169983, + "reward_std": 0.15850859880447388, + "rewards/accuracy_reward_stage2": 0.5253235101699829, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1843 + }, + { + "completion_length": 18.328125, + "epoch": 0.3231119677588926, + "grad_norm": 15.084336274366542, + "kl": 0.234375, + "learning_rate": 6.770632556509549e-07, + "loss": 0.0647, + "reward": 1.266427993774414, + "reward_std": 0.14675506949424744, + "rewards/accuracy_reward_stage2": 0.4070529043674469, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1844 + }, + { + "completion_length": 8.65625, + "epoch": 0.32328719116874016, + "grad_norm": 14.951960898593283, + "kl": 0.0400390625, + "learning_rate": 6.768880322411073e-07, + "loss": 0.0161, + "reward": 1.4418463706970215, + "reward_std": 0.08619339764118195, + "rewards/accuracy_reward_stage2": 0.6918463110923767, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1845 + }, + { + "completion_length": 10.921875, + "epoch": 0.3234624145785877, + "grad_norm": 34.66320880041889, + "kl": 0.11328125, + "learning_rate": 6.767128088312598e-07, + "loss": 0.0453, + "reward": 1.4650702476501465, + "reward_std": 0.17414847016334534, + "rewards/accuracy_reward_stage2": 0.5900702476501465, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1846 + }, + { + "completion_length": 8.46875, + "epoch": 0.32363763798843526, + "grad_norm": 20.237955441663978, + "kl": 0.2080078125, + "learning_rate": 6.765375854214123e-07, + "loss": -0.0051, + "reward": 1.6920424699783325, + "reward_std": 0.34711211919784546, + "rewards/accuracy_reward_stage2": 0.8482924699783325, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1847 + }, + { + "completion_length": 11.859375, + "epoch": 0.3238128613982828, + "grad_norm": 24.731047006662198, + "kl": 0.1640625, + "learning_rate": 6.763623620115647e-07, + "loss": -0.0182, + "reward": 1.280834674835205, + "reward_std": 0.28956809639930725, + "rewards/accuracy_reward_stage2": 0.4370846748352051, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1848 + }, + { + "completion_length": 10.8125, + "epoch": 0.32398808480813035, + "grad_norm": 26.723550080573006, + "kl": 0.279296875, + "learning_rate": 6.761871386017172e-07, + "loss": 0.1113, + "reward": 1.5930095911026, + "reward_std": 0.22630318999290466, + "rewards/accuracy_reward_stage2": 0.7180095911026001, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1849 + }, + { + "completion_length": 13.390625, + "epoch": 0.3241633082179779, + "grad_norm": 22.345652650672584, + "kl": 0.115234375, + "learning_rate": 6.760119151918696e-07, + "loss": 0.046, + "reward": 1.3461437225341797, + "reward_std": 0.28940150141716003, + "rewards/accuracy_reward_stage2": 0.4711437523365021, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1850 + }, + { + "completion_length": 11.828125, + "epoch": 0.3243385316278255, + "grad_norm": 24.827188975257773, + "kl": 0.01324462890625, + "learning_rate": 6.758366917820221e-07, + "loss": 0.0053, + "reward": 1.484375, + "reward_std": 0.25217998027801514, + "rewards/accuracy_reward_stage2": 0.484375, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1851 + }, + { + "completion_length": 11.953125, + "epoch": 0.32451375503767305, + "grad_norm": 17.497428306046615, + "kl": 0.13671875, + "learning_rate": 6.756614683721746e-07, + "loss": -0.0133, + "reward": 1.5360865592956543, + "reward_std": 0.23445501923561096, + "rewards/accuracy_reward_stage2": 0.5673364996910095, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1852 + }, + { + "completion_length": 10.5625, + "epoch": 0.3246889784475206, + "grad_norm": 22.98841053109054, + "kl": 0.052734375, + "learning_rate": 6.754862449623269e-07, + "loss": 0.0211, + "reward": 1.4700126647949219, + "reward_std": 0.3278173804283142, + "rewards/accuracy_reward_stage2": 0.4700126349925995, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1853 + }, + { + "completion_length": 10.109375, + "epoch": 0.32486420185736814, + "grad_norm": 23.18911960312045, + "kl": 0.14453125, + "learning_rate": 6.753110215524794e-07, + "loss": 0.0169, + "reward": 1.4488449096679688, + "reward_std": 0.17043456435203552, + "rewards/accuracy_reward_stage2": 0.4644698202610016, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1854 + }, + { + "completion_length": 7.875, + "epoch": 0.3250394252672157, + "grad_norm": 19.61804996855809, + "kl": 0.10791015625, + "learning_rate": 6.751357981426318e-07, + "loss": 0.0431, + "reward": 1.716990351676941, + "reward_std": 0.22781193256378174, + "rewards/accuracy_reward_stage2": 0.7169903516769409, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1855 + }, + { + "completion_length": 9.65625, + "epoch": 0.32521464867706323, + "grad_norm": 14.670859725635287, + "kl": 0.283203125, + "learning_rate": 6.749605747327842e-07, + "loss": 0.025, + "reward": 1.4233312606811523, + "reward_std": 0.11870677769184113, + "rewards/accuracy_reward_stage2": 0.5795813202857971, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1856 + }, + { + "completion_length": 4.6875, + "epoch": 0.32538987208691084, + "grad_norm": 14.784253008631724, + "kl": 0.0859375, + "learning_rate": 6.747853513229367e-07, + "loss": -0.0098, + "reward": 1.6528429985046387, + "reward_std": 0.19694890081882477, + "rewards/accuracy_reward_stage2": 0.6684680581092834, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1857 + }, + { + "completion_length": 11.484375, + "epoch": 0.3255650954967584, + "grad_norm": 17.366655675185815, + "kl": 0.09716796875, + "learning_rate": 6.746101279130891e-07, + "loss": 0.0047, + "reward": 1.3349708318710327, + "reward_std": 0.18092112243175507, + "rewards/accuracy_reward_stage2": 0.3505958318710327, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1858 + }, + { + "completion_length": 10.015625, + "epoch": 0.32574031890660593, + "grad_norm": 20.455291845170937, + "kl": 0.1025390625, + "learning_rate": 6.744349045032416e-07, + "loss": -0.0135, + "reward": 1.5648746490478516, + "reward_std": 0.3064984083175659, + "rewards/accuracy_reward_stage2": 0.7054996490478516, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1859 + }, + { + "completion_length": 9.1875, + "epoch": 0.3259155423164535, + "grad_norm": 21.613500327855913, + "kl": 0.10009765625, + "learning_rate": 6.742596810933941e-07, + "loss": -0.0377, + "reward": 1.8078569173812866, + "reward_std": 0.2591524124145508, + "rewards/accuracy_reward_stage2": 0.8391069769859314, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1860 + }, + { + "completion_length": 9.578125, + "epoch": 0.326090765726301, + "grad_norm": 26.2316907230029, + "kl": 0.17578125, + "learning_rate": 6.740844576835465e-07, + "loss": 0.0702, + "reward": 1.3697917461395264, + "reward_std": 0.3007485270500183, + "rewards/accuracy_reward_stage2": 0.6197916269302368, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1861 + }, + { + "completion_length": 14.3125, + "epoch": 0.32626598913614857, + "grad_norm": 245.91376195572363, + "kl": 1.3046875, + "learning_rate": 6.73909234273699e-07, + "loss": 0.4788, + "reward": 1.1469907760620117, + "reward_std": 0.25098198652267456, + "rewards/accuracy_reward_stage2": 0.5376157164573669, + "rewards/format_reward_stage1_pointerpad": 0.609375, + "scores/accuracy_reward_stage2": 0.609375, + "step": 1862 + }, + { + "completion_length": 6.859375, + "epoch": 0.3264412125459961, + "grad_norm": 46.635079623868656, + "kl": 0.3984375, + "learning_rate": 6.737340108638514e-07, + "loss": 0.0704, + "reward": 1.6059027910232544, + "reward_std": 0.29173195362091064, + "rewards/accuracy_reward_stage2": 0.6371527910232544, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1863 + }, + { + "completion_length": 7.90625, + "epoch": 0.3266164359558437, + "grad_norm": 13.745160872355433, + "kl": 0.0751953125, + "learning_rate": 6.735587874540038e-07, + "loss": 0.03, + "reward": 1.6368508338928223, + "reward_std": 0.0824931263923645, + "rewards/accuracy_reward_stage2": 0.6368508338928223, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1864 + }, + { + "completion_length": 18.859375, + "epoch": 0.32679165936569127, + "grad_norm": 43.0947117605456, + "kl": 0.326171875, + "learning_rate": 6.733835640441563e-07, + "loss": 0.0918, + "reward": 1.6280841827392578, + "reward_std": 0.30339083075523376, + "rewards/accuracy_reward_stage2": 0.768709123134613, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1865 + }, + { + "completion_length": 11.203125, + "epoch": 0.3269668827755388, + "grad_norm": 40.6742173575688, + "kl": 0.349609375, + "learning_rate": 6.732083406343087e-07, + "loss": 0.1224, + "reward": 1.433029294013977, + "reward_std": 0.21674448251724243, + "rewards/accuracy_reward_stage2": 0.683029294013977, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1866 + }, + { + "completion_length": 13.203125, + "epoch": 0.32714210618538636, + "grad_norm": 16.46817567207021, + "kl": 0.046142578125, + "learning_rate": 6.730331172244612e-07, + "loss": -0.0105, + "reward": 1.75, + "reward_std": 0.16675157845020294, + "rewards/accuracy_reward_stage2": 0.765625, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1867 + }, + { + "completion_length": 16.546875, + "epoch": 0.3273173295952339, + "grad_norm": 22.167670621631846, + "kl": 0.287109375, + "learning_rate": 6.728578938146136e-07, + "loss": 0.1147, + "reward": 1.1454861164093018, + "reward_std": 0.17306698858737946, + "rewards/accuracy_reward_stage2": 0.39548611640930176, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1868 + }, + { + "completion_length": 10.6875, + "epoch": 0.32749255300508145, + "grad_norm": 19.296797288283493, + "kl": 0.09033203125, + "learning_rate": 6.72682670404766e-07, + "loss": 0.0362, + "reward": 1.3370780944824219, + "reward_std": 0.15424805879592896, + "rewards/accuracy_reward_stage2": 0.33707812428474426, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1869 + }, + { + "completion_length": 9.53125, + "epoch": 0.32766777641492906, + "grad_norm": 13.949661990547954, + "kl": 0.0218505859375, + "learning_rate": 6.725074469949185e-07, + "loss": 0.0088, + "reward": 1.613518476486206, + "reward_std": 0.09896919131278992, + "rewards/accuracy_reward_stage2": 0.613518476486206, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1870 + }, + { + "completion_length": 10.03125, + "epoch": 0.3278429998247766, + "grad_norm": 19.47370807396806, + "kl": 0.1328125, + "learning_rate": 6.72332223585071e-07, + "loss": 0.009, + "reward": 1.728609561920166, + "reward_std": 0.16377633810043335, + "rewards/accuracy_reward_stage2": 0.744234561920166, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1871 + }, + { + "completion_length": 11.234375, + "epoch": 0.32801822323462415, + "grad_norm": 21.55942535287268, + "kl": 0.1005859375, + "learning_rate": 6.721570001752234e-07, + "loss": 0.0403, + "reward": 1.6306660175323486, + "reward_std": 0.21363964676856995, + "rewards/accuracy_reward_stage2": 0.6306659579277039, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1872 + }, + { + "completion_length": 18.453125, + "epoch": 0.3281934466444717, + "grad_norm": 16.540119657055413, + "kl": 0.166015625, + "learning_rate": 6.719817767653758e-07, + "loss": 0.0664, + "reward": 1.2170138359069824, + "reward_std": 0.1963312327861786, + "rewards/accuracy_reward_stage2": 0.3420138955116272, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1873 + }, + { + "completion_length": 9.828125, + "epoch": 0.32836867005431924, + "grad_norm": 18.35374074903453, + "kl": 0.09326171875, + "learning_rate": 6.718065533555282e-07, + "loss": 0.0374, + "reward": 1.5539817810058594, + "reward_std": 0.16566669940948486, + "rewards/accuracy_reward_stage2": 0.5539816617965698, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1874 + }, + { + "completion_length": 6.484375, + "epoch": 0.3285438934641668, + "grad_norm": 14.844442858283866, + "kl": 0.025634765625, + "learning_rate": 6.716313299456807e-07, + "loss": 0.0102, + "reward": 1.744091510772705, + "reward_std": 0.09293541312217712, + "rewards/accuracy_reward_stage2": 0.7440915107727051, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1875 + }, + { + "completion_length": 9.125, + "epoch": 0.3287191168740144, + "grad_norm": 25.254916645541858, + "kl": 0.1181640625, + "learning_rate": 6.714561065358332e-07, + "loss": 0.0377, + "reward": 1.3898807764053345, + "reward_std": 0.2201821655035019, + "rewards/accuracy_reward_stage2": 0.5305057764053345, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1876 + }, + { + "completion_length": 9.609375, + "epoch": 0.32889434028386194, + "grad_norm": 25.69400947159281, + "kl": 0.126953125, + "learning_rate": 6.712808831259856e-07, + "loss": -0.0167, + "reward": 1.633712649345398, + "reward_std": 0.3901214599609375, + "rewards/accuracy_reward_stage2": 0.6649625301361084, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1877 + }, + { + "completion_length": 11.625, + "epoch": 0.3290695636937095, + "grad_norm": 16.977959285591556, + "kl": 0.2265625, + "learning_rate": 6.711056597161381e-07, + "loss": -0.0361, + "reward": 1.560457706451416, + "reward_std": 0.28971582651138306, + "rewards/accuracy_reward_stage2": 0.732332706451416, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 1878 + }, + { + "completion_length": 12.390625, + "epoch": 0.32924478710355704, + "grad_norm": 18.18831099517817, + "kl": 0.0615234375, + "learning_rate": 6.709304363062906e-07, + "loss": 0.0246, + "reward": 1.5500532388687134, + "reward_std": 0.13439956307411194, + "rewards/accuracy_reward_stage2": 0.5500532388687134, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1879 + }, + { + "completion_length": 14.375, + "epoch": 0.3294200105134046, + "grad_norm": 18.965881019041337, + "kl": 0.44921875, + "learning_rate": 6.707552128964429e-07, + "loss": 0.1797, + "reward": 1.4239583015441895, + "reward_std": 0.23425593972206116, + "rewards/accuracy_reward_stage2": 0.6739581823348999, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1880 + }, + { + "completion_length": 14.625, + "epoch": 0.32959523392325213, + "grad_norm": 19.55683458826524, + "kl": 0.103515625, + "learning_rate": 6.705799894865954e-07, + "loss": 0.0413, + "reward": 1.6188348531723022, + "reward_std": 0.22359533607959747, + "rewards/accuracy_reward_stage2": 0.6188348531723022, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1881 + }, + { + "completion_length": 12.59375, + "epoch": 0.3297704573330997, + "grad_norm": 18.088377876534306, + "kl": 0.040283203125, + "learning_rate": 6.704047660767477e-07, + "loss": 0.0161, + "reward": 1.8368244171142578, + "reward_std": 0.14354784786701202, + "rewards/accuracy_reward_stage2": 0.8368244171142578, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1882 + }, + { + "completion_length": 33.171875, + "epoch": 0.3299456807429473, + "grad_norm": 16.8912919381152, + "kl": 0.12060546875, + "learning_rate": 6.702295426669002e-07, + "loss": 0.0044, + "reward": 1.1973037719726562, + "reward_std": 0.18049047887325287, + "rewards/accuracy_reward_stage2": 0.33792880177497864, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1883 + }, + { + "completion_length": 10.1875, + "epoch": 0.3301209041527948, + "grad_norm": 18.89381295924757, + "kl": 0.038818359375, + "learning_rate": 6.700543192570527e-07, + "loss": 0.0155, + "reward": 1.4166667461395264, + "reward_std": 0.21836219727993011, + "rewards/accuracy_reward_stage2": 0.6666666269302368, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1884 + }, + { + "completion_length": 7.984375, + "epoch": 0.33029612756264237, + "grad_norm": 20.448251291802112, + "kl": 0.049560546875, + "learning_rate": 6.698790958472051e-07, + "loss": 0.0198, + "reward": 1.7605003118515015, + "reward_std": 0.3169490694999695, + "rewards/accuracy_reward_stage2": 0.7605003118515015, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1885 + }, + { + "completion_length": 10.890625, + "epoch": 0.3304713509724899, + "grad_norm": 22.309793657882366, + "kl": 0.201171875, + "learning_rate": 6.697038724373576e-07, + "loss": 0.0363, + "reward": 1.2537977695465088, + "reward_std": 0.2588205635547638, + "rewards/accuracy_reward_stage2": 0.3944226801395416, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1886 + }, + { + "completion_length": 12.78125, + "epoch": 0.33064657438233747, + "grad_norm": 18.848369498488996, + "kl": 0.10791015625, + "learning_rate": 6.6952864902751e-07, + "loss": -0.0452, + "reward": 1.6480014324188232, + "reward_std": 0.2525930404663086, + "rewards/accuracy_reward_stage2": 0.6792514324188232, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1887 + }, + { + "completion_length": 12.8125, + "epoch": 0.330821797792185, + "grad_norm": 18.613152918954547, + "kl": 0.028564453125, + "learning_rate": 6.693534256176625e-07, + "loss": 0.0114, + "reward": 1.9396920204162598, + "reward_std": 0.07260610163211823, + "rewards/accuracy_reward_stage2": 0.9396920204162598, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1888 + }, + { + "completion_length": 7.15625, + "epoch": 0.3309970212020326, + "grad_norm": 14.881380789241637, + "kl": 0.08447265625, + "learning_rate": 6.69178202207815e-07, + "loss": -0.048, + "reward": 1.6165692806243896, + "reward_std": 0.22685596346855164, + "rewards/accuracy_reward_stage2": 0.6478191614151001, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1889 + }, + { + "completion_length": 21.734375, + "epoch": 0.33117224461188016, + "grad_norm": 19.699195856672503, + "kl": 0.12255859375, + "learning_rate": 6.690029787979674e-07, + "loss": 0.0249, + "reward": 1.533193588256836, + "reward_std": 0.2367192953824997, + "rewards/accuracy_reward_stage2": 0.5488186478614807, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1890 + }, + { + "completion_length": 11.078125, + "epoch": 0.3313474680217277, + "grad_norm": 21.620235405578864, + "kl": 0.099609375, + "learning_rate": 6.688277553881199e-07, + "loss": -0.0043, + "reward": 1.7323949337005615, + "reward_std": 0.19700127840042114, + "rewards/accuracy_reward_stage2": 0.7480199337005615, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1891 + }, + { + "completion_length": 15.875, + "epoch": 0.33152269143157526, + "grad_norm": 69.12943130337275, + "kl": 0.79296875, + "learning_rate": 6.686525319782724e-07, + "loss": 0.2731, + "reward": 1.488948106765747, + "reward_std": 0.253986656665802, + "rewards/accuracy_reward_stage2": 0.6295732259750366, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1892 + }, + { + "completion_length": 18.109375, + "epoch": 0.3316979148414228, + "grad_norm": 17.98704485559557, + "kl": 0.01385498046875, + "learning_rate": 6.684773085684246e-07, + "loss": 0.0055, + "reward": 1.6691596508026123, + "reward_std": 0.14532330632209778, + "rewards/accuracy_reward_stage2": 0.6691597104072571, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1893 + }, + { + "completion_length": 12.921875, + "epoch": 0.33187313825127035, + "grad_norm": 53.84006846321401, + "kl": 0.310546875, + "learning_rate": 6.683020851585771e-07, + "loss": -0.0064, + "reward": 1.3225611448287964, + "reward_std": 0.3454180955886841, + "rewards/accuracy_reward_stage2": 0.3850611746311188, + "rewards/format_reward_stage1_pointerpad": 0.9375, + "scores/accuracy_reward_stage2": 0.9375, + "step": 1894 + }, + { + "completion_length": 13.15625, + "epoch": 0.33204836166111795, + "grad_norm": 22.44899572206889, + "kl": 0.51171875, + "learning_rate": 6.681268617487295e-07, + "loss": 0.16, + "reward": 1.7277836799621582, + "reward_std": 0.1523258537054062, + "rewards/accuracy_reward_stage2": 0.8684086799621582, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1895 + }, + { + "completion_length": 13.125, + "epoch": 0.3322235850709655, + "grad_norm": 17.53173188046345, + "kl": 0.1181640625, + "learning_rate": 6.67951638338882e-07, + "loss": 0.0257, + "reward": 1.5292876958847046, + "reward_std": 0.13309209048748016, + "rewards/accuracy_reward_stage2": 0.5449126958847046, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1896 + }, + { + "completion_length": 8.453125, + "epoch": 0.33239880848081305, + "grad_norm": 20.38947336089091, + "kl": 0.1484375, + "learning_rate": 6.677764149290345e-07, + "loss": 0.0592, + "reward": 1.761195421218872, + "reward_std": 0.23012542724609375, + "rewards/accuracy_reward_stage2": 0.7611954212188721, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1897 + }, + { + "completion_length": 6.8125, + "epoch": 0.3325740318906606, + "grad_norm": 20.896132445671658, + "kl": 0.1962890625, + "learning_rate": 6.676011915191869e-07, + "loss": 0.0785, + "reward": 1.5751183032989502, + "reward_std": 0.16306456923484802, + "rewards/accuracy_reward_stage2": 0.700118362903595, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1898 + }, + { + "completion_length": 16.625, + "epoch": 0.33274925530050814, + "grad_norm": 19.194750271365315, + "kl": 0.095703125, + "learning_rate": 6.674259681093394e-07, + "loss": 0.0095, + "reward": 1.3477399349212646, + "reward_std": 0.16524702310562134, + "rewards/accuracy_reward_stage2": 0.4883649945259094, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1899 + }, + { + "completion_length": 11.015625, + "epoch": 0.3329244787103557, + "grad_norm": 22.31802873960122, + "kl": 0.1669921875, + "learning_rate": 6.672507446994919e-07, + "loss": 0.044, + "reward": 1.4311261177062988, + "reward_std": 0.2616199254989624, + "rewards/accuracy_reward_stage2": 0.4623761773109436, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1900 + }, + { + "completion_length": 10.5625, + "epoch": 0.33309970212020323, + "grad_norm": 18.039528111074194, + "kl": 0.07666015625, + "learning_rate": 6.670755212896443e-07, + "loss": 0.0306, + "reward": 1.7664124965667725, + "reward_std": 0.18786652386188507, + "rewards/accuracy_reward_stage2": 0.7664124369621277, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1901 + }, + { + "completion_length": 17.609375, + "epoch": 0.33327492553005084, + "grad_norm": 19.913840997681163, + "kl": 0.1298828125, + "learning_rate": 6.669002978797968e-07, + "loss": -0.0146, + "reward": 1.6314010620117188, + "reward_std": 0.21882987022399902, + "rewards/accuracy_reward_stage2": 0.6626511216163635, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1902 + }, + { + "completion_length": 14.453125, + "epoch": 0.3334501489398984, + "grad_norm": 23.040483980646762, + "kl": 0.08056640625, + "learning_rate": 6.667250744699491e-07, + "loss": 0.0004, + "reward": 1.421668529510498, + "reward_std": 0.29481422901153564, + "rewards/accuracy_reward_stage2": 0.43729349970817566, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1903 + }, + { + "completion_length": 8.484375, + "epoch": 0.33362537234974593, + "grad_norm": 15.772263098762213, + "kl": 0.107421875, + "learning_rate": 6.665498510601016e-07, + "loss": -0.0453, + "reward": 1.7050449848175049, + "reward_std": 0.2197214812040329, + "rewards/accuracy_reward_stage2": 0.7362948656082153, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1904 + }, + { + "completion_length": 11.421875, + "epoch": 0.3338005957595935, + "grad_norm": 16.85209853349615, + "kl": 0.006591796875, + "learning_rate": 6.663746276502541e-07, + "loss": 0.0026, + "reward": 1.4322917461395264, + "reward_std": 0.16098348796367645, + "rewards/accuracy_reward_stage2": 0.5572916269302368, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1905 + }, + { + "completion_length": 22.5625, + "epoch": 0.333975819169441, + "grad_norm": 19.93099087602353, + "kl": 0.2890625, + "learning_rate": 6.661994042404064e-07, + "loss": 0.1219, + "reward": 1.402681827545166, + "reward_std": 0.13913270831108093, + "rewards/accuracy_reward_stage2": 0.6526818871498108, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1906 + }, + { + "completion_length": 10.109375, + "epoch": 0.33415104257928857, + "grad_norm": 21.492901823484114, + "kl": 0.054443359375, + "learning_rate": 6.660241808305589e-07, + "loss": 0.0218, + "reward": 1.5858439207077026, + "reward_std": 0.23922909796237946, + "rewards/accuracy_reward_stage2": 0.5858439207077026, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1907 + }, + { + "completion_length": 8.34375, + "epoch": 0.3343262659891362, + "grad_norm": 18.817009949809382, + "kl": 0.076171875, + "learning_rate": 6.658489574207114e-07, + "loss": 0.0304, + "reward": 1.353365421295166, + "reward_std": 0.15156733989715576, + "rewards/accuracy_reward_stage2": 0.35336539149284363, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1908 + }, + { + "completion_length": 10.25, + "epoch": 0.3345014893989837, + "grad_norm": 20.460697523371316, + "kl": 0.52734375, + "learning_rate": 6.656737340108638e-07, + "loss": 0.1231, + "reward": 1.758762240409851, + "reward_std": 0.20783132314682007, + "rewards/accuracy_reward_stage2": 0.8993872404098511, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1909 + }, + { + "completion_length": 14.6875, + "epoch": 0.33467671280883127, + "grad_norm": 23.673039520549196, + "kl": 0.04638671875, + "learning_rate": 6.654985106010163e-07, + "loss": -0.0256, + "reward": 1.6718825101852417, + "reward_std": 0.23551476001739502, + "rewards/accuracy_reward_stage2": 0.6875075697898865, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1910 + }, + { + "completion_length": 15.40625, + "epoch": 0.3348519362186788, + "grad_norm": 23.44966109378244, + "kl": 0.3984375, + "learning_rate": 6.653232871911687e-07, + "loss": 0.1594, + "reward": 1.3732054233551025, + "reward_std": 0.16537074744701385, + "rewards/accuracy_reward_stage2": 0.6232053637504578, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1911 + }, + { + "completion_length": 11.5, + "epoch": 0.33502715962852636, + "grad_norm": 25.39244699869401, + "kl": 0.21875, + "learning_rate": 6.651480637813211e-07, + "loss": 0.0779, + "reward": 1.6530308723449707, + "reward_std": 0.18545380234718323, + "rewards/accuracy_reward_stage2": 0.7936557531356812, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1912 + }, + { + "completion_length": 32.09375, + "epoch": 0.3352023830383739, + "grad_norm": 20.433934641372062, + "kl": 0.19921875, + "learning_rate": 6.649728403714736e-07, + "loss": 0.0457, + "reward": 1.4659879207611084, + "reward_std": 0.24502934515476227, + "rewards/accuracy_reward_stage2": 0.6066129207611084, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1913 + }, + { + "completion_length": 12.484375, + "epoch": 0.33537760644822145, + "grad_norm": 17.940106585468374, + "kl": 0.08935546875, + "learning_rate": 6.64797616961626e-07, + "loss": 0.0024, + "reward": 1.429773211479187, + "reward_std": 0.20611415803432465, + "rewards/accuracy_reward_stage2": 0.4453982710838318, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1914 + }, + { + "completion_length": 11.734375, + "epoch": 0.33555282985806906, + "grad_norm": 21.91538716583864, + "kl": 0.11181640625, + "learning_rate": 6.646223935517785e-07, + "loss": 0.0554, + "reward": 1.651969075202942, + "reward_std": 0.1828499734401703, + "rewards/accuracy_reward_stage2": 0.7769691348075867, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1915 + }, + { + "completion_length": 10.984375, + "epoch": 0.3357280532679166, + "grad_norm": 18.154773046052462, + "kl": 0.12109375, + "learning_rate": 6.64447170141931e-07, + "loss": 0.0041, + "reward": 1.3040469884872437, + "reward_std": 0.19486962258815765, + "rewards/accuracy_reward_stage2": 0.44467195868492126, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1916 + }, + { + "completion_length": 20.984375, + "epoch": 0.33590327667776415, + "grad_norm": 23.685376062521655, + "kl": 0.12353515625, + "learning_rate": 6.642719467320834e-07, + "loss": 0.0495, + "reward": 1.4644455909729004, + "reward_std": 0.17394208908081055, + "rewards/accuracy_reward_stage2": 0.5894454717636108, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1917 + }, + { + "completion_length": 14.09375, + "epoch": 0.3360785000876117, + "grad_norm": 20.132935090916234, + "kl": 0.046875, + "learning_rate": 6.640967233222359e-07, + "loss": 0.0187, + "reward": 1.7096850872039795, + "reward_std": 0.11627823114395142, + "rewards/accuracy_reward_stage2": 0.7096851468086243, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1918 + }, + { + "completion_length": 11.171875, + "epoch": 0.33625372349745924, + "grad_norm": 21.71008176430557, + "kl": 0.11962890625, + "learning_rate": 6.639214999123882e-07, + "loss": 0.0478, + "reward": 1.7054026126861572, + "reward_std": 0.14449666440486908, + "rewards/accuracy_reward_stage2": 0.705402672290802, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1919 + }, + { + "completion_length": 12.3125, + "epoch": 0.3364289469073068, + "grad_norm": 23.566831941745605, + "kl": 0.1005859375, + "learning_rate": 6.637462765025407e-07, + "loss": 0.0089, + "reward": 1.4168455600738525, + "reward_std": 0.25963038206100464, + "rewards/accuracy_reward_stage2": 0.432470440864563, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1920 + }, + { + "completion_length": 30.28125, + "epoch": 0.3366041703171544, + "grad_norm": 21.944029208656, + "kl": 0.455078125, + "learning_rate": 6.635710530926932e-07, + "loss": 0.1379, + "reward": 1.127016305923462, + "reward_std": 0.2312091737985611, + "rewards/accuracy_reward_stage2": 0.3926413357257843, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 1921 + }, + { + "completion_length": 5.828125, + "epoch": 0.33677939372700194, + "grad_norm": 17.257118272901188, + "kl": 0.09375, + "learning_rate": 6.633958296828455e-07, + "loss": 0.0374, + "reward": 1.875, + "reward_std": 0.2130674123764038, + "rewards/accuracy_reward_stage2": 0.875, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1922 + }, + { + "completion_length": 8.171875, + "epoch": 0.3369546171368495, + "grad_norm": 23.521572641848383, + "kl": 0.2421875, + "learning_rate": 6.63220606272998e-07, + "loss": 0.0362, + "reward": 1.216859221458435, + "reward_std": 0.3034880757331848, + "rewards/accuracy_reward_stage2": 0.37310922145843506, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1923 + }, + { + "completion_length": 10.203125, + "epoch": 0.33712984054669703, + "grad_norm": 20.521336853814198, + "kl": 0.09912109375, + "learning_rate": 6.630453828631505e-07, + "loss": 0.0398, + "reward": 1.4524263143539429, + "reward_std": 0.15699222683906555, + "rewards/accuracy_reward_stage2": 0.45242631435394287, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1924 + }, + { + "completion_length": 6.6875, + "epoch": 0.3373050639565446, + "grad_norm": 19.291231715911866, + "kl": 0.09619140625, + "learning_rate": 6.628701594533029e-07, + "loss": 0.0383, + "reward": 1.8933196067810059, + "reward_std": 0.15603157877922058, + "rewards/accuracy_reward_stage2": 0.8933195471763611, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1925 + }, + { + "completion_length": 10.890625, + "epoch": 0.33748028736639213, + "grad_norm": 19.784772606319674, + "kl": 0.09033203125, + "learning_rate": 6.626949360434554e-07, + "loss": 0.0362, + "reward": 1.6539945602416992, + "reward_std": 0.1568455845117569, + "rewards/accuracy_reward_stage2": 0.6539945602416992, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1926 + }, + { + "completion_length": 9.9375, + "epoch": 0.33765551077623973, + "grad_norm": 24.480945312649215, + "kl": 0.07275390625, + "learning_rate": 6.625197126336078e-07, + "loss": 0.0075, + "reward": 1.6549501419067383, + "reward_std": 0.1914709359407425, + "rewards/accuracy_reward_stage2": 0.6705750823020935, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1927 + }, + { + "completion_length": 9.234375, + "epoch": 0.3378307341860873, + "grad_norm": 24.791875823236783, + "kl": 0.1142578125, + "learning_rate": 6.623444892237603e-07, + "loss": 0.0457, + "reward": 1.515584945678711, + "reward_std": 0.3138820230960846, + "rewards/accuracy_reward_stage2": 0.6405848264694214, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1928 + }, + { + "completion_length": 9.28125, + "epoch": 0.3380059575959348, + "grad_norm": 18.311535553437952, + "kl": 0.1259765625, + "learning_rate": 6.621692658139128e-07, + "loss": 0.0506, + "reward": 1.6065380573272705, + "reward_std": 0.2451055496931076, + "rewards/accuracy_reward_stage2": 0.606537938117981, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1929 + }, + { + "completion_length": 9.21875, + "epoch": 0.33818118100578237, + "grad_norm": 21.444373945493457, + "kl": 0.140625, + "learning_rate": 6.619940424040652e-07, + "loss": 0.012, + "reward": 1.619673728942871, + "reward_std": 0.22012469172477722, + "rewards/accuracy_reward_stage2": 0.6352988481521606, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1930 + }, + { + "completion_length": 12.78125, + "epoch": 0.3383564044156299, + "grad_norm": 25.47223293939637, + "kl": 0.466796875, + "learning_rate": 6.618188189942176e-07, + "loss": 0.1302, + "reward": 1.3783800601959229, + "reward_std": 0.23717038333415985, + "rewards/accuracy_reward_stage2": 0.5346300005912781, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1931 + }, + { + "completion_length": 13.234375, + "epoch": 0.33853162782547747, + "grad_norm": 23.782077737361416, + "kl": 0.061767578125, + "learning_rate": 6.6164359558437e-07, + "loss": -0.0195, + "reward": 1.6181929111480713, + "reward_std": 0.202871173620224, + "rewards/accuracy_reward_stage2": 0.6338179111480713, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1932 + }, + { + "completion_length": 8.15625, + "epoch": 0.338706851235325, + "grad_norm": 16.914909837285762, + "kl": 0.09326171875, + "learning_rate": 6.614683721745224e-07, + "loss": 0.0372, + "reward": 1.4998173713684082, + "reward_std": 0.11330730468034744, + "rewards/accuracy_reward_stage2": 0.749817430973053, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 1933 + }, + { + "completion_length": 11.15625, + "epoch": 0.3388820746451726, + "grad_norm": 18.585863567630085, + "kl": 0.046630859375, + "learning_rate": 6.612931487646749e-07, + "loss": 0.0186, + "reward": 1.6778485774993896, + "reward_std": 0.18502506613731384, + "rewards/accuracy_reward_stage2": 0.6778485774993896, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1934 + }, + { + "completion_length": 9.90625, + "epoch": 0.33905729805502016, + "grad_norm": 19.867172195955273, + "kl": 0.14453125, + "learning_rate": 6.611179253548273e-07, + "loss": -0.0022, + "reward": 1.5232343673706055, + "reward_std": 0.33503860235214233, + "rewards/accuracy_reward_stage2": 0.554484486579895, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1935 + }, + { + "completion_length": 9.46875, + "epoch": 0.3392325214648677, + "grad_norm": 20.729060743505748, + "kl": 0.134765625, + "learning_rate": 6.609427019449798e-07, + "loss": 0.0323, + "reward": 1.7128856182098389, + "reward_std": 0.24648074805736542, + "rewards/accuracy_reward_stage2": 0.7285105586051941, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1936 + }, + { + "completion_length": 9.734375, + "epoch": 0.33940774487471526, + "grad_norm": 16.46205351952286, + "kl": 0.08203125, + "learning_rate": 6.607674785351323e-07, + "loss": 0.0328, + "reward": 1.7479907274246216, + "reward_std": 0.10182836651802063, + "rewards/accuracy_reward_stage2": 0.7479907274246216, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1937 + }, + { + "completion_length": 8.078125, + "epoch": 0.3395829682845628, + "grad_norm": 12.568074846483034, + "kl": 0.0625, + "learning_rate": 6.605922551252847e-07, + "loss": -0.0192, + "reward": 1.6138134002685547, + "reward_std": 0.1896343231201172, + "rewards/accuracy_reward_stage2": 0.6294383406639099, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1938 + }, + { + "completion_length": 6.703125, + "epoch": 0.33975819169441035, + "grad_norm": 25.873183430795024, + "kl": 0.0966796875, + "learning_rate": 6.604170317154372e-07, + "loss": 0.0387, + "reward": 1.7632300853729248, + "reward_std": 0.2887365520000458, + "rewards/accuracy_reward_stage2": 0.7632301449775696, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1939 + }, + { + "completion_length": 9.296875, + "epoch": 0.33993341510425795, + "grad_norm": 15.34207546952785, + "kl": 0.056396484375, + "learning_rate": 6.602418083055897e-07, + "loss": -0.0064, + "reward": 1.7274775505065918, + "reward_std": 0.12433339655399323, + "rewards/accuracy_reward_stage2": 0.7431026101112366, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1940 + }, + { + "completion_length": 10.890625, + "epoch": 0.3401086385141055, + "grad_norm": 24.915169950161932, + "kl": 0.1796875, + "learning_rate": 6.600665848957421e-07, + "loss": 0.0279, + "reward": 1.5926790237426758, + "reward_std": 0.31510692834854126, + "rewards/accuracy_reward_stage2": 0.733303964138031, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1941 + }, + { + "completion_length": 12.625, + "epoch": 0.34028386192395305, + "grad_norm": 19.149575995205236, + "kl": 0.11474609375, + "learning_rate": 6.598913614858945e-07, + "loss": 0.0459, + "reward": 1.5549436807632446, + "reward_std": 0.2327888011932373, + "rewards/accuracy_reward_stage2": 0.5549436211585999, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1942 + }, + { + "completion_length": 9.453125, + "epoch": 0.3404590853338006, + "grad_norm": 22.51574542315056, + "kl": 0.1376953125, + "learning_rate": 6.597161380760469e-07, + "loss": 0.0108, + "reward": 1.5218536853790283, + "reward_std": 0.2579251527786255, + "rewards/accuracy_reward_stage2": 0.5374786853790283, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1943 + }, + { + "completion_length": 12.578125, + "epoch": 0.34063430874364814, + "grad_norm": 27.877815424585087, + "kl": 0.10400390625, + "learning_rate": 6.595409146661993e-07, + "loss": -0.0122, + "reward": 1.3764312267303467, + "reward_std": 0.36347371339797974, + "rewards/accuracy_reward_stage2": 0.5326813459396362, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1944 + }, + { + "completion_length": 9.84375, + "epoch": 0.3408095321534957, + "grad_norm": 18.795007676878427, + "kl": 0.09033203125, + "learning_rate": 6.593656912563518e-07, + "loss": -0.0838, + "reward": 1.686873197555542, + "reward_std": 0.23738035559654236, + "rewards/accuracy_reward_stage2": 0.733748197555542, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 1945 + }, + { + "completion_length": 16.25, + "epoch": 0.3409847555633433, + "grad_norm": 30.824319726042305, + "kl": 0.1259765625, + "learning_rate": 6.591904678465042e-07, + "loss": 0.0203, + "reward": 1.3881264925003052, + "reward_std": 0.24407415091991425, + "rewards/accuracy_reward_stage2": 0.40375152230262756, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1946 + }, + { + "completion_length": 9.609375, + "epoch": 0.34115997897319084, + "grad_norm": 19.57452275526612, + "kl": 0.05859375, + "learning_rate": 6.590152444366567e-07, + "loss": 0.0019, + "reward": 1.3171931505203247, + "reward_std": 0.26795125007629395, + "rewards/accuracy_reward_stage2": 0.3328181207180023, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1947 + }, + { + "completion_length": 10.3125, + "epoch": 0.3413352023830384, + "grad_norm": 16.885076206570435, + "kl": 0.09326171875, + "learning_rate": 6.588400210268091e-07, + "loss": -0.0069, + "reward": 1.3556816577911377, + "reward_std": 0.22130826115608215, + "rewards/accuracy_reward_stage2": 0.6213066577911377, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 1948 + }, + { + "completion_length": 9.765625, + "epoch": 0.34151042579288593, + "grad_norm": 17.591580716772768, + "kl": 0.1484375, + "learning_rate": 6.586647976169616e-07, + "loss": 0.015, + "reward": 1.7992362976074219, + "reward_std": 0.22967034578323364, + "rewards/accuracy_reward_stage2": 0.8148613572120667, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1949 + }, + { + "completion_length": 11.25, + "epoch": 0.3416856492027335, + "grad_norm": 22.07411498807725, + "kl": 0.05859375, + "learning_rate": 6.584895742071141e-07, + "loss": -0.0145, + "reward": 1.4841227531433105, + "reward_std": 0.2506176233291626, + "rewards/accuracy_reward_stage2": 0.6247477531433105, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1950 + }, + { + "completion_length": 13.671875, + "epoch": 0.341860872612581, + "grad_norm": 19.24929849784772, + "kl": 0.036376953125, + "learning_rate": 6.583143507972665e-07, + "loss": 0.0146, + "reward": 1.4395618438720703, + "reward_std": 0.21534651517868042, + "rewards/accuracy_reward_stage2": 0.4395618140697479, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1951 + }, + { + "completion_length": 10.953125, + "epoch": 0.34203609602242857, + "grad_norm": 20.779179875174233, + "kl": 0.234375, + "learning_rate": 6.581391273874189e-07, + "loss": 0.024, + "reward": 1.5351721048355103, + "reward_std": 0.23412840068340302, + "rewards/accuracy_reward_stage2": 0.7070470452308655, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 1952 + }, + { + "completion_length": 12.8125, + "epoch": 0.3422113194322762, + "grad_norm": 22.998124530730706, + "kl": 0.203125, + "learning_rate": 6.579639039775714e-07, + "loss": -0.0068, + "reward": 1.3460776805877686, + "reward_std": 0.3017346262931824, + "rewards/accuracy_reward_stage2": 0.5023276805877686, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1953 + }, + { + "completion_length": 7.359375, + "epoch": 0.3423865428421237, + "grad_norm": 15.53066341719061, + "kl": 0.1328125, + "learning_rate": 6.577886805677238e-07, + "loss": -0.0584, + "reward": 1.7026607990264893, + "reward_std": 0.23679913580417633, + "rewards/accuracy_reward_stage2": 0.7495357990264893, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 1954 + }, + { + "completion_length": 9.453125, + "epoch": 0.34256176625197127, + "grad_norm": 18.46816749799684, + "kl": 0.08544921875, + "learning_rate": 6.576134571578763e-07, + "loss": 0.034, + "reward": 1.403747320175171, + "reward_std": 0.22259891033172607, + "rewards/accuracy_reward_stage2": 0.5287472605705261, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1955 + }, + { + "completion_length": 18.25, + "epoch": 0.3427369896618188, + "grad_norm": 18.38930419158117, + "kl": 0.068359375, + "learning_rate": 6.574382337480288e-07, + "loss": -0.0167, + "reward": 1.649796962738037, + "reward_std": 0.19626963138580322, + "rewards/accuracy_reward_stage2": 0.6654220223426819, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1956 + }, + { + "completion_length": 11.015625, + "epoch": 0.34291221307166636, + "grad_norm": 18.69463465641868, + "kl": 0.16015625, + "learning_rate": 6.572630103381811e-07, + "loss": -0.0242, + "reward": 1.4900152683258057, + "reward_std": 0.23229768872261047, + "rewards/accuracy_reward_stage2": 0.5212653279304504, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1957 + }, + { + "completion_length": 8.96875, + "epoch": 0.3430874364815139, + "grad_norm": 16.994028473834337, + "kl": 0.1416015625, + "learning_rate": 6.570877869283336e-07, + "loss": 0.0123, + "reward": 1.7630869150161743, + "reward_std": 0.23732198774814606, + "rewards/accuracy_reward_stage2": 0.7787119150161743, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1958 + }, + { + "completion_length": 12.921875, + "epoch": 0.3432626598913615, + "grad_norm": 48.14582209676718, + "kl": 0.19140625, + "learning_rate": 6.56912563518486e-07, + "loss": -0.0354, + "reward": 1.400420904159546, + "reward_std": 0.29775795340538025, + "rewards/accuracy_reward_stage2": 0.5722959041595459, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 1959 + }, + { + "completion_length": 9.828125, + "epoch": 0.34343788330120906, + "grad_norm": 21.817718975240545, + "kl": 0.1328125, + "learning_rate": 6.567373401086385e-07, + "loss": 0.0532, + "reward": 1.6833243370056152, + "reward_std": 0.22717389464378357, + "rewards/accuracy_reward_stage2": 0.6833243370056152, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1960 + }, + { + "completion_length": 8.609375, + "epoch": 0.3436131067110566, + "grad_norm": 10.814326403229101, + "kl": 0.0162353515625, + "learning_rate": 6.56562116698791e-07, + "loss": 0.0065, + "reward": 1.532088279724121, + "reward_std": 0.08120846003293991, + "rewards/accuracy_reward_stage2": 0.6570882797241211, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1961 + }, + { + "completion_length": 14.84375, + "epoch": 0.34378833012090415, + "grad_norm": 36.49634156631595, + "kl": 0.09716796875, + "learning_rate": 6.563868932889433e-07, + "loss": 0.001, + "reward": 1.3071386814117432, + "reward_std": 0.2545177638530731, + "rewards/accuracy_reward_stage2": 0.32276368141174316, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1962 + }, + { + "completion_length": 13.1875, + "epoch": 0.3439635535307517, + "grad_norm": 16.061314507901063, + "kl": 0.076171875, + "learning_rate": 6.562116698790958e-07, + "loss": 0.0062, + "reward": 1.3792309761047363, + "reward_std": 0.1264325976371765, + "rewards/accuracy_reward_stage2": 0.39485591650009155, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1963 + }, + { + "completion_length": 12.078125, + "epoch": 0.34413877694059924, + "grad_norm": 23.750220020402494, + "kl": 0.1357421875, + "learning_rate": 6.560364464692482e-07, + "loss": 0.01, + "reward": 1.652053713798523, + "reward_std": 0.29191863536834717, + "rewards/accuracy_reward_stage2": 0.667678713798523, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1964 + }, + { + "completion_length": 11.96875, + "epoch": 0.34431400035044685, + "grad_norm": 33.41616899169439, + "kl": 0.12890625, + "learning_rate": 6.558612230594007e-07, + "loss": 0.0241, + "reward": 1.4442522525787354, + "reward_std": 0.23655974864959717, + "rewards/accuracy_reward_stage2": 0.45987722277641296, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1965 + }, + { + "completion_length": 6.90625, + "epoch": 0.3444892237602944, + "grad_norm": 21.254282550862037, + "kl": 0.07470703125, + "learning_rate": 6.556859996495532e-07, + "loss": 0.0009, + "reward": 1.7194864749908447, + "reward_std": 0.23831090331077576, + "rewards/accuracy_reward_stage2": 0.7351114749908447, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1966 + }, + { + "completion_length": 9.4375, + "epoch": 0.34466444717014194, + "grad_norm": 15.179379706752174, + "kl": 0.1240234375, + "learning_rate": 6.555107762397056e-07, + "loss": -0.0343, + "reward": 1.696099877357483, + "reward_std": 0.13108253479003906, + "rewards/accuracy_reward_stage2": 0.8523498773574829, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1967 + }, + { + "completion_length": 9.265625, + "epoch": 0.3448396705799895, + "grad_norm": 30.543324244072327, + "kl": 0.181640625, + "learning_rate": 6.553355528298581e-07, + "loss": 0.0285, + "reward": 1.5463353395462036, + "reward_std": 0.2797982692718506, + "rewards/accuracy_reward_stage2": 0.5619603395462036, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1968 + }, + { + "completion_length": 20.390625, + "epoch": 0.34501489398983703, + "grad_norm": 16.82444448290589, + "kl": 0.130859375, + "learning_rate": 6.551603294200106e-07, + "loss": 0.011, + "reward": 1.4820425510406494, + "reward_std": 0.1993536353111267, + "rewards/accuracy_reward_stage2": 0.7476676106452942, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 1969 + }, + { + "completion_length": 12.984375, + "epoch": 0.3451901173996846, + "grad_norm": 11.0144221558119, + "kl": 0.0498046875, + "learning_rate": 6.549851060101629e-07, + "loss": 0.0198, + "reward": 1.7429325580596924, + "reward_std": 0.07267377525568008, + "rewards/accuracy_reward_stage2": 0.7429325580596924, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1970 + }, + { + "completion_length": 14.0625, + "epoch": 0.34536534080953213, + "grad_norm": 17.55120295527536, + "kl": 0.08447265625, + "learning_rate": 6.548098826003154e-07, + "loss": -0.0103, + "reward": 1.5645519495010376, + "reward_std": 0.2282322496175766, + "rewards/accuracy_reward_stage2": 0.5801768898963928, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1971 + }, + { + "completion_length": 13.8125, + "epoch": 0.34554056421937973, + "grad_norm": 16.79040991776876, + "kl": 0.05859375, + "learning_rate": 6.546346591904677e-07, + "loss": -0.0208, + "reward": 1.3323101997375488, + "reward_std": 0.2069927155971527, + "rewards/accuracy_reward_stage2": 0.47293511033058167, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1972 + }, + { + "completion_length": 13.140625, + "epoch": 0.3457157876292273, + "grad_norm": 19.771904263657262, + "kl": 0.2431640625, + "learning_rate": 6.544594357806202e-07, + "loss": 0.0972, + "reward": 1.4394879341125488, + "reward_std": 0.153305783867836, + "rewards/accuracy_reward_stage2": 0.564487874507904, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1973 + }, + { + "completion_length": 9.703125, + "epoch": 0.3458910110390748, + "grad_norm": 25.667897310449245, + "kl": 0.09130859375, + "learning_rate": 6.542842123707727e-07, + "loss": -0.0014, + "reward": 1.5841963291168213, + "reward_std": 0.2587115168571472, + "rewards/accuracy_reward_stage2": 0.7248212695121765, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1974 + }, + { + "completion_length": 7.484375, + "epoch": 0.34606623444892237, + "grad_norm": 22.24575147622979, + "kl": 0.25390625, + "learning_rate": 6.541089889609251e-07, + "loss": 0.0632, + "reward": 1.5621519088745117, + "reward_std": 0.335077166557312, + "rewards/accuracy_reward_stage2": 0.7027768492698669, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1975 + }, + { + "completion_length": 10.453125, + "epoch": 0.3462414578587699, + "grad_norm": 24.44092705807922, + "kl": 0.04736328125, + "learning_rate": 6.539337655510776e-07, + "loss": -0.014, + "reward": 1.655135154724121, + "reward_std": 0.24762673676013947, + "rewards/accuracy_reward_stage2": 0.6707600951194763, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1976 + }, + { + "completion_length": 11.25, + "epoch": 0.34641668126861747, + "grad_norm": 23.469688804357215, + "kl": 0.25, + "learning_rate": 6.537585421412301e-07, + "loss": 0.0116, + "reward": 1.7019398212432861, + "reward_std": 0.2970326542854309, + "rewards/accuracy_reward_stage2": 0.8581898808479309, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 1977 + }, + { + "completion_length": 9.21875, + "epoch": 0.34659190467846507, + "grad_norm": 18.475356845854026, + "kl": 0.0595703125, + "learning_rate": 6.535833187313825e-07, + "loss": 0.0239, + "reward": 1.7104763984680176, + "reward_std": 0.14170876145362854, + "rewards/accuracy_reward_stage2": 0.7104763984680176, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1978 + }, + { + "completion_length": 9.484375, + "epoch": 0.3467671280883126, + "grad_norm": 12.945405309371132, + "kl": 0.08935546875, + "learning_rate": 6.53408095321535e-07, + "loss": 0.0357, + "reward": 1.639056921005249, + "reward_std": 0.06409046053886414, + "rewards/accuracy_reward_stage2": 0.6390569806098938, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1979 + }, + { + "completion_length": 14.03125, + "epoch": 0.34694235149816016, + "grad_norm": 18.91283009725862, + "kl": 0.232421875, + "learning_rate": 6.532328719116874e-07, + "loss": 0.0489, + "reward": 1.1489617824554443, + "reward_std": 0.1828662008047104, + "rewards/accuracy_reward_stage2": 0.4145868420600891, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 1980 + }, + { + "completion_length": 11.875, + "epoch": 0.3471175749080077, + "grad_norm": 25.379163351883836, + "kl": 0.130859375, + "learning_rate": 6.530576485018399e-07, + "loss": 0.0522, + "reward": 1.517045259475708, + "reward_std": 0.17672033607959747, + "rewards/accuracy_reward_stage2": 0.517045259475708, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1981 + }, + { + "completion_length": 10.5625, + "epoch": 0.34729279831785526, + "grad_norm": 24.248118802036352, + "kl": 0.2412109375, + "learning_rate": 6.528824250919922e-07, + "loss": -0.0996, + "reward": 1.5262937545776367, + "reward_std": 0.3614034652709961, + "rewards/accuracy_reward_stage2": 0.6200437545776367, + "rewards/format_reward_stage1_pointerpad": 0.90625, + "scores/accuracy_reward_stage2": 0.90625, + "step": 1982 + }, + { + "completion_length": 14.109375, + "epoch": 0.3474680217277028, + "grad_norm": 22.904190961498777, + "kl": 0.10400390625, + "learning_rate": 6.527072016821446e-07, + "loss": 0.0126, + "reward": 1.594224214553833, + "reward_std": 0.2510741651058197, + "rewards/accuracy_reward_stage2": 0.6098491549491882, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1983 + }, + { + "completion_length": 9.6875, + "epoch": 0.34764324513755035, + "grad_norm": 21.44313358526625, + "kl": 0.1826171875, + "learning_rate": 6.525319782722971e-07, + "loss": 0.0729, + "reward": 1.6885744333267212, + "reward_std": 0.2376718968153, + "rewards/accuracy_reward_stage2": 0.6885744333267212, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1984 + }, + { + "completion_length": 8.34375, + "epoch": 0.34781846854739795, + "grad_norm": 22.402962231988234, + "kl": 0.09228515625, + "learning_rate": 6.523567548624496e-07, + "loss": 0.0054, + "reward": 1.8162912130355835, + "reward_std": 0.28567641973495483, + "rewards/accuracy_reward_stage2": 0.8319162130355835, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1985 + }, + { + "completion_length": 34.921875, + "epoch": 0.3479936919572455, + "grad_norm": 153.3247594937022, + "kl": 1.28125, + "learning_rate": 6.52181531452602e-07, + "loss": 0.5154, + "reward": 1.6300783157348633, + "reward_std": 0.06343966722488403, + "rewards/accuracy_reward_stage2": 0.7550783753395081, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 1986 + }, + { + "completion_length": 9.296875, + "epoch": 0.34816891536709305, + "grad_norm": 23.518916407176633, + "kl": 0.11474609375, + "learning_rate": 6.520063080427545e-07, + "loss": -0.0186, + "reward": 1.42463219165802, + "reward_std": 0.3672451972961426, + "rewards/accuracy_reward_stage2": 0.4558822214603424, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1987 + }, + { + "completion_length": 8.921875, + "epoch": 0.3483441387769406, + "grad_norm": 19.948904765481284, + "kl": 0.1728515625, + "learning_rate": 6.518310846329069e-07, + "loss": -0.0049, + "reward": 1.6624736785888672, + "reward_std": 0.17071276903152466, + "rewards/accuracy_reward_stage2": 0.6937235593795776, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1988 + }, + { + "completion_length": 8.796875, + "epoch": 0.34851936218678814, + "grad_norm": 26.28195016419193, + "kl": 0.0908203125, + "learning_rate": 6.516558612230594e-07, + "loss": 0.0362, + "reward": 1.5121607780456543, + "reward_std": 0.19263972342014313, + "rewards/accuracy_reward_stage2": 0.5121607184410095, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1989 + }, + { + "completion_length": 8.796875, + "epoch": 0.3486945855966357, + "grad_norm": 20.032310841077322, + "kl": 0.0908203125, + "learning_rate": 6.514806378132119e-07, + "loss": -0.0245, + "reward": 1.4932280778884888, + "reward_std": 0.24793201684951782, + "rewards/accuracy_reward_stage2": 0.5244780778884888, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 1990 + }, + { + "completion_length": 10.40625, + "epoch": 0.3488698090064833, + "grad_norm": 14.738256721826753, + "kl": 0.050537109375, + "learning_rate": 6.513054144033643e-07, + "loss": -0.0239, + "reward": 1.6377990245819092, + "reward_std": 0.10622736066579819, + "rewards/accuracy_reward_stage2": 0.6534240245819092, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1991 + }, + { + "completion_length": 17.265625, + "epoch": 0.34904503241633084, + "grad_norm": 20.695338367248944, + "kl": 0.0791015625, + "learning_rate": 6.511301909935167e-07, + "loss": -0.0121, + "reward": 1.6008673906326294, + "reward_std": 0.2019246220588684, + "rewards/accuracy_reward_stage2": 0.6164922714233398, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1992 + }, + { + "completion_length": 9.921875, + "epoch": 0.3492202558261784, + "grad_norm": 17.42855634410852, + "kl": 0.099609375, + "learning_rate": 6.509549675836692e-07, + "loss": -0.0043, + "reward": 1.5164021253585815, + "reward_std": 0.18167875707149506, + "rewards/accuracy_reward_stage2": 0.6570271253585815, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1993 + }, + { + "completion_length": 11.921875, + "epoch": 0.34939547923602593, + "grad_norm": 16.76881427087333, + "kl": 0.061767578125, + "learning_rate": 6.507797441738216e-07, + "loss": 0.0008, + "reward": 1.6663645505905151, + "reward_std": 0.11451227962970734, + "rewards/accuracy_reward_stage2": 0.6819895505905151, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1994 + }, + { + "completion_length": 9.796875, + "epoch": 0.3495707026458735, + "grad_norm": 20.48561906353898, + "kl": 0.154296875, + "learning_rate": 6.50604520763974e-07, + "loss": 0.0176, + "reward": 1.487541913986206, + "reward_std": 0.21980035305023193, + "rewards/accuracy_reward_stage2": 0.5031670331954956, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1995 + }, + { + "completion_length": 7.8125, + "epoch": 0.349745926055721, + "grad_norm": 16.811939673960403, + "kl": 0.1767578125, + "learning_rate": 6.504292973541264e-07, + "loss": -0.1069, + "reward": 1.3587661981582642, + "reward_std": 0.2528875470161438, + "rewards/accuracy_reward_stage2": 0.43689125776290894, + "rewards/format_reward_stage1_pointerpad": 0.921875, + "scores/accuracy_reward_stage2": 0.921875, + "step": 1996 + }, + { + "completion_length": 10.5625, + "epoch": 0.3499211494655686, + "grad_norm": 19.56031623481252, + "kl": 0.04931640625, + "learning_rate": 6.502540739442789e-07, + "loss": 0.0197, + "reward": 1.7224457263946533, + "reward_std": 0.22622840106487274, + "rewards/accuracy_reward_stage2": 0.7224457263946533, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 1997 + }, + { + "completion_length": 15.6875, + "epoch": 0.3500963728754162, + "grad_norm": 10.344808632978406, + "kl": 0.0294189453125, + "learning_rate": 6.500788505344314e-07, + "loss": -0.0323, + "reward": 1.6654764413833618, + "reward_std": 0.1032496765255928, + "rewards/accuracy_reward_stage2": 0.6811015009880066, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 1998 + }, + { + "completion_length": 13.90625, + "epoch": 0.3502715962852637, + "grad_norm": 16.99481523909834, + "kl": 0.24609375, + "learning_rate": 6.499036271245838e-07, + "loss": 0.0583, + "reward": 1.5105576515197754, + "reward_std": 0.16287124156951904, + "rewards/accuracy_reward_stage2": 0.6511826515197754, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 1999 + }, + { + "completion_length": 8.515625, + "epoch": 0.35044681969511127, + "grad_norm": 14.072725639459303, + "kl": 0.08740234375, + "learning_rate": 6.497284037147363e-07, + "loss": 0.035, + "reward": 1.741347074508667, + "reward_std": 0.11879969388246536, + "rewards/accuracy_reward_stage2": 0.7413470149040222, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2000 + }, + { + "completion_length": 10.984375, + "epoch": 0.3506220431049588, + "grad_norm": 21.66948159691174, + "kl": 0.11767578125, + "learning_rate": 6.495531803048888e-07, + "loss": 0.0061, + "reward": 1.5786539316177368, + "reward_std": 0.2800479829311371, + "rewards/accuracy_reward_stage2": 0.5942789316177368, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2001 + }, + { + "completion_length": 11.078125, + "epoch": 0.35079726651480636, + "grad_norm": 32.400792294857474, + "kl": 0.0810546875, + "learning_rate": 6.493779568950411e-07, + "loss": 0.0325, + "reward": 1.5315885543823242, + "reward_std": 0.27468162775039673, + "rewards/accuracy_reward_stage2": 0.656588613986969, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2002 + }, + { + "completion_length": 10.875, + "epoch": 0.3509724899246539, + "grad_norm": 23.578523403709887, + "kl": 0.25390625, + "learning_rate": 6.492027334851936e-07, + "loss": 0.0848, + "reward": 1.4257640838623047, + "reward_std": 0.25517600774765015, + "rewards/accuracy_reward_stage2": 0.5663890838623047, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2003 + }, + { + "completion_length": 19.703125, + "epoch": 0.3511477133345015, + "grad_norm": 20.596326020016303, + "kl": 0.083984375, + "learning_rate": 6.49027510075346e-07, + "loss": 0.0336, + "reward": 1.6645708084106445, + "reward_std": 0.2337852567434311, + "rewards/accuracy_reward_stage2": 0.6645709276199341, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2004 + }, + { + "completion_length": 16.765625, + "epoch": 0.35132293674434906, + "grad_norm": 22.441290506296923, + "kl": 0.337890625, + "learning_rate": 6.488522866654985e-07, + "loss": 0.0101, + "reward": 1.3200299739837646, + "reward_std": 0.30100393295288086, + "rewards/accuracy_reward_stage2": 0.4919048845767975, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 2005 + }, + { + "completion_length": 15.25, + "epoch": 0.3514981601541966, + "grad_norm": 27.76775951792323, + "kl": 0.051025390625, + "learning_rate": 6.48677063255651e-07, + "loss": 0.0205, + "reward": 1.3646464347839355, + "reward_std": 0.25721046328544617, + "rewards/accuracy_reward_stage2": 0.6146464943885803, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 2006 + }, + { + "completion_length": 10.484375, + "epoch": 0.35167338356404415, + "grad_norm": 17.07704587908255, + "kl": 0.0712890625, + "learning_rate": 6.485018398458034e-07, + "loss": -0.0116, + "reward": 1.518690824508667, + "reward_std": 0.1659468710422516, + "rewards/accuracy_reward_stage2": 0.534315824508667, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2007 + }, + { + "completion_length": 9.5625, + "epoch": 0.3518486069738917, + "grad_norm": 14.103071172763505, + "kl": 0.07666015625, + "learning_rate": 6.483266164359558e-07, + "loss": 0.0306, + "reward": 1.5295997858047485, + "reward_std": 0.07649820297956467, + "rewards/accuracy_reward_stage2": 0.5295997262001038, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2008 + }, + { + "completion_length": 9.625, + "epoch": 0.35202383038373924, + "grad_norm": 18.747318004607546, + "kl": 0.0673828125, + "learning_rate": 6.481513930261083e-07, + "loss": 0.0271, + "reward": 1.6228842735290527, + "reward_std": 0.16448625922203064, + "rewards/accuracy_reward_stage2": 0.622884213924408, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2009 + }, + { + "completion_length": 11.3125, + "epoch": 0.35219905379358685, + "grad_norm": 17.426671028984867, + "kl": 0.1357421875, + "learning_rate": 6.479761696162607e-07, + "loss": 0.0544, + "reward": 1.7684483528137207, + "reward_std": 0.1293027698993683, + "rewards/accuracy_reward_stage2": 0.7684484720230103, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2010 + }, + { + "completion_length": 8.8125, + "epoch": 0.3523742772034344, + "grad_norm": 19.074783419869533, + "kl": 0.1328125, + "learning_rate": 6.478009462064131e-07, + "loss": 0.0089, + "reward": 1.3788700103759766, + "reward_std": 0.24218647181987762, + "rewards/accuracy_reward_stage2": 0.39449506998062134, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2011 + }, + { + "completion_length": 17.234375, + "epoch": 0.35254950061328194, + "grad_norm": 19.175166347014933, + "kl": 0.010498046875, + "learning_rate": 6.476257227965655e-07, + "loss": 0.0042, + "reward": 1.7504546642303467, + "reward_std": 0.12284188717603683, + "rewards/accuracy_reward_stage2": 0.7504546046257019, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2012 + }, + { + "completion_length": 10.0, + "epoch": 0.3527247240231295, + "grad_norm": 21.749683884591207, + "kl": 0.134765625, + "learning_rate": 6.47450499386718e-07, + "loss": 0.02, + "reward": 1.6623674631118774, + "reward_std": 0.28688478469848633, + "rewards/accuracy_reward_stage2": 0.6779924631118774, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2013 + }, + { + "completion_length": 5.421875, + "epoch": 0.35289994743297703, + "grad_norm": 44.18443001251126, + "kl": 0.52734375, + "learning_rate": 6.472752759768705e-07, + "loss": 0.122, + "reward": 1.183675765991211, + "reward_std": 0.20090191066265106, + "rewards/accuracy_reward_stage2": 0.48055073618888855, + "rewards/format_reward_stage1_pointerpad": 0.703125, + "scores/accuracy_reward_stage2": 0.703125, + "step": 2014 + }, + { + "completion_length": 6.328125, + "epoch": 0.3530751708428246, + "grad_norm": 22.254088766217205, + "kl": 0.06201171875, + "learning_rate": 6.471000525670229e-07, + "loss": 0.0248, + "reward": 1.7993628978729248, + "reward_std": 0.1816439926624298, + "rewards/accuracy_reward_stage2": 0.7993630170822144, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2015 + }, + { + "completion_length": 7.828125, + "epoch": 0.3532503942526722, + "grad_norm": 22.0690082720116, + "kl": 0.04296875, + "learning_rate": 6.469248291571754e-07, + "loss": 0.0172, + "reward": 1.7569329738616943, + "reward_std": 0.15919101238250732, + "rewards/accuracy_reward_stage2": 0.7569329142570496, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2016 + }, + { + "completion_length": 12.5, + "epoch": 0.35342561766251973, + "grad_norm": 257.73596936766705, + "kl": 1.484375, + "learning_rate": 6.467496057473279e-07, + "loss": 0.5931, + "reward": 1.65625, + "reward_std": 0.1735912710428238, + "rewards/accuracy_reward_stage2": 0.90625, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 2017 + }, + { + "completion_length": 16.328125, + "epoch": 0.3536008410723673, + "grad_norm": 22.55185782932836, + "kl": 0.12890625, + "learning_rate": 6.465743823374803e-07, + "loss": 0.0516, + "reward": 1.6566040515899658, + "reward_std": 0.2475588023662567, + "rewards/accuracy_reward_stage2": 0.656603991985321, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2018 + }, + { + "completion_length": 11.921875, + "epoch": 0.3537760644822148, + "grad_norm": 18.32946455298761, + "kl": 0.11181640625, + "learning_rate": 6.463991589276328e-07, + "loss": -0.0436, + "reward": 1.503852367401123, + "reward_std": 0.3397676944732666, + "rewards/accuracy_reward_stage2": 0.5351022481918335, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2019 + }, + { + "completion_length": 20.140625, + "epoch": 0.35395128789206237, + "grad_norm": 23.591232229481246, + "kl": 0.18359375, + "learning_rate": 6.462239355177852e-07, + "loss": 0.0734, + "reward": 1.4747546911239624, + "reward_std": 0.3044975996017456, + "rewards/accuracy_reward_stage2": 0.5997546911239624, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2020 + }, + { + "completion_length": 10.953125, + "epoch": 0.3541265113019099, + "grad_norm": 49.864613932302575, + "kl": 0.1708984375, + "learning_rate": 6.460487121079375e-07, + "loss": 0.0685, + "reward": 1.5495915412902832, + "reward_std": 0.35907381772994995, + "rewards/accuracy_reward_stage2": 0.549591600894928, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2021 + }, + { + "completion_length": 5.390625, + "epoch": 0.35430173471175747, + "grad_norm": 24.78394195461497, + "kl": 0.037841796875, + "learning_rate": 6.4587348869809e-07, + "loss": 0.0151, + "reward": 1.6354167461395264, + "reward_std": 0.303213894367218, + "rewards/accuracy_reward_stage2": 0.6354166865348816, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2022 + }, + { + "completion_length": 17.65625, + "epoch": 0.35447695812160507, + "grad_norm": 15.952665146989885, + "kl": 0.042724609375, + "learning_rate": 6.456982652882424e-07, + "loss": -0.0504, + "reward": 1.4603793621063232, + "reward_std": 0.10579296946525574, + "rewards/accuracy_reward_stage2": 0.49162930250167847, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2023 + }, + { + "completion_length": 9.015625, + "epoch": 0.3546521815314526, + "grad_norm": 23.73446125497201, + "kl": 0.08642578125, + "learning_rate": 6.455230418783949e-07, + "loss": -0.0072, + "reward": 1.5714523792266846, + "reward_std": 0.2881009578704834, + "rewards/accuracy_reward_stage2": 0.5870773196220398, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2024 + }, + { + "completion_length": 11.75, + "epoch": 0.35482740494130016, + "grad_norm": 19.071701378032504, + "kl": 0.0228271484375, + "learning_rate": 6.453478184685473e-07, + "loss": 0.0091, + "reward": 1.512235164642334, + "reward_std": 0.1992071270942688, + "rewards/accuracy_reward_stage2": 0.5122351050376892, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2025 + }, + { + "completion_length": 5.6875, + "epoch": 0.3550026283511477, + "grad_norm": 13.827528845383378, + "kl": 0.1357421875, + "learning_rate": 6.451725950586998e-07, + "loss": 0.0541, + "reward": 1.8923611640930176, + "reward_std": 0.1601068675518036, + "rewards/accuracy_reward_stage2": 0.8923611640930176, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2026 + }, + { + "completion_length": 8.21875, + "epoch": 0.35517785176099526, + "grad_norm": 21.877932825400908, + "kl": 0.125, + "learning_rate": 6.449973716488523e-07, + "loss": -0.0149, + "reward": 1.328125, + "reward_std": 0.308285653591156, + "rewards/accuracy_reward_stage2": 0.359375, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2027 + }, + { + "completion_length": 9.0625, + "epoch": 0.3553530751708428, + "grad_norm": 21.868545535759246, + "kl": 0.1044921875, + "learning_rate": 6.448221482390047e-07, + "loss": -0.0024, + "reward": 1.6145799160003662, + "reward_std": 0.2504516839981079, + "rewards/accuracy_reward_stage2": 0.6302049160003662, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2028 + }, + { + "completion_length": 8.5, + "epoch": 0.3555282985806904, + "grad_norm": 34.47660911860323, + "kl": 0.2890625, + "learning_rate": 6.446469248291572e-07, + "loss": 0.1158, + "reward": 1.6426146030426025, + "reward_std": 0.18083617091178894, + "rewards/accuracy_reward_stage2": 0.7676145434379578, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2029 + }, + { + "completion_length": 12.140625, + "epoch": 0.35570352199053795, + "grad_norm": 28.346024278896905, + "kl": 0.09130859375, + "learning_rate": 6.444717014193097e-07, + "loss": -0.0078, + "reward": 1.441169023513794, + "reward_std": 0.23020276427268982, + "rewards/accuracy_reward_stage2": 0.45679396390914917, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2030 + }, + { + "completion_length": 10.78125, + "epoch": 0.3558787454003855, + "grad_norm": 19.90543622562572, + "kl": 0.1591796875, + "learning_rate": 6.44296478009462e-07, + "loss": -0.0228, + "reward": 1.4047634601593018, + "reward_std": 0.3020269274711609, + "rewards/accuracy_reward_stage2": 0.5610134601593018, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2031 + }, + { + "completion_length": 7.28125, + "epoch": 0.35605396881023305, + "grad_norm": 29.985663372852073, + "kl": 0.19140625, + "learning_rate": 6.441212545996145e-07, + "loss": 0.0768, + "reward": 1.436646819114685, + "reward_std": 0.1707455813884735, + "rewards/accuracy_reward_stage2": 0.43664681911468506, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2032 + }, + { + "completion_length": 9.734375, + "epoch": 0.3562291922200806, + "grad_norm": 41.73996181885468, + "kl": 0.2109375, + "learning_rate": 6.439460311897668e-07, + "loss": 0.0843, + "reward": 1.631887674331665, + "reward_std": 0.20773212611675262, + "rewards/accuracy_reward_stage2": 0.631887674331665, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2033 + }, + { + "completion_length": 10.828125, + "epoch": 0.35640441562992814, + "grad_norm": 16.95317734365735, + "kl": 0.0859375, + "learning_rate": 6.437708077799193e-07, + "loss": -0.0796, + "reward": 1.5492008924484253, + "reward_std": 0.2339085042476654, + "rewards/accuracy_reward_stage2": 0.5960758924484253, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2034 + }, + { + "completion_length": 17.359375, + "epoch": 0.3565796390397757, + "grad_norm": 24.50093893215538, + "kl": 0.1943359375, + "learning_rate": 6.435955843700718e-07, + "loss": 0.0248, + "reward": 1.3570654392242432, + "reward_std": 0.1519032120704651, + "rewards/accuracy_reward_stage2": 0.5133154392242432, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2035 + }, + { + "completion_length": 10.984375, + "epoch": 0.3567548624496233, + "grad_norm": 22.809359510094048, + "kl": 0.052490234375, + "learning_rate": 6.434203609602242e-07, + "loss": 0.021, + "reward": 1.6798628568649292, + "reward_std": 0.17061826586723328, + "rewards/accuracy_reward_stage2": 0.679862916469574, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2036 + }, + { + "completion_length": 8.921875, + "epoch": 0.35693008585947084, + "grad_norm": 14.822723100907085, + "kl": 0.0732421875, + "learning_rate": 6.432451375503767e-07, + "loss": 0.0293, + "reward": 1.671637773513794, + "reward_std": 0.1526038646697998, + "rewards/accuracy_reward_stage2": 0.671637773513794, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2037 + }, + { + "completion_length": 10.640625, + "epoch": 0.3571053092693184, + "grad_norm": 17.16508925671016, + "kl": 0.0830078125, + "learning_rate": 6.430699141405292e-07, + "loss": 0.0015, + "reward": 1.493004322052002, + "reward_std": 0.2090732753276825, + "rewards/accuracy_reward_stage2": 0.633629322052002, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2038 + }, + { + "completion_length": 9.0, + "epoch": 0.35728053267916593, + "grad_norm": 22.10614423154803, + "kl": 0.275390625, + "learning_rate": 6.428946907306816e-07, + "loss": 0.068, + "reward": 1.3667113780975342, + "reward_std": 0.2098403126001358, + "rewards/accuracy_reward_stage2": 0.6479613780975342, + "rewards/format_reward_stage1_pointerpad": 0.71875, + "scores/accuracy_reward_stage2": 0.71875, + "step": 2039 + }, + { + "completion_length": 13.5625, + "epoch": 0.3574557560890135, + "grad_norm": 29.74656037361351, + "kl": 0.2041015625, + "learning_rate": 6.427194673208341e-07, + "loss": 0.0818, + "reward": 1.560437560081482, + "reward_std": 0.19831281900405884, + "rewards/accuracy_reward_stage2": 0.6854374408721924, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2040 + }, + { + "completion_length": 6.9375, + "epoch": 0.357630979498861, + "grad_norm": 17.874500673445397, + "kl": 0.048583984375, + "learning_rate": 6.425442439109864e-07, + "loss": 0.0195, + "reward": 1.8301217555999756, + "reward_std": 0.16129888594150543, + "rewards/accuracy_reward_stage2": 0.8301217555999756, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2041 + }, + { + "completion_length": 9.578125, + "epoch": 0.3578062029087086, + "grad_norm": 20.93377670278759, + "kl": 0.10693359375, + "learning_rate": 6.423690205011389e-07, + "loss": -0.0112, + "reward": 1.4852190017700195, + "reward_std": 0.18758749961853027, + "rewards/accuracy_reward_stage2": 0.5164690613746643, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2042 + }, + { + "completion_length": 6.03125, + "epoch": 0.3579814263185562, + "grad_norm": 14.282698452510397, + "kl": 0.1591796875, + "learning_rate": 6.421937970912914e-07, + "loss": -0.0633, + "reward": 1.5824334621429443, + "reward_std": 0.24984443187713623, + "rewards/accuracy_reward_stage2": 0.6293083429336548, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2043 + }, + { + "completion_length": 13.140625, + "epoch": 0.3581566497284037, + "grad_norm": 19.279786705660666, + "kl": 0.062255859375, + "learning_rate": 6.420185736814438e-07, + "loss": -0.0426, + "reward": 1.7357683181762695, + "reward_std": 0.21315570175647736, + "rewards/accuracy_reward_stage2": 0.7670182585716248, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2044 + }, + { + "completion_length": 7.828125, + "epoch": 0.35833187313825127, + "grad_norm": 24.18269457328421, + "kl": 0.1611328125, + "learning_rate": 6.418433502715963e-07, + "loss": 0.0551, + "reward": 1.3229691982269287, + "reward_std": 0.2945018708705902, + "rewards/accuracy_reward_stage2": 0.4635942578315735, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2045 + }, + { + "completion_length": 11.609375, + "epoch": 0.3585070965480988, + "grad_norm": 20.038499784926273, + "kl": 0.1669921875, + "learning_rate": 6.416681268617487e-07, + "loss": 0.0451, + "reward": 1.503136396408081, + "reward_std": 0.2722621560096741, + "rewards/accuracy_reward_stage2": 0.5187614560127258, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2046 + }, + { + "completion_length": 8.796875, + "epoch": 0.35868231995794636, + "grad_norm": 28.963244713081757, + "kl": 0.048583984375, + "learning_rate": 6.414929034519011e-07, + "loss": 0.0194, + "reward": 1.3736588954925537, + "reward_std": 0.31448879837989807, + "rewards/accuracy_reward_stage2": 0.49865883588790894, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2047 + }, + { + "completion_length": 7.09375, + "epoch": 0.35885754336779396, + "grad_norm": 17.347047787010148, + "kl": 0.0869140625, + "learning_rate": 6.413176800420536e-07, + "loss": -0.0536, + "reward": 1.2793877124786377, + "reward_std": 0.25073882937431335, + "rewards/accuracy_reward_stage2": 0.31063777208328247, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2048 + }, + { + "completion_length": 9.765625, + "epoch": 0.3590327667776415, + "grad_norm": 54.80440220993308, + "kl": 0.365234375, + "learning_rate": 6.41142456632206e-07, + "loss": 0.0837, + "reward": 1.6083563566207886, + "reward_std": 0.2655033469200134, + "rewards/accuracy_reward_stage2": 0.6396063566207886, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2049 + }, + { + "completion_length": 9.375, + "epoch": 0.35920799018748906, + "grad_norm": 20.464226476682267, + "kl": 0.050537109375, + "learning_rate": 6.409672332223585e-07, + "loss": 0.0073, + "reward": 1.6855225563049316, + "reward_std": 0.21928083896636963, + "rewards/accuracy_reward_stage2": 0.8261474370956421, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2050 + }, + { + "completion_length": 11.78125, + "epoch": 0.3593832135973366, + "grad_norm": 21.28322369967594, + "kl": 0.056640625, + "learning_rate": 6.407920098125109e-07, + "loss": -0.0611, + "reward": 1.612959384918213, + "reward_std": 0.29035934805870056, + "rewards/accuracy_reward_stage2": 0.6598344445228577, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2051 + }, + { + "completion_length": 32.5, + "epoch": 0.35955843700718415, + "grad_norm": 22.26627309796276, + "kl": 0.07763671875, + "learning_rate": 6.406167864026633e-07, + "loss": -0.0091, + "reward": 1.3275949954986572, + "reward_std": 0.2904391586780548, + "rewards/accuracy_reward_stage2": 0.34321993589401245, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2052 + }, + { + "completion_length": 8.21875, + "epoch": 0.3597336604170317, + "grad_norm": 21.821006377049365, + "kl": 0.10400390625, + "learning_rate": 6.404415629928158e-07, + "loss": 0.0415, + "reward": 1.5978240966796875, + "reward_std": 0.3226596415042877, + "rewards/accuracy_reward_stage2": 0.5978240966796875, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2053 + }, + { + "completion_length": 7.203125, + "epoch": 0.35990888382687924, + "grad_norm": 19.665977379258152, + "kl": 0.0712890625, + "learning_rate": 6.402663395829683e-07, + "loss": 0.0285, + "reward": 1.5003751516342163, + "reward_std": 0.20897655189037323, + "rewards/accuracy_reward_stage2": 0.5003751516342163, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2054 + }, + { + "completion_length": 15.40625, + "epoch": 0.36008410723672685, + "grad_norm": 15.361039973728369, + "kl": 0.0947265625, + "learning_rate": 6.400911161731207e-07, + "loss": -0.0506, + "reward": 1.9023044109344482, + "reward_std": 0.2073962688446045, + "rewards/accuracy_reward_stage2": 0.9335543513298035, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2055 + }, + { + "completion_length": 8.0, + "epoch": 0.3602593306465744, + "grad_norm": 22.679126144790594, + "kl": 0.1494140625, + "learning_rate": 6.399158927632732e-07, + "loss": 0.0599, + "reward": 1.4556570053100586, + "reward_std": 0.24635502696037292, + "rewards/accuracy_reward_stage2": 0.4556569457054138, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2056 + }, + { + "completion_length": 14.71875, + "epoch": 0.36043455405642194, + "grad_norm": 21.89217055198326, + "kl": 0.07763671875, + "learning_rate": 6.397406693534256e-07, + "loss": 0.0311, + "reward": 1.569153070449829, + "reward_std": 0.21444562077522278, + "rewards/accuracy_reward_stage2": 0.5691530704498291, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2057 + }, + { + "completion_length": 14.8125, + "epoch": 0.3606097774662695, + "grad_norm": 64.99758626128994, + "kl": 0.34375, + "learning_rate": 6.395654459435781e-07, + "loss": 0.0927, + "reward": 1.2257872819900513, + "reward_std": 0.29136985540390015, + "rewards/accuracy_reward_stage2": 0.36641234159469604, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2058 + }, + { + "completion_length": 8.5, + "epoch": 0.36078500087611703, + "grad_norm": 11.101316283181784, + "kl": 0.0162353515625, + "learning_rate": 6.393902225337305e-07, + "loss": 0.0065, + "reward": 1.7482197284698486, + "reward_std": 0.1722995638847351, + "rewards/accuracy_reward_stage2": 0.7482197284698486, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2059 + }, + { + "completion_length": 23.21875, + "epoch": 0.3609602242859646, + "grad_norm": 21.513832405825056, + "kl": 0.11279296875, + "learning_rate": 6.392149991238828e-07, + "loss": 0.0451, + "reward": 1.5280723571777344, + "reward_std": 0.28505265712738037, + "rewards/accuracy_reward_stage2": 0.6530723571777344, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2060 + }, + { + "completion_length": 31.703125, + "epoch": 0.3611354476958122, + "grad_norm": 109.66779035733514, + "kl": 0.828125, + "learning_rate": 6.390397757140353e-07, + "loss": 0.2906, + "reward": 1.1904970407485962, + "reward_std": 0.07266878336668015, + "rewards/accuracy_reward_stage2": 0.7061220407485962, + "rewards/format_reward_stage1_pointerpad": 0.484375, + "scores/accuracy_reward_stage2": 0.484375, + "step": 2061 + }, + { + "completion_length": 7.3125, + "epoch": 0.36131067110565973, + "grad_norm": 22.10805459794911, + "kl": 0.126953125, + "learning_rate": 6.388645523041878e-07, + "loss": 0.0119, + "reward": 1.5646607875823975, + "reward_std": 0.32735133171081543, + "rewards/accuracy_reward_stage2": 0.5802856683731079, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2062 + }, + { + "completion_length": 13.421875, + "epoch": 0.3614858945155073, + "grad_norm": 15.694383901469001, + "kl": 0.1015625, + "learning_rate": 6.386893288943402e-07, + "loss": 0.0189, + "reward": 1.6609668731689453, + "reward_std": 0.134820356965065, + "rewards/accuracy_reward_stage2": 0.6765917539596558, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2063 + }, + { + "completion_length": 11.171875, + "epoch": 0.3616611179253548, + "grad_norm": 17.573652360450506, + "kl": 0.0791015625, + "learning_rate": 6.385141054844927e-07, + "loss": 0.0317, + "reward": 1.6683963537216187, + "reward_std": 0.09525197744369507, + "rewards/accuracy_reward_stage2": 0.6683963537216187, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2064 + }, + { + "completion_length": 8.765625, + "epoch": 0.36183634133520237, + "grad_norm": 8.684531722027758, + "kl": 0.01708984375, + "learning_rate": 6.383388820746451e-07, + "loss": 0.0069, + "reward": 1.521653175354004, + "reward_std": 0.03974734991788864, + "rewards/accuracy_reward_stage2": 0.5216532349586487, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2065 + }, + { + "completion_length": 10.953125, + "epoch": 0.3620115647450499, + "grad_norm": 20.975044338571003, + "kl": 0.1201171875, + "learning_rate": 6.381636586647976e-07, + "loss": 0.0152, + "reward": 1.4553329944610596, + "reward_std": 0.2626515030860901, + "rewards/accuracy_reward_stage2": 0.47095784544944763, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2066 + }, + { + "completion_length": 12.5, + "epoch": 0.3621867881548975, + "grad_norm": 15.954595962342632, + "kl": 0.06005859375, + "learning_rate": 6.379884352549501e-07, + "loss": -0.0074, + "reward": 1.5274360179901123, + "reward_std": 0.19874641299247742, + "rewards/accuracy_reward_stage2": 0.5430610179901123, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2067 + }, + { + "completion_length": 10.21875, + "epoch": 0.36236201156474507, + "grad_norm": 24.982193204846315, + "kl": 0.0927734375, + "learning_rate": 6.378132118451025e-07, + "loss": -0.0041, + "reward": 1.5683791637420654, + "reward_std": 0.25179433822631836, + "rewards/accuracy_reward_stage2": 0.5840041637420654, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2068 + }, + { + "completion_length": 10.546875, + "epoch": 0.3625372349745926, + "grad_norm": 14.140044137439842, + "kl": 0.0751953125, + "learning_rate": 6.37637988435255e-07, + "loss": 0.0301, + "reward": 1.5177831649780273, + "reward_std": 0.13460445404052734, + "rewards/accuracy_reward_stage2": 0.5177832245826721, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2069 + }, + { + "completion_length": 11.296875, + "epoch": 0.36271245838444016, + "grad_norm": 21.19048020714746, + "kl": 0.087890625, + "learning_rate": 6.374627650254075e-07, + "loss": -0.0514, + "reward": 1.389535665512085, + "reward_std": 0.2172594964504242, + "rewards/accuracy_reward_stage2": 0.42078569531440735, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2070 + }, + { + "completion_length": 11.71875, + "epoch": 0.3628876817942877, + "grad_norm": 25.686707689715494, + "kl": 0.1533203125, + "learning_rate": 6.372875416155598e-07, + "loss": 0.0611, + "reward": 1.4730861186981201, + "reward_std": 0.22391179203987122, + "rewards/accuracy_reward_stage2": 0.5980860590934753, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2071 + }, + { + "completion_length": 16.390625, + "epoch": 0.36306290520413526, + "grad_norm": 16.942657071815216, + "kl": 0.06396484375, + "learning_rate": 6.371123182057122e-07, + "loss": 0.0257, + "reward": 1.5159791707992554, + "reward_std": 0.18147556483745575, + "rewards/accuracy_reward_stage2": 0.5159791707992554, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2072 + }, + { + "completion_length": 12.84375, + "epoch": 0.3632381286139828, + "grad_norm": 15.66545943705507, + "kl": 0.126953125, + "learning_rate": 6.369370947958646e-07, + "loss": 0.0508, + "reward": 1.5079923868179321, + "reward_std": 0.167510986328125, + "rewards/accuracy_reward_stage2": 0.6329923868179321, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2073 + }, + { + "completion_length": 9.265625, + "epoch": 0.3634133520238304, + "grad_norm": 18.989514067325118, + "kl": 0.07568359375, + "learning_rate": 6.367618713860171e-07, + "loss": -0.0139, + "reward": 1.6523466110229492, + "reward_std": 0.2200305163860321, + "rewards/accuracy_reward_stage2": 0.667971670627594, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2074 + }, + { + "completion_length": 10.25, + "epoch": 0.36358857543367795, + "grad_norm": 17.02760984123236, + "kl": 0.047119140625, + "learning_rate": 6.365866479761696e-07, + "loss": 0.0189, + "reward": 1.6422874927520752, + "reward_std": 0.0827837586402893, + "rewards/accuracy_reward_stage2": 0.6422874927520752, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2075 + }, + { + "completion_length": 11.25, + "epoch": 0.3637637988435255, + "grad_norm": 24.889511070713368, + "kl": 0.1376953125, + "learning_rate": 6.36411424566322e-07, + "loss": 0.055, + "reward": 1.5129289627075195, + "reward_std": 0.2492441087961197, + "rewards/accuracy_reward_stage2": 0.63792884349823, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2076 + }, + { + "completion_length": 7.875, + "epoch": 0.36393902225337305, + "grad_norm": 19.01981301756976, + "kl": 0.09619140625, + "learning_rate": 6.362362011564745e-07, + "loss": 0.0384, + "reward": 1.56331467628479, + "reward_std": 0.19022265076637268, + "rewards/accuracy_reward_stage2": 0.5633147954940796, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2077 + }, + { + "completion_length": 6.234375, + "epoch": 0.3641142456632206, + "grad_norm": 22.173410743694834, + "kl": 0.12451171875, + "learning_rate": 6.36060977746627e-07, + "loss": 0.022, + "reward": 1.4536991119384766, + "reward_std": 0.2785697281360626, + "rewards/accuracy_reward_stage2": 0.5943241119384766, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2078 + }, + { + "completion_length": 12.8125, + "epoch": 0.36428946907306814, + "grad_norm": 10.779453254625155, + "kl": 0.039306640625, + "learning_rate": 6.358857543367794e-07, + "loss": 0.0157, + "reward": 1.6614583730697632, + "reward_std": 0.08398155868053436, + "rewards/accuracy_reward_stage2": 0.7864583730697632, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2079 + }, + { + "completion_length": 10.921875, + "epoch": 0.36446469248291574, + "grad_norm": 17.875036599921554, + "kl": 0.08935546875, + "learning_rate": 6.357105309269319e-07, + "loss": -0.0084, + "reward": 1.8185806274414062, + "reward_std": 0.20812779664993286, + "rewards/accuracy_reward_stage2": 0.8342055082321167, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2080 + }, + { + "completion_length": 10.140625, + "epoch": 0.3646399158927633, + "grad_norm": 19.66613709519755, + "kl": 0.2119140625, + "learning_rate": 6.355353075170842e-07, + "loss": 0.0532, + "reward": 1.3610090017318726, + "reward_std": 0.2860793471336365, + "rewards/accuracy_reward_stage2": 0.5016340017318726, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2081 + }, + { + "completion_length": 11.328125, + "epoch": 0.36481513930261084, + "grad_norm": 14.665167537059556, + "kl": 0.1142578125, + "learning_rate": 6.353600841072367e-07, + "loss": -0.0802, + "reward": 1.7025963068008423, + "reward_std": 0.1936970353126526, + "rewards/accuracy_reward_stage2": 0.7494713068008423, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2082 + }, + { + "completion_length": 10.453125, + "epoch": 0.3649903627124584, + "grad_norm": 13.06306517458348, + "kl": 0.03125, + "learning_rate": 6.351848606973892e-07, + "loss": -0.0737, + "reward": 1.8571877479553223, + "reward_std": 0.1453656405210495, + "rewards/accuracy_reward_stage2": 0.8884376883506775, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2083 + }, + { + "completion_length": 7.96875, + "epoch": 0.36516558612230593, + "grad_norm": 36.78432158866736, + "kl": 0.0291748046875, + "learning_rate": 6.350096372875415e-07, + "loss": 0.0117, + "reward": 1.804444670677185, + "reward_std": 0.17355109751224518, + "rewards/accuracy_reward_stage2": 0.8044446706771851, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2084 + }, + { + "completion_length": 9.984375, + "epoch": 0.3653408095321535, + "grad_norm": 15.207981324473641, + "kl": 0.130859375, + "learning_rate": 6.34834413877694e-07, + "loss": -0.0248, + "reward": 1.522946834564209, + "reward_std": 0.19198226928710938, + "rewards/accuracy_reward_stage2": 0.6791968941688538, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2085 + }, + { + "completion_length": 12.609375, + "epoch": 0.3655160329420011, + "grad_norm": 16.909722121647306, + "kl": 0.0859375, + "learning_rate": 6.346591904678465e-07, + "loss": 0.0344, + "reward": 1.6806175708770752, + "reward_std": 0.13519400358200073, + "rewards/accuracy_reward_stage2": 0.6806175708770752, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2086 + }, + { + "completion_length": 11.03125, + "epoch": 0.3656912563518486, + "grad_norm": 20.227633210535608, + "kl": 0.1796875, + "learning_rate": 6.344839670579989e-07, + "loss": 0.0445, + "reward": 1.688694953918457, + "reward_std": 0.10501417517662048, + "rewards/accuracy_reward_stage2": 0.829319953918457, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2087 + }, + { + "completion_length": 17.25, + "epoch": 0.3658664797616962, + "grad_norm": 15.82704135197256, + "kl": 0.0546875, + "learning_rate": 6.343087436481514e-07, + "loss": 0.0002, + "reward": 1.5985554456710815, + "reward_std": 0.22206175327301025, + "rewards/accuracy_reward_stage2": 0.6141804456710815, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2088 + }, + { + "completion_length": 15.765625, + "epoch": 0.3660417031715437, + "grad_norm": 18.337294972423447, + "kl": 0.0869140625, + "learning_rate": 6.341335202383038e-07, + "loss": 0.0348, + "reward": 1.4946866035461426, + "reward_std": 0.1953589916229248, + "rewards/accuracy_reward_stage2": 0.4946865439414978, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2089 + }, + { + "completion_length": 9.328125, + "epoch": 0.36621692658139127, + "grad_norm": 13.987616141282922, + "kl": 0.0303955078125, + "learning_rate": 6.339582968284563e-07, + "loss": -0.0212, + "reward": 1.7124465703964233, + "reward_std": 0.15865039825439453, + "rewards/accuracy_reward_stage2": 0.7280715703964233, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2090 + }, + { + "completion_length": 8.640625, + "epoch": 0.3663921499912388, + "grad_norm": 20.614469832693583, + "kl": 0.267578125, + "learning_rate": 6.337830734186087e-07, + "loss": 0.1075, + "reward": 1.4114258289337158, + "reward_std": 0.24925842881202698, + "rewards/accuracy_reward_stage2": 0.5364257097244263, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2091 + }, + { + "completion_length": 7.015625, + "epoch": 0.36656737340108636, + "grad_norm": 9.815510426958314, + "kl": 0.033203125, + "learning_rate": 6.336078500087611e-07, + "loss": 0.0132, + "reward": 1.65625, + "reward_std": 0.10888782143592834, + "rewards/accuracy_reward_stage2": 0.65625, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2092 + }, + { + "completion_length": 7.671875, + "epoch": 0.36674259681093396, + "grad_norm": 27.466884223323603, + "kl": 0.12353515625, + "learning_rate": 6.334326265989136e-07, + "loss": 0.0064, + "reward": 1.5330727100372314, + "reward_std": 0.2515067160129547, + "rewards/accuracy_reward_stage2": 0.5486976504325867, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2093 + }, + { + "completion_length": 12.328125, + "epoch": 0.3669178202207815, + "grad_norm": 22.04138803322858, + "kl": 0.1376953125, + "learning_rate": 6.332574031890661e-07, + "loss": 0.011, + "reward": 1.7145648002624512, + "reward_std": 0.18631255626678467, + "rewards/accuracy_reward_stage2": 0.7301896214485168, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2094 + }, + { + "completion_length": 23.1875, + "epoch": 0.36709304363062906, + "grad_norm": 165.0439351929352, + "kl": 1.03125, + "learning_rate": 6.330821797792185e-07, + "loss": 0.4126, + "reward": 1.3562812805175781, + "reward_std": 0.16295039653778076, + "rewards/accuracy_reward_stage2": 0.4812812805175781, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2095 + }, + { + "completion_length": 10.28125, + "epoch": 0.3672682670404766, + "grad_norm": 22.879246711638288, + "kl": 0.130859375, + "learning_rate": 6.32906956369371e-07, + "loss": -0.0129, + "reward": 1.5343456268310547, + "reward_std": 0.19512513279914856, + "rewards/accuracy_reward_stage2": 0.6905956268310547, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2096 + }, + { + "completion_length": 8.671875, + "epoch": 0.36744349045032415, + "grad_norm": 21.393376018256088, + "kl": 0.0712890625, + "learning_rate": 6.327317329595233e-07, + "loss": -0.0156, + "reward": 1.657727837562561, + "reward_std": 0.1709435135126114, + "rewards/accuracy_reward_stage2": 0.6733528971672058, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2097 + }, + { + "completion_length": 7.46875, + "epoch": 0.3676187138601717, + "grad_norm": 7.7193000878386275, + "kl": 0.01904296875, + "learning_rate": 6.325565095496758e-07, + "loss": 0.0076, + "reward": 1.6041667461395264, + "reward_std": 0.044543541967868805, + "rewards/accuracy_reward_stage2": 0.6041666865348816, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2098 + }, + { + "completion_length": 11.015625, + "epoch": 0.3677939372700193, + "grad_norm": 17.218948024547018, + "kl": 0.1875, + "learning_rate": 6.323812861398283e-07, + "loss": 0.0753, + "reward": 1.522045612335205, + "reward_std": 0.1089363694190979, + "rewards/accuracy_reward_stage2": 0.6470456719398499, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2099 + }, + { + "completion_length": 12.765625, + "epoch": 0.36796916067986685, + "grad_norm": 20.258775312947893, + "kl": 0.01092529296875, + "learning_rate": 6.322060627299806e-07, + "loss": 0.0044, + "reward": 1.6412173509597778, + "reward_std": 0.2584494352340698, + "rewards/accuracy_reward_stage2": 0.6412172913551331, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2100 + }, + { + "completion_length": 15.90625, + "epoch": 0.3681443840897144, + "grad_norm": 20.08498932619288, + "kl": 0.30859375, + "learning_rate": 6.320308393201331e-07, + "loss": 0.1236, + "reward": 1.3641042709350586, + "reward_std": 0.2118104249238968, + "rewards/accuracy_reward_stage2": 0.4891042113304138, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2101 + }, + { + "completion_length": 5.09375, + "epoch": 0.36831960749956194, + "grad_norm": 13.095916114622412, + "kl": 0.11767578125, + "learning_rate": 6.318556159102855e-07, + "loss": -0.038, + "reward": 1.7996301651000977, + "reward_std": 0.12616249918937683, + "rewards/accuracy_reward_stage2": 0.8308802843093872, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2102 + }, + { + "completion_length": 23.140625, + "epoch": 0.3684948309094095, + "grad_norm": 17.25267888432899, + "kl": 0.0380859375, + "learning_rate": 6.31680392500438e-07, + "loss": 0.0152, + "reward": 1.6559020280838013, + "reward_std": 0.13044781982898712, + "rewards/accuracy_reward_stage2": 0.6559020280838013, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2103 + }, + { + "completion_length": 15.453125, + "epoch": 0.36867005431925703, + "grad_norm": 27.42187132545378, + "kl": 0.38671875, + "learning_rate": 6.315051690905905e-07, + "loss": 0.1546, + "reward": 1.5863591432571411, + "reward_std": 0.21733585000038147, + "rewards/accuracy_reward_stage2": 0.7113592028617859, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2104 + }, + { + "completion_length": 11.09375, + "epoch": 0.3688452777291046, + "grad_norm": 17.045225521328376, + "kl": 0.072265625, + "learning_rate": 6.313299456807429e-07, + "loss": 0.0291, + "reward": 1.1704258918762207, + "reward_std": 0.17644906044006348, + "rewards/accuracy_reward_stage2": 0.2954259514808655, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2105 + }, + { + "completion_length": 9.265625, + "epoch": 0.3690205011389522, + "grad_norm": 20.790848387475037, + "kl": 0.0732421875, + "learning_rate": 6.311547222708954e-07, + "loss": 0.0292, + "reward": 1.6465411186218262, + "reward_std": 0.18840546905994415, + "rewards/accuracy_reward_stage2": 0.6465411186218262, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2106 + }, + { + "completion_length": 9.796875, + "epoch": 0.36919572454879973, + "grad_norm": 15.59715427953365, + "kl": 0.109375, + "learning_rate": 6.309794988610479e-07, + "loss": -0.0006, + "reward": 1.696101427078247, + "reward_std": 0.21237659454345703, + "rewards/accuracy_reward_stage2": 0.7117264270782471, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2107 + }, + { + "completion_length": 11.359375, + "epoch": 0.3693709479586473, + "grad_norm": 36.65876864188198, + "kl": 0.0556640625, + "learning_rate": 6.308042754512003e-07, + "loss": -0.0219, + "reward": 1.7240824699401855, + "reward_std": 0.2328636646270752, + "rewards/accuracy_reward_stage2": 0.7397074699401855, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2108 + }, + { + "completion_length": 16.671875, + "epoch": 0.3695461713684948, + "grad_norm": 26.18128665190158, + "kl": 0.384765625, + "learning_rate": 6.306290520413528e-07, + "loss": 0.1252, + "reward": 1.4608935117721558, + "reward_std": 0.20166131854057312, + "rewards/accuracy_reward_stage2": 0.601518452167511, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2109 + }, + { + "completion_length": 9.609375, + "epoch": 0.36972139477834237, + "grad_norm": 18.71367999781447, + "kl": 0.1376953125, + "learning_rate": 6.30453828631505e-07, + "loss": 0.0239, + "reward": 1.3404827117919922, + "reward_std": 0.22724974155426025, + "rewards/accuracy_reward_stage2": 0.4811077415943146, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2110 + }, + { + "completion_length": 8.96875, + "epoch": 0.3698966181881899, + "grad_norm": 25.878878293086668, + "kl": 0.27734375, + "learning_rate": 6.302786052216575e-07, + "loss": 0.0349, + "reward": 1.4678795337677002, + "reward_std": 0.37333595752716064, + "rewards/accuracy_reward_stage2": 0.6241295337677002, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2111 + }, + { + "completion_length": 11.765625, + "epoch": 0.3700718415980375, + "grad_norm": 13.73746502004517, + "kl": 0.07275390625, + "learning_rate": 6.3010338181181e-07, + "loss": -0.0573, + "reward": 1.8010525703430176, + "reward_std": 0.1689380407333374, + "rewards/accuracy_reward_stage2": 0.8323025703430176, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2112 + }, + { + "completion_length": 10.234375, + "epoch": 0.37024706500788507, + "grad_norm": 17.41833234174483, + "kl": 0.2236328125, + "learning_rate": 6.299281584019624e-07, + "loss": 0.1181, + "reward": 1.477597713470459, + "reward_std": 0.1575087606906891, + "rewards/accuracy_reward_stage2": 0.602597713470459, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2113 + }, + { + "completion_length": 24.203125, + "epoch": 0.3704222884177326, + "grad_norm": 17.130019289837158, + "kl": 0.064453125, + "learning_rate": 6.297529349921149e-07, + "loss": -0.0723, + "reward": 1.571899652481079, + "reward_std": 0.21543410420417786, + "rewards/accuracy_reward_stage2": 0.6187746524810791, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2114 + }, + { + "completion_length": 10.484375, + "epoch": 0.37059751182758016, + "grad_norm": 16.148528845551034, + "kl": 0.109375, + "learning_rate": 6.295777115822674e-07, + "loss": -0.0333, + "reward": 1.4840588569641113, + "reward_std": 0.225162535905838, + "rewards/accuracy_reward_stage2": 0.5153088569641113, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2115 + }, + { + "completion_length": 8.390625, + "epoch": 0.3707727352374277, + "grad_norm": 19.73466678979311, + "kl": 0.0791015625, + "learning_rate": 6.294024881724198e-07, + "loss": -0.0125, + "reward": 1.5833333730697632, + "reward_std": 0.17150771617889404, + "rewards/accuracy_reward_stage2": 0.5989583134651184, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2116 + }, + { + "completion_length": 15.515625, + "epoch": 0.37094795864727526, + "grad_norm": 23.316172543129646, + "kl": 0.1845703125, + "learning_rate": 6.292272647625723e-07, + "loss": 0.0518, + "reward": 1.400850772857666, + "reward_std": 0.25529760122299194, + "rewards/accuracy_reward_stage2": 0.541475772857666, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2117 + }, + { + "completion_length": 9.734375, + "epoch": 0.37112318205712286, + "grad_norm": 17.983517471441573, + "kl": 0.171875, + "learning_rate": 6.290520413527247e-07, + "loss": 0.0688, + "reward": 1.432976245880127, + "reward_std": 0.27066361904144287, + "rewards/accuracy_reward_stage2": 0.557976245880127, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2118 + }, + { + "completion_length": 8.640625, + "epoch": 0.3712984054669704, + "grad_norm": 20.06146237815626, + "kl": 0.10791015625, + "learning_rate": 6.288768179428772e-07, + "loss": 0.0432, + "reward": 1.6736295223236084, + "reward_std": 0.20630821585655212, + "rewards/accuracy_reward_stage2": 0.6736295223236084, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2119 + }, + { + "completion_length": 15.515625, + "epoch": 0.37147362887681795, + "grad_norm": 33.925749094582095, + "kl": 0.169921875, + "learning_rate": 6.287015945330297e-07, + "loss": 0.0677, + "reward": 1.5644185543060303, + "reward_std": 0.2227061688899994, + "rewards/accuracy_reward_stage2": 0.6894185543060303, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2120 + }, + { + "completion_length": 13.875, + "epoch": 0.3716488522866655, + "grad_norm": 21.495517940812118, + "kl": 0.07470703125, + "learning_rate": 6.28526371123182e-07, + "loss": 0.03, + "reward": 1.561547875404358, + "reward_std": 0.2311323881149292, + "rewards/accuracy_reward_stage2": 0.6865478754043579, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2121 + }, + { + "completion_length": 13.3125, + "epoch": 0.37182407569651305, + "grad_norm": 28.243106658760585, + "kl": 0.07568359375, + "learning_rate": 6.283511477133345e-07, + "loss": 0.0304, + "reward": 1.183201551437378, + "reward_std": 0.24948707222938538, + "rewards/accuracy_reward_stage2": 0.30820155143737793, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2122 + }, + { + "completion_length": 14.71875, + "epoch": 0.3719992991063606, + "grad_norm": 18.79970480525015, + "kl": 0.0869140625, + "learning_rate": 6.281759243034869e-07, + "loss": 0.0347, + "reward": 1.5993150472640991, + "reward_std": 0.14652368426322937, + "rewards/accuracy_reward_stage2": 0.5993151068687439, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2123 + }, + { + "completion_length": 7.484375, + "epoch": 0.37217452251620814, + "grad_norm": 22.273172047889105, + "kl": 0.1162109375, + "learning_rate": 6.280007008936393e-07, + "loss": 0.0464, + "reward": 1.5664750337600708, + "reward_std": 0.3070983588695526, + "rewards/accuracy_reward_stage2": 0.5664750337600708, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2124 + }, + { + "completion_length": 8.171875, + "epoch": 0.37234974592605574, + "grad_norm": 18.925406299644248, + "kl": 0.0341796875, + "learning_rate": 6.278254774837918e-07, + "loss": 0.0137, + "reward": 1.5416319370269775, + "reward_std": 0.18046918511390686, + "rewards/accuracy_reward_stage2": 0.5416319966316223, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2125 + }, + { + "completion_length": 13.1875, + "epoch": 0.3725249693359033, + "grad_norm": 21.017869463214364, + "kl": 0.107421875, + "learning_rate": 6.276502540739442e-07, + "loss": -0.0684, + "reward": 1.4259710311889648, + "reward_std": 0.351542592048645, + "rewards/accuracy_reward_stage2": 0.4728460907936096, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2126 + }, + { + "completion_length": 12.234375, + "epoch": 0.37270019274575084, + "grad_norm": 24.82390537095879, + "kl": 0.040771484375, + "learning_rate": 6.274750306640967e-07, + "loss": 0.0163, + "reward": 1.64809250831604, + "reward_std": 0.20966197550296783, + "rewards/accuracy_reward_stage2": 0.64809250831604, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2127 + }, + { + "completion_length": 9.875, + "epoch": 0.3728754161555984, + "grad_norm": 23.547973224097863, + "kl": 0.09619140625, + "learning_rate": 6.272998072542492e-07, + "loss": 0.0386, + "reward": 1.5378363132476807, + "reward_std": 0.20070700347423553, + "rewards/accuracy_reward_stage2": 0.5378363132476807, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2128 + }, + { + "completion_length": 9.609375, + "epoch": 0.37305063956544593, + "grad_norm": 27.303363055608237, + "kl": 0.2197265625, + "learning_rate": 6.271245838444016e-07, + "loss": -0.034, + "reward": 1.6875, + "reward_std": 0.2845909595489502, + "rewards/accuracy_reward_stage2": 0.734375, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2129 + }, + { + "completion_length": 12.828125, + "epoch": 0.3732258629752935, + "grad_norm": 22.24181250018731, + "kl": 0.171875, + "learning_rate": 6.26949360434554e-07, + "loss": 0.0259, + "reward": 1.425432562828064, + "reward_std": 0.3704448342323303, + "rewards/accuracy_reward_stage2": 0.4410575032234192, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2130 + }, + { + "completion_length": 12.03125, + "epoch": 0.3734010863851411, + "grad_norm": 45.756356155634414, + "kl": 0.396484375, + "learning_rate": 6.267741370247065e-07, + "loss": 0.1586, + "reward": 1.5892225503921509, + "reward_std": 0.21309590339660645, + "rewards/accuracy_reward_stage2": 0.7142226099967957, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2131 + }, + { + "completion_length": 14.859375, + "epoch": 0.3735763097949886, + "grad_norm": 19.61871851111529, + "kl": 0.1943359375, + "learning_rate": 6.265989136148589e-07, + "loss": -0.0106, + "reward": 1.5703403949737549, + "reward_std": 0.2058640867471695, + "rewards/accuracy_reward_stage2": 0.7265903353691101, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2132 + }, + { + "completion_length": 9.640625, + "epoch": 0.3737515332048362, + "grad_norm": 22.685600796758123, + "kl": 0.126953125, + "learning_rate": 6.264236902050114e-07, + "loss": -0.0376, + "reward": 1.619655728340149, + "reward_std": 0.31002217531204224, + "rewards/accuracy_reward_stage2": 0.6509058475494385, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2133 + }, + { + "completion_length": 8.859375, + "epoch": 0.3739267566146837, + "grad_norm": 22.704349832902558, + "kl": 0.255859375, + "learning_rate": 6.262484667951638e-07, + "loss": 0.0141, + "reward": 1.5410445928573608, + "reward_std": 0.40798452496528625, + "rewards/accuracy_reward_stage2": 0.5722945928573608, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2134 + }, + { + "completion_length": 12.015625, + "epoch": 0.37410198002453127, + "grad_norm": 17.017736183879197, + "kl": 0.11328125, + "learning_rate": 6.260732433853162e-07, + "loss": 0.0453, + "reward": 1.5101354122161865, + "reward_std": 0.1195073202252388, + "rewards/accuracy_reward_stage2": 0.5101353526115417, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2135 + }, + { + "completion_length": 8.078125, + "epoch": 0.3742772034343788, + "grad_norm": 18.307863070101405, + "kl": 0.0634765625, + "learning_rate": 6.258980199754687e-07, + "loss": 0.0253, + "reward": 1.782165288925171, + "reward_std": 0.15776385366916656, + "rewards/accuracy_reward_stage2": 0.7821652889251709, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2136 + }, + { + "completion_length": 10.375, + "epoch": 0.3744524268442264, + "grad_norm": 19.604165483296928, + "kl": 0.0869140625, + "learning_rate": 6.257227965656211e-07, + "loss": -0.0095, + "reward": 1.1285523176193237, + "reward_std": 0.14808019995689392, + "rewards/accuracy_reward_stage2": 0.14417734742164612, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2137 + }, + { + "completion_length": 9.234375, + "epoch": 0.37462765025407396, + "grad_norm": 15.117178098316627, + "kl": 0.10546875, + "learning_rate": 6.255475731557736e-07, + "loss": -0.002, + "reward": 1.509828805923462, + "reward_std": 0.15732884407043457, + "rewards/accuracy_reward_stage2": 0.5254538059234619, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2138 + }, + { + "completion_length": 12.59375, + "epoch": 0.3748028736639215, + "grad_norm": 16.940128197482103, + "kl": 0.0673828125, + "learning_rate": 6.253723497459261e-07, + "loss": -0.0019, + "reward": 1.7207591533660889, + "reward_std": 0.17644330859184265, + "rewards/accuracy_reward_stage2": 0.7363842129707336, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2139 + }, + { + "completion_length": 11.890625, + "epoch": 0.37497809707376906, + "grad_norm": 26.874987309380455, + "kl": 0.09619140625, + "learning_rate": 6.251971263360784e-07, + "loss": 0.0385, + "reward": 1.5684523582458496, + "reward_std": 0.23804005980491638, + "rewards/accuracy_reward_stage2": 0.5684524178504944, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2140 + }, + { + "completion_length": 27.5, + "epoch": 0.3751533204836166, + "grad_norm": 26.3035483971051, + "kl": 0.287109375, + "learning_rate": 6.250219029262309e-07, + "loss": 0.0831, + "reward": 1.4032840728759766, + "reward_std": 0.2784123420715332, + "rewards/accuracy_reward_stage2": 0.5439091324806213, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2141 + }, + { + "completion_length": 13.390625, + "epoch": 0.37532854389346415, + "grad_norm": 22.92592263951425, + "kl": 0.11962890625, + "learning_rate": 6.248466795163833e-07, + "loss": 0.0478, + "reward": 1.3103749752044678, + "reward_std": 0.23864029347896576, + "rewards/accuracy_reward_stage2": 0.4353749752044678, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2142 + }, + { + "completion_length": 11.765625, + "epoch": 0.3755037673033117, + "grad_norm": 21.834870516210604, + "kl": 0.09375, + "learning_rate": 6.246714561065358e-07, + "loss": -0.0713, + "reward": 1.3206520080566406, + "reward_std": 0.339372843503952, + "rewards/accuracy_reward_stage2": 0.3675270974636078, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2143 + }, + { + "completion_length": 12.078125, + "epoch": 0.3756789907131593, + "grad_norm": 24.03979986543318, + "kl": 0.25, + "learning_rate": 6.244962326966883e-07, + "loss": 0.0715, + "reward": 1.7012312412261963, + "reward_std": 0.2878972291946411, + "rewards/accuracy_reward_stage2": 0.8418562412261963, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2144 + }, + { + "completion_length": 12.46875, + "epoch": 0.37585421412300685, + "grad_norm": 16.544610076218923, + "kl": 0.06689453125, + "learning_rate": 6.243210092868407e-07, + "loss": -0.0043, + "reward": 1.2745712995529175, + "reward_std": 0.1314728707075119, + "rewards/accuracy_reward_stage2": 0.4151962697505951, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2145 + }, + { + "completion_length": 22.15625, + "epoch": 0.3760294375328544, + "grad_norm": 22.459144576557698, + "kl": 0.107421875, + "learning_rate": 6.241457858769932e-07, + "loss": 0.0214, + "reward": 1.423964023590088, + "reward_std": 0.27767157554626465, + "rewards/accuracy_reward_stage2": 0.5645890235900879, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2146 + }, + { + "completion_length": 8.765625, + "epoch": 0.37620466094270194, + "grad_norm": 13.373006338465403, + "kl": 0.05029296875, + "learning_rate": 6.239705624671457e-07, + "loss": 0.0202, + "reward": 1.7165720462799072, + "reward_std": 0.10273611545562744, + "rewards/accuracy_reward_stage2": 0.841572105884552, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2147 + }, + { + "completion_length": 15.84375, + "epoch": 0.3763798843525495, + "grad_norm": 11.776072260362834, + "kl": 0.0184326171875, + "learning_rate": 6.23795339057298e-07, + "loss": 0.0074, + "reward": 1.5905694961547852, + "reward_std": 0.08757132291793823, + "rewards/accuracy_reward_stage2": 0.5905694961547852, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2148 + }, + { + "completion_length": 13.359375, + "epoch": 0.37655510776239703, + "grad_norm": 20.45580036732035, + "kl": 0.059326171875, + "learning_rate": 6.236201156474505e-07, + "loss": 0.0237, + "reward": 1.6051902770996094, + "reward_std": 0.14381514489650726, + "rewards/accuracy_reward_stage2": 0.6051902174949646, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2149 + }, + { + "completion_length": 14.34375, + "epoch": 0.37673033117224464, + "grad_norm": 20.69266694732591, + "kl": 0.048583984375, + "learning_rate": 6.234448922376028e-07, + "loss": 0.0195, + "reward": 1.640191674232483, + "reward_std": 0.23035646975040436, + "rewards/accuracy_reward_stage2": 0.6401916742324829, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2150 + }, + { + "completion_length": 12.609375, + "epoch": 0.3769055545820922, + "grad_norm": 20.93672786852049, + "kl": 0.083984375, + "learning_rate": 6.232696688277553e-07, + "loss": -0.0108, + "reward": 1.727813720703125, + "reward_std": 0.2194058895111084, + "rewards/accuracy_reward_stage2": 0.7434388399124146, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2151 + }, + { + "completion_length": 6.578125, + "epoch": 0.37708077799193973, + "grad_norm": 23.85461277636439, + "kl": 0.2578125, + "learning_rate": 6.230944454179078e-07, + "loss": 0.0705, + "reward": 1.5679218769073486, + "reward_std": 0.31683841347694397, + "rewards/accuracy_reward_stage2": 0.7085468769073486, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2152 + }, + { + "completion_length": 13.140625, + "epoch": 0.3772560014017873, + "grad_norm": 16.468654634909875, + "kl": 0.046875, + "learning_rate": 6.229192220080602e-07, + "loss": -0.0201, + "reward": 1.5747102499008179, + "reward_std": 0.08664512634277344, + "rewards/accuracy_reward_stage2": 0.5903353095054626, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2153 + }, + { + "completion_length": 6.765625, + "epoch": 0.3774312248116348, + "grad_norm": 23.244001860146184, + "kl": 0.119140625, + "learning_rate": 6.227439985982127e-07, + "loss": 0.0477, + "reward": 1.6278541088104248, + "reward_std": 0.2148430198431015, + "rewards/accuracy_reward_stage2": 0.6278541088104248, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2154 + }, + { + "completion_length": 11.5, + "epoch": 0.37760644822148237, + "grad_norm": 21.616613381013284, + "kl": 0.09619140625, + "learning_rate": 6.225687751883652e-07, + "loss": -0.0058, + "reward": 1.8042510747909546, + "reward_std": 0.19430895149707794, + "rewards/accuracy_reward_stage2": 0.8198760747909546, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2155 + }, + { + "completion_length": 10.953125, + "epoch": 0.3777816716313299, + "grad_norm": 22.314447656053556, + "kl": 0.09521484375, + "learning_rate": 6.223935517785176e-07, + "loss": -0.048, + "reward": 1.4477014541625977, + "reward_std": 0.2694235146045685, + "rewards/accuracy_reward_stage2": 0.47895151376724243, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2156 + }, + { + "completion_length": 9.953125, + "epoch": 0.3779568950411775, + "grad_norm": 5.851002174857388, + "kl": 0.03125, + "learning_rate": 6.222183283686701e-07, + "loss": 0.0125, + "reward": 1.828125, + "reward_std": 0.0646936446428299, + "rewards/accuracy_reward_stage2": 0.828125, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2157 + }, + { + "completion_length": 10.515625, + "epoch": 0.37813211845102507, + "grad_norm": 17.6330337529898, + "kl": 0.058837890625, + "learning_rate": 6.220431049588225e-07, + "loss": -0.0207, + "reward": 1.7743115425109863, + "reward_std": 0.2329869270324707, + "rewards/accuracy_reward_stage2": 0.7899366021156311, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2158 + }, + { + "completion_length": 11.390625, + "epoch": 0.3783073418608726, + "grad_norm": 23.5113310408841, + "kl": 0.0859375, + "learning_rate": 6.21867881548975e-07, + "loss": 0.025, + "reward": 1.512028455734253, + "reward_std": 0.19187329709529877, + "rewards/accuracy_reward_stage2": 0.5276533961296082, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2159 + }, + { + "completion_length": 12.625, + "epoch": 0.37848256527072016, + "grad_norm": 26.345417785246763, + "kl": 0.27734375, + "learning_rate": 6.216926581391274e-07, + "loss": 0.0693, + "reward": 1.250259280204773, + "reward_std": 0.2544091045856476, + "rewards/accuracy_reward_stage2": 0.515884280204773, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 2160 + }, + { + "completion_length": 8.203125, + "epoch": 0.3786577886805677, + "grad_norm": 21.203083672580384, + "kl": 0.07861328125, + "learning_rate": 6.215174347292797e-07, + "loss": 0.0315, + "reward": 1.674128532409668, + "reward_std": 0.2373107671737671, + "rewards/accuracy_reward_stage2": 0.6741284728050232, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2161 + }, + { + "completion_length": 10.8125, + "epoch": 0.37883301209041526, + "grad_norm": 32.004422207854546, + "kl": 0.220703125, + "learning_rate": 6.213422113194322e-07, + "loss": 0.0879, + "reward": 1.3069201707839966, + "reward_std": 0.24062323570251465, + "rewards/accuracy_reward_stage2": 0.43192020058631897, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2162 + }, + { + "completion_length": 15.796875, + "epoch": 0.37900823550026286, + "grad_norm": 22.743167065050343, + "kl": 0.2734375, + "learning_rate": 6.211669879095846e-07, + "loss": 0.0652, + "reward": 1.62326979637146, + "reward_std": 0.2634640336036682, + "rewards/accuracy_reward_stage2": 0.76389479637146, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2163 + }, + { + "completion_length": 7.09375, + "epoch": 0.3791834589101104, + "grad_norm": 15.754559332097065, + "kl": 0.099609375, + "learning_rate": 6.209917644997371e-07, + "loss": 0.0399, + "reward": 1.9393177032470703, + "reward_std": 0.12810847163200378, + "rewards/accuracy_reward_stage2": 0.9393176436424255, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2164 + }, + { + "completion_length": 6.15625, + "epoch": 0.37935868231995795, + "grad_norm": 22.90820730699427, + "kl": 0.2041015625, + "learning_rate": 6.208165410898896e-07, + "loss": -0.0068, + "reward": 1.776153564453125, + "reward_std": 0.2677950859069824, + "rewards/accuracy_reward_stage2": 0.8074035048484802, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2165 + }, + { + "completion_length": 8.71875, + "epoch": 0.3795339057298055, + "grad_norm": 22.416892182157884, + "kl": 0.0791015625, + "learning_rate": 6.20641317680042e-07, + "loss": 0.0093, + "reward": 1.6350996494293213, + "reward_std": 0.20906037092208862, + "rewards/accuracy_reward_stage2": 0.6507246494293213, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2166 + }, + { + "completion_length": 18.546875, + "epoch": 0.37970912913965305, + "grad_norm": 20.848643017035872, + "kl": 0.044677734375, + "learning_rate": 6.204660942701945e-07, + "loss": 0.0179, + "reward": 1.5497921705245972, + "reward_std": 0.191371887922287, + "rewards/accuracy_reward_stage2": 0.5497921705245972, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2167 + }, + { + "completion_length": 9.15625, + "epoch": 0.3798843525495006, + "grad_norm": 16.90072768619376, + "kl": 0.19921875, + "learning_rate": 6.20290870860347e-07, + "loss": 0.0353, + "reward": 1.5379623174667358, + "reward_std": 0.1680925339460373, + "rewards/accuracy_reward_stage2": 0.5535872578620911, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2168 + }, + { + "completion_length": 8.15625, + "epoch": 0.3800595759593482, + "grad_norm": 16.751612988843327, + "kl": 0.1708984375, + "learning_rate": 6.201156474504994e-07, + "loss": 0.0684, + "reward": 1.4254521131515503, + "reward_std": 0.16224229335784912, + "rewards/accuracy_reward_stage2": 0.4254521429538727, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2169 + }, + { + "completion_length": 12.09375, + "epoch": 0.38023479936919574, + "grad_norm": 17.533718907309815, + "kl": 0.123046875, + "learning_rate": 6.199404240406518e-07, + "loss": 0.0162, + "reward": 1.5870461463928223, + "reward_std": 0.25187405943870544, + "rewards/accuracy_reward_stage2": 0.602671205997467, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2170 + }, + { + "completion_length": 9.375, + "epoch": 0.3804100227790433, + "grad_norm": 17.396942548872524, + "kl": 0.1787109375, + "learning_rate": 6.197652006308043e-07, + "loss": 0.0271, + "reward": 1.4302774667739868, + "reward_std": 0.2149101197719574, + "rewards/accuracy_reward_stage2": 0.5709024667739868, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2171 + }, + { + "completion_length": 7.875, + "epoch": 0.38058524618889084, + "grad_norm": 15.929708549328153, + "kl": 0.0703125, + "learning_rate": 6.195899772209567e-07, + "loss": 0.0281, + "reward": 1.8126511573791504, + "reward_std": 0.17158563435077667, + "rewards/accuracy_reward_stage2": 0.8126511573791504, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2172 + }, + { + "completion_length": 21.15625, + "epoch": 0.3807604695987384, + "grad_norm": 19.271770329974064, + "kl": 0.154296875, + "learning_rate": 6.194147538111091e-07, + "loss": 0.0175, + "reward": 1.1675336360931396, + "reward_std": 0.16348037123680115, + "rewards/accuracy_reward_stage2": 0.18315869569778442, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2173 + }, + { + "completion_length": 9.671875, + "epoch": 0.38093569300858593, + "grad_norm": 16.931454968051227, + "kl": 0.1201171875, + "learning_rate": 6.192395304012615e-07, + "loss": 0.0038, + "reward": 1.4680428504943848, + "reward_std": 0.24045197665691376, + "rewards/accuracy_reward_stage2": 0.48366779088974, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2174 + }, + { + "completion_length": 15.328125, + "epoch": 0.3811109164184335, + "grad_norm": 18.821037310256308, + "kl": 0.06689453125, + "learning_rate": 6.19064306991414e-07, + "loss": 0.0268, + "reward": 1.465679407119751, + "reward_std": 0.17180000245571136, + "rewards/accuracy_reward_stage2": 0.46567946672439575, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2175 + }, + { + "completion_length": 10.4375, + "epoch": 0.3812861398282811, + "grad_norm": 18.791259747994744, + "kl": 0.140625, + "learning_rate": 6.188890835815665e-07, + "loss": 0.056, + "reward": 1.6078298091888428, + "reward_std": 0.10767532885074615, + "rewards/accuracy_reward_stage2": 0.6078298687934875, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2176 + }, + { + "completion_length": 7.90625, + "epoch": 0.3814613632381286, + "grad_norm": 26.132132368079272, + "kl": 0.0247802734375, + "learning_rate": 6.187138601717189e-07, + "loss": 0.0099, + "reward": 1.703125, + "reward_std": 0.35612428188323975, + "rewards/accuracy_reward_stage2": 0.703125, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2177 + }, + { + "completion_length": 5.046875, + "epoch": 0.3816365866479762, + "grad_norm": 11.699367635387564, + "kl": 0.083984375, + "learning_rate": 6.185386367618714e-07, + "loss": -0.0105, + "reward": 1.6875, + "reward_std": 0.1462521106004715, + "rewards/accuracy_reward_stage2": 0.828125, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2178 + }, + { + "completion_length": 14.09375, + "epoch": 0.3818118100578237, + "grad_norm": 24.398667267083667, + "kl": 0.310546875, + "learning_rate": 6.183634133520237e-07, + "loss": 0.095, + "reward": 1.657011866569519, + "reward_std": 0.2222549319267273, + "rewards/accuracy_reward_stage2": 0.7976367473602295, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2179 + }, + { + "completion_length": 11.03125, + "epoch": 0.38198703346767127, + "grad_norm": 28.560326997317556, + "kl": 0.236328125, + "learning_rate": 6.181881899421762e-07, + "loss": 0.0944, + "reward": 1.7426857948303223, + "reward_std": 0.17225751280784607, + "rewards/accuracy_reward_stage2": 0.8676857948303223, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2180 + }, + { + "completion_length": 10.75, + "epoch": 0.3821622568775188, + "grad_norm": 11.066768967836019, + "kl": 0.044921875, + "learning_rate": 6.180129665323287e-07, + "loss": 0.018, + "reward": 1.738937258720398, + "reward_std": 0.083560511469841, + "rewards/accuracy_reward_stage2": 0.738937258720398, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2181 + }, + { + "completion_length": 9.078125, + "epoch": 0.3823374802873664, + "grad_norm": 20.679233919704437, + "kl": 0.1328125, + "learning_rate": 6.178377431224811e-07, + "loss": -0.0125, + "reward": 1.6758754253387451, + "reward_std": 0.24189935624599457, + "rewards/accuracy_reward_stage2": 0.7071253657341003, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2182 + }, + { + "completion_length": 10.40625, + "epoch": 0.38251270369721396, + "grad_norm": 17.54733809618765, + "kl": 0.0849609375, + "learning_rate": 6.176625197126336e-07, + "loss": -0.023, + "reward": 1.4875990152359009, + "reward_std": 0.17271263897418976, + "rewards/accuracy_reward_stage2": 0.5188490748405457, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2183 + }, + { + "completion_length": 9.390625, + "epoch": 0.3826879271070615, + "grad_norm": 24.9013565917632, + "kl": 0.1337890625, + "learning_rate": 6.174872963027861e-07, + "loss": 0.0538, + "reward": 1.5268363952636719, + "reward_std": 0.11797383427619934, + "rewards/accuracy_reward_stage2": 0.5268364548683167, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2184 + }, + { + "completion_length": 9.734375, + "epoch": 0.38286315051690906, + "grad_norm": 19.620060637076595, + "kl": 0.09326171875, + "learning_rate": 6.173120728929385e-07, + "loss": 0.0374, + "reward": 1.8419290781021118, + "reward_std": 0.20093566179275513, + "rewards/accuracy_reward_stage2": 0.8419290781021118, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2185 + }, + { + "completion_length": 7.765625, + "epoch": 0.3830383739267566, + "grad_norm": 17.378286477591992, + "kl": 0.1240234375, + "learning_rate": 6.171368494830909e-07, + "loss": 0.0497, + "reward": 1.5082812309265137, + "reward_std": 0.13430514931678772, + "rewards/accuracy_reward_stage2": 0.6332812309265137, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2186 + }, + { + "completion_length": 10.078125, + "epoch": 0.38321359733660415, + "grad_norm": 23.07091267330619, + "kl": 0.04052734375, + "learning_rate": 6.169616260732433e-07, + "loss": -0.0279, + "reward": 1.455744743347168, + "reward_std": 0.17548725008964539, + "rewards/accuracy_reward_stage2": 0.596369743347168, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2187 + }, + { + "completion_length": 15.171875, + "epoch": 0.38338882074645175, + "grad_norm": 15.279125531805924, + "kl": 0.05419921875, + "learning_rate": 6.167864026633958e-07, + "loss": 0.0216, + "reward": 1.5746581554412842, + "reward_std": 0.08711699396371841, + "rewards/accuracy_reward_stage2": 0.6996581554412842, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2188 + }, + { + "completion_length": 15.109375, + "epoch": 0.3835640441562993, + "grad_norm": 21.233700522460875, + "kl": 0.203125, + "learning_rate": 6.166111792535483e-07, + "loss": 0.0809, + "reward": 1.4553985595703125, + "reward_std": 0.14188659191131592, + "rewards/accuracy_reward_stage2": 0.5803984999656677, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2189 + }, + { + "completion_length": 11.78125, + "epoch": 0.38373926756614685, + "grad_norm": 13.064192613447082, + "kl": 0.052490234375, + "learning_rate": 6.164359558437006e-07, + "loss": 0.021, + "reward": 1.6517360210418701, + "reward_std": 0.090608611702919, + "rewards/accuracy_reward_stage2": 0.6517360210418701, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2190 + }, + { + "completion_length": 8.171875, + "epoch": 0.3839144909759944, + "grad_norm": 29.885606654441183, + "kl": 0.1923828125, + "learning_rate": 6.162607324338531e-07, + "loss": 0.0094, + "reward": 1.6037235260009766, + "reward_std": 0.35476142168045044, + "rewards/accuracy_reward_stage2": 0.6349735260009766, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2191 + }, + { + "completion_length": 10.6875, + "epoch": 0.38408971438584194, + "grad_norm": 28.59603690547388, + "kl": 0.1533203125, + "learning_rate": 6.160855090240056e-07, + "loss": 0.0056, + "reward": 1.557417631149292, + "reward_std": 0.3235635757446289, + "rewards/accuracy_reward_stage2": 0.588667631149292, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2192 + }, + { + "completion_length": 9.8125, + "epoch": 0.3842649377956895, + "grad_norm": 23.595167727606896, + "kl": 0.06103515625, + "learning_rate": 6.15910285614158e-07, + "loss": 0.0245, + "reward": 1.6596101522445679, + "reward_std": 0.17235329747200012, + "rewards/accuracy_reward_stage2": 0.6596100926399231, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2193 + }, + { + "completion_length": 9.296875, + "epoch": 0.38444016120553703, + "grad_norm": 25.677820508864002, + "kl": 0.12109375, + "learning_rate": 6.157350622043105e-07, + "loss": 0.0485, + "reward": 1.5163013935089111, + "reward_std": 0.21374960243701935, + "rewards/accuracy_reward_stage2": 0.5163014531135559, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2194 + }, + { + "completion_length": 9.1875, + "epoch": 0.38461538461538464, + "grad_norm": 19.62805940423102, + "kl": 0.051513671875, + "learning_rate": 6.155598387944629e-07, + "loss": -0.0235, + "reward": 1.8182024955749512, + "reward_std": 0.2139115333557129, + "rewards/accuracy_reward_stage2": 0.8338274955749512, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2195 + }, + { + "completion_length": 7.65625, + "epoch": 0.3847906080252322, + "grad_norm": 20.08895842095188, + "kl": 0.07470703125, + "learning_rate": 6.153846153846154e-07, + "loss": -0.0064, + "reward": 1.3504976034164429, + "reward_std": 0.2288835346698761, + "rewards/accuracy_reward_stage2": 0.36612263321876526, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2196 + }, + { + "completion_length": 9.3125, + "epoch": 0.38496583143507973, + "grad_norm": 10.903634089390751, + "kl": 0.0126953125, + "learning_rate": 6.152093919747679e-07, + "loss": 0.0051, + "reward": 1.890625, + "reward_std": 0.10205793380737305, + "rewards/accuracy_reward_stage2": 0.890625, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2197 + }, + { + "completion_length": 6.59375, + "epoch": 0.3851410548449273, + "grad_norm": 23.287103677945193, + "kl": 0.1171875, + "learning_rate": 6.150341685649203e-07, + "loss": -0.0092, + "reward": 1.7211157083511353, + "reward_std": 0.2485455870628357, + "rewards/accuracy_reward_stage2": 0.7523657083511353, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2198 + }, + { + "completion_length": 15.265625, + "epoch": 0.3853162782547748, + "grad_norm": 21.500939887887792, + "kl": 0.09423828125, + "learning_rate": 6.148589451550726e-07, + "loss": 0.0377, + "reward": 1.4081530570983887, + "reward_std": 0.07311158627271652, + "rewards/accuracy_reward_stage2": 0.5331530570983887, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2199 + }, + { + "completion_length": 13.0625, + "epoch": 0.38549150166462237, + "grad_norm": 34.72416274037816, + "kl": 0.357421875, + "learning_rate": 6.146837217452251e-07, + "loss": 0.0341, + "reward": 1.206498146057129, + "reward_std": 0.3895317316055298, + "rewards/accuracy_reward_stage2": 0.37837323546409607, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 2200 + }, + { + "completion_length": 11.578125, + "epoch": 0.38566672507447, + "grad_norm": 22.262971245552823, + "kl": 0.10546875, + "learning_rate": 6.145084983353775e-07, + "loss": -0.0006, + "reward": 1.291999101638794, + "reward_std": 0.2678248882293701, + "rewards/accuracy_reward_stage2": 0.43262407183647156, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2201 + }, + { + "completion_length": 7.421875, + "epoch": 0.3858419484843175, + "grad_norm": 17.792911564404413, + "kl": 0.181640625, + "learning_rate": 6.1433327492553e-07, + "loss": 0.0726, + "reward": 1.6500575542449951, + "reward_std": 0.11898770928382874, + "rewards/accuracy_reward_stage2": 0.6500574946403503, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2202 + }, + { + "completion_length": 10.203125, + "epoch": 0.38601717189416507, + "grad_norm": 20.116358935868064, + "kl": 0.1357421875, + "learning_rate": 6.141580515156824e-07, + "loss": -0.0391, + "reward": 1.5253784656524658, + "reward_std": 0.38462263345718384, + "rewards/accuracy_reward_stage2": 0.5722534656524658, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2203 + }, + { + "completion_length": 13.1875, + "epoch": 0.3861923953040126, + "grad_norm": 17.99177828565341, + "kl": 0.045654296875, + "learning_rate": 6.139828281058349e-07, + "loss": 0.0183, + "reward": 1.65625, + "reward_std": 0.16675157845020294, + "rewards/accuracy_reward_stage2": 0.78125, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2204 + }, + { + "completion_length": 11.640625, + "epoch": 0.38636761871386016, + "grad_norm": 22.035441172696054, + "kl": 0.05224609375, + "learning_rate": 6.138076046959874e-07, + "loss": 0.021, + "reward": 1.440962314605713, + "reward_std": 0.24247995018959045, + "rewards/accuracy_reward_stage2": 0.4409623146057129, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2205 + }, + { + "completion_length": 10.015625, + "epoch": 0.3865428421237077, + "grad_norm": 20.296250736825957, + "kl": 0.10205078125, + "learning_rate": 6.136323812861398e-07, + "loss": -0.0034, + "reward": 1.5719988346099854, + "reward_std": 0.24222303926944733, + "rewards/accuracy_reward_stage2": 0.5876238942146301, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2206 + }, + { + "completion_length": 8.46875, + "epoch": 0.38671806553355526, + "grad_norm": 16.497923845983223, + "kl": 0.0849609375, + "learning_rate": 6.134571578762923e-07, + "loss": 0.0341, + "reward": 1.5625, + "reward_std": 0.16675157845020294, + "rewards/accuracy_reward_stage2": 0.5625, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2207 + }, + { + "completion_length": 11.0625, + "epoch": 0.38689328894340286, + "grad_norm": 18.295192741683362, + "kl": 0.1884765625, + "learning_rate": 6.132819344664448e-07, + "loss": -0.0002, + "reward": 1.644460916519165, + "reward_std": 0.19249743223190308, + "rewards/accuracy_reward_stage2": 0.675710916519165, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2208 + }, + { + "completion_length": 7.125, + "epoch": 0.3870685123532504, + "grad_norm": 19.963442006676203, + "kl": 0.087890625, + "learning_rate": 6.131067110565971e-07, + "loss": 0.0353, + "reward": 1.5324900150299072, + "reward_std": 0.2595484256744385, + "rewards/accuracy_reward_stage2": 0.532490074634552, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2209 + }, + { + "completion_length": 13.90625, + "epoch": 0.38724373576309795, + "grad_norm": 26.61436541495198, + "kl": 0.08056640625, + "learning_rate": 6.129314876467496e-07, + "loss": -0.0095, + "reward": 1.7203000783920288, + "reward_std": 0.1948830783367157, + "rewards/accuracy_reward_stage2": 0.7359250783920288, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2210 + }, + { + "completion_length": 8.078125, + "epoch": 0.3874189591729455, + "grad_norm": 15.538624785207695, + "kl": 0.10009765625, + "learning_rate": 6.12756264236902e-07, + "loss": 0.0186, + "reward": 1.723802089691162, + "reward_std": 0.13896231353282928, + "rewards/accuracy_reward_stage2": 0.7394270896911621, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2211 + }, + { + "completion_length": 12.71875, + "epoch": 0.38759418258279305, + "grad_norm": 16.671196293015015, + "kl": 0.034423828125, + "learning_rate": 6.125810408270544e-07, + "loss": 0.0138, + "reward": 1.2645647525787354, + "reward_std": 0.044438742101192474, + "rewards/accuracy_reward_stage2": 0.2645648419857025, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2212 + }, + { + "completion_length": 11.609375, + "epoch": 0.3877694059926406, + "grad_norm": 17.30957237048228, + "kl": 0.05322265625, + "learning_rate": 6.124058174172069e-07, + "loss": -0.0106, + "reward": 1.6624021530151367, + "reward_std": 0.16287538409233093, + "rewards/accuracy_reward_stage2": 0.8030271530151367, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2213 + }, + { + "completion_length": 12.8125, + "epoch": 0.3879446294024882, + "grad_norm": 19.486549989967347, + "kl": 0.11083984375, + "learning_rate": 6.122305940073593e-07, + "loss": 0.0443, + "reward": 1.536430835723877, + "reward_std": 0.25739842653274536, + "rewards/accuracy_reward_stage2": 0.5364308953285217, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2214 + }, + { + "completion_length": 9.40625, + "epoch": 0.38811985281233574, + "grad_norm": 23.428496736016186, + "kl": 0.1171875, + "learning_rate": 6.120553705975118e-07, + "loss": 0.0027, + "reward": 1.723336100578308, + "reward_std": 0.21733993291854858, + "rewards/accuracy_reward_stage2": 0.7389611005783081, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2215 + }, + { + "completion_length": 10.046875, + "epoch": 0.3882950762221833, + "grad_norm": 23.7601496680345, + "kl": 0.1337890625, + "learning_rate": 6.118801471876643e-07, + "loss": 0.0228, + "reward": 1.320266604423523, + "reward_std": 0.19674022495746613, + "rewards/accuracy_reward_stage2": 0.5858915448188782, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 2216 + }, + { + "completion_length": 9.296875, + "epoch": 0.38847029963203084, + "grad_norm": 20.72527082536805, + "kl": 0.1982421875, + "learning_rate": 6.117049237778167e-07, + "loss": 0.0265, + "reward": 1.462338924407959, + "reward_std": 0.2675231099128723, + "rewards/accuracy_reward_stage2": 0.6185888051986694, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2217 + }, + { + "completion_length": 11.3125, + "epoch": 0.3886455230418784, + "grad_norm": 17.420874740089047, + "kl": 0.09619140625, + "learning_rate": 6.115297003679692e-07, + "loss": 0.0385, + "reward": 1.4358340501785278, + "reward_std": 0.13778458535671234, + "rewards/accuracy_reward_stage2": 0.5608340501785278, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2218 + }, + { + "completion_length": 16.125, + "epoch": 0.38882074645172593, + "grad_norm": 20.444965241840418, + "kl": 0.2412109375, + "learning_rate": 6.113544769581215e-07, + "loss": 0.0619, + "reward": 1.2542166709899902, + "reward_std": 0.1983487606048584, + "rewards/accuracy_reward_stage2": 0.39484167098999023, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2219 + }, + { + "completion_length": 9.15625, + "epoch": 0.38899596986157353, + "grad_norm": 24.584740447360808, + "kl": 0.2138671875, + "learning_rate": 6.11179253548274e-07, + "loss": 0.0066, + "reward": 1.4592695236206055, + "reward_std": 0.2801477909088135, + "rewards/accuracy_reward_stage2": 0.49051961302757263, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2220 + }, + { + "completion_length": 18.75, + "epoch": 0.3891711932714211, + "grad_norm": 21.180945773655164, + "kl": 0.25, + "learning_rate": 6.110040301384265e-07, + "loss": 0.1003, + "reward": 1.3165578842163086, + "reward_std": 0.23822440207004547, + "rewards/accuracy_reward_stage2": 0.5665579438209534, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 2221 + }, + { + "completion_length": 17.46875, + "epoch": 0.3893464166812686, + "grad_norm": 19.289222933177903, + "kl": 0.06787109375, + "learning_rate": 6.108288067285789e-07, + "loss": 0.0272, + "reward": 1.7069063186645508, + "reward_std": 0.14726392924785614, + "rewards/accuracy_reward_stage2": 0.7069063782691956, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2222 + }, + { + "completion_length": 26.859375, + "epoch": 0.3895216400911162, + "grad_norm": 18.85145918214092, + "kl": 0.0947265625, + "learning_rate": 6.106535833187314e-07, + "loss": -0.0632, + "reward": 1.716736078262329, + "reward_std": 0.26862505078315735, + "rewards/accuracy_reward_stage2": 0.7636110782623291, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2223 + }, + { + "completion_length": 15.46875, + "epoch": 0.3896968635009637, + "grad_norm": 19.158419110512753, + "kl": 0.11474609375, + "learning_rate": 6.104783599088838e-07, + "loss": 0.0458, + "reward": 1.3567907810211182, + "reward_std": 0.20545431971549988, + "rewards/accuracy_reward_stage2": 0.356790691614151, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2224 + }, + { + "completion_length": 9.578125, + "epoch": 0.38987208691081127, + "grad_norm": 27.94855443881219, + "kl": 0.146484375, + "learning_rate": 6.103031364990362e-07, + "loss": 0.0584, + "reward": 1.5437999963760376, + "reward_std": 0.16191495954990387, + "rewards/accuracy_reward_stage2": 0.5437999367713928, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2225 + }, + { + "completion_length": 12.84375, + "epoch": 0.3900473103206588, + "grad_norm": 21.47202574922326, + "kl": 0.08154296875, + "learning_rate": 6.101279130891887e-07, + "loss": 0.0326, + "reward": 1.4971905946731567, + "reward_std": 0.20354795455932617, + "rewards/accuracy_reward_stage2": 0.622190535068512, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2226 + }, + { + "completion_length": 15.53125, + "epoch": 0.3902225337305064, + "grad_norm": 3199.861242021072, + "kl": 16.5, + "learning_rate": 6.099526896793411e-07, + "loss": 6.5665, + "reward": 1.61332368850708, + "reward_std": 0.14749747514724731, + "rewards/accuracy_reward_stage2": 0.7383236885070801, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2227 + }, + { + "completion_length": 12.71875, + "epoch": 0.39039775714035396, + "grad_norm": 23.940551008858936, + "kl": 0.2265625, + "learning_rate": 6.097774662694936e-07, + "loss": 0.0139, + "reward": 1.565973162651062, + "reward_std": 0.3491760492324829, + "rewards/accuracy_reward_stage2": 0.5972232222557068, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2228 + }, + { + "completion_length": 11.234375, + "epoch": 0.3905729805502015, + "grad_norm": 14.555579503011074, + "kl": 0.08984375, + "learning_rate": 6.09602242859646e-07, + "loss": 0.036, + "reward": 1.7619819641113281, + "reward_std": 0.12717638909816742, + "rewards/accuracy_reward_stage2": 0.7619818449020386, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2229 + }, + { + "completion_length": 6.1875, + "epoch": 0.39074820396004906, + "grad_norm": 21.25334499700258, + "kl": 0.16015625, + "learning_rate": 6.094270194497984e-07, + "loss": 0.0551, + "reward": 1.6436500549316406, + "reward_std": 0.27374887466430664, + "rewards/accuracy_reward_stage2": 0.7686500549316406, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2230 + }, + { + "completion_length": 13.078125, + "epoch": 0.3909234273698966, + "grad_norm": 20.036000201879755, + "kl": 0.14453125, + "learning_rate": 6.092517960399509e-07, + "loss": 0.0287, + "reward": 1.3101893663406372, + "reward_std": 0.2083069086074829, + "rewards/accuracy_reward_stage2": 0.3258143961429596, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2231 + }, + { + "completion_length": 10.859375, + "epoch": 0.39109865077974415, + "grad_norm": 21.01595135113345, + "kl": 0.11474609375, + "learning_rate": 6.090765726301034e-07, + "loss": -0.0198, + "reward": 1.6504713296890259, + "reward_std": 0.2545785903930664, + "rewards/accuracy_reward_stage2": 0.6817213296890259, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2232 + }, + { + "completion_length": 21.90625, + "epoch": 0.39127387418959175, + "grad_norm": 13.266211129359702, + "kl": 0.04052734375, + "learning_rate": 6.089013492202558e-07, + "loss": -0.028, + "reward": 1.5483436584472656, + "reward_std": 0.0761621966958046, + "rewards/accuracy_reward_stage2": 0.5639687180519104, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2233 + }, + { + "completion_length": 10.96875, + "epoch": 0.3914490975994393, + "grad_norm": 23.78450389226553, + "kl": 0.1162109375, + "learning_rate": 6.087261258104083e-07, + "loss": 0.0023, + "reward": 1.8500159978866577, + "reward_std": 0.21330755949020386, + "rewards/accuracy_reward_stage2": 0.8656409382820129, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2234 + }, + { + "completion_length": 11.3125, + "epoch": 0.39162432100928685, + "grad_norm": 15.860994793149466, + "kl": 0.08984375, + "learning_rate": 6.085509024005607e-07, + "loss": -0.0012, + "reward": 1.834155797958374, + "reward_std": 0.12957896292209625, + "rewards/accuracy_reward_stage2": 0.849780797958374, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2235 + }, + { + "completion_length": 10.53125, + "epoch": 0.3917995444191344, + "grad_norm": 21.031488729143145, + "kl": 0.146484375, + "learning_rate": 6.083756789907132e-07, + "loss": 0.023, + "reward": 1.7553956508636475, + "reward_std": 0.27307990193367004, + "rewards/accuracy_reward_stage2": 0.7710205912590027, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2236 + }, + { + "completion_length": 12.3125, + "epoch": 0.39197476782898194, + "grad_norm": 45.038068276481724, + "kl": 0.052001953125, + "learning_rate": 6.082004555808656e-07, + "loss": -0.0234, + "reward": 1.4747748374938965, + "reward_std": 0.21759963035583496, + "rewards/accuracy_reward_stage2": 0.6153998374938965, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2237 + }, + { + "completion_length": 8.59375, + "epoch": 0.3921499912388295, + "grad_norm": 21.49837229610135, + "kl": 0.054443359375, + "learning_rate": 6.08025232171018e-07, + "loss": 0.0218, + "reward": 1.4371988773345947, + "reward_std": 0.18646962940692902, + "rewards/accuracy_reward_stage2": 0.4371989369392395, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2238 + }, + { + "completion_length": 15.15625, + "epoch": 0.3923252146486771, + "grad_norm": 22.48222287881226, + "kl": 0.1796875, + "learning_rate": 6.078500087611704e-07, + "loss": -0.0154, + "reward": 1.4610028266906738, + "reward_std": 0.27222740650177, + "rewards/accuracy_reward_stage2": 0.6172528862953186, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2239 + }, + { + "completion_length": 16.328125, + "epoch": 0.39250043805852464, + "grad_norm": 19.858045637418105, + "kl": 0.11572265625, + "learning_rate": 6.076747853513228e-07, + "loss": 0.0123, + "reward": 1.6191458702087402, + "reward_std": 0.33029234409332275, + "rewards/accuracy_reward_stage2": 0.6347708702087402, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2240 + }, + { + "completion_length": 8.703125, + "epoch": 0.3926756614683722, + "grad_norm": 15.88710029637002, + "kl": 0.1083984375, + "learning_rate": 6.074995619414753e-07, + "loss": -0.0384, + "reward": 1.7707568407058716, + "reward_std": 0.16039703786373138, + "rewards/accuracy_reward_stage2": 0.8020068407058716, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2241 + }, + { + "completion_length": 15.0, + "epoch": 0.39285088487821973, + "grad_norm": 24.84449475106468, + "kl": 0.03662109375, + "learning_rate": 6.073243385316278e-07, + "loss": 0.0147, + "reward": 1.3003172874450684, + "reward_std": 0.2628664970397949, + "rewards/accuracy_reward_stage2": 0.4253171682357788, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2242 + }, + { + "completion_length": 28.109375, + "epoch": 0.3930261082880673, + "grad_norm": 20.842704108597864, + "kl": 0.1025390625, + "learning_rate": 6.071491151217802e-07, + "loss": 0.041, + "reward": 1.4179072380065918, + "reward_std": 0.1976032704114914, + "rewards/accuracy_reward_stage2": 0.5429072380065918, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2243 + }, + { + "completion_length": 9.0625, + "epoch": 0.3932013316979148, + "grad_norm": 22.012758121864316, + "kl": 0.08203125, + "learning_rate": 6.069738917119327e-07, + "loss": 0.0329, + "reward": 1.6247165203094482, + "reward_std": 0.14634189009666443, + "rewards/accuracy_reward_stage2": 0.6247165203094482, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2244 + }, + { + "completion_length": 14.953125, + "epoch": 0.39337655510776237, + "grad_norm": 24.990969384381177, + "kl": 0.3125, + "learning_rate": 6.067986683020852e-07, + "loss": 0.1155, + "reward": 1.501061201095581, + "reward_std": 0.2306504249572754, + "rewards/accuracy_reward_stage2": 0.641686201095581, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2245 + }, + { + "completion_length": 9.765625, + "epoch": 0.39355177851761, + "grad_norm": 23.167817382707934, + "kl": 0.052734375, + "learning_rate": 6.066234448922376e-07, + "loss": 0.0211, + "reward": 1.5593750476837158, + "reward_std": 0.2636833190917969, + "rewards/accuracy_reward_stage2": 0.684374988079071, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2246 + }, + { + "completion_length": 11.28125, + "epoch": 0.3937270019274575, + "grad_norm": 17.03462934298418, + "kl": 0.0751953125, + "learning_rate": 6.064482214823901e-07, + "loss": 0.03, + "reward": 1.4761418104171753, + "reward_std": 0.2149282991886139, + "rewards/accuracy_reward_stage2": 0.6011418104171753, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2247 + }, + { + "completion_length": 10.0625, + "epoch": 0.39390222533730507, + "grad_norm": 23.729261446581663, + "kl": 0.162109375, + "learning_rate": 6.062729980725426e-07, + "loss": -0.0235, + "reward": 1.747470498085022, + "reward_std": 0.27710628509521484, + "rewards/accuracy_reward_stage2": 0.778720498085022, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2248 + }, + { + "completion_length": 12.90625, + "epoch": 0.3940774487471526, + "grad_norm": 17.009665912805705, + "kl": 0.076171875, + "learning_rate": 6.060977746626949e-07, + "loss": -0.0578, + "reward": 1.5438098907470703, + "reward_std": 0.2387259602546692, + "rewards/accuracy_reward_stage2": 0.5750599503517151, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2249 + }, + { + "completion_length": 11.484375, + "epoch": 0.39425267215700016, + "grad_norm": 17.876369069783316, + "kl": 0.0257568359375, + "learning_rate": 6.059225512528473e-07, + "loss": 0.0103, + "reward": 1.800662875175476, + "reward_std": 0.18343248963356018, + "rewards/accuracy_reward_stage2": 0.8006628751754761, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2250 + }, + { + "completion_length": 13.65625, + "epoch": 0.3944278955668477, + "grad_norm": 17.8567731672947, + "kl": 0.0166015625, + "learning_rate": 6.057473278429997e-07, + "loss": 0.0066, + "reward": 1.554578423500061, + "reward_std": 0.2657412886619568, + "rewards/accuracy_reward_stage2": 0.679578423500061, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2251 + }, + { + "completion_length": 7.46875, + "epoch": 0.3946031189766953, + "grad_norm": 21.025622163803355, + "kl": 0.1298828125, + "learning_rate": 6.055721044331522e-07, + "loss": 0.0519, + "reward": 1.5240048170089722, + "reward_std": 0.25898078083992004, + "rewards/accuracy_reward_stage2": 0.7740048766136169, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 2252 + }, + { + "completion_length": 14.46875, + "epoch": 0.39477834238654286, + "grad_norm": 14.991669037359548, + "kl": 0.1845703125, + "learning_rate": 6.053968810233047e-07, + "loss": 0.0736, + "reward": 1.6325411796569824, + "reward_std": 0.10772719979286194, + "rewards/accuracy_reward_stage2": 0.7575411200523376, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2253 + }, + { + "completion_length": 9.609375, + "epoch": 0.3949535657963904, + "grad_norm": 20.17179782436066, + "kl": 0.193359375, + "learning_rate": 6.052216576134571e-07, + "loss": 0.0773, + "reward": 1.7336421012878418, + "reward_std": 0.2646172046661377, + "rewards/accuracy_reward_stage2": 0.7336422204971313, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2254 + }, + { + "completion_length": 9.421875, + "epoch": 0.39512878920623795, + "grad_norm": 22.18340436662211, + "kl": 0.0859375, + "learning_rate": 6.050464342036096e-07, + "loss": -0.0032, + "reward": 1.788942575454712, + "reward_std": 0.17048048973083496, + "rewards/accuracy_reward_stage2": 0.8045675158500671, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2255 + }, + { + "completion_length": 10.53125, + "epoch": 0.3953040126160855, + "grad_norm": 21.908798401167978, + "kl": 0.07763671875, + "learning_rate": 6.04871210793762e-07, + "loss": 0.0022, + "reward": 1.2886775732040405, + "reward_std": 0.23748990893363953, + "rewards/accuracy_reward_stage2": 0.3043026328086853, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2256 + }, + { + "completion_length": 9.015625, + "epoch": 0.39547923602593305, + "grad_norm": 14.891452971280476, + "kl": 0.1376953125, + "learning_rate": 6.046959873839145e-07, + "loss": 0.055, + "reward": 1.643110990524292, + "reward_std": 0.08917830139398575, + "rewards/accuracy_reward_stage2": 0.768110990524292, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2257 + }, + { + "completion_length": 24.640625, + "epoch": 0.39565445943578065, + "grad_norm": 18.532267801307796, + "kl": 0.06689453125, + "learning_rate": 6.04520763974067e-07, + "loss": -0.0174, + "reward": 1.3986705541610718, + "reward_std": 0.2709914743900299, + "rewards/accuracy_reward_stage2": 0.5392955541610718, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2258 + }, + { + "completion_length": 7.75, + "epoch": 0.3958296828456282, + "grad_norm": 19.705860118413078, + "kl": 0.14453125, + "learning_rate": 6.043455405642193e-07, + "loss": 0.0578, + "reward": 1.7622469663619995, + "reward_std": 0.16544394195079803, + "rewards/accuracy_reward_stage2": 0.7622469663619995, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2259 + }, + { + "completion_length": 20.859375, + "epoch": 0.39600490625547574, + "grad_norm": 17.54658164701785, + "kl": 0.0478515625, + "learning_rate": 6.041703171543718e-07, + "loss": 0.0191, + "reward": 1.5218894481658936, + "reward_std": 0.16388002038002014, + "rewards/accuracy_reward_stage2": 0.6468895673751831, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2260 + }, + { + "completion_length": 9.71875, + "epoch": 0.3961801296653233, + "grad_norm": 15.360252803195012, + "kl": 0.0556640625, + "learning_rate": 6.039950937445243e-07, + "loss": 0.0222, + "reward": 1.729975938796997, + "reward_std": 0.08503744006156921, + "rewards/accuracy_reward_stage2": 0.8549759387969971, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2261 + }, + { + "completion_length": 7.609375, + "epoch": 0.39635535307517084, + "grad_norm": 21.248695797842434, + "kl": 0.0908203125, + "learning_rate": 6.038198703346767e-07, + "loss": 0.0363, + "reward": 1.5866072177886963, + "reward_std": 0.25131991505622864, + "rewards/accuracy_reward_stage2": 0.7116071581840515, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2262 + }, + { + "completion_length": 12.5625, + "epoch": 0.3965305764850184, + "grad_norm": 25.841012280803945, + "kl": 0.10498046875, + "learning_rate": 6.036446469248291e-07, + "loss": 0.0421, + "reward": 1.6510810852050781, + "reward_std": 0.21648067235946655, + "rewards/accuracy_reward_stage2": 0.6510810852050781, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2263 + }, + { + "completion_length": 10.796875, + "epoch": 0.39670579989486593, + "grad_norm": 26.90376670754385, + "kl": 0.28515625, + "learning_rate": 6.034694235149815e-07, + "loss": 0.1143, + "reward": 1.5416667461395264, + "reward_std": 0.26196783781051636, + "rewards/accuracy_reward_stage2": 0.6666666865348816, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2264 + }, + { + "completion_length": 14.5, + "epoch": 0.39688102330471353, + "grad_norm": 17.25363604070593, + "kl": 0.1435546875, + "learning_rate": 6.03294200105134e-07, + "loss": 0.0575, + "reward": 1.4917428493499756, + "reward_std": 0.14269062876701355, + "rewards/accuracy_reward_stage2": 0.616742730140686, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2265 + }, + { + "completion_length": 7.953125, + "epoch": 0.3970562467145611, + "grad_norm": 24.496076146044928, + "kl": 0.1142578125, + "learning_rate": 6.031189766952865e-07, + "loss": 0.0456, + "reward": 1.4624096155166626, + "reward_std": 0.2695634663105011, + "rewards/accuracy_reward_stage2": 0.5874096751213074, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2266 + }, + { + "completion_length": 9.890625, + "epoch": 0.3972314701244086, + "grad_norm": 20.884483065722286, + "kl": 0.10107421875, + "learning_rate": 6.029437532854389e-07, + "loss": 0.0405, + "reward": 1.7499773502349854, + "reward_std": 0.29926127195358276, + "rewards/accuracy_reward_stage2": 0.7499772310256958, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2267 + }, + { + "completion_length": 8.53125, + "epoch": 0.3974066935342562, + "grad_norm": 11.781013559961279, + "kl": 0.0162353515625, + "learning_rate": 6.027685298755914e-07, + "loss": 0.0065, + "reward": 1.59375, + "reward_std": 0.10888782143592834, + "rewards/accuracy_reward_stage2": 0.59375, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2268 + }, + { + "completion_length": 7.828125, + "epoch": 0.3975819169441037, + "grad_norm": 21.6861325693261, + "kl": 0.07421875, + "learning_rate": 6.025933064657438e-07, + "loss": -0.002, + "reward": 1.6320500373840332, + "reward_std": 0.21844886243343353, + "rewards/accuracy_reward_stage2": 0.6476749777793884, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2269 + }, + { + "completion_length": 10.78125, + "epoch": 0.39775714035395127, + "grad_norm": 23.714687842214964, + "kl": 0.1748046875, + "learning_rate": 6.024180830558962e-07, + "loss": 0.0698, + "reward": 1.3585705757141113, + "reward_std": 0.2857874035835266, + "rewards/accuracy_reward_stage2": 0.3585706651210785, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2270 + }, + { + "completion_length": 10.28125, + "epoch": 0.39793236376379887, + "grad_norm": 14.626456891793655, + "kl": 0.041015625, + "learning_rate": 6.022428596460487e-07, + "loss": 0.0164, + "reward": 1.2995471954345703, + "reward_std": 0.20875424146652222, + "rewards/accuracy_reward_stage2": 0.5495471954345703, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 2271 + }, + { + "completion_length": 8.046875, + "epoch": 0.3981075871736464, + "grad_norm": 17.670543866931045, + "kl": 0.12109375, + "learning_rate": 6.020676362362011e-07, + "loss": 0.0484, + "reward": 1.6284711360931396, + "reward_std": 0.2129313349723816, + "rewards/accuracy_reward_stage2": 0.6284710764884949, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2272 + }, + { + "completion_length": 7.96875, + "epoch": 0.39828281058349396, + "grad_norm": 15.718648701678724, + "kl": 0.09619140625, + "learning_rate": 6.018924128263536e-07, + "loss": -0.0058, + "reward": 1.484375, + "reward_std": 0.22673699259757996, + "rewards/accuracy_reward_stage2": 0.5, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2273 + }, + { + "completion_length": 8.921875, + "epoch": 0.3984580339933415, + "grad_norm": 16.76238405847528, + "kl": 0.15234375, + "learning_rate": 6.017171894165061e-07, + "loss": 0.0169, + "reward": 1.5914230346679688, + "reward_std": 0.10464628040790558, + "rewards/accuracy_reward_stage2": 0.6070479154586792, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2274 + }, + { + "completion_length": 14.640625, + "epoch": 0.39863325740318906, + "grad_norm": 19.55876690771695, + "kl": 0.08642578125, + "learning_rate": 6.015419660066584e-07, + "loss": 0.0345, + "reward": 1.808529257774353, + "reward_std": 0.10393321514129639, + "rewards/accuracy_reward_stage2": 0.808529257774353, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2275 + }, + { + "completion_length": 14.078125, + "epoch": 0.3988084808130366, + "grad_norm": 25.976134613056775, + "kl": 0.330078125, + "learning_rate": 6.013667425968109e-07, + "loss": 0.0878, + "reward": 1.2277390956878662, + "reward_std": 0.2750273048877716, + "rewards/accuracy_reward_stage2": 0.49336421489715576, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 2276 + }, + { + "completion_length": 10.359375, + "epoch": 0.39898370422288415, + "grad_norm": 25.334821129538327, + "kl": 0.1015625, + "learning_rate": 6.011915191869634e-07, + "loss": 0.0408, + "reward": 1.575644850730896, + "reward_std": 0.2729370892047882, + "rewards/accuracy_reward_stage2": 0.575644850730896, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2277 + }, + { + "completion_length": 17.734375, + "epoch": 0.39915892763273175, + "grad_norm": 21.873042456168918, + "kl": 0.224609375, + "learning_rate": 6.010162957771157e-07, + "loss": 0.09, + "reward": 1.3560564517974854, + "reward_std": 0.22519102692604065, + "rewards/accuracy_reward_stage2": 0.48105645179748535, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2278 + }, + { + "completion_length": 13.09375, + "epoch": 0.3993341510425793, + "grad_norm": 27.114162602146166, + "kl": 0.16796875, + "learning_rate": 6.008410723672682e-07, + "loss": 0.0675, + "reward": 1.3410875797271729, + "reward_std": 0.1357228308916092, + "rewards/accuracy_reward_stage2": 0.4660876393318176, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2279 + }, + { + "completion_length": 10.71875, + "epoch": 0.39950937445242685, + "grad_norm": 16.282857035884422, + "kl": 0.12353515625, + "learning_rate": 6.006658489574206e-07, + "loss": 0.0495, + "reward": 1.3618073463439941, + "reward_std": 0.17284999787807465, + "rewards/accuracy_reward_stage2": 0.4868074059486389, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2280 + }, + { + "completion_length": 9.21875, + "epoch": 0.3996845978622744, + "grad_norm": 22.962387664587848, + "kl": 0.1708984375, + "learning_rate": 6.004906255475731e-07, + "loss": 0.0684, + "reward": 1.4452757835388184, + "reward_std": 0.22587110102176666, + "rewards/accuracy_reward_stage2": 0.44527584314346313, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2281 + }, + { + "completion_length": 6.796875, + "epoch": 0.39985982127212194, + "grad_norm": 11.122280502077027, + "kl": 0.107421875, + "learning_rate": 6.003154021377256e-07, + "loss": 0.0058, + "reward": 1.4873359203338623, + "reward_std": 0.08620868623256683, + "rewards/accuracy_reward_stage2": 0.6279608607292175, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2282 + }, + { + "completion_length": 9.828125, + "epoch": 0.4000350446819695, + "grad_norm": 18.48475644218214, + "kl": 0.06982421875, + "learning_rate": 6.00140178727878e-07, + "loss": -0.037, + "reward": 1.5395770072937012, + "reward_std": 0.3086152970790863, + "rewards/accuracy_reward_stage2": 0.5708270072937012, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2283 + }, + { + "completion_length": 12.078125, + "epoch": 0.4002102680918171, + "grad_norm": 20.75370415639769, + "kl": 0.2001953125, + "learning_rate": 5.999649553180305e-07, + "loss": 0.0359, + "reward": 1.367240309715271, + "reward_std": 0.2471608817577362, + "rewards/accuracy_reward_stage2": 0.507865309715271, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2284 + }, + { + "completion_length": 8.75, + "epoch": 0.40038549150166464, + "grad_norm": 20.710375361431648, + "kl": 0.09619140625, + "learning_rate": 5.99789731908183e-07, + "loss": -0.0057, + "reward": 1.6774652004241943, + "reward_std": 0.2268453687429428, + "rewards/accuracy_reward_stage2": 0.6930902004241943, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2285 + }, + { + "completion_length": 8.09375, + "epoch": 0.4005607149115122, + "grad_norm": 18.298710329293545, + "kl": 0.04296875, + "learning_rate": 5.996145084983354e-07, + "loss": -0.0038, + "reward": 1.4890004396438599, + "reward_std": 0.1191844642162323, + "rewards/accuracy_reward_stage2": 0.5046254992485046, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2286 + }, + { + "completion_length": 9.78125, + "epoch": 0.40073593832135973, + "grad_norm": 16.98439905746384, + "kl": 0.048583984375, + "learning_rate": 5.994392850884879e-07, + "loss": 0.0194, + "reward": 1.4807779788970947, + "reward_std": 0.18450888991355896, + "rewards/accuracy_reward_stage2": 0.4807780086994171, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2287 + }, + { + "completion_length": 16.03125, + "epoch": 0.4009111617312073, + "grad_norm": 12.240341016748312, + "kl": 0.052001953125, + "learning_rate": 5.992640616786401e-07, + "loss": -0.0126, + "reward": 1.375, + "reward_std": 0.1552036553621292, + "rewards/accuracy_reward_stage2": 0.515625, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2288 + }, + { + "completion_length": 7.1875, + "epoch": 0.4010863851410548, + "grad_norm": 19.206907098055197, + "kl": 0.1474609375, + "learning_rate": 5.990888382687926e-07, + "loss": 0.0146, + "reward": 1.5250904560089111, + "reward_std": 0.2802599370479584, + "rewards/accuracy_reward_stage2": 0.5407153964042664, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2289 + }, + { + "completion_length": 10.21875, + "epoch": 0.4012616085509024, + "grad_norm": 16.23780482715037, + "kl": 0.1015625, + "learning_rate": 5.989136148589451e-07, + "loss": -0.0369, + "reward": 1.2621527910232544, + "reward_std": 0.25270047783851624, + "rewards/accuracy_reward_stage2": 0.4027777910232544, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2290 + }, + { + "completion_length": 11.046875, + "epoch": 0.40143683196075, + "grad_norm": 39.776298461562874, + "kl": 0.1328125, + "learning_rate": 5.987383914490975e-07, + "loss": 0.0088, + "reward": 1.7533235549926758, + "reward_std": 0.19562682509422302, + "rewards/accuracy_reward_stage2": 0.8939485549926758, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2291 + }, + { + "completion_length": 27.515625, + "epoch": 0.4016120553705975, + "grad_norm": 22.133452025655327, + "kl": 0.0908203125, + "learning_rate": 5.9856316803925e-07, + "loss": 0.0364, + "reward": 1.5148825645446777, + "reward_std": 0.26382148265838623, + "rewards/accuracy_reward_stage2": 0.5148825645446777, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2292 + }, + { + "completion_length": 9.96875, + "epoch": 0.40178727878044507, + "grad_norm": 14.090229255752924, + "kl": 0.057373046875, + "learning_rate": 5.983879446294025e-07, + "loss": 0.0229, + "reward": 1.6614583730697632, + "reward_std": 0.12609022855758667, + "rewards/accuracy_reward_stage2": 0.6614583134651184, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2293 + }, + { + "completion_length": 16.15625, + "epoch": 0.4019625021902926, + "grad_norm": 24.598171444560585, + "kl": 0.0810546875, + "learning_rate": 5.982127212195549e-07, + "loss": 0.0324, + "reward": 1.4326512813568115, + "reward_std": 0.24987728893756866, + "rewards/accuracy_reward_stage2": 0.43265122175216675, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2294 + }, + { + "completion_length": 8.765625, + "epoch": 0.40213772560014016, + "grad_norm": 23.768059758477552, + "kl": 0.193359375, + "learning_rate": 5.980374978097074e-07, + "loss": 0.0021, + "reward": 1.7757015228271484, + "reward_std": 0.31451430916786194, + "rewards/accuracy_reward_stage2": 0.8069514036178589, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2295 + }, + { + "completion_length": 12.859375, + "epoch": 0.4023129490099877, + "grad_norm": 38.013862095694, + "kl": 0.130859375, + "learning_rate": 5.978622743998598e-07, + "loss": 0.0117, + "reward": 1.4523301124572754, + "reward_std": 0.23838970065116882, + "rewards/accuracy_reward_stage2": 0.7179551124572754, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 2296 + }, + { + "completion_length": 13.703125, + "epoch": 0.4024881724198353, + "grad_norm": 1750.6093752011525, + "kl": 7.34375, + "learning_rate": 5.976870509900123e-07, + "loss": 2.9339, + "reward": 1.464247703552246, + "reward_std": 0.19597555696964264, + "rewards/accuracy_reward_stage2": 0.5892475843429565, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2297 + }, + { + "completion_length": 7.96875, + "epoch": 0.40266339582968286, + "grad_norm": 17.351812722344764, + "kl": 0.123046875, + "learning_rate": 5.975118275801648e-07, + "loss": 0.0051, + "reward": 1.6678106784820557, + "reward_std": 0.18729940056800842, + "rewards/accuracy_reward_stage2": 0.6834356784820557, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2298 + }, + { + "completion_length": 19.296875, + "epoch": 0.4028386192395304, + "grad_norm": 16.46383858884921, + "kl": 0.04150390625, + "learning_rate": 5.973366041703171e-07, + "loss": -0.0025, + "reward": 1.6293494701385498, + "reward_std": 0.12097081542015076, + "rewards/accuracy_reward_stage2": 0.6449744701385498, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2299 + }, + { + "completion_length": 5.984375, + "epoch": 0.40301384264937795, + "grad_norm": 44.639775379090395, + "kl": 0.154296875, + "learning_rate": 5.971613807604696e-07, + "loss": 0.062, + "reward": 1.7036259174346924, + "reward_std": 0.2740706503391266, + "rewards/accuracy_reward_stage2": 0.7036257982254028, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2300 + }, + { + "completion_length": 10.21875, + "epoch": 0.4031890660592255, + "grad_norm": 16.01770704561227, + "kl": 0.044189453125, + "learning_rate": 5.969861573506219e-07, + "loss": 0.0177, + "reward": 1.8585901260375977, + "reward_std": 0.14846956729888916, + "rewards/accuracy_reward_stage2": 0.8585900068283081, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2301 + }, + { + "completion_length": 7.4375, + "epoch": 0.40336428946907305, + "grad_norm": 21.29640137549983, + "kl": 0.1064453125, + "learning_rate": 5.968109339407744e-07, + "loss": 0.0426, + "reward": 1.6757261753082275, + "reward_std": 0.28504133224487305, + "rewards/accuracy_reward_stage2": 0.6757262349128723, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2302 + }, + { + "completion_length": 10.34375, + "epoch": 0.40353951287892065, + "grad_norm": 20.686258071687735, + "kl": 0.076171875, + "learning_rate": 5.966357105309269e-07, + "loss": 0.0025, + "reward": 1.31874418258667, + "reward_std": 0.239344522356987, + "rewards/accuracy_reward_stage2": 0.4593692123889923, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2303 + }, + { + "completion_length": 12.9375, + "epoch": 0.4037147362887682, + "grad_norm": 27.775159769904608, + "kl": 0.2314453125, + "learning_rate": 5.964604871210793e-07, + "loss": 0.0925, + "reward": 1.5497610569000244, + "reward_std": 0.1822834312915802, + "rewards/accuracy_reward_stage2": 0.6747609972953796, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2304 + }, + { + "completion_length": 12.25, + "epoch": 0.40388995969861574, + "grad_norm": 17.129685183594717, + "kl": 0.080078125, + "learning_rate": 5.962852637112318e-07, + "loss": -0.0226, + "reward": 1.566606044769287, + "reward_std": 0.20607224106788635, + "rewards/accuracy_reward_stage2": 0.5978560447692871, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2305 + }, + { + "completion_length": 24.8125, + "epoch": 0.4040651831084633, + "grad_norm": 626.6167581526231, + "kl": 5.28125, + "learning_rate": 5.961100403013843e-07, + "loss": 2.0717, + "reward": 1.5107133388519287, + "reward_std": 0.1535727083683014, + "rewards/accuracy_reward_stage2": 0.6513383984565735, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2306 + }, + { + "completion_length": 19.75, + "epoch": 0.40424040651831084, + "grad_norm": 58.60220982847609, + "kl": 0.455078125, + "learning_rate": 5.959348168915367e-07, + "loss": 0.1376, + "reward": 1.3191068172454834, + "reward_std": 0.22959591448307037, + "rewards/accuracy_reward_stage2": 0.5847317576408386, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 2307 + }, + { + "completion_length": 10.140625, + "epoch": 0.4044156299281584, + "grad_norm": 24.806034726052975, + "kl": 0.31640625, + "learning_rate": 5.957595934816891e-07, + "loss": 0.0931, + "reward": 1.4665522575378418, + "reward_std": 0.3047543168067932, + "rewards/accuracy_reward_stage2": 0.607177197933197, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2308 + }, + { + "completion_length": 7.09375, + "epoch": 0.404590853338006, + "grad_norm": 16.157756302348197, + "kl": 0.08642578125, + "learning_rate": 5.955843700718416e-07, + "loss": 0.0346, + "reward": 1.6599851846694946, + "reward_std": 0.08853545039892197, + "rewards/accuracy_reward_stage2": 0.6599851250648499, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2309 + }, + { + "completion_length": 13.234375, + "epoch": 0.40476607674785353, + "grad_norm": 50.421575985750216, + "kl": 0.1484375, + "learning_rate": 5.95409146661994e-07, + "loss": 0.0594, + "reward": 1.6158902645111084, + "reward_std": 0.39847201108932495, + "rewards/accuracy_reward_stage2": 0.6158903241157532, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2310 + }, + { + "completion_length": 9.171875, + "epoch": 0.4049413001577011, + "grad_norm": 21.215574229033695, + "kl": 0.109375, + "learning_rate": 5.952339232521465e-07, + "loss": 0.0089, + "reward": 1.5476425886154175, + "reward_std": 0.2785415053367615, + "rewards/accuracy_reward_stage2": 0.563267707824707, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2311 + }, + { + "completion_length": 6.34375, + "epoch": 0.4051165235675486, + "grad_norm": 19.026066267211252, + "kl": 0.185546875, + "learning_rate": 5.950586998422989e-07, + "loss": 0.0298, + "reward": 1.668402910232544, + "reward_std": 0.25595739483833313, + "rewards/accuracy_reward_stage2": 0.6840278506278992, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2312 + }, + { + "completion_length": 9.15625, + "epoch": 0.4052917469773962, + "grad_norm": 18.9975286170903, + "kl": 0.12255859375, + "learning_rate": 5.948834764324514e-07, + "loss": 0.0489, + "reward": 1.6058623790740967, + "reward_std": 0.17858222126960754, + "rewards/accuracy_reward_stage2": 0.6058623790740967, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2313 + }, + { + "completion_length": 7.109375, + "epoch": 0.4054669703872437, + "grad_norm": 21.660857721388656, + "kl": 0.208984375, + "learning_rate": 5.947082530226038e-07, + "loss": -0.0046, + "reward": 1.3151360750198364, + "reward_std": 0.3293163776397705, + "rewards/accuracy_reward_stage2": 0.3463861048221588, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2314 + }, + { + "completion_length": 9.234375, + "epoch": 0.40564219379709127, + "grad_norm": 25.92194524304459, + "kl": 0.259765625, + "learning_rate": 5.945330296127562e-07, + "loss": 0.1021, + "reward": 1.5383646488189697, + "reward_std": 0.32094958424568176, + "rewards/accuracy_reward_stage2": 0.6633646488189697, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2315 + }, + { + "completion_length": 13.359375, + "epoch": 0.40581741720693887, + "grad_norm": 134.6202242774508, + "kl": 0.37890625, + "learning_rate": 5.943578062029087e-07, + "loss": 0.1511, + "reward": 1.5572917461395264, + "reward_std": 0.22779878973960876, + "rewards/accuracy_reward_stage2": 0.6822916269302368, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2316 + }, + { + "completion_length": 10.6875, + "epoch": 0.4059926406167864, + "grad_norm": 17.41673028501159, + "kl": 0.0576171875, + "learning_rate": 5.941825827930611e-07, + "loss": 0.0231, + "reward": 1.4323480129241943, + "reward_std": 0.2165302187204361, + "rewards/accuracy_reward_stage2": 0.43234801292419434, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2317 + }, + { + "completion_length": 10.5, + "epoch": 0.40616786402663396, + "grad_norm": 19.564501287263905, + "kl": 0.1201171875, + "learning_rate": 5.940073593832135e-07, + "loss": 0.0483, + "reward": 1.4573872089385986, + "reward_std": 0.2566508650779724, + "rewards/accuracy_reward_stage2": 0.5823871493339539, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2318 + }, + { + "completion_length": 8.015625, + "epoch": 0.4063430874364815, + "grad_norm": 13.005218537193727, + "kl": 0.1376953125, + "learning_rate": 5.93832135973366e-07, + "loss": -0.0747, + "reward": 1.7554097175598145, + "reward_std": 0.21100175380706787, + "rewards/accuracy_reward_stage2": 0.8022847771644592, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2319 + }, + { + "completion_length": 9.484375, + "epoch": 0.40651831084632906, + "grad_norm": 19.67616530990649, + "kl": 0.064453125, + "learning_rate": 5.936569125635184e-07, + "loss": 0.0259, + "reward": 1.3368675708770752, + "reward_std": 0.3198486864566803, + "rewards/accuracy_reward_stage2": 0.5868675708770752, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 2320 + }, + { + "completion_length": 15.015625, + "epoch": 0.4066935342561766, + "grad_norm": 15.25698057189163, + "kl": 0.1123046875, + "learning_rate": 5.934816891536709e-07, + "loss": 0.0017, + "reward": 1.3410911560058594, + "reward_std": 0.07204613089561462, + "rewards/accuracy_reward_stage2": 0.6067162752151489, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 2321 + }, + { + "completion_length": 9.078125, + "epoch": 0.4068687576660242, + "grad_norm": 19.99227162306004, + "kl": 0.10546875, + "learning_rate": 5.933064657438234e-07, + "loss": 0.0133, + "reward": 1.8546524047851562, + "reward_std": 0.1757163405418396, + "rewards/accuracy_reward_stage2": 0.8702772855758667, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2322 + }, + { + "completion_length": 9.171875, + "epoch": 0.40704398107587175, + "grad_norm": 22.055503664853624, + "kl": 0.095703125, + "learning_rate": 5.931312423339758e-07, + "loss": -0.0058, + "reward": 1.2581727504730225, + "reward_std": 0.18284207582473755, + "rewards/accuracy_reward_stage2": 0.39879778027534485, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2323 + }, + { + "completion_length": 10.296875, + "epoch": 0.4072192044857193, + "grad_norm": 20.67006780550727, + "kl": 0.07763671875, + "learning_rate": 5.929560189241283e-07, + "loss": -0.0129, + "reward": 1.6703296899795532, + "reward_std": 0.3147152066230774, + "rewards/accuracy_reward_stage2": 0.6859546899795532, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2324 + }, + { + "completion_length": 8.609375, + "epoch": 0.40739442789556685, + "grad_norm": 22.610598881736344, + "kl": 0.1533203125, + "learning_rate": 5.927807955142807e-07, + "loss": 0.0005, + "reward": 1.388228416442871, + "reward_std": 0.27816927433013916, + "rewards/accuracy_reward_stage2": 0.5444784164428711, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2325 + }, + { + "completion_length": 9.5625, + "epoch": 0.4075696513054144, + "grad_norm": 15.881553175034364, + "kl": 0.09912109375, + "learning_rate": 5.926055721044331e-07, + "loss": 0.0023, + "reward": 1.7211300134658813, + "reward_std": 0.22270438075065613, + "rewards/accuracy_reward_stage2": 0.7367550134658813, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2326 + }, + { + "completion_length": 7.59375, + "epoch": 0.40774487471526194, + "grad_norm": 15.311621032584444, + "kl": 0.051513671875, + "learning_rate": 5.924303486945856e-07, + "loss": 0.0206, + "reward": 1.537257194519043, + "reward_std": 0.19294340908527374, + "rewards/accuracy_reward_stage2": 0.537257194519043, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2327 + }, + { + "completion_length": 9.390625, + "epoch": 0.4079200981251095, + "grad_norm": 17.162111824963418, + "kl": 0.12890625, + "learning_rate": 5.922551252847379e-07, + "loss": 0.0073, + "reward": 1.4032111167907715, + "reward_std": 0.16666026413440704, + "rewards/accuracy_reward_stage2": 0.4188360273838043, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2328 + }, + { + "completion_length": 9.03125, + "epoch": 0.4080953215349571, + "grad_norm": 21.17231097602088, + "kl": 0.2021484375, + "learning_rate": 5.920799018748904e-07, + "loss": 0.0426, + "reward": 1.358152151107788, + "reward_std": 0.20485907793045044, + "rewards/accuracy_reward_stage2": 0.3737771511077881, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2329 + }, + { + "completion_length": 12.265625, + "epoch": 0.40827054494480464, + "grad_norm": 30.753156072640614, + "kl": 0.25, + "learning_rate": 5.919046784650429e-07, + "loss": 0.1002, + "reward": 1.5117536783218384, + "reward_std": 0.21533477306365967, + "rewards/accuracy_reward_stage2": 0.7617536783218384, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 2330 + }, + { + "completion_length": 16.671875, + "epoch": 0.4084457683546522, + "grad_norm": 16.282888838198275, + "kl": 0.031005859375, + "learning_rate": 5.917294550551953e-07, + "loss": 0.0124, + "reward": 1.5380942821502686, + "reward_std": 0.24841409921646118, + "rewards/accuracy_reward_stage2": 0.5380942821502686, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2331 + }, + { + "completion_length": 17.609375, + "epoch": 0.40862099176449973, + "grad_norm": 24.394382354395756, + "kl": 0.052001953125, + "learning_rate": 5.915542316453478e-07, + "loss": 0.0208, + "reward": 1.5208165645599365, + "reward_std": 0.16024133563041687, + "rewards/accuracy_reward_stage2": 0.5208166241645813, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2332 + }, + { + "completion_length": 8.875, + "epoch": 0.4087962151743473, + "grad_norm": 18.123673916162268, + "kl": 0.10107421875, + "learning_rate": 5.913790082355002e-07, + "loss": 0.0404, + "reward": 1.6102049350738525, + "reward_std": 0.22374418377876282, + "rewards/accuracy_reward_stage2": 0.7352049350738525, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2333 + }, + { + "completion_length": 7.75, + "epoch": 0.4089714385841948, + "grad_norm": 18.41739347695288, + "kl": 0.0859375, + "learning_rate": 5.912037848256527e-07, + "loss": 0.0343, + "reward": 1.2719743251800537, + "reward_std": 0.19079017639160156, + "rewards/accuracy_reward_stage2": 0.39697423577308655, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2334 + }, + { + "completion_length": 9.609375, + "epoch": 0.4091466619940424, + "grad_norm": 21.074570496495934, + "kl": 0.049072265625, + "learning_rate": 5.910285614158052e-07, + "loss": 0.0196, + "reward": 1.3504436016082764, + "reward_std": 0.2122042328119278, + "rewards/accuracy_reward_stage2": 0.47544366121292114, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2335 + }, + { + "completion_length": 22.625, + "epoch": 0.40932188540389, + "grad_norm": 19.930927351670906, + "kl": 0.1630859375, + "learning_rate": 5.908533380059576e-07, + "loss": 0.0238, + "reward": 1.6350904703140259, + "reward_std": 0.2252962738275528, + "rewards/accuracy_reward_stage2": 0.6507154703140259, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2336 + }, + { + "completion_length": 6.921875, + "epoch": 0.4094971088137375, + "grad_norm": 13.86422551326349, + "kl": 0.1845703125, + "learning_rate": 5.906781145961101e-07, + "loss": -0.0588, + "reward": 1.7604167461395264, + "reward_std": 0.19533005356788635, + "rewards/accuracy_reward_stage2": 0.8072916865348816, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2337 + }, + { + "completion_length": 8.78125, + "epoch": 0.40967233222358507, + "grad_norm": 18.223599790270328, + "kl": 0.0693359375, + "learning_rate": 5.905028911862626e-07, + "loss": -0.0035, + "reward": 1.4242560863494873, + "reward_std": 0.21478287875652313, + "rewards/accuracy_reward_stage2": 0.43988117575645447, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2338 + }, + { + "completion_length": 14.75, + "epoch": 0.4098475556334326, + "grad_norm": 20.60659264277177, + "kl": 0.04150390625, + "learning_rate": 5.903276677764148e-07, + "loss": 0.0166, + "reward": 1.7482521533966064, + "reward_std": 0.23158448934555054, + "rewards/accuracy_reward_stage2": 0.7482522130012512, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2339 + }, + { + "completion_length": 8.03125, + "epoch": 0.41002277904328016, + "grad_norm": 17.999935466107047, + "kl": 0.046142578125, + "learning_rate": 5.901524443665673e-07, + "loss": 0.0184, + "reward": 1.4947917461395264, + "reward_std": 0.2120075523853302, + "rewards/accuracy_reward_stage2": 0.4947916865348816, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2340 + }, + { + "completion_length": 13.9375, + "epoch": 0.41019800245312776, + "grad_norm": 12.194046839176925, + "kl": 0.134765625, + "learning_rate": 5.899772209567197e-07, + "loss": 0.0121, + "reward": 1.36506986618042, + "reward_std": 0.1311139315366745, + "rewards/accuracy_reward_stage2": 0.6306947469711304, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 2341 + }, + { + "completion_length": 9.859375, + "epoch": 0.4103732258629753, + "grad_norm": 19.060395869898677, + "kl": 0.0927734375, + "learning_rate": 5.898019975468722e-07, + "loss": 0.0371, + "reward": 1.7756450176239014, + "reward_std": 0.15166711807250977, + "rewards/accuracy_reward_stage2": 0.7756450176239014, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2342 + }, + { + "completion_length": 10.359375, + "epoch": 0.41054844927282286, + "grad_norm": 19.750944712631785, + "kl": 0.1142578125, + "learning_rate": 5.896267741370247e-07, + "loss": 0.0457, + "reward": 1.4291049242019653, + "reward_std": 0.158258855342865, + "rewards/accuracy_reward_stage2": 0.5541049242019653, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2343 + }, + { + "completion_length": 9.390625, + "epoch": 0.4107236726826704, + "grad_norm": 20.40117083117346, + "kl": 0.1357421875, + "learning_rate": 5.894515507271771e-07, + "loss": 0.0255, + "reward": 1.6197458505630493, + "reward_std": 0.26166221499443054, + "rewards/accuracy_reward_stage2": 0.6353708505630493, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2344 + }, + { + "completion_length": 9.390625, + "epoch": 0.41089889609251795, + "grad_norm": 13.735586804906264, + "kl": 0.0576171875, + "learning_rate": 5.892763273173296e-07, + "loss": -0.0124, + "reward": 1.5586662292480469, + "reward_std": 0.187656432390213, + "rewards/accuracy_reward_stage2": 0.5742912888526917, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2345 + }, + { + "completion_length": 18.46875, + "epoch": 0.4110741195023655, + "grad_norm": 15.581619516175259, + "kl": 0.392578125, + "learning_rate": 5.891011039074821e-07, + "loss": 0.1548, + "reward": 1.738080620765686, + "reward_std": 0.13184432685375214, + "rewards/accuracy_reward_stage2": 0.863080620765686, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2346 + }, + { + "completion_length": 19.359375, + "epoch": 0.41124934291221305, + "grad_norm": 19.30749538544802, + "kl": 0.060546875, + "learning_rate": 5.889258804976345e-07, + "loss": -0.0199, + "reward": 1.4864542484283447, + "reward_std": 0.16072504222393036, + "rewards/accuracy_reward_stage2": 0.6270792484283447, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2347 + }, + { + "completion_length": 15.796875, + "epoch": 0.41142456632206065, + "grad_norm": 14.788056029744874, + "kl": 0.06689453125, + "learning_rate": 5.887506570877869e-07, + "loss": 0.0268, + "reward": 1.5268596410751343, + "reward_std": 0.1403314173221588, + "rewards/accuracy_reward_stage2": 0.5268596410751343, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2348 + }, + { + "completion_length": 10.921875, + "epoch": 0.4115997897319082, + "grad_norm": 19.221941814879607, + "kl": 0.24609375, + "learning_rate": 5.885754336779393e-07, + "loss": -0.0237, + "reward": 1.5493875741958618, + "reward_std": 0.2811095118522644, + "rewards/accuracy_reward_stage2": 0.5962625741958618, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2349 + }, + { + "completion_length": 16.515625, + "epoch": 0.41177501314175574, + "grad_norm": 8.794078311705457, + "kl": 0.03125, + "learning_rate": 5.884002102680918e-07, + "loss": 0.0125, + "reward": 1.5402517318725586, + "reward_std": 0.015763016417622566, + "rewards/accuracy_reward_stage2": 0.5402517318725586, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2350 + }, + { + "completion_length": 13.828125, + "epoch": 0.4119502365516033, + "grad_norm": 17.140128237746744, + "kl": 0.2236328125, + "learning_rate": 5.882249868582443e-07, + "loss": 0.0453, + "reward": 1.393404245376587, + "reward_std": 0.19191637635231018, + "rewards/accuracy_reward_stage2": 0.6590292453765869, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 2351 + }, + { + "completion_length": 13.1875, + "epoch": 0.41212545996145084, + "grad_norm": 23.406708522628044, + "kl": 0.2119140625, + "learning_rate": 5.880497634483966e-07, + "loss": -0.006, + "reward": 1.6162077188491821, + "reward_std": 0.24657636880874634, + "rewards/accuracy_reward_stage2": 0.6630827188491821, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2352 + }, + { + "completion_length": 10.125, + "epoch": 0.4123006833712984, + "grad_norm": 15.536105603528492, + "kl": 0.095703125, + "learning_rate": 5.878745400385491e-07, + "loss": 0.0383, + "reward": 1.2230710983276367, + "reward_std": 0.16474489867687225, + "rewards/accuracy_reward_stage2": 0.3480711877346039, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2353 + }, + { + "completion_length": 17.515625, + "epoch": 0.412475906781146, + "grad_norm": 27.055096838392842, + "kl": 0.08203125, + "learning_rate": 5.876993166287016e-07, + "loss": 0.033, + "reward": 1.4038621187210083, + "reward_std": 0.20572970807552338, + "rewards/accuracy_reward_stage2": 0.5288619995117188, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2354 + }, + { + "completion_length": 9.1875, + "epoch": 0.41265113019099353, + "grad_norm": 19.439410924795908, + "kl": 0.1611328125, + "learning_rate": 5.87524093218854e-07, + "loss": 0.0247, + "reward": 1.529209852218628, + "reward_std": 0.2930196523666382, + "rewards/accuracy_reward_stage2": 0.5448348522186279, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2355 + }, + { + "completion_length": 9.890625, + "epoch": 0.4128263536008411, + "grad_norm": 19.132398622575774, + "kl": 0.2080078125, + "learning_rate": 5.873488698090065e-07, + "loss": 0.0829, + "reward": 1.3353736400604248, + "reward_std": 0.12006399035453796, + "rewards/accuracy_reward_stage2": 0.46037358045578003, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2356 + }, + { + "completion_length": 8.625, + "epoch": 0.4130015770106886, + "grad_norm": 18.508285146546704, + "kl": 0.099609375, + "learning_rate": 5.871736463991589e-07, + "loss": -0.0044, + "reward": 1.7954076528549194, + "reward_std": 0.16407202184200287, + "rewards/accuracy_reward_stage2": 0.8110326528549194, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2357 + }, + { + "completion_length": 7.703125, + "epoch": 0.41317680042053617, + "grad_norm": 13.905171688530661, + "kl": 0.04296875, + "learning_rate": 5.869984229893113e-07, + "loss": 0.0172, + "reward": 1.6086777448654175, + "reward_std": 0.10335144400596619, + "rewards/accuracy_reward_stage2": 0.6086777448654175, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2358 + }, + { + "completion_length": 7.5, + "epoch": 0.4133520238303837, + "grad_norm": 19.268524405558992, + "kl": 0.1005859375, + "learning_rate": 5.868231995794638e-07, + "loss": -0.0039, + "reward": 1.7492554187774658, + "reward_std": 0.24843813478946686, + "rewards/accuracy_reward_stage2": 0.7648804187774658, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2359 + }, + { + "completion_length": 6.796875, + "epoch": 0.4135272472402313, + "grad_norm": 19.160492785539688, + "kl": 0.189453125, + "learning_rate": 5.866479761696162e-07, + "loss": 0.0756, + "reward": 1.198094129562378, + "reward_std": 0.22037328779697418, + "rewards/accuracy_reward_stage2": 0.3230942189693451, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2360 + }, + { + "completion_length": 8.96875, + "epoch": 0.41370247065007887, + "grad_norm": 16.825110820402383, + "kl": 0.09228515625, + "learning_rate": 5.864727527597687e-07, + "loss": 0.0089, + "reward": 1.6092510223388672, + "reward_std": 0.12170203030109406, + "rewards/accuracy_reward_stage2": 0.6248759031295776, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2361 + }, + { + "completion_length": 8.5, + "epoch": 0.4138776940599264, + "grad_norm": 7.999499917781706, + "kl": 0.05126953125, + "learning_rate": 5.862975293499212e-07, + "loss": 0.0205, + "reward": 1.359375, + "reward_std": 0.04419417306780815, + "rewards/accuracy_reward_stage2": 0.359375, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2362 + }, + { + "completion_length": 13.21875, + "epoch": 0.41405291746977396, + "grad_norm": 21.393899908468846, + "kl": 0.052001953125, + "learning_rate": 5.861223059400736e-07, + "loss": 0.0209, + "reward": 1.5607662200927734, + "reward_std": 0.2627609670162201, + "rewards/accuracy_reward_stage2": 0.5607661008834839, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2363 + }, + { + "completion_length": 11.78125, + "epoch": 0.4142281408796215, + "grad_norm": 22.023127107995297, + "kl": 0.05908203125, + "learning_rate": 5.859470825302261e-07, + "loss": 0.0236, + "reward": 1.6148502826690674, + "reward_std": 0.16442753374576569, + "rewards/accuracy_reward_stage2": 0.6148503422737122, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2364 + }, + { + "completion_length": 9.03125, + "epoch": 0.41440336428946906, + "grad_norm": 14.362750051760493, + "kl": 0.06787109375, + "learning_rate": 5.857718591203784e-07, + "loss": 0.0272, + "reward": 1.6884300708770752, + "reward_std": 0.07231159508228302, + "rewards/accuracy_reward_stage2": 0.6884300112724304, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2365 + }, + { + "completion_length": 22.828125, + "epoch": 0.4145785876993166, + "grad_norm": 23.613349877555066, + "kl": 0.37890625, + "learning_rate": 5.855966357105309e-07, + "loss": 0.1514, + "reward": 1.411705493927002, + "reward_std": 0.31298691034317017, + "rewards/accuracy_reward_stage2": 0.5367054343223572, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2366 + }, + { + "completion_length": 6.6875, + "epoch": 0.4147538111091642, + "grad_norm": 20.00550996319013, + "kl": 0.09423828125, + "learning_rate": 5.854214123006834e-07, + "loss": 0.0377, + "reward": 1.5901319980621338, + "reward_std": 0.18717949092388153, + "rewards/accuracy_reward_stage2": 0.7151321172714233, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2367 + }, + { + "completion_length": 12.265625, + "epoch": 0.41492903451901175, + "grad_norm": 17.797273669738534, + "kl": 0.05126953125, + "learning_rate": 5.852461888908357e-07, + "loss": 0.0205, + "reward": 1.3920097351074219, + "reward_std": 0.1308312863111496, + "rewards/accuracy_reward_stage2": 0.3920097351074219, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2368 + }, + { + "completion_length": 11.859375, + "epoch": 0.4151042579288593, + "grad_norm": 26.299178173381538, + "kl": 0.166015625, + "learning_rate": 5.850709654809882e-07, + "loss": 0.0593, + "reward": 1.4359591007232666, + "reward_std": 0.17541763186454773, + "rewards/accuracy_reward_stage2": 0.685958981513977, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 2369 + }, + { + "completion_length": 17.78125, + "epoch": 0.41527948133870685, + "grad_norm": 14.485488856701455, + "kl": 0.06640625, + "learning_rate": 5.848957420711407e-07, + "loss": -0.0014, + "reward": 1.775770664215088, + "reward_std": 0.10899923741817474, + "rewards/accuracy_reward_stage2": 0.7913956046104431, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2370 + }, + { + "completion_length": 27.265625, + "epoch": 0.4154547047485544, + "grad_norm": 17.881757409429884, + "kl": 0.341796875, + "learning_rate": 5.847205186612931e-07, + "loss": 0.1376, + "reward": 1.4621247053146362, + "reward_std": 0.07601737231016159, + "rewards/accuracy_reward_stage2": 0.5871245861053467, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2371 + }, + { + "completion_length": 8.390625, + "epoch": 0.41562992815840194, + "grad_norm": 25.055742932493168, + "kl": 0.0986328125, + "learning_rate": 5.845452952514456e-07, + "loss": 0.0395, + "reward": 1.6540504693984985, + "reward_std": 0.20673657953739166, + "rewards/accuracy_reward_stage2": 0.6540504693984985, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2372 + }, + { + "completion_length": 11.375, + "epoch": 0.41580515156824954, + "grad_norm": 16.521109231983434, + "kl": 0.10546875, + "learning_rate": 5.84370071841598e-07, + "loss": 0.0421, + "reward": 1.4801661968231201, + "reward_std": 0.12700024247169495, + "rewards/accuracy_reward_stage2": 0.4801662564277649, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2373 + }, + { + "completion_length": 11.171875, + "epoch": 0.4159803749780971, + "grad_norm": 20.12090258507575, + "kl": 0.07568359375, + "learning_rate": 5.841948484317505e-07, + "loss": 0.0302, + "reward": 1.670166015625, + "reward_std": 0.10102277249097824, + "rewards/accuracy_reward_stage2": 0.6701659560203552, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2374 + }, + { + "completion_length": 10.09375, + "epoch": 0.41615559838794464, + "grad_norm": 16.442767290434187, + "kl": 0.12451171875, + "learning_rate": 5.84019625021903e-07, + "loss": 0.0499, + "reward": 1.6291148662567139, + "reward_std": 0.06639102101325989, + "rewards/accuracy_reward_stage2": 0.7541148662567139, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2375 + }, + { + "completion_length": 8.296875, + "epoch": 0.4163308217977922, + "grad_norm": 21.97629019067572, + "kl": 0.1572265625, + "learning_rate": 5.838444016120554e-07, + "loss": 0.0202, + "reward": 1.7240822315216064, + "reward_std": 0.2035062313079834, + "rewards/accuracy_reward_stage2": 0.7397072315216064, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2376 + }, + { + "completion_length": 14.984375, + "epoch": 0.41650604520763973, + "grad_norm": 30.896256657541, + "kl": 0.12158203125, + "learning_rate": 5.836691782022077e-07, + "loss": 0.0487, + "reward": 1.5382190942764282, + "reward_std": 0.3024444580078125, + "rewards/accuracy_reward_stage2": 0.5382190942764282, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2377 + }, + { + "completion_length": 10.125, + "epoch": 0.4166812686174873, + "grad_norm": 20.28747836466928, + "kl": 0.11572265625, + "learning_rate": 5.834939547923601e-07, + "loss": 0.0462, + "reward": 1.73995041847229, + "reward_std": 0.20338395237922668, + "rewards/accuracy_reward_stage2": 0.7399503588676453, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2378 + }, + { + "completion_length": 10.609375, + "epoch": 0.4168564920273349, + "grad_norm": 13.97976660621444, + "kl": 0.0196533203125, + "learning_rate": 5.833187313825126e-07, + "loss": 0.0079, + "reward": 1.5437802076339722, + "reward_std": 0.11976291984319687, + "rewards/accuracy_reward_stage2": 0.6687802076339722, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2379 + }, + { + "completion_length": 16.25, + "epoch": 0.4170317154371824, + "grad_norm": 16.29885843641377, + "kl": 0.033447265625, + "learning_rate": 5.831435079726651e-07, + "loss": 0.0134, + "reward": 1.239460825920105, + "reward_std": 0.17725922167301178, + "rewards/accuracy_reward_stage2": 0.2394607961177826, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2380 + }, + { + "completion_length": 9.4375, + "epoch": 0.41720693884703, + "grad_norm": 17.520922343117828, + "kl": 0.177734375, + "learning_rate": 5.829682845628175e-07, + "loss": 0.071, + "reward": 1.414095163345337, + "reward_std": 0.1704052984714508, + "rewards/accuracy_reward_stage2": 0.41409510374069214, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2381 + }, + { + "completion_length": 21.375, + "epoch": 0.4173821622568775, + "grad_norm": 17.758151692325338, + "kl": 0.07666015625, + "learning_rate": 5.8279306115297e-07, + "loss": -0.0132, + "reward": 1.573897123336792, + "reward_std": 0.15478408336639404, + "rewards/accuracy_reward_stage2": 0.589522123336792, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2382 + }, + { + "completion_length": 10.6875, + "epoch": 0.41755738566672507, + "grad_norm": 21.532408390881507, + "kl": 0.1416015625, + "learning_rate": 5.826178377431225e-07, + "loss": 0.0567, + "reward": 1.5427581071853638, + "reward_std": 0.1347476989030838, + "rewards/accuracy_reward_stage2": 0.6677581071853638, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2383 + }, + { + "completion_length": 13.84375, + "epoch": 0.4177326090765726, + "grad_norm": 16.64175548823734, + "kl": 0.07080078125, + "learning_rate": 5.824426143332749e-07, + "loss": 0.0282, + "reward": 1.6984035968780518, + "reward_std": 0.1400546133518219, + "rewards/accuracy_reward_stage2": 0.6984036564826965, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2384 + }, + { + "completion_length": 10.765625, + "epoch": 0.41790783248642016, + "grad_norm": 16.949582968543215, + "kl": 0.08544921875, + "learning_rate": 5.822673909234274e-07, + "loss": -0.0414, + "reward": 1.7102024555206299, + "reward_std": 0.28166982531547546, + "rewards/accuracy_reward_stage2": 0.7414524555206299, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2385 + }, + { + "completion_length": 11.203125, + "epoch": 0.41808305589626776, + "grad_norm": 12.914250997053147, + "kl": 0.1337890625, + "learning_rate": 5.820921675135799e-07, + "loss": -0.0193, + "reward": 1.5751521587371826, + "reward_std": 0.10636032372713089, + "rewards/accuracy_reward_stage2": 0.6064021587371826, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2386 + }, + { + "completion_length": 12.765625, + "epoch": 0.4182582793061153, + "grad_norm": 17.62833798934218, + "kl": 0.16796875, + "learning_rate": 5.819169441037323e-07, + "loss": 0.0231, + "reward": 1.8179621696472168, + "reward_std": 0.18188484013080597, + "rewards/accuracy_reward_stage2": 0.8335871696472168, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2387 + }, + { + "completion_length": 12.96875, + "epoch": 0.41843350271596286, + "grad_norm": 18.760679579906835, + "kl": 0.09765625, + "learning_rate": 5.817417206938847e-07, + "loss": 0.0392, + "reward": 1.4825525283813477, + "reward_std": 0.19686242938041687, + "rewards/accuracy_reward_stage2": 0.6075524687767029, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2388 + }, + { + "completion_length": 9.125, + "epoch": 0.4186087261258104, + "grad_norm": 16.455958478420655, + "kl": 0.0208740234375, + "learning_rate": 5.815664972840371e-07, + "loss": 0.0083, + "reward": 1.5572917461395264, + "reward_std": 0.062747523188591, + "rewards/accuracy_reward_stage2": 0.5572916865348816, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2389 + }, + { + "completion_length": 17.234375, + "epoch": 0.41878394953565795, + "grad_norm": 21.767371412439168, + "kl": 0.177734375, + "learning_rate": 5.813912738741895e-07, + "loss": -0.022, + "reward": 1.750274419784546, + "reward_std": 0.2638322710990906, + "rewards/accuracy_reward_stage2": 0.7971494793891907, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2390 + }, + { + "completion_length": 7.15625, + "epoch": 0.4189591729455055, + "grad_norm": 19.398944510953225, + "kl": 0.09716796875, + "learning_rate": 5.81216050464342e-07, + "loss": 0.0388, + "reward": 1.7229228019714355, + "reward_std": 0.1554725468158722, + "rewards/accuracy_reward_stage2": 0.7229227423667908, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2391 + }, + { + "completion_length": 11.984375, + "epoch": 0.4191343963553531, + "grad_norm": 25.80667769908971, + "kl": 0.1220703125, + "learning_rate": 5.810408270544944e-07, + "loss": 0.0489, + "reward": 1.493762493133545, + "reward_std": 0.32909145951271057, + "rewards/accuracy_reward_stage2": 0.6187624335289001, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2392 + }, + { + "completion_length": 9.6875, + "epoch": 0.41930961976520065, + "grad_norm": 24.204346190531403, + "kl": 0.2890625, + "learning_rate": 5.808656036446469e-07, + "loss": 0.1156, + "reward": 1.220840334892273, + "reward_std": 0.378046452999115, + "rewards/accuracy_reward_stage2": 0.595840334892273, + "rewards/format_reward_stage1_pointerpad": 0.625, + "scores/accuracy_reward_stage2": 0.625, + "step": 2393 + }, + { + "completion_length": 20.84375, + "epoch": 0.4194848431750482, + "grad_norm": 27.14247331797663, + "kl": 0.12353515625, + "learning_rate": 5.806903802347993e-07, + "loss": 0.0053, + "reward": 1.4899253845214844, + "reward_std": 0.243827223777771, + "rewards/accuracy_reward_stage2": 0.6305502653121948, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2394 + }, + { + "completion_length": 11.125, + "epoch": 0.41966006658489574, + "grad_norm": 21.81038085160431, + "kl": 0.2138671875, + "learning_rate": 5.805151568249518e-07, + "loss": 0.0856, + "reward": 1.4024500846862793, + "reward_std": 0.35116198658943176, + "rewards/accuracy_reward_stage2": 0.5274500846862793, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2395 + }, + { + "completion_length": 17.34375, + "epoch": 0.4198352899947433, + "grad_norm": 24.3348961842837, + "kl": 0.027099609375, + "learning_rate": 5.803399334151043e-07, + "loss": -0.0334, + "reward": 1.5152560472488403, + "reward_std": 0.20777665078639984, + "rewards/accuracy_reward_stage2": 0.6558809876441956, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2396 + }, + { + "completion_length": 8.40625, + "epoch": 0.42001051340459084, + "grad_norm": 22.865852095520676, + "kl": 0.06640625, + "learning_rate": 5.801647100052566e-07, + "loss": -0.007, + "reward": 1.7035300731658936, + "reward_std": 0.2735273838043213, + "rewards/accuracy_reward_stage2": 0.7191551327705383, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2397 + }, + { + "completion_length": 7.6875, + "epoch": 0.4201857368144384, + "grad_norm": 12.864278552382983, + "kl": 0.0267333984375, + "learning_rate": 5.799894865954091e-07, + "loss": 0.0107, + "reward": 1.5834779739379883, + "reward_std": 0.10687437653541565, + "rewards/accuracy_reward_stage2": 0.5834779739379883, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2398 + }, + { + "completion_length": 10.234375, + "epoch": 0.420360960224286, + "grad_norm": 14.545711269406413, + "kl": 0.031494140625, + "learning_rate": 5.798142631855616e-07, + "loss": 0.0126, + "reward": 1.5031486749649048, + "reward_std": 0.15417417883872986, + "rewards/accuracy_reward_stage2": 0.6281486749649048, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2399 + }, + { + "completion_length": 11.40625, + "epoch": 0.42053618363413353, + "grad_norm": 21.409633551926984, + "kl": 0.0791015625, + "learning_rate": 5.79639039775714e-07, + "loss": 0.0028, + "reward": 1.5188215970993042, + "reward_std": 0.26314669847488403, + "rewards/accuracy_reward_stage2": 0.5344465970993042, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2400 + }, + { + "completion_length": 9.671875, + "epoch": 0.4207114070439811, + "grad_norm": 19.715486362642697, + "kl": 0.06591796875, + "learning_rate": 5.794638163658665e-07, + "loss": 0.0264, + "reward": 1.4238401651382446, + "reward_std": 0.18752865493297577, + "rewards/accuracy_reward_stage2": 0.42384013533592224, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2401 + }, + { + "completion_length": 8.671875, + "epoch": 0.4208866304538286, + "grad_norm": 26.864964821679248, + "kl": 0.2158203125, + "learning_rate": 5.792885929560189e-07, + "loss": 0.0476, + "reward": 1.5920884609222412, + "reward_std": 0.20743471384048462, + "rewards/accuracy_reward_stage2": 0.607713520526886, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2402 + }, + { + "completion_length": 7.1875, + "epoch": 0.42106185386367617, + "grad_norm": 19.490899661332257, + "kl": 0.111328125, + "learning_rate": 5.791133695461713e-07, + "loss": 0.0059, + "reward": 1.6936674118041992, + "reward_std": 0.1947699785232544, + "rewards/accuracy_reward_stage2": 0.7092924118041992, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2403 + }, + { + "completion_length": 8.515625, + "epoch": 0.4212370772735237, + "grad_norm": 18.196450672215533, + "kl": 0.138671875, + "learning_rate": 5.789381461363238e-07, + "loss": -0.0328, + "reward": 1.6714731454849243, + "reward_std": 0.287492960691452, + "rewards/accuracy_reward_stage2": 0.7027231454849243, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2404 + }, + { + "completion_length": 14.765625, + "epoch": 0.4214123006833713, + "grad_norm": 12.377034503915139, + "kl": 0.01177978515625, + "learning_rate": 5.787629227264762e-07, + "loss": 0.0047, + "reward": 1.6268939971923828, + "reward_std": 0.11237125098705292, + "rewards/accuracy_reward_stage2": 0.751893937587738, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2405 + }, + { + "completion_length": 11.703125, + "epoch": 0.42158752409321887, + "grad_norm": 22.596957861242434, + "kl": 0.1015625, + "learning_rate": 5.785876993166287e-07, + "loss": 0.0407, + "reward": 1.673307180404663, + "reward_std": 0.26322025060653687, + "rewards/accuracy_reward_stage2": 0.6733071804046631, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2406 + }, + { + "completion_length": 10.765625, + "epoch": 0.4217627475030664, + "grad_norm": 10.012152757261067, + "kl": 0.06640625, + "learning_rate": 5.784124759067811e-07, + "loss": 0.0266, + "reward": 1.777231216430664, + "reward_std": 0.07235102355480194, + "rewards/accuracy_reward_stage2": 0.7772312760353088, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2407 + }, + { + "completion_length": 33.0625, + "epoch": 0.42193797091291396, + "grad_norm": 22.549033708865686, + "kl": 0.06396484375, + "learning_rate": 5.782372524969335e-07, + "loss": -0.0232, + "reward": 1.5575335025787354, + "reward_std": 0.25364547967910767, + "rewards/accuracy_reward_stage2": 0.5887835025787354, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2408 + }, + { + "completion_length": 10.1875, + "epoch": 0.4221131943227615, + "grad_norm": 16.69878136588533, + "kl": 0.11328125, + "learning_rate": 5.78062029087086e-07, + "loss": 0.0012, + "reward": 1.7619128227233887, + "reward_std": 0.23174157738685608, + "rewards/accuracy_reward_stage2": 0.7775378227233887, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2409 + }, + { + "completion_length": 8.5625, + "epoch": 0.42228841773260906, + "grad_norm": 19.92237217185688, + "kl": 0.1728515625, + "learning_rate": 5.778868056772384e-07, + "loss": 0.025, + "reward": 1.4769458770751953, + "reward_std": 0.22557707130908966, + "rewards/accuracy_reward_stage2": 0.4925709366798401, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2410 + }, + { + "completion_length": 10.265625, + "epoch": 0.42246364114245666, + "grad_norm": 21.942952623213802, + "kl": 0.056884765625, + "learning_rate": 5.777115822673909e-07, + "loss": 0.0227, + "reward": 1.838803768157959, + "reward_std": 0.1688080132007599, + "rewards/accuracy_reward_stage2": 0.838803768157959, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2411 + }, + { + "completion_length": 11.25, + "epoch": 0.4226388645523042, + "grad_norm": 16.68553099657329, + "kl": 0.07958984375, + "learning_rate": 5.775363588575434e-07, + "loss": -0.0122, + "reward": 1.596681833267212, + "reward_std": 0.20045030117034912, + "rewards/accuracy_reward_stage2": 0.6123068332672119, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2412 + }, + { + "completion_length": 9.546875, + "epoch": 0.42281408796215175, + "grad_norm": 19.458085509509655, + "kl": 0.2001953125, + "learning_rate": 5.773611354476958e-07, + "loss": 0.0078, + "reward": 1.3699358701705933, + "reward_std": 0.256511926651001, + "rewards/accuracy_reward_stage2": 0.4011858403682709, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2413 + }, + { + "completion_length": 13.25, + "epoch": 0.4229893113719993, + "grad_norm": 21.411986512608394, + "kl": 0.2060546875, + "learning_rate": 5.771859120378483e-07, + "loss": 0.0826, + "reward": 1.5075629949569702, + "reward_std": 0.20391272008419037, + "rewards/accuracy_reward_stage2": 0.632563054561615, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2414 + }, + { + "completion_length": 17.734375, + "epoch": 0.42316453478184685, + "grad_norm": 30.774292690055958, + "kl": 0.1767578125, + "learning_rate": 5.770106886280008e-07, + "loss": 0.0706, + "reward": 1.3399405479431152, + "reward_std": 0.2806154489517212, + "rewards/accuracy_reward_stage2": 0.46494060754776, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2415 + }, + { + "completion_length": 7.5, + "epoch": 0.4233397581916944, + "grad_norm": 14.927647667623011, + "kl": 0.0595703125, + "learning_rate": 5.768354652181531e-07, + "loss": -0.0204, + "reward": 1.621319055557251, + "reward_std": 0.12363035976886749, + "rewards/accuracy_reward_stage2": 0.636944055557251, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2416 + }, + { + "completion_length": 10.6875, + "epoch": 0.42351498160154194, + "grad_norm": 20.65694451978839, + "kl": 0.087890625, + "learning_rate": 5.766602418083055e-07, + "loss": 0.0352, + "reward": 1.674845814704895, + "reward_std": 0.1950603872537613, + "rewards/accuracy_reward_stage2": 0.6748457551002502, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2417 + }, + { + "completion_length": 10.671875, + "epoch": 0.42369020501138954, + "grad_norm": 19.60176983232829, + "kl": 0.0908203125, + "learning_rate": 5.764850183984579e-07, + "loss": 0.0234, + "reward": 1.410792350769043, + "reward_std": 0.20295007526874542, + "rewards/accuracy_reward_stage2": 0.4264172911643982, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2418 + }, + { + "completion_length": 17.28125, + "epoch": 0.4238654284212371, + "grad_norm": 15.447747709824291, + "kl": 0.029296875, + "learning_rate": 5.763097949886104e-07, + "loss": 0.0117, + "reward": 1.539503812789917, + "reward_std": 0.120786651968956, + "rewards/accuracy_reward_stage2": 0.539503812789917, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2419 + }, + { + "completion_length": 12.53125, + "epoch": 0.42404065183108464, + "grad_norm": 26.228768769751692, + "kl": 0.1552734375, + "learning_rate": 5.761345715787629e-07, + "loss": 0.0285, + "reward": 1.535796880722046, + "reward_std": 0.2801082730293274, + "rewards/accuracy_reward_stage2": 0.5514217615127563, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2420 + }, + { + "completion_length": 29.546875, + "epoch": 0.4242158752409322, + "grad_norm": 19.516174382115324, + "kl": 0.12890625, + "learning_rate": 5.759593481689153e-07, + "loss": 0.0075, + "reward": 1.733359456062317, + "reward_std": 0.17085593938827515, + "rewards/accuracy_reward_stage2": 0.7489844560623169, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2421 + }, + { + "completion_length": 14.4375, + "epoch": 0.42439109865077973, + "grad_norm": 19.17537638480682, + "kl": 0.091796875, + "learning_rate": 5.757841247590678e-07, + "loss": 0.0368, + "reward": 1.246988296508789, + "reward_std": 0.1498282104730606, + "rewards/accuracy_reward_stage2": 0.49698832631111145, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 2422 + }, + { + "completion_length": 11.703125, + "epoch": 0.4245663220606273, + "grad_norm": 23.20436750248409, + "kl": 0.0751953125, + "learning_rate": 5.756089013492203e-07, + "loss": -0.0139, + "reward": 1.7541325092315674, + "reward_std": 0.22958284616470337, + "rewards/accuracy_reward_stage2": 0.7697575092315674, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2423 + }, + { + "completion_length": 8.96875, + "epoch": 0.4247415454704749, + "grad_norm": 21.20783789239163, + "kl": 0.1357421875, + "learning_rate": 5.754336779393727e-07, + "loss": 0.0542, + "reward": 1.7458889484405518, + "reward_std": 0.29691436886787415, + "rewards/accuracy_reward_stage2": 0.7458890080451965, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2424 + }, + { + "completion_length": 16.796875, + "epoch": 0.4249167688803224, + "grad_norm": 23.664822906611626, + "kl": 0.24609375, + "learning_rate": 5.752584545295252e-07, + "loss": 0.0985, + "reward": 1.4297152757644653, + "reward_std": 0.28032106161117554, + "rewards/accuracy_reward_stage2": 0.5547152161598206, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2425 + }, + { + "completion_length": 17.390625, + "epoch": 0.42509199229017, + "grad_norm": 15.369191055475348, + "kl": 0.037109375, + "learning_rate": 5.750832311196776e-07, + "loss": 0.0148, + "reward": 1.5607839822769165, + "reward_std": 0.09787797927856445, + "rewards/accuracy_reward_stage2": 0.6857839822769165, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2426 + }, + { + "completion_length": 19.46875, + "epoch": 0.4252672157000175, + "grad_norm": 24.312044808358525, + "kl": 0.1904296875, + "learning_rate": 5.7490800770983e-07, + "loss": 0.076, + "reward": 1.4541585445404053, + "reward_std": 0.1744290292263031, + "rewards/accuracy_reward_stage2": 0.7041586637496948, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 2427 + }, + { + "completion_length": 17.9375, + "epoch": 0.42544243910986507, + "grad_norm": 23.50930204251329, + "kl": 0.1923828125, + "learning_rate": 5.747327842999824e-07, + "loss": 0.0771, + "reward": 1.4003291130065918, + "reward_std": 0.22130905091762543, + "rewards/accuracy_reward_stage2": 0.6503292322158813, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 2428 + }, + { + "completion_length": 9.015625, + "epoch": 0.4256176625197126, + "grad_norm": 22.13947088990283, + "kl": 0.1806640625, + "learning_rate": 5.745575608901348e-07, + "loss": 0.0161, + "reward": 1.6461684703826904, + "reward_std": 0.2596883475780487, + "rewards/accuracy_reward_stage2": 0.6774183511734009, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2429 + }, + { + "completion_length": 11.3125, + "epoch": 0.4257928859295602, + "grad_norm": 22.035671063081804, + "kl": 0.1474609375, + "learning_rate": 5.743823374802873e-07, + "loss": 0.015, + "reward": 1.395911693572998, + "reward_std": 0.24767053127288818, + "rewards/accuracy_reward_stage2": 0.41153672337532043, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2430 + }, + { + "completion_length": 21.15625, + "epoch": 0.42596810933940776, + "grad_norm": 20.96133435505049, + "kl": 0.1572265625, + "learning_rate": 5.742071140704398e-07, + "loss": 0.0628, + "reward": 1.5663808584213257, + "reward_std": 0.1595151424407959, + "rewards/accuracy_reward_stage2": 0.6913807988166809, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2431 + }, + { + "completion_length": 7.875, + "epoch": 0.4261433327492553, + "grad_norm": 28.340403516253645, + "kl": 0.0498046875, + "learning_rate": 5.740318906605922e-07, + "loss": -0.0241, + "reward": 1.5094510316848755, + "reward_std": 0.2710912227630615, + "rewards/accuracy_reward_stage2": 0.5250759720802307, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2432 + }, + { + "completion_length": 13.765625, + "epoch": 0.42631855615910286, + "grad_norm": 19.890579026197372, + "kl": 0.039306640625, + "learning_rate": 5.738566672507447e-07, + "loss": 0.0158, + "reward": 1.6429111957550049, + "reward_std": 0.16243639588356018, + "rewards/accuracy_reward_stage2": 0.6429111957550049, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2433 + }, + { + "completion_length": 12.59375, + "epoch": 0.4264937795689504, + "grad_norm": 22.00464367213492, + "kl": 0.1494140625, + "learning_rate": 5.736814438408971e-07, + "loss": -0.0047, + "reward": 1.474339246749878, + "reward_std": 0.2699277997016907, + "rewards/accuracy_reward_stage2": 0.5055892467498779, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2434 + }, + { + "completion_length": 11.609375, + "epoch": 0.42666900297879795, + "grad_norm": 32.41247879005575, + "kl": 0.279296875, + "learning_rate": 5.735062204310496e-07, + "loss": 0.0677, + "reward": 1.5541329383850098, + "reward_std": 0.13045634329319, + "rewards/accuracy_reward_stage2": 0.6947579383850098, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2435 + }, + { + "completion_length": 10.5, + "epoch": 0.4268442263886455, + "grad_norm": 19.876951523107635, + "kl": 0.046630859375, + "learning_rate": 5.733309970212021e-07, + "loss": -0.0256, + "reward": 1.3027304410934448, + "reward_std": 0.21071302890777588, + "rewards/accuracy_reward_stage2": 0.3183554708957672, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2436 + }, + { + "completion_length": 9.25, + "epoch": 0.4270194497984931, + "grad_norm": 18.20379230867892, + "kl": 0.01953125, + "learning_rate": 5.731557736113544e-07, + "loss": 0.0078, + "reward": 1.5104167461395264, + "reward_std": 0.25927814841270447, + "rewards/accuracy_reward_stage2": 0.5104166269302368, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2437 + }, + { + "completion_length": 5.984375, + "epoch": 0.42719467320834065, + "grad_norm": 16.66665150309482, + "kl": 0.0673828125, + "learning_rate": 5.729805502015069e-07, + "loss": -0.002, + "reward": 1.6153621673583984, + "reward_std": 0.2000160962343216, + "rewards/accuracy_reward_stage2": 0.6309871673583984, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2438 + }, + { + "completion_length": 8.515625, + "epoch": 0.4273698966181882, + "grad_norm": 15.293604925338169, + "kl": 0.0712890625, + "learning_rate": 5.728053267916594e-07, + "loss": 0.0286, + "reward": 1.5463223457336426, + "reward_std": 0.12554559111595154, + "rewards/accuracy_reward_stage2": 0.5463222861289978, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2439 + }, + { + "completion_length": 7.78125, + "epoch": 0.42754512002803574, + "grad_norm": 12.185368006882296, + "kl": 0.053955078125, + "learning_rate": 5.726301033818118e-07, + "loss": 0.0216, + "reward": 1.6412497758865356, + "reward_std": 0.06281961500644684, + "rewards/accuracy_reward_stage2": 0.6412497758865356, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2440 + }, + { + "completion_length": 8.6875, + "epoch": 0.4277203434378833, + "grad_norm": 15.494160366052823, + "kl": 0.125, + "learning_rate": 5.724548799719642e-07, + "loss": 0.05, + "reward": 1.5815420150756836, + "reward_std": 0.10027365386486053, + "rewards/accuracy_reward_stage2": 0.5815420150756836, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2441 + }, + { + "completion_length": 10.921875, + "epoch": 0.42789556684773083, + "grad_norm": 18.107070394943033, + "kl": 0.07080078125, + "learning_rate": 5.722796565621166e-07, + "loss": 0.0283, + "reward": 1.3592140674591064, + "reward_std": 0.12985925376415253, + "rewards/accuracy_reward_stage2": 0.4842139780521393, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2442 + }, + { + "completion_length": 9.21875, + "epoch": 0.42807079025757844, + "grad_norm": 26.157579176268243, + "kl": 0.1611328125, + "learning_rate": 5.721044331522691e-07, + "loss": 0.0689, + "reward": 1.3738298416137695, + "reward_std": 0.31853947043418884, + "rewards/accuracy_reward_stage2": 0.63945472240448, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 2443 + }, + { + "completion_length": 13.328125, + "epoch": 0.428246013667426, + "grad_norm": 44.37524748602229, + "kl": 0.283203125, + "learning_rate": 5.719292097424216e-07, + "loss": 0.0741, + "reward": 1.4276965856552124, + "reward_std": 0.2782999873161316, + "rewards/accuracy_reward_stage2": 0.5683215260505676, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2444 + }, + { + "completion_length": 11.09375, + "epoch": 0.42842123707727353, + "grad_norm": 23.422889478777957, + "kl": 0.208984375, + "learning_rate": 5.71753986332574e-07, + "loss": 0.0458, + "reward": 1.4923583269119263, + "reward_std": 0.24954932928085327, + "rewards/accuracy_reward_stage2": 0.6329833269119263, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2445 + }, + { + "completion_length": 7.0, + "epoch": 0.4285964604871211, + "grad_norm": 17.028429839600122, + "kl": 0.08251953125, + "learning_rate": 5.715787629227265e-07, + "loss": 0.0041, + "reward": 1.652631163597107, + "reward_std": 0.1518530398607254, + "rewards/accuracy_reward_stage2": 0.6682562232017517, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2446 + }, + { + "completion_length": 11.71875, + "epoch": 0.4287716838969686, + "grad_norm": 18.844946250417486, + "kl": 0.259765625, + "learning_rate": 5.714035395128789e-07, + "loss": -0.0017, + "reward": 1.7519108057022095, + "reward_std": 0.26788169145584106, + "rewards/accuracy_reward_stage2": 0.7987857460975647, + "rewards/format_reward_stage1_pointerpad": 0.953125, + "scores/accuracy_reward_stage2": 0.953125, + "step": 2447 + }, + { + "completion_length": 9.390625, + "epoch": 0.42894690730681617, + "grad_norm": 16.434159640612602, + "kl": 0.11669921875, + "learning_rate": 5.712283161030313e-07, + "loss": -0.0362, + "reward": 1.7319084405899048, + "reward_std": 0.1767083704471588, + "rewards/accuracy_reward_stage2": 0.7631585001945496, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2448 + }, + { + "completion_length": 9.5, + "epoch": 0.4291221307166637, + "grad_norm": 21.436378957860942, + "kl": 0.10205078125, + "learning_rate": 5.710530926931838e-07, + "loss": 0.0407, + "reward": 1.6460349559783936, + "reward_std": 0.09250953048467636, + "rewards/accuracy_reward_stage2": 0.6460349559783936, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2449 + }, + { + "completion_length": 9.921875, + "epoch": 0.4292973541265113, + "grad_norm": 28.61624407275498, + "kl": 0.12158203125, + "learning_rate": 5.708778692833362e-07, + "loss": 0.0487, + "reward": 1.5566973686218262, + "reward_std": 0.3204444646835327, + "rewards/accuracy_reward_stage2": 0.5566972494125366, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2450 + }, + { + "completion_length": 26.046875, + "epoch": 0.42947257753635887, + "grad_norm": 20.448100007493757, + "kl": 0.068359375, + "learning_rate": 5.707026458734887e-07, + "loss": 0.0274, + "reward": 1.34342622756958, + "reward_std": 0.23968136310577393, + "rewards/accuracy_reward_stage2": 0.34342628717422485, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2451 + }, + { + "completion_length": 9.953125, + "epoch": 0.4296478009462064, + "grad_norm": 18.723259657679318, + "kl": 0.1201171875, + "learning_rate": 5.705274224636412e-07, + "loss": 0.048, + "reward": 1.5735918283462524, + "reward_std": 0.2153104841709137, + "rewards/accuracy_reward_stage2": 0.6985918283462524, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2452 + }, + { + "completion_length": 11.15625, + "epoch": 0.42982302435605396, + "grad_norm": 21.985458003635763, + "kl": 0.1123046875, + "learning_rate": 5.703521990537936e-07, + "loss": -0.017, + "reward": 1.6417429447174072, + "reward_std": 0.28753870725631714, + "rewards/accuracy_reward_stage2": 0.6729929447174072, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2453 + }, + { + "completion_length": 9.296875, + "epoch": 0.4299982477659015, + "grad_norm": 20.593440273884532, + "kl": 0.032958984375, + "learning_rate": 5.70176975643946e-07, + "loss": 0.0132, + "reward": 1.4509015083312988, + "reward_std": 0.2628650367259979, + "rewards/accuracy_reward_stage2": 0.45090147852897644, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2454 + }, + { + "completion_length": 11.484375, + "epoch": 0.43017347117574906, + "grad_norm": 18.415737733768058, + "kl": 0.1259765625, + "learning_rate": 5.700017522340984e-07, + "loss": 0.0504, + "reward": 1.5575731992721558, + "reward_std": 0.35987773537635803, + "rewards/accuracy_reward_stage2": 0.6825731992721558, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2455 + }, + { + "completion_length": 8.359375, + "epoch": 0.43034869458559666, + "grad_norm": 14.98305908133991, + "kl": 0.042724609375, + "learning_rate": 5.698265288242509e-07, + "loss": 0.017, + "reward": 1.908919095993042, + "reward_std": 0.11924922466278076, + "rewards/accuracy_reward_stage2": 0.9089190363883972, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2456 + }, + { + "completion_length": 9.015625, + "epoch": 0.4305239179954442, + "grad_norm": 15.713865925408056, + "kl": 0.05908203125, + "learning_rate": 5.696513054144033e-07, + "loss": 0.0237, + "reward": 1.5216166973114014, + "reward_std": 0.14264705777168274, + "rewards/accuracy_reward_stage2": 0.6466167569160461, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2457 + }, + { + "completion_length": 10.5625, + "epoch": 0.43069914140529175, + "grad_norm": 19.880669874137872, + "kl": 0.140625, + "learning_rate": 5.694760820045557e-07, + "loss": 0.0195, + "reward": 1.5184917449951172, + "reward_std": 0.30239593982696533, + "rewards/accuracy_reward_stage2": 0.5341167449951172, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2458 + }, + { + "completion_length": 7.671875, + "epoch": 0.4308743648151393, + "grad_norm": 16.329410163714197, + "kl": 0.044921875, + "learning_rate": 5.693008585947082e-07, + "loss": 0.018, + "reward": 1.787500023841858, + "reward_std": 0.07550577819347382, + "rewards/accuracy_reward_stage2": 0.7874999642372131, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2459 + }, + { + "completion_length": 11.140625, + "epoch": 0.43104958822498685, + "grad_norm": 19.381287915776806, + "kl": 0.12109375, + "learning_rate": 5.691256351848607e-07, + "loss": 0.0482, + "reward": 1.1594098806381226, + "reward_std": 0.1518731415271759, + "rewards/accuracy_reward_stage2": 0.40940988063812256, + "rewards/format_reward_stage1_pointerpad": 0.75, + "scores/accuracy_reward_stage2": 0.75, + "step": 2460 + }, + { + "completion_length": 10.109375, + "epoch": 0.4312248116348344, + "grad_norm": 11.975975628094352, + "kl": 0.12060546875, + "learning_rate": 5.689504117750131e-07, + "loss": 0.004, + "reward": 1.5058554410934448, + "reward_std": 0.10613877326250076, + "rewards/accuracy_reward_stage2": 0.7714804410934448, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 2461 + }, + { + "completion_length": 10.25, + "epoch": 0.431400035044682, + "grad_norm": 20.121126160568057, + "kl": 0.091796875, + "learning_rate": 5.687751883651656e-07, + "loss": 0.0367, + "reward": 1.7084333896636963, + "reward_std": 0.22800734639167786, + "rewards/accuracy_reward_stage2": 0.708433210849762, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2462 + }, + { + "completion_length": 9.28125, + "epoch": 0.43157525845452954, + "grad_norm": 22.460435213851525, + "kl": 0.134765625, + "learning_rate": 5.685999649553181e-07, + "loss": -0.0247, + "reward": 1.7333886623382568, + "reward_std": 0.2131943702697754, + "rewards/accuracy_reward_stage2": 0.7646386623382568, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2463 + }, + { + "completion_length": 8.015625, + "epoch": 0.4317504818643771, + "grad_norm": 14.680107507234679, + "kl": 0.10400390625, + "learning_rate": 5.684247415454705e-07, + "loss": 0.0135, + "reward": 1.5010437965393066, + "reward_std": 0.20351773500442505, + "rewards/accuracy_reward_stage2": 0.5166687369346619, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2464 + }, + { + "completion_length": 11.21875, + "epoch": 0.43192570527422464, + "grad_norm": 19.2462032033978, + "kl": 0.05078125, + "learning_rate": 5.68249518135623e-07, + "loss": -0.0113, + "reward": 1.697539210319519, + "reward_std": 0.24802234768867493, + "rewards/accuracy_reward_stage2": 0.713164210319519, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2465 + }, + { + "completion_length": 24.890625, + "epoch": 0.4321009286840722, + "grad_norm": 20.761453619278438, + "kl": 0.058837890625, + "learning_rate": 5.680742947257752e-07, + "loss": 0.0007, + "reward": 1.7051244974136353, + "reward_std": 0.12879568338394165, + "rewards/accuracy_reward_stage2": 0.7207494974136353, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2466 + }, + { + "completion_length": 11.5625, + "epoch": 0.43227615209391973, + "grad_norm": 23.788088443112787, + "kl": 0.060302734375, + "learning_rate": 5.678990713159277e-07, + "loss": -0.02, + "reward": 1.5080852508544922, + "reward_std": 0.24169126152992249, + "rewards/accuracy_reward_stage2": 0.5237102508544922, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2467 + }, + { + "completion_length": 7.53125, + "epoch": 0.4324513755037673, + "grad_norm": 17.387253813940422, + "kl": 0.11328125, + "learning_rate": 5.677238479060802e-07, + "loss": 0.0452, + "reward": 1.4800641536712646, + "reward_std": 0.19356288015842438, + "rewards/accuracy_reward_stage2": 0.6050641536712646, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2468 + }, + { + "completion_length": 13.421875, + "epoch": 0.4326265989136149, + "grad_norm": 325.77192228254404, + "kl": 1.84375, + "learning_rate": 5.675486244962326e-07, + "loss": 0.707, + "reward": 1.5729882717132568, + "reward_std": 0.26776865124702454, + "rewards/accuracy_reward_stage2": 0.7136133313179016, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2469 + }, + { + "completion_length": 10.703125, + "epoch": 0.4328018223234624, + "grad_norm": 16.006993697155597, + "kl": 0.03564453125, + "learning_rate": 5.673734010863851e-07, + "loss": 0.0046, + "reward": 1.266721487045288, + "reward_std": 0.17739826440811157, + "rewards/accuracy_reward_stage2": 0.5323464870452881, + "rewards/format_reward_stage1_pointerpad": 0.734375, + "scores/accuracy_reward_stage2": 0.734375, + "step": 2470 + }, + { + "completion_length": 9.921875, + "epoch": 0.43297704573331, + "grad_norm": 18.505760660623352, + "kl": 0.1484375, + "learning_rate": 5.671981776765375e-07, + "loss": 0.0152, + "reward": 1.5191692113876343, + "reward_std": 0.1488463580608368, + "rewards/accuracy_reward_stage2": 0.5347942113876343, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2471 + }, + { + "completion_length": 6.875, + "epoch": 0.4331522691431575, + "grad_norm": 17.408087882955247, + "kl": 0.1328125, + "learning_rate": 5.6702295426669e-07, + "loss": -0.0542, + "reward": 1.3365036249160767, + "reward_std": 0.3592662811279297, + "rewards/accuracy_reward_stage2": 0.5083786249160767, + "rewards/format_reward_stage1_pointerpad": 0.828125, + "scores/accuracy_reward_stage2": 0.828125, + "step": 2472 + }, + { + "completion_length": 7.1875, + "epoch": 0.43332749255300507, + "grad_norm": 19.829436669177255, + "kl": 0.083984375, + "learning_rate": 5.668477308568425e-07, + "loss": 0.0335, + "reward": 1.5234953165054321, + "reward_std": 0.25521036982536316, + "rewards/accuracy_reward_stage2": 0.6484953165054321, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2473 + }, + { + "completion_length": 8.984375, + "epoch": 0.4335027159628526, + "grad_norm": 19.423133602621352, + "kl": 0.111328125, + "learning_rate": 5.666725074469949e-07, + "loss": 0.0157, + "reward": 1.6337876319885254, + "reward_std": 0.1884712278842926, + "rewards/accuracy_reward_stage2": 0.6494127511978149, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2474 + }, + { + "completion_length": 10.375, + "epoch": 0.4336779393727002, + "grad_norm": 18.298341038325223, + "kl": 0.068359375, + "learning_rate": 5.664972840371474e-07, + "loss": 0.0274, + "reward": 1.5837030410766602, + "reward_std": 0.1801297813653946, + "rewards/accuracy_reward_stage2": 0.7087030410766602, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2475 + }, + { + "completion_length": 8.390625, + "epoch": 0.43385316278254776, + "grad_norm": 15.754352909022117, + "kl": 0.030029296875, + "learning_rate": 5.663220606272999e-07, + "loss": 0.012, + "reward": 1.860360860824585, + "reward_std": 0.1001751571893692, + "rewards/accuracy_reward_stage2": 0.8603609204292297, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2476 + }, + { + "completion_length": 8.671875, + "epoch": 0.4340283861923953, + "grad_norm": 18.118293617702054, + "kl": 0.166015625, + "learning_rate": 5.661468372174522e-07, + "loss": -0.0209, + "reward": 1.7263150215148926, + "reward_std": 0.18398618698120117, + "rewards/accuracy_reward_stage2": 0.7575650215148926, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2477 + }, + { + "completion_length": 10.671875, + "epoch": 0.43420360960224286, + "grad_norm": 12.758581469841406, + "kl": 0.0693359375, + "learning_rate": 5.659716138076047e-07, + "loss": 0.0278, + "reward": 1.5945075750350952, + "reward_std": 0.07799089699983597, + "rewards/accuracy_reward_stage2": 0.5945075750350952, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2478 + }, + { + "completion_length": 7.53125, + "epoch": 0.4343788330120904, + "grad_norm": 24.724876782895745, + "kl": 0.08056640625, + "learning_rate": 5.65796390397757e-07, + "loss": -0.0011, + "reward": 1.3923568725585938, + "reward_std": 0.30109161138534546, + "rewards/accuracy_reward_stage2": 0.4079819321632385, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2479 + }, + { + "completion_length": 17.015625, + "epoch": 0.43455405642193795, + "grad_norm": 21.30851433712103, + "kl": 0.103515625, + "learning_rate": 5.656211669879095e-07, + "loss": 0.0415, + "reward": 1.6417262554168701, + "reward_std": 0.16019636392593384, + "rewards/accuracy_reward_stage2": 0.6417261362075806, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2480 + }, + { + "completion_length": 11.625, + "epoch": 0.43472927983178555, + "grad_norm": 16.069909922504603, + "kl": 0.1123046875, + "learning_rate": 5.65445943578062e-07, + "loss": 0.0449, + "reward": 1.7745733261108398, + "reward_std": 0.10483638942241669, + "rewards/accuracy_reward_stage2": 0.7745733261108398, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2481 + }, + { + "completion_length": 7.78125, + "epoch": 0.4349045032416331, + "grad_norm": 18.202272906284808, + "kl": 0.04052734375, + "learning_rate": 5.652707201682144e-07, + "loss": 0.0162, + "reward": 1.6692678928375244, + "reward_std": 0.23714250326156616, + "rewards/accuracy_reward_stage2": 0.6692679524421692, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2482 + }, + { + "completion_length": 10.40625, + "epoch": 0.43507972665148065, + "grad_norm": 19.22569853451002, + "kl": 0.1455078125, + "learning_rate": 5.650954967583669e-07, + "loss": 0.0584, + "reward": 1.2947709560394287, + "reward_std": 0.11568181961774826, + "rewards/accuracy_reward_stage2": 0.41977089643478394, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2483 + }, + { + "completion_length": 9.59375, + "epoch": 0.4352549500613282, + "grad_norm": 18.60676009657278, + "kl": 0.07470703125, + "learning_rate": 5.649202733485194e-07, + "loss": 0.0298, + "reward": 1.6659901142120361, + "reward_std": 0.21120445430278778, + "rewards/accuracy_reward_stage2": 0.6659901738166809, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2484 + }, + { + "completion_length": 12.375, + "epoch": 0.43543017347117574, + "grad_norm": 17.448368004805243, + "kl": 0.10107421875, + "learning_rate": 5.647450499386718e-07, + "loss": 0.0404, + "reward": 1.3877105712890625, + "reward_std": 0.17023152112960815, + "rewards/accuracy_reward_stage2": 0.5127106308937073, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2485 + }, + { + "completion_length": 7.734375, + "epoch": 0.4356053968810233, + "grad_norm": 27.701798930911647, + "kl": 0.05615234375, + "learning_rate": 5.645698265288243e-07, + "loss": 0.0225, + "reward": 1.686922550201416, + "reward_std": 0.21318362653255463, + "rewards/accuracy_reward_stage2": 0.6869224905967712, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2486 + }, + { + "completion_length": 10.46875, + "epoch": 0.43578062029087083, + "grad_norm": 33.38221389414447, + "kl": 0.298828125, + "learning_rate": 5.643946031189766e-07, + "loss": 0.0468, + "reward": 1.533717393875122, + "reward_std": 0.19223450124263763, + "rewards/accuracy_reward_stage2": 0.5649674534797668, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2487 + }, + { + "completion_length": 8.765625, + "epoch": 0.43595584370071844, + "grad_norm": 15.92423013091103, + "kl": 0.357421875, + "learning_rate": 5.642193797091291e-07, + "loss": 0.1043, + "reward": 1.5287775993347168, + "reward_std": 0.21819572150707245, + "rewards/accuracy_reward_stage2": 0.6694026589393616, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2488 + }, + { + "completion_length": 11.125, + "epoch": 0.436131067110566, + "grad_norm": 24.990874315078415, + "kl": 0.1318359375, + "learning_rate": 5.640441562992816e-07, + "loss": 0.0085, + "reward": 1.7180120944976807, + "reward_std": 0.2930488586425781, + "rewards/accuracy_reward_stage2": 0.7336370944976807, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2489 + }, + { + "completion_length": 14.5, + "epoch": 0.43630629052041353, + "grad_norm": 23.423213369547156, + "kl": 0.09033203125, + "learning_rate": 5.63868932889434e-07, + "loss": 0.0363, + "reward": 1.8398503065109253, + "reward_std": 0.17344442009925842, + "rewards/accuracy_reward_stage2": 0.8398503065109253, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2490 + }, + { + "completion_length": 13.078125, + "epoch": 0.4364815139302611, + "grad_norm": 22.765187899068547, + "kl": 0.06982421875, + "learning_rate": 5.636937094795865e-07, + "loss": -0.0055, + "reward": 1.4840041399002075, + "reward_std": 0.21035104990005493, + "rewards/accuracy_reward_stage2": 0.6246291399002075, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2491 + }, + { + "completion_length": 9.5625, + "epoch": 0.4366567373401086, + "grad_norm": 25.422366034224048, + "kl": 0.205078125, + "learning_rate": 5.635184860697389e-07, + "loss": 0.082, + "reward": 1.4929587841033936, + "reward_std": 0.1953394114971161, + "rewards/accuracy_reward_stage2": 0.6179587841033936, + "rewards/format_reward_stage1_pointerpad": 0.875, + "scores/accuracy_reward_stage2": 0.875, + "step": 2492 + }, + { + "completion_length": 11.84375, + "epoch": 0.43683196074995617, + "grad_norm": 23.39126913379409, + "kl": 0.059326171875, + "learning_rate": 5.633432626598913e-07, + "loss": -0.02, + "reward": 1.788152813911438, + "reward_std": 0.23067131638526917, + "rewards/accuracy_reward_stage2": 0.803777813911438, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2493 + }, + { + "completion_length": 8.296875, + "epoch": 0.4370071841598038, + "grad_norm": 24.067458583341157, + "kl": 0.07373046875, + "learning_rate": 5.631680392500438e-07, + "loss": 0.0295, + "reward": 1.729994773864746, + "reward_std": 0.18137015402317047, + "rewards/accuracy_reward_stage2": 0.7299947142601013, + "rewards/format_reward_stage1_pointerpad": 1.0, + "scores/accuracy_reward_stage2": 1.0, + "step": 2494 + }, + { + "completion_length": 10.78125, + "epoch": 0.4371824075696513, + "grad_norm": 20.6936402781379, + "kl": 0.1025390625, + "learning_rate": 5.629928158401962e-07, + "loss": -0.0031, + "reward": 1.8095977306365967, + "reward_std": 0.2006131410598755, + "rewards/accuracy_reward_stage2": 0.8252226114273071, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2495 + }, + { + "completion_length": 6.9375, + "epoch": 0.43735763097949887, + "grad_norm": 56.06477061667996, + "kl": 0.5703125, + "learning_rate": 5.628175924303486e-07, + "loss": 0.183, + "reward": 1.601702094078064, + "reward_std": 0.1429881602525711, + "rewards/accuracy_reward_stage2": 0.6173270344734192, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2496 + }, + { + "completion_length": 10.421875, + "epoch": 0.4375328543893464, + "grad_norm": 18.460030194723316, + "kl": 0.150390625, + "learning_rate": 5.626423690205011e-07, + "loss": 0.016, + "reward": 1.5749945640563965, + "reward_std": 0.23786082863807678, + "rewards/accuracy_reward_stage2": 0.7156196236610413, + "rewards/format_reward_stage1_pointerpad": 0.859375, + "scores/accuracy_reward_stage2": 0.859375, + "step": 2497 + }, + { + "completion_length": 12.75, + "epoch": 0.43770807779919396, + "grad_norm": 13.524925919219523, + "kl": 0.0751953125, + "learning_rate": 5.624671456106535e-07, + "loss": -0.0475, + "reward": 1.4493248462677002, + "reward_std": 0.1437452733516693, + "rewards/accuracy_reward_stage2": 0.4805747866630554, + "rewards/format_reward_stage1_pointerpad": 0.96875, + "scores/accuracy_reward_stage2": 0.96875, + "step": 2498 + }, + { + "completion_length": 10.703125, + "epoch": 0.4378833012090415, + "grad_norm": 15.790480007846668, + "kl": 0.08349609375, + "learning_rate": 5.62291922200806e-07, + "loss": -0.0109, + "reward": 1.7373511791229248, + "reward_std": 0.15441085398197174, + "rewards/accuracy_reward_stage2": 0.7529761791229248, + "rewards/format_reward_stage1_pointerpad": 0.984375, + "scores/accuracy_reward_stage2": 0.984375, + "step": 2499 + }, + { + "completion_length": 8.3125, + "epoch": 0.4380585246188891, + "grad_norm": 19.997615203454874, + "kl": 0.1591796875, + "learning_rate": 5.621166987909585e-07, + "loss": -0.0242, + "reward": 1.714186668395996, + "reward_std": 0.30740267038345337, + "rewards/accuracy_reward_stage2": 0.8704366087913513, + "rewards/format_reward_stage1_pointerpad": 0.84375, + "scores/accuracy_reward_stage2": 0.84375, + "step": 2500 + } + ], + "logging_steps": 1.0, + "max_steps": 5707, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}