diff --git "a/pi_list/trainer_state.json" "b/pi_list/trainer_state.json" new file mode 100644--- /dev/null +++ "b/pi_list/trainer_state.json" @@ -0,0 +1,11514 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.656, + "eval_steps": 500, + "global_step": 8200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "completion_length": 29.75625, + "epoch": 0.0008, + "grad_norm": 3.57593035697937, + "kl": 0.0005933973778155632, + "learning_rate": 4.5000000000000003e-07, + "loss": -0.0283, + "reward": 0.018564663897268473, + "reward_std": 0.04723395220935345, + "rewards/ddi_reward": -0.009041142754722387, + "rewards/jaccard_reward": 0.020793889660853892, + "rewards/refuse_rate_reward": -0.01114612827077508, + "step": 10 + }, + { + "completion_length": 26.1671875, + "epoch": 0.0016, + "grad_norm": 2.9401144981384277, + "kl": 0.0008395852404646575, + "learning_rate": 9.500000000000001e-07, + "loss": -0.0267, + "reward": 0.022977117728441954, + "reward_std": 0.04329147357493639, + "rewards/ddi_reward": -0.003024790738709271, + "rewards/jaccard_reward": 0.02444243929348886, + "rewards/refuse_rate_reward": -0.007326607190771028, + "step": 20 + }, + { + "completion_length": 29.471875, + "epoch": 0.0024, + "grad_norm": 2.579378604888916, + "kl": 0.0008524633260094561, + "learning_rate": 1.45e-06, + "loss": -0.0399, + "reward": 0.030703018663916736, + "reward_std": 0.0482021345756948, + "rewards/ddi_reward": -0.005479783064220101, + "rewards/jaccard_reward": 0.03204653144348413, + "rewards/refuse_rate_reward": -0.00671756638912484, + "step": 30 + }, + { + "completion_length": 28.353125, + "epoch": 0.0032, + "grad_norm": 2.070500373840332, + "kl": 0.0010065904527436942, + "learning_rate": 1.9500000000000004e-06, + "loss": -0.0227, + "reward": 0.027745838137343527, + "reward_std": 0.04441091660410166, + "rewards/ddi_reward": -0.006494398013455793, + "rewards/jaccard_reward": 0.02955945087596774, + "rewards/refuse_rate_reward": -0.009068071586079895, + "step": 40 + }, + { + "completion_length": 27.5015625, + "epoch": 0.004, + "grad_norm": 1.659399151802063, + "kl": 0.0012799304910004138, + "learning_rate": 2.4500000000000003e-06, + "loss": -0.0181, + "reward": 0.022755683213472367, + "reward_std": 0.0441404651850462, + "rewards/ddi_reward": -0.002945503467344679, + "rewards/jaccard_reward": 0.02501596437068656, + "rewards/refuse_rate_reward": -0.011301402212120592, + "step": 50 + }, + { + "completion_length": 25.7046875, + "epoch": 0.0048, + "grad_norm": 2.3225162029266357, + "kl": 0.002114478437579237, + "learning_rate": 2.95e-06, + "loss": -0.0299, + "reward": 0.024267112114466728, + "reward_std": 0.04239263674244285, + "rewards/ddi_reward": -0.0013403785414993763, + "rewards/jaccard_reward": 0.02499833660549484, + "rewards/refuse_rate_reward": -0.0036561204702593385, + "step": 60 + }, + { + "completion_length": 27.3921875, + "epoch": 0.0056, + "grad_norm": 3.860537052154541, + "kl": 0.002314273775846232, + "learning_rate": 3.45e-06, + "loss": -0.0209, + "reward": 0.01308250955771655, + "reward_std": 0.05337791331112385, + "rewards/ddi_reward": -0.008188269066158682, + "rewards/jaccard_reward": 0.014360785949975252, + "rewards/refuse_rate_reward": -0.006391380605055019, + "step": 70 + }, + { + "completion_length": 26.4078125, + "epoch": 0.0064, + "grad_norm": 2.159525156021118, + "kl": 0.008903224568348378, + "learning_rate": 3.95e-06, + "loss": -0.0335, + "reward": 0.021066215320024638, + "reward_std": 0.04166620392352342, + "rewards/ddi_reward": -0.005652799326344393, + "rewards/jaccard_reward": 0.02238016764167696, + "rewards/refuse_rate_reward": -0.006569760607089847, + "step": 80 + }, + { + "completion_length": 22.590625, + "epoch": 0.0072, + "grad_norm": 5.966856002807617, + "kl": 0.025706733902916314, + "learning_rate": 4.450000000000001e-06, + "loss": -0.0576, + "reward": 0.034208368184044956, + "reward_std": 0.042430201172828676, + "rewards/ddi_reward": -0.006675204797647894, + "rewards/jaccard_reward": 0.035563159768935294, + "rewards/refuse_rate_reward": -0.0067739564809016885, + "step": 90 + }, + { + "completion_length": 27.4953125, + "epoch": 0.008, + "grad_norm": 7.516049861907959, + "kl": 0.06204737676307559, + "learning_rate": 4.95e-06, + "loss": -0.0245, + "reward": 0.026592397317290305, + "reward_std": 0.04189849914982915, + "rewards/ddi_reward": -0.002351547937723808, + "rewards/jaccard_reward": 0.02787044889992103, + "rewards/refuse_rate_reward": -0.006390253233257681, + "step": 100 + }, + { + "completion_length": 26.4234375, + "epoch": 0.0088, + "grad_norm": 6.262258529663086, + "kl": 0.0919895451515913, + "learning_rate": 5.450000000000001e-06, + "loss": -0.0387, + "reward": 0.02892746324650943, + "reward_std": 0.0438289288431406, + "rewards/ddi_reward": -0.0040213783999206495, + "rewards/jaccard_reward": 0.030319836759008467, + "rewards/refuse_rate_reward": -0.006961861974559724, + "step": 110 + }, + { + "completion_length": 23.30625, + "epoch": 0.0096, + "grad_norm": 4.449306011199951, + "kl": 0.1577576667070389, + "learning_rate": 5.950000000000001e-06, + "loss": -0.0301, + "reward": 0.02783071342855692, + "reward_std": 0.042362258024513724, + "rewards/ddi_reward": 0.00039529596688225865, + "rewards/jaccard_reward": 0.02894464529817924, + "rewards/refuse_rate_reward": -0.005569659592583776, + "step": 120 + }, + { + "completion_length": 21.6546875, + "epoch": 0.0104, + "grad_norm": 6.553844451904297, + "kl": 0.2470131117850542, + "learning_rate": 6.450000000000001e-06, + "loss": -0.0548, + "reward": 0.03169132430339232, + "reward_std": 0.04059392884373665, + "rewards/ddi_reward": -0.013428660291538108, + "rewards/jaccard_reward": 0.032550629158504304, + "rewards/refuse_rate_reward": -0.0042965259635820985, + "step": 130 + }, + { + "completion_length": 21.290625, + "epoch": 0.0112, + "grad_norm": 5.354051113128662, + "kl": 0.35831874087452886, + "learning_rate": 6.95e-06, + "loss": -0.0285, + "reward": 0.034617599775083364, + "reward_std": 0.040177999436855315, + "rewards/ddi_reward": -0.00378415469895117, + "rewards/jaccard_reward": 0.03540766581427306, + "rewards/refuse_rate_reward": -0.003950329346116633, + "step": 140 + }, + { + "completion_length": 17.5921875, + "epoch": 0.012, + "grad_norm": 3.7506566047668457, + "kl": 0.7088816434144973, + "learning_rate": 7.450000000000001e-06, + "loss": -0.0111, + "reward": 0.026712132303509863, + "reward_std": 0.03140932293608785, + "rewards/ddi_reward": 0.0037644443829776717, + "rewards/jaccard_reward": 0.027036908804439007, + "rewards/refuse_rate_reward": -0.00162388397147879, + "step": 150 + }, + { + "completion_length": 20.6375, + "epoch": 0.0128, + "grad_norm": 4.5712971687316895, + "kl": 3.2349953673779965, + "learning_rate": 7.950000000000002e-06, + "loss": 0.0592, + "reward": 0.040288155316375196, + "reward_std": 0.03788806954398751, + "rewards/ddi_reward": 0.0009856867080088704, + "rewards/jaccard_reward": 0.04051595712080598, + "rewards/refuse_rate_reward": -0.001139012910425663, + "step": 160 + }, + { + "completion_length": 18.3859375, + "epoch": 0.0136, + "grad_norm": 3.455796241760254, + "kl": 0.5210425373166799, + "learning_rate": 8.45e-06, + "loss": -0.0192, + "reward": 0.03107840521261096, + "reward_std": 0.03484615893103182, + "rewards/ddi_reward": -0.006420145003357902, + "rewards/jaccard_reward": 0.03201072921510786, + "rewards/refuse_rate_reward": -0.004661621595732868, + "step": 170 + }, + { + "completion_length": 20.053125, + "epoch": 0.0144, + "grad_norm": 1.9707624912261963, + "kl": 0.5222390983253717, + "learning_rate": 8.95e-06, + "loss": -0.0269, + "reward": 0.03598052840679884, + "reward_std": 0.039800772350281476, + "rewards/ddi_reward": -0.006107857724418864, + "rewards/jaccard_reward": 0.03732688277959824, + "rewards/refuse_rate_reward": -0.006731770909391343, + "step": 180 + }, + { + "completion_length": 20.234375, + "epoch": 0.0152, + "grad_norm": 8.904123306274414, + "kl": 0.25305485501885416, + "learning_rate": 9.450000000000001e-06, + "loss": -0.0246, + "reward": 0.028522065933793785, + "reward_std": 0.041713623143732545, + "rewards/ddi_reward": -0.00281710255658254, + "rewards/jaccard_reward": 0.029455379478167742, + "rewards/refuse_rate_reward": -0.004666564613580703, + "step": 190 + }, + { + "completion_length": 24.0453125, + "epoch": 0.016, + "grad_norm": 3.3512511253356934, + "kl": 0.31575517654418944, + "learning_rate": 9.950000000000001e-06, + "loss": -0.0385, + "reward": 0.041331886500120166, + "reward_std": 0.03612528294324875, + "rewards/ddi_reward": -0.007218879542779178, + "rewards/jaccard_reward": 0.04338167898822576, + "rewards/refuse_rate_reward": -0.010248957958538085, + "step": 200 + }, + { + "completion_length": 27.5, + "epoch": 0.0168, + "grad_norm": 3.1252987384796143, + "kl": 0.3299813747406006, + "learning_rate": 9.999986789648946e-06, + "loss": -0.0391, + "reward": 0.04938991582021117, + "reward_std": 0.03817319246008992, + "rewards/ddi_reward": -0.004487249068915844, + "rewards/jaccard_reward": 0.05049227708950639, + "rewards/refuse_rate_reward": -0.005511803861008957, + "step": 210 + }, + { + "completion_length": 27.725, + "epoch": 0.0176, + "grad_norm": 1.4563435316085815, + "kl": 0.3123649101704359, + "learning_rate": 9.99994112432751e-06, + "loss": -0.0144, + "reward": 0.042043750087032095, + "reward_std": 0.04060468999668956, + "rewards/ddi_reward": -0.003646275299252011, + "rewards/jaccard_reward": 0.04335545402136631, + "rewards/refuse_rate_reward": -0.006558522453997284, + "step": 220 + }, + { + "completion_length": 28.071875, + "epoch": 0.0184, + "grad_norm": 5.0847272872924805, + "kl": 0.37248593419790266, + "learning_rate": 9.999862841242774e-06, + "loss": -0.0297, + "reward": 0.044795927964150904, + "reward_std": 0.03821419361047447, + "rewards/ddi_reward": -0.009205453272443265, + "rewards/jaccard_reward": 0.046086123841814694, + "rewards/refuse_rate_reward": -0.006450980296358466, + "step": 230 + }, + { + "completion_length": 22.734375, + "epoch": 0.0192, + "grad_norm": 3.2770462036132812, + "kl": 0.40955328196287155, + "learning_rate": 9.999751940905424e-06, + "loss": -0.0348, + "reward": 0.03891500236932188, + "reward_std": 0.03678588718175888, + "rewards/ddi_reward": -0.015162861428689212, + "rewards/jaccard_reward": 0.03997124792076647, + "rewards/refuse_rate_reward": -0.005281221715267748, + "step": 240 + }, + { + "completion_length": 23.0890625, + "epoch": 0.02, + "grad_norm": 1.9124705791473389, + "kl": 0.3929987773299217, + "learning_rate": 9.999608424038936e-06, + "loss": -0.0153, + "reward": 0.034487370029091836, + "reward_std": 0.03610561741515994, + "rewards/ddi_reward": -0.005244850591407157, + "rewards/jaccard_reward": 0.03579178215004504, + "rewards/refuse_rate_reward": -0.006522063235752285, + "step": 250 + }, + { + "completion_length": 22.0359375, + "epoch": 0.0208, + "grad_norm": 4.8138556480407715, + "kl": 0.4457839414477348, + "learning_rate": 9.999432291579561e-06, + "loss": -0.034, + "reward": 0.04566699122078717, + "reward_std": 0.03014615010470152, + "rewards/ddi_reward": -0.0029227966981125062, + "rewards/jaccard_reward": 0.046361638139933346, + "rewards/refuse_rate_reward": -0.003473233920522034, + "step": 260 + }, + { + "completion_length": 20.2484375, + "epoch": 0.0216, + "grad_norm": 4.442445278167725, + "kl": 0.3398831535130739, + "learning_rate": 9.999223544676319e-06, + "loss": -0.0324, + "reward": 0.042069650674238804, + "reward_std": 0.035634635295718906, + "rewards/ddi_reward": -0.009598567127250135, + "rewards/jaccard_reward": 0.04249276523478329, + "rewards/refuse_rate_reward": -0.0021155753755010666, + "step": 270 + }, + { + "completion_length": 19.1921875, + "epoch": 0.0224, + "grad_norm": 3.5465147495269775, + "kl": 0.4090707376599312, + "learning_rate": 9.998982184691e-06, + "loss": -0.0417, + "reward": 0.039975787280127406, + "reward_std": 0.02872827770188451, + "rewards/ddi_reward": -0.0006803600292187185, + "rewards/jaccard_reward": 0.040362992137670514, + "rewards/refuse_rate_reward": -0.001936021144501865, + "step": 280 + }, + { + "completion_length": 25.25, + "epoch": 0.0232, + "grad_norm": 3.443272352218628, + "kl": 0.23907504118978978, + "learning_rate": 9.998708213198147e-06, + "loss": -0.0515, + "reward": 0.05395978004671633, + "reward_std": 0.03723439509049058, + "rewards/ddi_reward": -0.0031805446953512726, + "rewards/jaccard_reward": 0.0554869222920388, + "rewards/refuse_rate_reward": -0.007635710900649429, + "step": 290 + }, + { + "completion_length": 23.53125, + "epoch": 0.024, + "grad_norm": 2.8850784301757812, + "kl": 0.3070978976786137, + "learning_rate": 9.998401631985048e-06, + "loss": -0.0452, + "reward": 0.04603706933557987, + "reward_std": 0.034278560895472765, + "rewards/ddi_reward": -0.017035151337040588, + "rewards/jaccard_reward": 0.046722854883410034, + "rewards/refuse_rate_reward": -0.003428932192036882, + "step": 300 + }, + { + "completion_length": 23.78125, + "epoch": 0.0248, + "grad_norm": 1.601813554763794, + "kl": 0.35034109130501745, + "learning_rate": 9.998062443051724e-06, + "loss": -0.0451, + "reward": 0.05335830757394433, + "reward_std": 0.032229002099484205, + "rewards/ddi_reward": 0.0023071878997143356, + "rewards/jaccard_reward": 0.05443481020629406, + "rewards/refuse_rate_reward": -0.005382525478489697, + "step": 310 + }, + { + "completion_length": 26.73125, + "epoch": 0.0256, + "grad_norm": 2.047910213470459, + "kl": 0.3370140690356493, + "learning_rate": 9.99769064861092e-06, + "loss": -0.0309, + "reward": 0.0432299476582557, + "reward_std": 0.03789084628224373, + "rewards/ddi_reward": -0.006993291876278817, + "rewards/jaccard_reward": 0.04440084579400718, + "rewards/refuse_rate_reward": -0.00585449489299208, + "step": 320 + }, + { + "completion_length": 21.9, + "epoch": 0.0264, + "grad_norm": 4.956886291503906, + "kl": 0.42041261270642283, + "learning_rate": 9.997286251088086e-06, + "loss": -0.0217, + "reward": 0.03427543042926118, + "reward_std": 0.03159530814737081, + "rewards/ddi_reward": -0.0058746677823364735, + "rewards/jaccard_reward": 0.03547563669271767, + "rewards/refuse_rate_reward": -0.006001032469794154, + "step": 330 + }, + { + "completion_length": 24.1703125, + "epoch": 0.0272, + "grad_norm": 2.4637317657470703, + "kl": 0.31109966821968554, + "learning_rate": 9.996849253121357e-06, + "loss": -0.0546, + "reward": 0.048897714260965584, + "reward_std": 0.03442220017313957, + "rewards/ddi_reward": -0.0005171092052478343, + "rewards/jaccard_reward": 0.04954268098808825, + "rewards/refuse_rate_reward": -0.003224825521465391, + "step": 340 + }, + { + "completion_length": 22.984375, + "epoch": 0.028, + "grad_norm": 3.0689408779144287, + "kl": 0.32151837907731534, + "learning_rate": 9.99637965756155e-06, + "loss": -0.0438, + "reward": 0.0470694288611412, + "reward_std": 0.029105472005903722, + "rewards/ddi_reward": -0.0059266215946991, + "rewards/jaccard_reward": 0.04759931615553796, + "rewards/refuse_rate_reward": -0.0026494295918382703, + "step": 350 + }, + { + "completion_length": 19.740625, + "epoch": 0.0288, + "grad_norm": 3.0974810123443604, + "kl": 0.301722414791584, + "learning_rate": 9.995877467472131e-06, + "loss": -0.0797, + "reward": 0.05378229366615415, + "reward_std": 0.02681219670921564, + "rewards/ddi_reward": -0.005175827807397582, + "rewards/jaccard_reward": 0.054024854628369215, + "rewards/refuse_rate_reward": -0.0012127976398915053, + "step": 360 + }, + { + "completion_length": 20.828125, + "epoch": 0.0296, + "grad_norm": 5.218966960906982, + "kl": 0.3375785432755947, + "learning_rate": 9.995342686129202e-06, + "loss": -0.0527, + "reward": 0.049808434676378965, + "reward_std": 0.032354217022657394, + "rewards/ddi_reward": -0.010041095316410065, + "rewards/jaccard_reward": 0.05037608100101352, + "rewards/refuse_rate_reward": -0.0028382317163050176, + "step": 370 + }, + { + "completion_length": 20.2484375, + "epoch": 0.0304, + "grad_norm": 1.4750087261199951, + "kl": 0.2714916460216045, + "learning_rate": 9.994775317021477e-06, + "loss": -0.0562, + "reward": 0.05076656835153699, + "reward_std": 0.034306522272527215, + "rewards/ddi_reward": -0.0044390088529326025, + "rewards/jaccard_reward": 0.051731845049653204, + "rewards/refuse_rate_reward": -0.004826389020308852, + "step": 380 + }, + { + "completion_length": 19.9015625, + "epoch": 0.0312, + "grad_norm": 2.890627861022949, + "kl": 0.24763085264712573, + "learning_rate": 9.994175363850265e-06, + "loss": -0.0456, + "reward": 0.041980014136061075, + "reward_std": 0.030148877948522567, + "rewards/ddi_reward": -0.005520205473294482, + "rewards/jaccard_reward": 0.0425920492503792, + "rewards/refuse_rate_reward": -0.0030601777601987124, + "step": 390 + }, + { + "completion_length": 22.584375, + "epoch": 0.032, + "grad_norm": 2.015913963317871, + "kl": 0.296403331309557, + "learning_rate": 9.993542830529437e-06, + "loss": -0.0406, + "reward": 0.04785801386460662, + "reward_std": 0.031017291406169534, + "rewards/ddi_reward": -0.008567476682947018, + "rewards/jaccard_reward": 0.048105037724599245, + "rewards/refuse_rate_reward": -0.0012351191020570696, + "step": 400 + }, + { + "completion_length": 20.8953125, + "epoch": 0.0328, + "grad_norm": 2.834883451461792, + "kl": 0.31548326648771763, + "learning_rate": 9.992877721185404e-06, + "loss": -0.0514, + "reward": 0.046272925846278665, + "reward_std": 0.03138691792264581, + "rewards/ddi_reward": -0.0004278326407074928, + "rewards/jaccard_reward": 0.04664829783141613, + "rewards/refuse_rate_reward": -0.0018768601352348925, + "step": 410 + }, + { + "completion_length": 23.340625, + "epoch": 0.0336, + "grad_norm": 1.738528847694397, + "kl": 0.3702305771410465, + "learning_rate": 9.992180040157096e-06, + "loss": -0.052, + "reward": 0.05700378031469881, + "reward_std": 0.03129559857770801, + "rewards/ddi_reward": -0.009879603140871041, + "rewards/jaccard_reward": 0.05776566574349999, + "rewards/refuse_rate_reward": -0.00380942850606516, + "step": 420 + }, + { + "completion_length": 22.6140625, + "epoch": 0.0344, + "grad_norm": 3.891157627105713, + "kl": 0.38160739094018936, + "learning_rate": 9.991449791995922e-06, + "loss": -0.0393, + "reward": 0.044818634539842604, + "reward_std": 0.029547323239967227, + "rewards/ddi_reward": -0.007795296925178263, + "rewards/jaccard_reward": 0.04621862107887864, + "rewards/refuse_rate_reward": -0.006999938038643449, + "step": 430 + }, + { + "completion_length": 21.2234375, + "epoch": 0.0352, + "grad_norm": 3.0857412815093994, + "kl": 0.3346212849020958, + "learning_rate": 9.990686981465754e-06, + "loss": -0.0554, + "reward": 0.04660870302468538, + "reward_std": 0.029977358970791103, + "rewards/ddi_reward": -0.01269194005872123, + "rewards/jaccard_reward": 0.047186790686100724, + "rewards/refuse_rate_reward": -0.0028904342732857915, + "step": 440 + }, + { + "completion_length": 23.35625, + "epoch": 0.036, + "grad_norm": 2.780155658721924, + "kl": 0.3206599555909634, + "learning_rate": 9.989891613542884e-06, + "loss": -0.0602, + "reward": 0.05361469909548759, + "reward_std": 0.03060278985649347, + "rewards/ddi_reward": -0.003964237566106021, + "rewards/jaccard_reward": 0.054327632545027885, + "rewards/refuse_rate_reward": -0.0035646645119413733, + "step": 450 + }, + { + "completion_length": 21.1625, + "epoch": 0.0368, + "grad_norm": 2.1100728511810303, + "kl": 0.300174617767334, + "learning_rate": 9.989063693415997e-06, + "loss": -0.0315, + "reward": 0.03685779073275626, + "reward_std": 0.029912237962707876, + "rewards/ddi_reward": -0.0057872725185006855, + "rewards/jaccard_reward": 0.037883336935192344, + "rewards/refuse_rate_reward": -0.005127728299703449, + "step": 460 + }, + { + "completion_length": 20.971875, + "epoch": 0.0376, + "grad_norm": 1.429036021232605, + "kl": 0.3457451120018959, + "learning_rate": 9.988203226486136e-06, + "loss": -0.0536, + "reward": 0.046615922614000736, + "reward_std": 0.026534601347520947, + "rewards/ddi_reward": 0.0017481532238889485, + "rewards/jaccard_reward": 0.046902267343830316, + "rewards/refuse_rate_reward": -0.0014317280263639986, + "step": 470 + }, + { + "completion_length": 20.75, + "epoch": 0.0384, + "grad_norm": 4.881855487823486, + "kl": 0.3963875710964203, + "learning_rate": 9.987310218366672e-06, + "loss": -0.053, + "reward": 0.04532378187868744, + "reward_std": 0.02983957389369607, + "rewards/ddi_reward": -0.00819681838620454, + "rewards/jaccard_reward": 0.04600760589819401, + "rewards/refuse_rate_reward": -0.003419124346692115, + "step": 480 + }, + { + "completion_length": 19.9828125, + "epoch": 0.0392, + "grad_norm": 5.429396629333496, + "kl": 0.5717341557145119, + "learning_rate": 9.986384674883256e-06, + "loss": -0.0454, + "reward": 0.04855356314219535, + "reward_std": 0.02500741188414395, + "rewards/ddi_reward": 0.0016836992785101757, + "rewards/jaccard_reward": 0.04868897959822789, + "rewards/refuse_rate_reward": -0.0006770833511836827, + "step": 490 + }, + { + "completion_length": 22.48125, + "epoch": 0.04, + "grad_norm": 3.2126781940460205, + "kl": 0.361210235953331, + "learning_rate": 9.98542660207379e-06, + "loss": -0.0705, + "reward": 0.061566379852592945, + "reward_std": 0.029183563450351356, + "rewards/ddi_reward": -0.011121974349953235, + "rewards/jaccard_reward": 0.061819675751030444, + "rewards/refuse_rate_reward": -0.0012664784211665393, + "step": 500 + }, + { + "completion_length": 20.5421875, + "epoch": 0.0408, + "grad_norm": 5.29798698425293, + "kl": 0.4320597894489765, + "learning_rate": 9.984436006188391e-06, + "loss": -0.0463, + "reward": 0.05085363816469908, + "reward_std": 0.03146558492444455, + "rewards/ddi_reward": 0.0007944973302073776, + "rewards/jaccard_reward": 0.052563913818448785, + "rewards/refuse_rate_reward": -0.00855137938633561, + "step": 510 + }, + { + "completion_length": 18.184375, + "epoch": 0.0416, + "grad_norm": 17.024568557739258, + "kl": 0.46689253896474836, + "learning_rate": 9.983412893689334e-06, + "loss": -0.0235, + "reward": 0.03828533047344536, + "reward_std": 0.025995548442006113, + "rewards/ddi_reward": -0.0003970509395003319, + "rewards/jaccard_reward": 0.03901237950194627, + "rewards/refuse_rate_reward": -0.0036352469702251255, + "step": 520 + }, + { + "completion_length": 21.73125, + "epoch": 0.0424, + "grad_norm": 1.5381278991699219, + "kl": 0.5329340599477291, + "learning_rate": 9.98235727125103e-06, + "loss": -0.0468, + "reward": 0.04981195591390133, + "reward_std": 0.025486539816483855, + "rewards/ddi_reward": -0.01271656564786099, + "rewards/jaccard_reward": 0.05092452741228044, + "rewards/refuse_rate_reward": -0.005562855198513716, + "step": 530 + }, + { + "completion_length": 21.903125, + "epoch": 0.0432, + "grad_norm": 3.6375911235809326, + "kl": 0.454425735771656, + "learning_rate": 9.981269145759965e-06, + "loss": -0.0645, + "reward": 0.062328731548041104, + "reward_std": 0.031996310921385884, + "rewards/ddi_reward": -6.968335947021842e-06, + "rewards/jaccard_reward": 0.06316696336434688, + "rewards/refuse_rate_reward": -0.004191147419624031, + "step": 540 + }, + { + "completion_length": 22.9703125, + "epoch": 0.044, + "grad_norm": 6.968578815460205, + "kl": 0.4104159250855446, + "learning_rate": 9.980148524314667e-06, + "loss": -0.0233, + "reward": 0.03676078277640045, + "reward_std": 0.029479770781472327, + "rewards/ddi_reward": -0.0032709981518564747, + "rewards/jaccard_reward": 0.03726273284992203, + "rewards/refuse_rate_reward": -0.002509751601610333, + "step": 550 + }, + { + "completion_length": 24.2671875, + "epoch": 0.0448, + "grad_norm": 1.5688012838363647, + "kl": 0.27281809970736504, + "learning_rate": 9.978995414225659e-06, + "loss": -0.0485, + "reward": 0.05043151965364814, + "reward_std": 0.03155448595061898, + "rewards/ddi_reward": -0.0030325407919008286, + "rewards/jaccard_reward": 0.05105662469286472, + "rewards/refuse_rate_reward": -0.0031255247187800704, + "step": 560 + }, + { + "completion_length": 23.0828125, + "epoch": 0.0456, + "grad_norm": 1.210322618484497, + "kl": 0.3081243745982647, + "learning_rate": 9.9778098230154e-06, + "loss": -0.0387, + "reward": 0.04274985915981233, + "reward_std": 0.030610501719638705, + "rewards/ddi_reward": -0.005169421367463655, + "rewards/jaccard_reward": 0.04330378316808492, + "rewards/refuse_rate_reward": -0.0027696218807250262, + "step": 570 + }, + { + "completion_length": 24.0859375, + "epoch": 0.0464, + "grad_norm": 1.4360288381576538, + "kl": 0.3556093379855156, + "learning_rate": 9.97659175841825e-06, + "loss": -0.0385, + "reward": 0.048558197740931064, + "reward_std": 0.032642926834523675, + "rewards/ddi_reward": -0.008350008982233703, + "rewards/jaccard_reward": 0.049462350690737365, + "rewards/refuse_rate_reward": -0.004520765761844814, + "step": 580 + }, + { + "completion_length": 20.125, + "epoch": 0.0472, + "grad_norm": 2.074982166290283, + "kl": 0.35712223425507544, + "learning_rate": 9.975341228380411e-06, + "loss": -0.0441, + "reward": 0.044743565213866535, + "reward_std": 0.0287234156858176, + "rewards/ddi_reward": -0.006666556186974049, + "rewards/jaccard_reward": 0.04576079468242824, + "rewards/refuse_rate_reward": -0.005086146458052099, + "step": 590 + }, + { + "completion_length": 22.5796875, + "epoch": 0.048, + "grad_norm": 3.330571174621582, + "kl": 0.3157158501446247, + "learning_rate": 9.974058241059875e-06, + "loss": -0.0547, + "reward": 0.05382202484179288, + "reward_std": 0.028769116196781398, + "rewards/ddi_reward": 0.0019510206999257206, + "rewards/jaccard_reward": 0.05440819178475067, + "rewards/refuse_rate_reward": -0.002930832258425653, + "step": 600 + }, + { + "completion_length": 24.5578125, + "epoch": 0.0488, + "grad_norm": 1.8943413496017456, + "kl": 0.3085491552948952, + "learning_rate": 9.972742804826378e-06, + "loss": -0.0775, + "reward": 0.06968711065128445, + "reward_std": 0.031548578571528194, + "rewards/ddi_reward": -0.007454930091626011, + "rewards/jaccard_reward": 0.06989302732981742, + "rewards/refuse_rate_reward": -0.0010295758955180645, + "step": 610 + }, + { + "completion_length": 20.746875, + "epoch": 0.0496, + "grad_norm": 1.5784205198287964, + "kl": 0.40930077284574506, + "learning_rate": 9.97139492826134e-06, + "loss": -0.0409, + "reward": 0.04371571261435747, + "reward_std": 0.030272141844034196, + "rewards/ddi_reward": -0.004973273642826825, + "rewards/jaccard_reward": 0.044259982625953855, + "rewards/refuse_rate_reward": -0.002721354167442769, + "step": 620 + }, + { + "completion_length": 25.678125, + "epoch": 0.0504, + "grad_norm": 1.7675102949142456, + "kl": 0.3819395139813423, + "learning_rate": 9.970014620157806e-06, + "loss": -0.0207, + "reward": 0.04680651992093772, + "reward_std": 0.0329294104129076, + "rewards/ddi_reward": -0.005995357368374243, + "rewards/jaccard_reward": 0.04735610418429133, + "rewards/refuse_rate_reward": -0.0027479205396957695, + "step": 630 + }, + { + "completion_length": 22.1640625, + "epoch": 0.0512, + "grad_norm": 3.641401767730713, + "kl": 0.5110413379967212, + "learning_rate": 9.968601889520393e-06, + "loss": -0.054, + "reward": 0.06109658740460873, + "reward_std": 0.03266472462564707, + "rewards/ddi_reward": -0.006597996805794537, + "rewards/jaccard_reward": 0.06164484014734626, + "rewards/refuse_rate_reward": -0.0027412518276832996, + "step": 640 + }, + { + "completion_length": 21.515625, + "epoch": 0.052, + "grad_norm": 4.467803001403809, + "kl": 0.3939688511192799, + "learning_rate": 9.967156745565237e-06, + "loss": -0.0421, + "reward": 0.04133688691072166, + "reward_std": 0.030348078068345784, + "rewards/ddi_reward": -0.016615432454273105, + "rewards/jaccard_reward": 0.04167022081092, + "rewards/refuse_rate_reward": -0.001666666695382446, + "step": 650 + }, + { + "completion_length": 20.0453125, + "epoch": 0.0528, + "grad_norm": 4.960206985473633, + "kl": 0.5730137057602406, + "learning_rate": 9.965679197719919e-06, + "loss": -0.0749, + "reward": 0.06521870912984014, + "reward_std": 0.03519129713531584, + "rewards/ddi_reward": -0.013623316894518211, + "rewards/jaccard_reward": 0.06592339975759387, + "rewards/refuse_rate_reward": -0.0035234572016634045, + "step": 660 + }, + { + "completion_length": 21.85, + "epoch": 0.0536, + "grad_norm": 5.168346881866455, + "kl": 0.35750931352376936, + "learning_rate": 9.964169255623412e-06, + "loss": -0.0472, + "reward": 0.047418897459283474, + "reward_std": 0.028366786427795887, + "rewards/ddi_reward": -0.003983552067074925, + "rewards/jaccard_reward": 0.04789552566362545, + "rewards/refuse_rate_reward": -0.0023831398924812675, + "step": 670 + }, + { + "completion_length": 21.8984375, + "epoch": 0.0544, + "grad_norm": 2.341670274734497, + "kl": 0.3585388883948326, + "learning_rate": 9.962626929126023e-06, + "loss": -0.0685, + "reward": 0.056444522272795436, + "reward_std": 0.027629424538463353, + "rewards/ddi_reward": -0.011138367670355365, + "rewards/jaccard_reward": 0.05693360934965312, + "rewards/refuse_rate_reward": -0.002445436525158584, + "step": 680 + }, + { + "completion_length": 23.8296875, + "epoch": 0.0552, + "grad_norm": 2.75173282623291, + "kl": 7.992466278374195, + "learning_rate": 9.961052228289315e-06, + "loss": 0.2587, + "reward": 0.049881663918495175, + "reward_std": 0.02638693692861125, + "rewards/ddi_reward": -0.006131883361376822, + "rewards/jaccard_reward": 0.051470599754247816, + "rewards/refuse_rate_reward": -0.007944680843502283, + "step": 690 + }, + { + "completion_length": 24.865625, + "epoch": 0.056, + "grad_norm": 3.377664804458618, + "kl": 0.39134550392627715, + "learning_rate": 9.959445163386055e-06, + "loss": -0.0476, + "reward": 0.055098794400691986, + "reward_std": 0.025284902611747383, + "rewards/ddi_reward": -0.010965214512543753, + "rewards/jaccard_reward": 0.05637338142842054, + "rewards/refuse_rate_reward": -0.006372936989646405, + "step": 700 + }, + { + "completion_length": 26.3296875, + "epoch": 0.0568, + "grad_norm": 6.39014196395874, + "kl": 0.33181479200720787, + "learning_rate": 9.957805744900137e-06, + "loss": -0.0316, + "reward": 0.03683172918390483, + "reward_std": 0.03280671900138259, + "rewards/ddi_reward": -0.00559197131806286, + "rewards/jaccard_reward": 0.037682821130147204, + "rewards/refuse_rate_reward": -0.0042554644518531855, + "step": 710 + }, + { + "completion_length": 24.7234375, + "epoch": 0.0576, + "grad_norm": 2.2818443775177, + "kl": 0.43947252482175825, + "learning_rate": 9.956133983526521e-06, + "loss": -0.0575, + "reward": 0.05648327623493969, + "reward_std": 0.027767761470749976, + "rewards/ddi_reward": -0.0030078074167249726, + "rewards/jaccard_reward": 0.057222231896594165, + "rewards/refuse_rate_reward": -0.0036947775399312375, + "step": 720 + }, + { + "completion_length": 27.4953125, + "epoch": 0.0584, + "grad_norm": 5.280501842498779, + "kl": 0.33484707549214365, + "learning_rate": 9.954429890171158e-06, + "loss": -0.0418, + "reward": 0.05522567452862859, + "reward_std": 0.03311186046339572, + "rewards/ddi_reward": 0.008289201220031827, + "rewards/jaccard_reward": 0.05572746265679598, + "rewards/refuse_rate_reward": -0.0025089392089284956, + "step": 730 + }, + { + "completion_length": 26.91875, + "epoch": 0.0592, + "grad_norm": 1.5642400979995728, + "kl": 0.3389181047677994, + "learning_rate": 9.952693475950923e-06, + "loss": -0.0578, + "reward": 0.05578422164544463, + "reward_std": 0.02756371332798153, + "rewards/ddi_reward": -0.003137437082477845, + "rewards/jaccard_reward": 0.056878578616306186, + "rewards/refuse_rate_reward": -0.005471791070885956, + "step": 740 + }, + { + "completion_length": 26.2484375, + "epoch": 0.06, + "grad_norm": 2.591477394104004, + "kl": 0.36594081707298753, + "learning_rate": 9.950924752193538e-06, + "loss": -0.0147, + "reward": 0.038319311244413255, + "reward_std": 0.029807973187416792, + "rewards/ddi_reward": -0.00521323537977878, + "rewards/jaccard_reward": 0.03910649167373777, + "rewards/refuse_rate_reward": -0.003935902728699148, + "step": 750 + }, + { + "completion_length": 23.884375, + "epoch": 0.0608, + "grad_norm": 13.421483039855957, + "kl": 0.3592815324664116, + "learning_rate": 9.949123730437502e-06, + "loss": -0.0584, + "reward": 0.06712153116241097, + "reward_std": 0.031646537128835917, + "rewards/ddi_reward": -0.005634266987908631, + "rewards/jaccard_reward": 0.06795027237385512, + "rewards/refuse_rate_reward": -0.004143703664885834, + "step": 760 + }, + { + "completion_length": 21.6671875, + "epoch": 0.0616, + "grad_norm": 2.0353612899780273, + "kl": 0.4642515368759632, + "learning_rate": 9.947290422432012e-06, + "loss": -0.0423, + "reward": 0.04569629915058613, + "reward_std": 0.02800757682416588, + "rewards/ddi_reward": -0.0060972996521741155, + "rewards/jaccard_reward": 0.04608234982006252, + "rewards/refuse_rate_reward": -0.0019302572472952306, + "step": 770 + }, + { + "completion_length": 21.1140625, + "epoch": 0.0624, + "grad_norm": 4.093363285064697, + "kl": 0.5848795484751463, + "learning_rate": 9.945424840136893e-06, + "loss": -0.0378, + "reward": 0.044552494562231, + "reward_std": 0.02270221463404596, + "rewards/ddi_reward": -0.0016858902235981077, + "rewards/jaccard_reward": 0.04464971721754409, + "rewards/refuse_rate_reward": -0.0004861111170612276, + "step": 780 + }, + { + "completion_length": 25.9515625, + "epoch": 0.0632, + "grad_norm": 2.3215701580047607, + "kl": 0.332990113645792, + "learning_rate": 9.94352699572251e-06, + "loss": -0.046, + "reward": 0.05686944657936692, + "reward_std": 0.029186088498681782, + "rewards/ddi_reward": -0.005097762675723061, + "rewards/jaccard_reward": 0.05730741592124104, + "rewards/refuse_rate_reward": -0.002189845708198845, + "step": 790 + }, + { + "completion_length": 22.721875, + "epoch": 0.064, + "grad_norm": 3.331799030303955, + "kl": 0.47993106991052625, + "learning_rate": 9.9415969015697e-06, + "loss": -0.0371, + "reward": 0.04018822712823748, + "reward_std": 0.023183011496439577, + "rewards/ddi_reward": -0.010714077443117276, + "rewards/jaccard_reward": 0.040451879939064385, + "rewards/refuse_rate_reward": -0.0013182607712224127, + "step": 800 + }, + { + "completion_length": 21.553125, + "epoch": 0.0648, + "grad_norm": 2.5425071716308594, + "kl": 0.4477700784802437, + "learning_rate": 9.939634570269675e-06, + "loss": -0.0377, + "reward": 0.04277334804646671, + "reward_std": 0.024109624093398452, + "rewards/ddi_reward": -0.007773052353877574, + "rewards/jaccard_reward": 0.0431834428163711, + "rewards/refuse_rate_reward": -0.0020504712592810392, + "step": 810 + }, + { + "completion_length": 21.246875, + "epoch": 0.0656, + "grad_norm": 2.5005598068237305, + "kl": 0.4432547651231289, + "learning_rate": 9.937640014623965e-06, + "loss": -0.0607, + "reward": 0.06508857682347298, + "reward_std": 0.026943555939942597, + "rewards/ddi_reward": -0.00589528371929191, + "rewards/jaccard_reward": 0.06557456385344267, + "rewards/refuse_rate_reward": -0.002429935592226684, + "step": 820 + }, + { + "completion_length": 22.6390625, + "epoch": 0.0664, + "grad_norm": 2.89924955368042, + "kl": 0.5672906324267387, + "learning_rate": 9.935613247644311e-06, + "loss": -0.0633, + "reward": 0.06368781994096935, + "reward_std": 0.027525815879926085, + "rewards/ddi_reward": -0.007801824926718837, + "rewards/jaccard_reward": 0.0645161323598586, + "rewards/refuse_rate_reward": -0.0041415665298700334, + "step": 830 + }, + { + "completion_length": 22.7125, + "epoch": 0.0672, + "grad_norm": 2.494553327560425, + "kl": 0.5126267418265342, + "learning_rate": 9.933554282552587e-06, + "loss": -0.045, + "reward": 0.05825497517362237, + "reward_std": 0.02747459299862385, + "rewards/ddi_reward": -0.008568704675417394, + "rewards/jaccard_reward": 0.05883285254240036, + "rewards/refuse_rate_reward": -0.002889385027810931, + "step": 840 + }, + { + "completion_length": 25.8953125, + "epoch": 0.068, + "grad_norm": 1.3391239643096924, + "kl": 0.31319587901234625, + "learning_rate": 9.931463132780719e-06, + "loss": -0.0433, + "reward": 0.05274099339731038, + "reward_std": 0.028338412195444106, + "rewards/ddi_reward": -0.0027345330541720613, + "rewards/jaccard_reward": 0.053075330704450606, + "rewards/refuse_rate_reward": -0.0016716833924874663, + "step": 850 + }, + { + "completion_length": 18.69375, + "epoch": 0.0688, + "grad_norm": 3.449465274810791, + "kl": 0.424176574498415, + "learning_rate": 9.929339811970593e-06, + "loss": -0.0754, + "reward": 0.058173507277388126, + "reward_std": 0.031346387974917886, + "rewards/ddi_reward": -0.011264619915164076, + "rewards/jaccard_reward": 0.05863244173815474, + "rewards/refuse_rate_reward": -0.002294671558775008, + "step": 860 + }, + { + "completion_length": 20.8625, + "epoch": 0.0696, + "grad_norm": 2.4115195274353027, + "kl": 0.3642787978053093, + "learning_rate": 9.92718433397397e-06, + "loss": -0.0497, + "reward": 0.04935491872020066, + "reward_std": 0.02474272227846086, + "rewards/ddi_reward": -0.011361691250931472, + "rewards/jaccard_reward": 0.04949901591462549, + "rewards/refuse_rate_reward": -0.0007204861147329211, + "step": 870 + }, + { + "completion_length": 21.76875, + "epoch": 0.0704, + "grad_norm": 1.4584277868270874, + "kl": 0.3735868878662586, + "learning_rate": 9.924996712852384e-06, + "loss": -0.0549, + "reward": 0.047994614206254484, + "reward_std": 0.026561270607635377, + "rewards/ddi_reward": -0.011746885476168246, + "rewards/jaccard_reward": 0.048360065720044074, + "rewards/refuse_rate_reward": -0.0018272569868713617, + "step": 880 + }, + { + "completion_length": 24.8640625, + "epoch": 0.0712, + "grad_norm": 2.365738868713379, + "kl": 0.3514306958764791, + "learning_rate": 9.922776962877071e-06, + "loss": -0.0717, + "reward": 0.05983357541263103, + "reward_std": 0.027558912802487612, + "rewards/ddi_reward": -0.01076944075175561, + "rewards/jaccard_reward": 0.060265895538032056, + "rewards/refuse_rate_reward": -0.0021616042824462054, + "step": 890 + }, + { + "completion_length": 24.1765625, + "epoch": 0.072, + "grad_norm": 1.9031428098678589, + "kl": 0.38137699365615846, + "learning_rate": 9.920525098528856e-06, + "loss": -0.0592, + "reward": 0.07086379639804363, + "reward_std": 0.026479250052943826, + "rewards/ddi_reward": -0.006034689780790359, + "rewards/jaccard_reward": 0.07137850555591285, + "rewards/refuse_rate_reward": -0.00257355630164966, + "step": 900 + }, + { + "completion_length": 24.71875, + "epoch": 0.0728, + "grad_norm": 1.5082311630249023, + "kl": 0.40275125578045845, + "learning_rate": 9.918241134498063e-06, + "loss": -0.0543, + "reward": 0.047976852394640446, + "reward_std": 0.026674391515553, + "rewards/ddi_reward": -0.008288503689254868, + "rewards/jaccard_reward": 0.04847786771133542, + "rewards/refuse_rate_reward": -0.0025050730677321554, + "step": 910 + }, + { + "completion_length": 21.365625, + "epoch": 0.0736, + "grad_norm": 2.2370526790618896, + "kl": 0.5090387791395188, + "learning_rate": 9.915925085684433e-06, + "loss": -0.0836, + "reward": 0.06484682415612042, + "reward_std": 0.024180404003709556, + "rewards/ddi_reward": -0.007448705256683752, + "rewards/jaccard_reward": 0.06560050188563764, + "rewards/refuse_rate_reward": -0.0037683880538679658, + "step": 920 + }, + { + "completion_length": 22.9703125, + "epoch": 0.0744, + "grad_norm": 2.4240963459014893, + "kl": 0.6080885909497737, + "learning_rate": 9.913576967197006e-06, + "loss": -0.0569, + "reward": 0.06326471967622638, + "reward_std": 0.028557529440149665, + "rewards/ddi_reward": -0.007469095084525179, + "rewards/jaccard_reward": 0.06369911222718656, + "rewards/refuse_rate_reward": -0.0021719599957577885, + "step": 930 + }, + { + "completion_length": 22.853125, + "epoch": 0.0752, + "grad_norm": 9.723219871520996, + "kl": 0.4839223146438599, + "learning_rate": 9.91119679435404e-06, + "loss": -0.0706, + "reward": 0.06153585612773895, + "reward_std": 0.025089749740436672, + "rewards/ddi_reward": -0.01057787473546341, + "rewards/jaccard_reward": 0.061824414413422346, + "rewards/refuse_rate_reward": -0.0014427933492697775, + "step": 940 + }, + { + "completion_length": 22.33125, + "epoch": 0.076, + "grad_norm": 2.521580457687378, + "kl": 0.4068328499794006, + "learning_rate": 9.9087845826829e-06, + "loss": -0.0536, + "reward": 0.06644059391692281, + "reward_std": 0.025233500683680177, + "rewards/ddi_reward": -0.009120530274230987, + "rewards/jaccard_reward": 0.06736432851757854, + "rewards/refuse_rate_reward": -0.00461867549456656, + "step": 950 + }, + { + "completion_length": 21.1, + "epoch": 0.0768, + "grad_norm": 2.5393309593200684, + "kl": 0.43110764399170876, + "learning_rate": 9.906340347919959e-06, + "loss": -0.0654, + "reward": 0.06696439711377025, + "reward_std": 0.02914987956173718, + "rewards/ddi_reward": -0.006037381466012448, + "rewards/jaccard_reward": 0.06763453427702189, + "rewards/refuse_rate_reward": -0.003350694489199668, + "step": 960 + }, + { + "completion_length": 21.4734375, + "epoch": 0.0776, + "grad_norm": 3.205782651901245, + "kl": 0.4767309673130512, + "learning_rate": 9.903864106010502e-06, + "loss": -0.0436, + "reward": 0.05564412451349199, + "reward_std": 0.025943185156211256, + "rewards/ddi_reward": -0.010071343203890137, + "rewards/jaccard_reward": 0.05606626959051937, + "rewards/refuse_rate_reward": -0.00211072793463245, + "step": 970 + }, + { + "completion_length": 20.3640625, + "epoch": 0.0784, + "grad_norm": 8.04362964630127, + "kl": 0.642146448418498, + "learning_rate": 9.901355873108611e-06, + "loss": -0.0417, + "reward": 0.05844091777689755, + "reward_std": 0.021689750300720335, + "rewards/ddi_reward": -0.005026922644174192, + "rewards/jaccard_reward": 0.05890950975008309, + "rewards/refuse_rate_reward": -0.0023429579799994827, + "step": 980 + }, + { + "completion_length": 19.04375, + "epoch": 0.0792, + "grad_norm": 5.108467102050781, + "kl": 0.6586141124367714, + "learning_rate": 9.898815665577069e-06, + "loss": -0.0352, + "reward": 0.04565193364396691, + "reward_std": 0.02435029160697013, + "rewards/ddi_reward": -0.006507644872181117, + "rewards/jaccard_reward": 0.04628191189840436, + "rewards/refuse_rate_reward": -0.003149887442123145, + "step": 990 + }, + { + "completion_length": 18.4734375, + "epoch": 0.08, + "grad_norm": 10.617640495300293, + "kl": 1.7444962188601494, + "learning_rate": 9.89624349998725e-06, + "loss": -0.0002, + "reward": 0.050531846145167945, + "reward_std": 0.024985995021415873, + "rewards/ddi_reward": -0.004282841575331986, + "rewards/jaccard_reward": 0.051663890667259696, + "rewards/refuse_rate_reward": -0.0056602251948788766, + "step": 1000 + }, + { + "completion_length": 16.703125, + "epoch": 0.0808, + "grad_norm": 16.444522857666016, + "kl": 0.5319973275065422, + "learning_rate": 9.893639393119007e-06, + "loss": -0.05, + "reward": 0.04777087995316833, + "reward_std": 0.025718830549158155, + "rewards/ddi_reward": -0.00017970757908187808, + "rewards/jaccard_reward": 0.04804729328025133, + "rewards/refuse_rate_reward": -0.001382068486418575, + "step": 1010 + }, + { + "completion_length": 19.584375, + "epoch": 0.0816, + "grad_norm": 2.5898537635803223, + "kl": 0.5110674373805523, + "learning_rate": 9.891003361960572e-06, + "loss": -0.0489, + "reward": 0.05387585740536451, + "reward_std": 0.025586627097800373, + "rewards/ddi_reward": -0.007083223172230646, + "rewards/jaccard_reward": 0.05460462439805269, + "rewards/refuse_rate_reward": -0.003643838758580387, + "step": 1020 + }, + { + "completion_length": 20.509375, + "epoch": 0.0824, + "grad_norm": 24.400226593017578, + "kl": 0.37676561176776885, + "learning_rate": 9.888335423708432e-06, + "loss": -0.0524, + "reward": 0.049183654878288505, + "reward_std": 0.02671249662525952, + "rewards/ddi_reward": -0.004053824728180189, + "rewards/jaccard_reward": 0.05017777916509658, + "rewards/refuse_rate_reward": -0.004970628279261291, + "step": 1030 + }, + { + "completion_length": 25.321875, + "epoch": 0.0832, + "grad_norm": 1.8572465181350708, + "kl": 0.35671677254140377, + "learning_rate": 9.885635595767228e-06, + "loss": -0.0411, + "reward": 0.05455333990976215, + "reward_std": 0.031191593501716852, + "rewards/ddi_reward": -0.006965544610284269, + "rewards/jaccard_reward": 0.05576258571818471, + "rewards/refuse_rate_reward": -0.006046232511289417, + "step": 1040 + }, + { + "completion_length": 20.5546875, + "epoch": 0.084, + "grad_norm": 2.979813575744629, + "kl": 0.47171940430998804, + "learning_rate": 9.88290389574964e-06, + "loss": -0.0535, + "reward": 0.05789860249496996, + "reward_std": 0.02566649462096393, + "rewards/ddi_reward": -0.0028174876468256118, + "rewards/jaccard_reward": 0.05850437378976494, + "rewards/refuse_rate_reward": -0.0030288548674434423, + "step": 1050 + }, + { + "completion_length": 23.6515625, + "epoch": 0.0848, + "grad_norm": 5.043629169464111, + "kl": 0.3643439456820488, + "learning_rate": 9.880140341476264e-06, + "loss": -0.0476, + "reward": 0.05530299409292638, + "reward_std": 0.028274358785711228, + "rewards/ddi_reward": -0.010580690513597802, + "rewards/jaccard_reward": 0.05585281334351748, + "rewards/refuse_rate_reward": -0.0027490839478559793, + "step": 1060 + }, + { + "completion_length": 24.5015625, + "epoch": 0.0856, + "grad_norm": 2.4544928073883057, + "kl": 0.34607671573758125, + "learning_rate": 9.877344950975504e-06, + "loss": -0.0587, + "reward": 0.06668875552713871, + "reward_std": 0.028178764507174492, + "rewards/ddi_reward": -0.010871269373456015, + "rewards/jaccard_reward": 0.06695130402222275, + "rewards/refuse_rate_reward": -0.0013127367943525313, + "step": 1070 + }, + { + "completion_length": 26.7984375, + "epoch": 0.0864, + "grad_norm": 1.4340753555297852, + "kl": 0.4237507998943329, + "learning_rate": 9.874517742483454e-06, + "loss": -0.0632, + "reward": 0.05980852581560612, + "reward_std": 0.02517706751823425, + "rewards/ddi_reward": 0.0016801168792881071, + "rewards/jaccard_reward": 0.060260049568023534, + "rewards/refuse_rate_reward": -0.0022576209506951273, + "step": 1080 + }, + { + "completion_length": 28.628125, + "epoch": 0.0872, + "grad_norm": 3.4584853649139404, + "kl": 0.4108202330768108, + "learning_rate": 9.871658734443774e-06, + "loss": -0.0513, + "reward": 0.05805153548717499, + "reward_std": 0.02971949023194611, + "rewards/ddi_reward": -0.012516241532284766, + "rewards/jaccard_reward": 0.05857565607875585, + "rewards/refuse_rate_reward": -0.002620599139481783, + "step": 1090 + }, + { + "completion_length": 24.3359375, + "epoch": 0.088, + "grad_norm": 1.498334288597107, + "kl": 0.3951046302914619, + "learning_rate": 9.86876794550757e-06, + "loss": -0.019, + "reward": 0.0329466309864074, + "reward_std": 0.026199090084992348, + "rewards/ddi_reward": -0.0035395075217820706, + "rewards/jaccard_reward": 0.03338264317717403, + "rewards/refuse_rate_reward": -0.0021800595452077685, + "step": 1100 + }, + { + "completion_length": 23.1703125, + "epoch": 0.0888, + "grad_norm": 1.749222755432129, + "kl": 0.43197436556220054, + "learning_rate": 9.865845394533282e-06, + "loss": -0.0463, + "reward": 0.06163178258575499, + "reward_std": 0.029102146811783313, + "rewards/ddi_reward": -0.009200153435813263, + "rewards/jaccard_reward": 0.06187405614182353, + "rewards/refuse_rate_reward": -0.0012113667791709304, + "step": 1110 + }, + { + "completion_length": 22.759375, + "epoch": 0.0896, + "grad_norm": 5.944066524505615, + "kl": 0.4161111660301685, + "learning_rate": 9.862891100586549e-06, + "loss": -0.0431, + "reward": 0.05601162908133119, + "reward_std": 0.028053085878491403, + "rewards/ddi_reward": -0.0068312414165120575, + "rewards/jaccard_reward": 0.05624139695428312, + "rewards/refuse_rate_reward": -0.0011488381423987448, + "step": 1120 + }, + { + "completion_length": 24.146875, + "epoch": 0.0904, + "grad_norm": 4.883821487426758, + "kl": 0.35037501752376554, + "learning_rate": 9.859905082940088e-06, + "loss": -0.0468, + "reward": 0.05242892252281308, + "reward_std": 0.030717754969373345, + "rewards/ddi_reward": -0.0013865639513824135, + "rewards/jaccard_reward": 0.05298969038994983, + "rewards/refuse_rate_reward": -0.0028038368094712497, + "step": 1130 + }, + { + "completion_length": 20.225, + "epoch": 0.0912, + "grad_norm": 3.539147138595581, + "kl": 0.3829478070139885, + "learning_rate": 9.856887361073572e-06, + "loss": -0.0517, + "reward": 0.04197888995986432, + "reward_std": 0.028772536478936673, + "rewards/ddi_reward": -0.0049468709621578455, + "rewards/jaccard_reward": 0.042275517026428136, + "rewards/refuse_rate_reward": -0.0014831349602900446, + "step": 1140 + }, + { + "completion_length": 24.38125, + "epoch": 0.092, + "grad_norm": 1.5310781002044678, + "kl": 0.5021689921617508, + "learning_rate": 9.853837954673503e-06, + "loss": -0.05, + "reward": 0.06075778636150062, + "reward_std": 0.027301435125991702, + "rewards/ddi_reward": -0.009389536845264956, + "rewards/jaccard_reward": 0.06107326028868556, + "rewards/refuse_rate_reward": -0.0015773809398524463, + "step": 1150 + }, + { + "completion_length": 24.625, + "epoch": 0.0928, + "grad_norm": 1.9107905626296997, + "kl": 0.43552867099642756, + "learning_rate": 9.850756883633077e-06, + "loss": -0.047, + "reward": 0.050847715570125726, + "reward_std": 0.02777251019142568, + "rewards/ddi_reward": -0.012074810615740717, + "rewards/jaccard_reward": 0.0513634649454616, + "rewards/refuse_rate_reward": -0.002578745083883405, + "step": 1160 + }, + { + "completion_length": 24.540625, + "epoch": 0.0936, + "grad_norm": 5.221657752990723, + "kl": 0.4793441526591778, + "learning_rate": 9.847644168052057e-06, + "loss": -0.0682, + "reward": 0.06798870712518693, + "reward_std": 0.028133330144919456, + "rewards/ddi_reward": -0.0013922846861532888, + "rewards/jaccard_reward": 0.06823536945739761, + "rewards/refuse_rate_reward": -0.0012333153164945543, + "step": 1170 + }, + { + "completion_length": 24.3625, + "epoch": 0.0944, + "grad_norm": 2.508359432220459, + "kl": 0.43448147401213644, + "learning_rate": 9.844499828236652e-06, + "loss": -0.0372, + "reward": 0.050987742049619554, + "reward_std": 0.0298352453391999, + "rewards/ddi_reward": -0.005165118057630025, + "rewards/jaccard_reward": 0.05173606257885695, + "rewards/refuse_rate_reward": -0.0037416067614685746, + "step": 1180 + }, + { + "completion_length": 25.125, + "epoch": 0.0952, + "grad_norm": 2.1786296367645264, + "kl": 0.4114015236496925, + "learning_rate": 9.841323884699365e-06, + "loss": -0.0449, + "reward": 0.05002592792734504, + "reward_std": 0.025081303948536516, + "rewards/ddi_reward": 2.5061730411835014e-06, + "rewards/jaccard_reward": 0.05067935879342258, + "rewards/refuse_rate_reward": -0.0032671522698365153, + "step": 1190 + }, + { + "completion_length": 22.0890625, + "epoch": 0.096, + "grad_norm": 3.532862663269043, + "kl": 0.49799945801496504, + "learning_rate": 9.838116358158876e-06, + "loss": -0.0385, + "reward": 0.04073324385099113, + "reward_std": 0.023458675807341935, + "rewards/ddi_reward": -0.006117396679474041, + "rewards/jaccard_reward": 0.04114226405508816, + "rewards/refuse_rate_reward": -0.0020451055141165854, + "step": 1200 + }, + { + "completion_length": 24.9125, + "epoch": 0.0968, + "grad_norm": 2.780172348022461, + "kl": 0.34704585522413256, + "learning_rate": 9.834877269539902e-06, + "loss": -0.0475, + "reward": 0.05580670065246522, + "reward_std": 0.027997491357382387, + "rewards/ddi_reward": -0.006012217851821333, + "rewards/jaccard_reward": 0.05630015060305595, + "rewards/refuse_rate_reward": -0.0024672506377100946, + "step": 1210 + }, + { + "completion_length": 23.8921875, + "epoch": 0.0976, + "grad_norm": 7.656563758850098, + "kl": 0.48737442344427107, + "learning_rate": 9.831606639973052e-06, + "loss": -0.0335, + "reward": 0.04678160212934017, + "reward_std": 0.03232279461808503, + "rewards/ddi_reward": -0.004059745525592007, + "rewards/jaccard_reward": 0.04727502972818911, + "rewards/refuse_rate_reward": -0.0024671379826031624, + "step": 1220 + }, + { + "completion_length": 19.940625, + "epoch": 0.0984, + "grad_norm": 2.8221230506896973, + "kl": 0.4046324580907822, + "learning_rate": 9.828304490794708e-06, + "loss": -0.0461, + "reward": 0.04505473374156281, + "reward_std": 0.02687023808248341, + "rewards/ddi_reward": -0.013577377842739224, + "rewards/jaccard_reward": 0.0451277848740574, + "rewards/refuse_rate_reward": -0.00036525975447148084, + "step": 1230 + }, + { + "completion_length": 22.053125, + "epoch": 0.0992, + "grad_norm": 1.5789248943328857, + "kl": 0.3710774064064026, + "learning_rate": 9.824970843546862e-06, + "loss": -0.0514, + "reward": 0.046630195470061156, + "reward_std": 0.029916393104940653, + "rewards/ddi_reward": -0.011273550061741845, + "rewards/jaccard_reward": 0.04701858748740051, + "rewards/refuse_rate_reward": -0.0019419643678702414, + "step": 1240 + }, + { + "completion_length": 20.1828125, + "epoch": 0.1, + "grad_norm": 3.300907611846924, + "kl": 0.5440885283052921, + "learning_rate": 9.821605719976997e-06, + "loss": -0.0331, + "reward": 0.04584857514128089, + "reward_std": 0.02708849050104618, + "rewards/ddi_reward": -0.010097040017717518, + "rewards/jaccard_reward": 0.04713540726806968, + "rewards/refuse_rate_reward": -0.006434151809662581, + "step": 1250 + }, + { + "completion_length": 22.2109375, + "epoch": 0.1008, + "grad_norm": 5.068177700042725, + "kl": 0.4118523895740509, + "learning_rate": 9.818209142037932e-06, + "loss": -0.039, + "reward": 0.05030130087397992, + "reward_std": 0.0278244158718735, + "rewards/ddi_reward": 0.004240677383495495, + "rewards/jaccard_reward": 0.051104252925142646, + "rewards/refuse_rate_reward": -0.0040147569961845875, + "step": 1260 + }, + { + "completion_length": 27.834375, + "epoch": 0.1016, + "grad_norm": 3.331639289855957, + "kl": 0.72084486708045, + "learning_rate": 9.814781131887681e-06, + "loss": -0.032, + "reward": 0.06520194625481963, + "reward_std": 0.028283318597823383, + "rewards/ddi_reward": -0.00899310649256222, + "rewards/jaccard_reward": 0.06584419389255344, + "rewards/refuse_rate_reward": -0.0032112419372424482, + "step": 1270 + }, + { + "completion_length": 28.3140625, + "epoch": 0.1024, + "grad_norm": 1.3702776432037354, + "kl": 0.3350364826619625, + "learning_rate": 9.811321711889313e-06, + "loss": -0.0536, + "reward": 0.058503843005746604, + "reward_std": 0.03102583228610456, + "rewards/ddi_reward": -0.01104991605388932, + "rewards/jaccard_reward": 0.058880903339013455, + "rewards/refuse_rate_reward": -0.0018853043671697377, + "step": 1280 + }, + { + "completion_length": 23.728125, + "epoch": 0.1032, + "grad_norm": 1.2916789054870605, + "kl": 0.4516872279345989, + "learning_rate": 9.807830904610803e-06, + "loss": -0.0249, + "reward": 0.0366468018386513, + "reward_std": 0.028912532795220613, + "rewards/ddi_reward": -0.0018764787178952246, + "rewards/jaccard_reward": 0.03727001771330833, + "rewards/refuse_rate_reward": -0.0031160762999206783, + "step": 1290 + }, + { + "completion_length": 27.1921875, + "epoch": 0.104, + "grad_norm": 2.734806537628174, + "kl": 0.37923685908317567, + "learning_rate": 9.804308732824883e-06, + "loss": -0.0294, + "reward": 0.056237278738990426, + "reward_std": 0.030014314129948615, + "rewards/ddi_reward": -0.002761623848346062, + "rewards/jaccard_reward": 0.05762832197360694, + "rewards/refuse_rate_reward": -0.006955216231290251, + "step": 1300 + }, + { + "completion_length": 23.35, + "epoch": 0.1048, + "grad_norm": 1.9605711698532104, + "kl": 0.41794675067067144, + "learning_rate": 9.800755219508898e-06, + "loss": -0.0641, + "reward": 0.05725725004449487, + "reward_std": 0.028483542148023844, + "rewards/ddi_reward": 0.0049227124924073, + "rewards/jaccard_reward": 0.05826149606145918, + "rewards/refuse_rate_reward": -0.0050212309579364955, + "step": 1310 + }, + { + "completion_length": 23.7921875, + "epoch": 0.1056, + "grad_norm": 2.971281051635742, + "kl": 0.4519328311085701, + "learning_rate": 9.79717038784465e-06, + "loss": -0.0348, + "reward": 0.05630892829503864, + "reward_std": 0.027057580649852753, + "rewards/ddi_reward": -0.0046528119244612755, + "rewards/jaccard_reward": 0.056946464464999735, + "rewards/refuse_rate_reward": -0.0031876805354841054, + "step": 1320 + }, + { + "completion_length": 20.9546875, + "epoch": 0.1064, + "grad_norm": 3.747969627380371, + "kl": 0.5203683212399483, + "learning_rate": 9.793554261218257e-06, + "loss": -0.0297, + "reward": 0.05149112897925079, + "reward_std": 0.0287293441593647, + "rewards/ddi_reward": -0.0016537693387363105, + "rewards/jaccard_reward": 0.052999686740804464, + "rewards/refuse_rate_reward": -0.007542782759992405, + "step": 1330 + }, + { + "completion_length": 21.76875, + "epoch": 0.1072, + "grad_norm": 1.7484087944030762, + "kl": 0.43632240518927573, + "learning_rate": 9.789906863219983e-06, + "loss": -0.0441, + "reward": 0.04388708611950278, + "reward_std": 0.023241319134831427, + "rewards/ddi_reward": -0.0017104094265960156, + "rewards/jaccard_reward": 0.0444209499983117, + "rewards/refuse_rate_reward": -0.0026693272637203337, + "step": 1340 + }, + { + "completion_length": 25.0390625, + "epoch": 0.108, + "grad_norm": 1.1322036981582642, + "kl": 0.4575160674750805, + "learning_rate": 9.786228217644107e-06, + "loss": -0.0484, + "reward": 0.04228678289800882, + "reward_std": 0.027363972109742462, + "rewards/ddi_reward": 0.0012691488780546933, + "rewards/jaccard_reward": 0.04338895634864457, + "rewards/refuse_rate_reward": -0.005510867794509977, + "step": 1350 + }, + { + "completion_length": 20.65625, + "epoch": 0.1088, + "grad_norm": 3.9600112438201904, + "kl": 0.423899095878005, + "learning_rate": 9.782518348488746e-06, + "loss": -0.0721, + "reward": 0.056813425943255426, + "reward_std": 0.029535386594943702, + "rewards/ddi_reward": 0.004307507473276928, + "rewards/jaccard_reward": 0.05706295231357217, + "rewards/refuse_rate_reward": -0.001247632596641779, + "step": 1360 + }, + { + "completion_length": 23.2046875, + "epoch": 0.1096, + "grad_norm": 1.866873860359192, + "kl": 0.744344274699688, + "learning_rate": 9.778777279955713e-06, + "loss": -0.0561, + "reward": 0.06098699445137754, + "reward_std": 0.030673387553542854, + "rewards/ddi_reward": -0.005062484840163961, + "rewards/jaccard_reward": 0.06186608651187271, + "rewards/refuse_rate_reward": -0.004395461338572204, + "step": 1370 + }, + { + "completion_length": 24.553125, + "epoch": 0.1104, + "grad_norm": 3.6723835468292236, + "kl": 0.5640023954212665, + "learning_rate": 9.775005036450353e-06, + "loss": -0.0445, + "reward": 0.06380589492619038, + "reward_std": 0.02997522186487913, + "rewards/ddi_reward": -0.0011904205719474703, + "rewards/jaccard_reward": 0.06412173542194069, + "rewards/refuse_rate_reward": -0.0015792021062225104, + "step": 1380 + }, + { + "completion_length": 21.7296875, + "epoch": 0.1112, + "grad_norm": 1.6011555194854736, + "kl": 0.43414945229887963, + "learning_rate": 9.771201642581384e-06, + "loss": -0.0469, + "reward": 0.05215927893295884, + "reward_std": 0.0272215873003006, + "rewards/ddi_reward": 0.0002515258616767824, + "rewards/jaccard_reward": 0.05301486356183886, + "rewards/refuse_rate_reward": -0.0042779248091392216, + "step": 1390 + }, + { + "completion_length": 21.9890625, + "epoch": 0.112, + "grad_norm": 3.3245902061462402, + "kl": 0.46388472691178323, + "learning_rate": 9.76736712316074e-06, + "loss": -0.0459, + "reward": 0.06066368417814374, + "reward_std": 0.02517681373283267, + "rewards/ddi_reward": -0.00783395984908566, + "rewards/jaccard_reward": 0.06128061767667532, + "rewards/refuse_rate_reward": -0.0030846758862026038, + "step": 1400 + }, + { + "completion_length": 20.409375, + "epoch": 0.1128, + "grad_norm": 2.4316914081573486, + "kl": 0.49135151579976083, + "learning_rate": 9.763501503203405e-06, + "loss": -0.0491, + "reward": 0.05157588459551334, + "reward_std": 0.03236822127364576, + "rewards/ddi_reward": -0.008904571871971712, + "rewards/jaccard_reward": 0.05242029766086489, + "rewards/refuse_rate_reward": -0.004222064913483337, + "step": 1410 + }, + { + "completion_length": 23.44375, + "epoch": 0.1136, + "grad_norm": 2.255751132965088, + "kl": 0.3776444174349308, + "learning_rate": 9.75960480792725e-06, + "loss": -0.0561, + "reward": 0.05927390907891095, + "reward_std": 0.0275219873059541, + "rewards/ddi_reward": -0.012946001207455992, + "rewards/jaccard_reward": 0.059756405744701625, + "rewards/refuse_rate_reward": -0.002412479044869542, + "step": 1420 + }, + { + "completion_length": 20.0328125, + "epoch": 0.1144, + "grad_norm": 2.324611186981201, + "kl": 0.4729289323091507, + "learning_rate": 9.75567706275287e-06, + "loss": -0.0485, + "reward": 0.05336127479095012, + "reward_std": 0.02500615008175373, + "rewards/ddi_reward": -0.0023316092789173125, + "rewards/jaccard_reward": 0.0541946945944801, + "rewards/refuse_rate_reward": -0.00416709411656484, + "step": 1430 + }, + { + "completion_length": 23.8328125, + "epoch": 0.1152, + "grad_norm": 3.8145222663879395, + "kl": 0.46646992638707163, + "learning_rate": 9.751718293303423e-06, + "loss": -0.0446, + "reward": 0.05582557360175997, + "reward_std": 0.027434576628729702, + "rewards/ddi_reward": -0.013527475955197588, + "rewards/jaccard_reward": 0.05691226712660864, + "rewards/refuse_rate_reward": -0.005433464306406677, + "step": 1440 + }, + { + "completion_length": 26.975, + "epoch": 0.116, + "grad_norm": 2.8181536197662354, + "kl": 0.4300100147724152, + "learning_rate": 9.74772852540445e-06, + "loss": -0.042, + "reward": 0.06227147658355534, + "reward_std": 0.024847394414246084, + "rewards/ddi_reward": -0.006491928259129054, + "rewards/jaccard_reward": 0.0630010416265577, + "rewards/refuse_rate_reward": -0.003647830174304545, + "step": 1450 + }, + { + "completion_length": 26.1234375, + "epoch": 0.1168, + "grad_norm": 1.4345605373382568, + "kl": 0.5669560343027115, + "learning_rate": 9.743707785083723e-06, + "loss": -0.0249, + "reward": 0.04522666987031698, + "reward_std": 0.025412922305986286, + "rewards/ddi_reward": -0.009713655422092415, + "rewards/jaccard_reward": 0.04607582825701684, + "rewards/refuse_rate_reward": -0.004245795495808125, + "step": 1460 + }, + { + "completion_length": 25.4828125, + "epoch": 0.1176, + "grad_norm": 1.478529930114746, + "kl": 0.5241041526198387, + "learning_rate": 9.739656098571056e-06, + "loss": -0.0421, + "reward": 0.05682356618344784, + "reward_std": 0.028850405104458333, + "rewards/ddi_reward": -0.012555188487749547, + "rewards/jaccard_reward": 0.05754404996987432, + "rewards/refuse_rate_reward": -0.003602430678438395, + "step": 1470 + }, + { + "completion_length": 21.596875, + "epoch": 0.1184, + "grad_norm": 5.26859712600708, + "kl": 0.5925735630095005, + "learning_rate": 9.735573492298156e-06, + "loss": -0.0695, + "reward": 0.06132822334766388, + "reward_std": 0.027297979686409235, + "rewards/ddi_reward": -0.00963461613864638, + "rewards/jaccard_reward": 0.061663179006427524, + "rewards/refuse_rate_reward": -0.0016747835848946123, + "step": 1480 + }, + { + "completion_length": 25.584375, + "epoch": 0.1192, + "grad_norm": 1.7477681636810303, + "kl": 0.4148997124284506, + "learning_rate": 9.73145999289843e-06, + "loss": -0.043, + "reward": 0.05522136185318231, + "reward_std": 0.026922978227958083, + "rewards/ddi_reward": -0.010373962111771107, + "rewards/jaccard_reward": 0.05624089143238962, + "rewards/refuse_rate_reward": -0.005097656277939677, + "step": 1490 + }, + { + "completion_length": 22.078125, + "epoch": 0.12, + "grad_norm": 2.398113965988159, + "kl": 0.3701006568968296, + "learning_rate": 9.727315627206827e-06, + "loss": -0.0535, + "reward": 0.04743431252427399, + "reward_std": 0.02748703076504171, + "rewards/ddi_reward": -0.01393632596009411, + "rewards/jaccard_reward": 0.04819000861607492, + "rewards/refuse_rate_reward": -0.0037784830085001884, + "step": 1500 + }, + { + "completion_length": 20.6171875, + "epoch": 0.1208, + "grad_norm": 1.211120843887329, + "kl": 0.36030448228120804, + "learning_rate": 9.72314042225965e-06, + "loss": -0.0478, + "reward": 0.04897936712950468, + "reward_std": 0.024107373040169477, + "rewards/ddi_reward": 0.0001607320737093687, + "rewards/jaccard_reward": 0.049341482407180594, + "rewards/refuse_rate_reward": -0.0018105722498148679, + "step": 1510 + }, + { + "completion_length": 20.4234375, + "epoch": 0.1216, + "grad_norm": 2.912285327911377, + "kl": 0.4840222917497158, + "learning_rate": 9.718934405294394e-06, + "loss": -0.0473, + "reward": 0.049080992210656406, + "reward_std": 0.02747847274877131, + "rewards/ddi_reward": -0.001228871988132596, + "rewards/jaccard_reward": 0.049536720744799825, + "rewards/refuse_rate_reward": -0.0022786458488553763, + "step": 1520 + }, + { + "completion_length": 22.45, + "epoch": 0.1224, + "grad_norm": 0.9167805910110474, + "kl": 0.4123259000480175, + "learning_rate": 9.714697603749549e-06, + "loss": -0.0562, + "reward": 0.061920927558094266, + "reward_std": 0.024149652873165905, + "rewards/ddi_reward": -0.007779879795270972, + "rewards/jaccard_reward": 0.062332637561485174, + "rewards/refuse_rate_reward": -0.002058555604889989, + "step": 1530 + }, + { + "completion_length": 17.8546875, + "epoch": 0.1232, + "grad_norm": 3.0418572425842285, + "kl": 0.4646789960563183, + "learning_rate": 9.710430045264443e-06, + "loss": -0.0631, + "reward": 0.04878751132637262, + "reward_std": 0.02498317528516054, + "rewards/ddi_reward": 0.0001428394825779833, + "rewards/jaccard_reward": 0.0495257314061746, + "rewards/refuse_rate_reward": -0.0036910962662659585, + "step": 1540 + }, + { + "completion_length": 21.4046875, + "epoch": 0.124, + "grad_norm": 7.118134021759033, + "kl": 0.4416692428290844, + "learning_rate": 9.706131757679043e-06, + "loss": -0.0356, + "reward": 0.037781877000816165, + "reward_std": 0.026465432066470385, + "rewards/ddi_reward": -0.005517835286445916, + "rewards/jaccard_reward": 0.03897037981078029, + "rewards/refuse_rate_reward": -0.005942516843788326, + "step": 1550 + }, + { + "completion_length": 23.6265625, + "epoch": 0.1248, + "grad_norm": 5.881462574005127, + "kl": 0.7530009865760803, + "learning_rate": 9.701802769033788e-06, + "loss": -0.0605, + "reward": 0.07310577733442188, + "reward_std": 0.026724845729768278, + "rewards/ddi_reward": -0.002391991642070934, + "rewards/jaccard_reward": 0.07381552970036864, + "rewards/refuse_rate_reward": -0.0035487631044816225, + "step": 1560 + }, + { + "completion_length": 22.028125, + "epoch": 0.1256, + "grad_norm": 2.387352705001831, + "kl": 0.4317987523972988, + "learning_rate": 9.697443107569394e-06, + "loss": -0.0468, + "reward": 0.05396655257791281, + "reward_std": 0.022740213782526553, + "rewards/ddi_reward": -0.0019028650305699557, + "rewards/jaccard_reward": 0.05422321413643658, + "rewards/refuse_rate_reward": -0.0012833100860007106, + "step": 1570 + }, + { + "completion_length": 21.36875, + "epoch": 0.1264, + "grad_norm": 2.5116522312164307, + "kl": 0.44034027457237246, + "learning_rate": 9.693052801726677e-06, + "loss": -0.0598, + "reward": 0.06055645407177508, + "reward_std": 0.028677422273904084, + "rewards/ddi_reward": -0.007306264012004249, + "rewards/jaccard_reward": 0.06096491725184024, + "rewards/refuse_rate_reward": -0.002042315399739891, + "step": 1580 + }, + { + "completion_length": 24.428125, + "epoch": 0.1272, + "grad_norm": 3.732570171356201, + "kl": 0.37875150591135026, + "learning_rate": 9.688631880146366e-06, + "loss": -0.0312, + "reward": 0.04672828949987888, + "reward_std": 0.028655194258317352, + "rewards/ddi_reward": -0.007469750416930765, + "rewards/jaccard_reward": 0.04701505419798195, + "rewards/refuse_rate_reward": -0.001433827995788306, + "step": 1590 + }, + { + "completion_length": 23.4953125, + "epoch": 0.128, + "grad_norm": 2.5749526023864746, + "kl": 0.43338054344058036, + "learning_rate": 9.684180371668916e-06, + "loss": -0.0574, + "reward": 0.052884919405914846, + "reward_std": 0.02483044331893325, + "rewards/ddi_reward": -0.0009959016577340662, + "rewards/jaccard_reward": 0.053234248398803176, + "rewards/refuse_rate_reward": -0.0017466518562287091, + "step": 1600 + }, + { + "completion_length": 20.353125, + "epoch": 0.1288, + "grad_norm": 1.9179983139038086, + "kl": 0.5308443672955037, + "learning_rate": 9.679698305334318e-06, + "loss": -0.0656, + "reward": 0.05463919145986438, + "reward_std": 0.024079738219734283, + "rewards/ddi_reward": -0.0002497680252417922, + "rewards/jaccard_reward": 0.05468383356928826, + "rewards/refuse_rate_reward": -0.00022321429569274187, + "step": 1610 + }, + { + "completion_length": 25.0109375, + "epoch": 0.1296, + "grad_norm": 2.0632293224334717, + "kl": 0.44098646342754366, + "learning_rate": 9.675185710381912e-06, + "loss": -0.0408, + "reward": 0.05318607809022069, + "reward_std": 0.030904495855793357, + "rewards/ddi_reward": -0.007200843162718229, + "rewards/jaccard_reward": 0.05364441138226539, + "rewards/refuse_rate_reward": -0.0022916666814126073, + "step": 1620 + }, + { + "completion_length": 21.609375, + "epoch": 0.1304, + "grad_norm": 4.2434611320495605, + "kl": 0.5367837831377983, + "learning_rate": 9.670642616250195e-06, + "loss": -0.0388, + "reward": 0.04708980801515281, + "reward_std": 0.025247108936309815, + "rewards/ddi_reward": -0.0014117995160631835, + "rewards/jaccard_reward": 0.04740788785275072, + "rewards/refuse_rate_reward": -0.001590401795692742, + "step": 1630 + }, + { + "completion_length": 19.90625, + "epoch": 0.1312, + "grad_norm": 1.9028974771499634, + "kl": 0.5100836738944053, + "learning_rate": 9.666069052576632e-06, + "loss": -0.0647, + "reward": 0.05597423538565636, + "reward_std": 0.02106211339123547, + "rewards/ddi_reward": 0.001874837197829038, + "rewards/jaccard_reward": 0.05644395975396037, + "rewards/refuse_rate_reward": -0.00234861500794068, + "step": 1640 + }, + { + "completion_length": 18.93125, + "epoch": 0.132, + "grad_norm": 5.10530424118042, + "kl": 0.6183934688568116, + "learning_rate": 9.661465049197456e-06, + "loss": -0.0754, + "reward": 0.05852998299524188, + "reward_std": 0.028616862976923584, + "rewards/ddi_reward": -0.011521016288315877, + "rewards/jaccard_reward": 0.05896990289911628, + "rewards/refuse_rate_reward": -0.002199590904638171, + "step": 1650 + }, + { + "completion_length": 21.0296875, + "epoch": 0.1328, + "grad_norm": 0.8690043687820435, + "kl": 0.5059335626661777, + "learning_rate": 9.656830636147479e-06, + "loss": -0.0449, + "reward": 0.04607570001389831, + "reward_std": 0.025871436670422555, + "rewards/ddi_reward": -0.007340307591948658, + "rewards/jaccard_reward": 0.04661489888094365, + "rewards/refuse_rate_reward": -0.0026959889568388463, + "step": 1660 + }, + { + "completion_length": 20.571875, + "epoch": 0.1336, + "grad_norm": 1.5945849418640137, + "kl": 0.49787177592515947, + "learning_rate": 9.652165843659892e-06, + "loss": -0.0664, + "reward": 0.06405440755188466, + "reward_std": 0.02630614573135972, + "rewards/ddi_reward": -0.005523826071294024, + "rewards/jaccard_reward": 0.06499201152473688, + "rewards/refuse_rate_reward": -0.004688024718780071, + "step": 1670 + }, + { + "completion_length": 19.628125, + "epoch": 0.1344, + "grad_norm": 6.410677433013916, + "kl": 0.6393591642379761, + "learning_rate": 9.647470702166081e-06, + "loss": -0.0304, + "reward": 0.04802501811645925, + "reward_std": 0.026006719516590237, + "rewards/ddi_reward": -0.005387568310834467, + "rewards/jaccard_reward": 0.04858399687800556, + "rewards/refuse_rate_reward": -0.002794890198856592, + "step": 1680 + }, + { + "completion_length": 22.1609375, + "epoch": 0.1352, + "grad_norm": 1.4302363395690918, + "kl": 0.5530774682760239, + "learning_rate": 9.642745242295402e-06, + "loss": -0.0583, + "reward": 0.06193780153989792, + "reward_std": 0.02796526006422937, + "rewards/ddi_reward": -0.0064023379003629085, + "rewards/jaccard_reward": 0.06226853067055345, + "rewards/refuse_rate_reward": -0.001653645862825215, + "step": 1690 + }, + { + "completion_length": 23.4234375, + "epoch": 0.136, + "grad_norm": 0.8524034023284912, + "kl": 0.5885169185698033, + "learning_rate": 9.637989494875006e-06, + "loss": -0.0322, + "reward": 0.052494370844215155, + "reward_std": 0.025767145096324384, + "rewards/ddi_reward": -0.0029318061249796303, + "rewards/jaccard_reward": 0.05278385616838932, + "rewards/refuse_rate_reward": -0.0014474312891252338, + "step": 1700 + }, + { + "completion_length": 21.0125, + "epoch": 0.1368, + "grad_norm": 1.255571961402893, + "kl": 0.5042624406516552, + "learning_rate": 9.633203490929628e-06, + "loss": -0.0568, + "reward": 0.05351695558056235, + "reward_std": 0.02568014180287719, + "rewards/ddi_reward": -0.007451715452771168, + "rewards/jaccard_reward": 0.05433182250708342, + "rewards/refuse_rate_reward": -0.004074337112251669, + "step": 1710 + }, + { + "completion_length": 24.0484375, + "epoch": 0.1376, + "grad_norm": 4.672509670257568, + "kl": 0.4076022394001484, + "learning_rate": 9.628387261681385e-06, + "loss": -0.0573, + "reward": 0.06473975852131844, + "reward_std": 0.02845569411292672, + "rewards/ddi_reward": -0.0015572972741210833, + "rewards/jaccard_reward": 0.06571495728567242, + "rewards/refuse_rate_reward": -0.004875992122106254, + "step": 1720 + }, + { + "completion_length": 21.240625, + "epoch": 0.1384, + "grad_norm": 3.4546291828155518, + "kl": 0.3845115780830383, + "learning_rate": 9.62354083854957e-06, + "loss": -0.0587, + "reward": 0.0517616496887058, + "reward_std": 0.02321402090601623, + "rewards/ddi_reward": -0.005891280199284666, + "rewards/jaccard_reward": 0.05235066954046488, + "rewards/refuse_rate_reward": -0.002945093100424856, + "step": 1730 + }, + { + "completion_length": 22.1234375, + "epoch": 0.1392, + "grad_norm": 3.9342026710510254, + "kl": 0.4156452305614948, + "learning_rate": 9.618664253150453e-06, + "loss": -0.0495, + "reward": 0.048118703672662376, + "reward_std": 0.024375111144036055, + "rewards/ddi_reward": -0.007662535354029387, + "rewards/jaccard_reward": 0.04890082213678397, + "rewards/refuse_rate_reward": -0.003910590405575931, + "step": 1740 + }, + { + "completion_length": 21.103125, + "epoch": 0.14, + "grad_norm": 2.5595505237579346, + "kl": 0.4893106333911419, + "learning_rate": 9.613757537297071e-06, + "loss": -0.0393, + "reward": 0.04554452411830425, + "reward_std": 0.02522597280330956, + "rewards/ddi_reward": -0.0034319281636271625, + "rewards/jaccard_reward": 0.046055150590837, + "rewards/refuse_rate_reward": -0.0025531326537020504, + "step": 1750 + }, + { + "completion_length": 21.340625, + "epoch": 0.1408, + "grad_norm": 7.252964496612549, + "kl": 0.5382134333252907, + "learning_rate": 9.60882072299902e-06, + "loss": -0.0643, + "reward": 0.06098688989877701, + "reward_std": 0.02717587905935943, + "rewards/ddi_reward": -0.005118564440635964, + "rewards/jaccard_reward": 0.06174523954396136, + "rewards/refuse_rate_reward": -0.0037917573004961013, + "step": 1760 + }, + { + "completion_length": 22.940625, + "epoch": 0.1416, + "grad_norm": 2.058075428009033, + "kl": 0.5961754582822323, + "learning_rate": 9.603853842462245e-06, + "loss": -0.0533, + "reward": 0.06027690451592207, + "reward_std": 0.02781693646684289, + "rewards/ddi_reward": 4.366711364127696e-05, + "rewards/jaccard_reward": 0.061910586431622505, + "rewards/refuse_rate_reward": -0.008168402931187303, + "step": 1770 + }, + { + "completion_length": 24.2515625, + "epoch": 0.1424, + "grad_norm": 2.606062173843384, + "kl": 0.5003072693943977, + "learning_rate": 9.598856928088837e-06, + "loss": -0.0445, + "reward": 0.061429803865030407, + "reward_std": 0.02353874403052032, + "rewards/ddi_reward": 0.0012945813170517795, + "rewards/jaccard_reward": 0.06250526730436831, + "rewards/refuse_rate_reward": -0.005377318174578249, + "step": 1780 + }, + { + "completion_length": 27.44375, + "epoch": 0.1432, + "grad_norm": 1.7016267776489258, + "kl": 0.44296313226222994, + "learning_rate": 9.593830012476812e-06, + "loss": -0.0337, + "reward": 0.053630699403584, + "reward_std": 0.026607065508142114, + "rewards/ddi_reward": -0.0036174721863062587, + "rewards/jaccard_reward": 0.05467944866977632, + "rewards/refuse_rate_reward": -0.005243746377527714, + "step": 1790 + }, + { + "completion_length": 25.55625, + "epoch": 0.144, + "grad_norm": 13.096869468688965, + "kl": 0.589732076227665, + "learning_rate": 9.588773128419907e-06, + "loss": -0.0069, + "reward": 0.0473182340618223, + "reward_std": 0.024968556268140672, + "rewards/ddi_reward": -0.0059025405731517825, + "rewards/jaccard_reward": 0.048083300981670615, + "rewards/refuse_rate_reward": -0.0038253348669968546, + "step": 1800 + }, + { + "completion_length": 24.1796875, + "epoch": 0.1448, + "grad_norm": 1.5489130020141602, + "kl": 0.561293201893568, + "learning_rate": 9.583686308907357e-06, + "loss": -0.0257, + "reward": 0.055040537286549805, + "reward_std": 0.026448406837880613, + "rewards/ddi_reward": -0.009468613725039177, + "rewards/jaccard_reward": 0.055422120611183345, + "rewards/refuse_rate_reward": -0.0019079185905866325, + "step": 1810 + }, + { + "completion_length": 26.5984375, + "epoch": 0.1456, + "grad_norm": 1.9180166721343994, + "kl": 0.40841879323124886, + "learning_rate": 9.578569587123691e-06, + "loss": -0.0499, + "reward": 0.06161570916883648, + "reward_std": 0.025642828736454248, + "rewards/ddi_reward": -0.00508377707155887, + "rewards/jaccard_reward": 0.06201577386818826, + "rewards/refuse_rate_reward": -0.002000322681851685, + "step": 1820 + }, + { + "completion_length": 25.2828125, + "epoch": 0.1464, + "grad_norm": 2.5974245071411133, + "kl": 0.40565155521035196, + "learning_rate": 9.573422996448503e-06, + "loss": -0.0587, + "reward": 0.06302496287971734, + "reward_std": 0.030069503653794526, + "rewards/ddi_reward": -0.012565212362096645, + "rewards/jaccard_reward": 0.0637076610699296, + "rewards/refuse_rate_reward": -0.003413487592479214, + "step": 1830 + }, + { + "completion_length": 24.965625, + "epoch": 0.1472, + "grad_norm": 1.6810754537582397, + "kl": 0.38974175453186033, + "learning_rate": 9.568246570456246e-06, + "loss": -0.1069, + "reward": 0.05979230189695954, + "reward_std": 0.026637096609920263, + "rewards/ddi_reward": -0.00977391644846648, + "rewards/jaccard_reward": 0.0603740899823606, + "rewards/refuse_rate_reward": -0.0029089335934259, + "step": 1840 + }, + { + "completion_length": 25.95625, + "epoch": 0.148, + "grad_norm": 1.5618747472763062, + "kl": 0.43564095348119736, + "learning_rate": 9.563040342916002e-06, + "loss": -0.0447, + "reward": 0.05098992744460702, + "reward_std": 0.029408487025648355, + "rewards/ddi_reward": 0.0005151014833245426, + "rewards/jaccard_reward": 0.05121184904128313, + "rewards/refuse_rate_reward": -0.001109610579442233, + "step": 1850 + }, + { + "completion_length": 25.553125, + "epoch": 0.1488, + "grad_norm": 2.5378940105438232, + "kl": 0.4697283051908016, + "learning_rate": 9.557804347791275e-06, + "loss": -0.0052, + "reward": 0.04213532190769911, + "reward_std": 0.028564901987556367, + "rewards/ddi_reward": -0.002595957939047366, + "rewards/jaccard_reward": 0.04278463027440012, + "rewards/refuse_rate_reward": -0.003246545139700174, + "step": 1860 + }, + { + "completion_length": 25.1578125, + "epoch": 0.1496, + "grad_norm": 1.9681588411331177, + "kl": 0.3878356389701366, + "learning_rate": 9.552538619239753e-06, + "loss": -0.0346, + "reward": 0.053833576804026964, + "reward_std": 0.025594086712226273, + "rewards/ddi_reward": -0.0066868361580418425, + "rewards/jaccard_reward": 0.05421927113784477, + "rewards/refuse_rate_reward": -0.001928464142838493, + "step": 1870 + }, + { + "completion_length": 24.1828125, + "epoch": 0.1504, + "grad_norm": 2.341069221496582, + "kl": 0.46562488973140714, + "learning_rate": 9.547243191613101e-06, + "loss": -0.0406, + "reward": 0.052742607239633796, + "reward_std": 0.025066361716017126, + "rewards/ddi_reward": -0.015998071926878764, + "rewards/jaccard_reward": 0.053201600164175036, + "rewards/refuse_rate_reward": -0.002294967765919864, + "step": 1880 + }, + { + "completion_length": 23.1421875, + "epoch": 0.1512, + "grad_norm": 2.8886590003967285, + "kl": 0.3880458176136017, + "learning_rate": 9.541918099456725e-06, + "loss": -0.0499, + "reward": 0.05463268104940653, + "reward_std": 0.025500115286558867, + "rewards/ddi_reward": -0.00225212506193202, + "rewards/jaccard_reward": 0.055020598554983736, + "rewards/refuse_rate_reward": -0.0019395969109609723, + "step": 1890 + }, + { + "completion_length": 19.721875, + "epoch": 0.152, + "grad_norm": 1.1612889766693115, + "kl": 0.43325130864977834, + "learning_rate": 9.536563377509554e-06, + "loss": -0.0372, + "reward": 0.0433676993008703, + "reward_std": 0.024130673008039595, + "rewards/ddi_reward": -0.003893899198737927, + "rewards/jaccard_reward": 0.04382709694909863, + "rewards/refuse_rate_reward": -0.002296993392519653, + "step": 1900 + }, + { + "completion_length": 17.2515625, + "epoch": 0.1528, + "grad_norm": 1.7784113883972168, + "kl": 0.43289407193660734, + "learning_rate": 9.531179060703811e-06, + "loss": -0.066, + "reward": 0.05276009938679636, + "reward_std": 0.024833360128104685, + "rewards/ddi_reward": -0.01122017230372876, + "rewards/jaccard_reward": 0.05318085704930127, + "rewards/refuse_rate_reward": -0.0021037946455180643, + "step": 1910 + }, + { + "completion_length": 20.66875, + "epoch": 0.1536, + "grad_norm": 2.8341245651245117, + "kl": 0.5906827956438064, + "learning_rate": 9.525765184164779e-06, + "loss": -0.0344, + "reward": 0.046689460845664145, + "reward_std": 0.024658868694677948, + "rewards/ddi_reward": 0.005995015404187143, + "rewards/jaccard_reward": 0.047112360666505995, + "rewards/refuse_rate_reward": -0.002114496263675392, + "step": 1920 + }, + { + "completion_length": 23.140625, + "epoch": 0.1544, + "grad_norm": 2.8262014389038086, + "kl": 0.37009684517979624, + "learning_rate": 9.520321783210589e-06, + "loss": -0.05, + "reward": 0.05227736607193947, + "reward_std": 0.02907090657390654, + "rewards/ddi_reward": 0.00930623677559197, + "rewards/jaccard_reward": 0.05363227617926895, + "rewards/refuse_rate_reward": -0.006774553738068789, + "step": 1930 + }, + { + "completion_length": 27.60625, + "epoch": 0.1552, + "grad_norm": 1.6470423936843872, + "kl": 0.46727864667773245, + "learning_rate": 9.514848893351967e-06, + "loss": -0.0431, + "reward": 0.05934821758419275, + "reward_std": 0.03056013584136963, + "rewards/ddi_reward": -0.010815875929256435, + "rewards/jaccard_reward": 0.060464942315593365, + "rewards/refuse_rate_reward": -0.005583626625593752, + "step": 1940 + }, + { + "completion_length": 24.1484375, + "epoch": 0.156, + "grad_norm": 3.2191226482391357, + "kl": 0.5143943756818772, + "learning_rate": 9.509346550292022e-06, + "loss": -0.0348, + "reward": 0.053695641609374435, + "reward_std": 0.025688855431508274, + "rewards/ddi_reward": -0.009198217006633059, + "rewards/jaccard_reward": 0.05469086027878802, + "rewards/refuse_rate_reward": -0.004976085649104789, + "step": 1950 + }, + { + "completion_length": 23.98125, + "epoch": 0.1568, + "grad_norm": 1.6083965301513672, + "kl": 0.3692587584257126, + "learning_rate": 9.503814789926e-06, + "loss": -0.0306, + "reward": 0.04650940764695406, + "reward_std": 0.028714942978695035, + "rewards/ddi_reward": -0.005319135962054133, + "rewards/jaccard_reward": 0.047066642541904, + "rewards/refuse_rate_reward": -0.002786176570225507, + "step": 1960 + }, + { + "completion_length": 27.4921875, + "epoch": 0.1576, + "grad_norm": 2.8270585536956787, + "kl": 0.3940353274345398, + "learning_rate": 9.498253648341054e-06, + "loss": -0.0471, + "reward": 0.05650910763069987, + "reward_std": 0.025629352405667306, + "rewards/ddi_reward": -0.00731629830552265, + "rewards/jaccard_reward": 0.05752677952405065, + "rewards/refuse_rate_reward": -0.005088360689114779, + "step": 1970 + }, + { + "completion_length": 27.834375, + "epoch": 0.1584, + "grad_norm": 1.9834539890289307, + "kl": 0.3702671483159065, + "learning_rate": 9.492663161816014e-06, + "loss": -0.0367, + "reward": 0.057113425945863126, + "reward_std": 0.029495068732649087, + "rewards/ddi_reward": -0.009776009930646978, + "rewards/jaccard_reward": 0.05789840929210186, + "rewards/refuse_rate_reward": -0.003924924833700061, + "step": 1980 + }, + { + "completion_length": 22.6609375, + "epoch": 0.1592, + "grad_norm": 1.7405987977981567, + "kl": 0.3183430634438992, + "learning_rate": 9.487043366821143e-06, + "loss": -0.0559, + "reward": 0.05940955011174083, + "reward_std": 0.029489705944433807, + "rewards/ddi_reward": -0.013373659312492236, + "rewards/jaccard_reward": 0.05988214397802949, + "rewards/refuse_rate_reward": -0.002362971322145313, + "step": 1990 + }, + { + "completion_length": 24.8015625, + "epoch": 0.16, + "grad_norm": 2.7401368618011475, + "kl": 0.40714537650346755, + "learning_rate": 9.481394300017898e-06, + "loss": -0.0545, + "reward": 0.05846670265309513, + "reward_std": 0.025335992826148866, + "rewards/ddi_reward": -0.004253413953119889, + "rewards/jaccard_reward": 0.0588718606159091, + "rewards/refuse_rate_reward": -0.0020257946569472553, + "step": 2000 + }, + { + "completion_length": 21.3890625, + "epoch": 0.1608, + "grad_norm": 1.4210821390151978, + "kl": 0.4277390781790018, + "learning_rate": 9.4757159982587e-06, + "loss": -0.0519, + "reward": 0.05717954384163022, + "reward_std": 0.0282935606315732, + "rewards/ddi_reward": -0.0027943785884417594, + "rewards/jaccard_reward": 0.05762905450537801, + "rewards/refuse_rate_reward": -0.002247548499144614, + "step": 2010 + }, + { + "completion_length": 22.671875, + "epoch": 0.1616, + "grad_norm": 1.7063385248184204, + "kl": 0.37301371842622755, + "learning_rate": 9.470008498586687e-06, + "loss": -0.0627, + "reward": 0.06441454524174332, + "reward_std": 0.028427378600463273, + "rewards/ddi_reward": -0.007058043137658387, + "rewards/jaccard_reward": 0.06455641177017242, + "rewards/refuse_rate_reward": -0.0007093254127539695, + "step": 2020 + }, + { + "completion_length": 19.6359375, + "epoch": 0.1624, + "grad_norm": 2.0365190505981445, + "kl": 0.5372042432427406, + "learning_rate": 9.46427183823547e-06, + "loss": -0.1051, + "reward": 0.05957938218489289, + "reward_std": 0.02411571149714291, + "rewards/ddi_reward": -0.0014158089412376284, + "rewards/jaccard_reward": 0.06027242700802162, + "rewards/refuse_rate_reward": -0.00346521110041067, + "step": 2030 + }, + { + "completion_length": 22.91875, + "epoch": 0.1632, + "grad_norm": 6.4986796379089355, + "kl": 0.4110834844410419, + "learning_rate": 9.458506054628896e-06, + "loss": -0.0713, + "reward": 0.06421514870598913, + "reward_std": 0.024559238646179437, + "rewards/ddi_reward": -0.01067312735831365, + "rewards/jaccard_reward": 0.06442064116708934, + "rewards/refuse_rate_reward": -0.001027462154161185, + "step": 2040 + }, + { + "completion_length": 23.3390625, + "epoch": 0.164, + "grad_norm": 1.0581883192062378, + "kl": 0.4322709575295448, + "learning_rate": 9.452711185380801e-06, + "loss": -0.0503, + "reward": 0.0644802498514764, + "reward_std": 0.02419890775345266, + "rewards/ddi_reward": -0.003475320228608325, + "rewards/jaccard_reward": 0.06519814268685878, + "rewards/refuse_rate_reward": -0.003589466167613864, + "step": 2050 + }, + { + "completion_length": 23.65625, + "epoch": 0.1648, + "grad_norm": 6.19871711730957, + "kl": 0.42134273499250413, + "learning_rate": 9.446887268294766e-06, + "loss": -0.0327, + "reward": 0.04005907904356718, + "reward_std": 0.02449878272600472, + "rewards/ddi_reward": -0.009559525572694839, + "rewards/jaccard_reward": 0.0408304643468, + "rewards/refuse_rate_reward": -0.0038569287164136766, + "step": 2060 + }, + { + "completion_length": 26.1421875, + "epoch": 0.1656, + "grad_norm": 2.525622844696045, + "kl": 0.42861673012375834, + "learning_rate": 9.441034341363866e-06, + "loss": -0.0506, + "reward": 0.06157756224274635, + "reward_std": 0.027340413862839343, + "rewards/ddi_reward": -0.007902983628446237, + "rewards/jaccard_reward": 0.061784593481570484, + "rewards/refuse_rate_reward": -0.0010351562756113707, + "step": 2070 + }, + { + "completion_length": 21.128125, + "epoch": 0.1664, + "grad_norm": 2.6319739818573, + "kl": 0.5081676617264748, + "learning_rate": 9.435152442770428e-06, + "loss": -0.1051, + "reward": 0.06638165146578104, + "reward_std": 0.023925317311659457, + "rewards/ddi_reward": -7.043732330203056e-06, + "rewards/jaccard_reward": 0.06655873557319865, + "rewards/refuse_rate_reward": -0.0008854166720993817, + "step": 2080 + }, + { + "completion_length": 24.0796875, + "epoch": 0.1672, + "grad_norm": 2.0443334579467773, + "kl": 0.4526941955089569, + "learning_rate": 9.429241610885779e-06, + "loss": -0.0404, + "reward": 0.04439324708655477, + "reward_std": 0.02495514079928398, + "rewards/ddi_reward": -0.01372422311687842, + "rewards/jaccard_reward": 0.045437371963635086, + "rewards/refuse_rate_reward": -0.0052206271211616695, + "step": 2090 + }, + { + "completion_length": 23.3265625, + "epoch": 0.168, + "grad_norm": 2.260411500930786, + "kl": 0.4024643003940582, + "learning_rate": 9.423301884269999e-06, + "loss": -0.0627, + "reward": 0.059093679394572975, + "reward_std": 0.025222137663513423, + "rewards/ddi_reward": -0.004414730001008138, + "rewards/jaccard_reward": 0.05963468886911869, + "rewards/refuse_rate_reward": -0.0027050597476772965, + "step": 2100 + }, + { + "completion_length": 23.0640625, + "epoch": 0.1688, + "grad_norm": 2.966794013977051, + "kl": 0.45312429815530775, + "learning_rate": 9.41733330167166e-06, + "loss": -0.0497, + "reward": 0.052526958473026755, + "reward_std": 0.023980335844680668, + "rewards/ddi_reward": -0.003650353997363709, + "rewards/jaccard_reward": 0.05279475282877684, + "rewards/refuse_rate_reward": -0.0013389757368713617, + "step": 2110 + }, + { + "completion_length": 27.3296875, + "epoch": 0.1696, + "grad_norm": 1.1762361526489258, + "kl": 0.3486960422247648, + "learning_rate": 9.411335902027588e-06, + "loss": -0.0342, + "reward": 0.04738250183872879, + "reward_std": 0.028398664481937886, + "rewards/ddi_reward": -0.0038293431513011455, + "rewards/jaccard_reward": 0.048349272925406694, + "rewards/refuse_rate_reward": -0.004833850252907723, + "step": 2120 + }, + { + "completion_length": 28.6984375, + "epoch": 0.1704, + "grad_norm": 2.8590638637542725, + "kl": 0.33463264629244804, + "learning_rate": 9.40530972446259e-06, + "loss": -0.0358, + "reward": 0.058945043943822384, + "reward_std": 0.029519352409988642, + "rewards/ddi_reward": -0.004110825498355552, + "rewards/jaccard_reward": 0.05944710550829768, + "rewards/refuse_rate_reward": -0.0025103123276494443, + "step": 2130 + }, + { + "completion_length": 27.45625, + "epoch": 0.1712, + "grad_norm": 8.066052436828613, + "kl": 0.3726157687604427, + "learning_rate": 9.399254808289219e-06, + "loss": -0.0513, + "reward": 0.06186291351914406, + "reward_std": 0.03174690473824739, + "rewards/ddi_reward": -0.014997023745672777, + "rewards/jaccard_reward": 0.06267896858043968, + "rewards/refuse_rate_reward": -0.00408027860103175, + "step": 2140 + }, + { + "completion_length": 23.93125, + "epoch": 0.172, + "grad_norm": 2.480485677719116, + "kl": 0.3682127296924591, + "learning_rate": 9.393171193007505e-06, + "loss": -0.0607, + "reward": 0.06494135400280357, + "reward_std": 0.03219906250014901, + "rewards/ddi_reward": -0.00647581517114304, + "rewards/jaccard_reward": 0.06530317277647554, + "rewards/refuse_rate_reward": -0.0018090850440785288, + "step": 2150 + }, + { + "completion_length": 18.5890625, + "epoch": 0.1728, + "grad_norm": 3.9430739879608154, + "kl": 0.42666213065385816, + "learning_rate": 9.387058918304699e-06, + "loss": -0.0486, + "reward": 0.048364104516804216, + "reward_std": 0.025440957630053163, + "rewards/ddi_reward": -0.00600951180967968, + "rewards/jaccard_reward": 0.048494312027469275, + "rewards/refuse_rate_reward": -0.0006510416744276882, + "step": 2160 + }, + { + "completion_length": 21.1328125, + "epoch": 0.1736, + "grad_norm": 2.252962112426758, + "kl": 0.44893175512552264, + "learning_rate": 9.380918024055015e-06, + "loss": -0.0728, + "reward": 0.06650991514325141, + "reward_std": 0.028681118274107575, + "rewards/ddi_reward": -0.01010020951507613, + "rewards/jaccard_reward": 0.06676512470003218, + "rewards/refuse_rate_reward": -0.001276041695382446, + "step": 2170 + }, + { + "completion_length": 24.453125, + "epoch": 0.1744, + "grad_norm": 1.3540196418762207, + "kl": 0.4156239241361618, + "learning_rate": 9.374748550319374e-06, + "loss": -0.0776, + "reward": 0.08375460868701338, + "reward_std": 0.028314870549365878, + "rewards/ddi_reward": -0.005550525180296972, + "rewards/jaccard_reward": 0.08420824129134416, + "rewards/refuse_rate_reward": -0.0022681615548208354, + "step": 2180 + }, + { + "completion_length": 19.925, + "epoch": 0.1752, + "grad_norm": 2.249154567718506, + "kl": 0.3644613064825535, + "learning_rate": 9.368550537345137e-06, + "loss": -0.1078, + "reward": 0.053315206058323385, + "reward_std": 0.028039303980767726, + "rewards/ddi_reward": -0.0045502397930249575, + "rewards/jaccard_reward": 0.053694669960532335, + "rewards/refuse_rate_reward": -0.0018973214668221773, + "step": 2190 + }, + { + "completion_length": 25.24375, + "epoch": 0.176, + "grad_norm": 1.2866085767745972, + "kl": 0.5043054163455963, + "learning_rate": 9.362324025565844e-06, + "loss": -0.0399, + "reward": 0.05943871634081006, + "reward_std": 0.02891668174415827, + "rewards/ddi_reward": -0.007534853206016124, + "rewards/jaccard_reward": 0.05991118820384145, + "rewards/refuse_rate_reward": -0.002362351294141263, + "step": 2200 + }, + { + "completion_length": 20.71875, + "epoch": 0.1768, + "grad_norm": 3.688530206680298, + "kl": 0.42803463488817217, + "learning_rate": 9.356069055600949e-06, + "loss": -0.0464, + "reward": 0.04865808147005737, + "reward_std": 0.021655358001589776, + "rewards/ddi_reward": -0.00010878805042011664, + "rewards/jaccard_reward": 0.0494817336788401, + "rewards/refuse_rate_reward": -0.004118264594580978, + "step": 2210 + }, + { + "completion_length": 22.2640625, + "epoch": 0.1776, + "grad_norm": 1.272680640220642, + "kl": 0.5480917535722256, + "learning_rate": 9.349785668255558e-06, + "loss": -0.0539, + "reward": 0.05490786593873054, + "reward_std": 0.02579186325892806, + "rewards/ddi_reward": -0.003096765701775439, + "rewards/jaccard_reward": 0.05525413625873625, + "rewards/refuse_rate_reward": -0.001731353107606992, + "step": 2220 + }, + { + "completion_length": 22.36875, + "epoch": 0.1784, + "grad_norm": 3.4716691970825195, + "kl": 0.45239451676607134, + "learning_rate": 9.343473904520164e-06, + "loss": -0.0494, + "reward": 0.05722326873801649, + "reward_std": 0.028012493811547756, + "rewards/ddi_reward": -0.00440576285473071, + "rewards/jaccard_reward": 0.058102548215538265, + "rewards/refuse_rate_reward": -0.004396391450427473, + "step": 2230 + }, + { + "completion_length": 24.5625, + "epoch": 0.1792, + "grad_norm": 5.585963726043701, + "kl": 0.5320621393620968, + "learning_rate": 9.337133805570376e-06, + "loss": -0.042, + "reward": 0.05288214592728764, + "reward_std": 0.028962475806474687, + "rewards/ddi_reward": -0.006382998437038623, + "rewards/jaccard_reward": 0.053380676230881365, + "rewards/refuse_rate_reward": -0.002492649108171463, + "step": 2240 + }, + { + "completion_length": 17.9546875, + "epoch": 0.18, + "grad_norm": 4.736510753631592, + "kl": 0.530471746623516, + "learning_rate": 9.330765412766646e-06, + "loss": -0.0752, + "reward": 0.05763475247658789, + "reward_std": 0.024754474125802516, + "rewards/ddi_reward": -0.003622853514389135, + "rewards/jaccard_reward": 0.05794905601069331, + "rewards/refuse_rate_reward": -0.001571514445822686, + "step": 2250 + }, + { + "completion_length": 24.196875, + "epoch": 0.1808, + "grad_norm": 2.745171308517456, + "kl": 0.6183068968355656, + "learning_rate": 9.324368767654014e-06, + "loss": -0.0483, + "reward": 0.06933700949884951, + "reward_std": 0.027835332322865725, + "rewards/ddi_reward": -0.0068915043520973995, + "rewards/jaccard_reward": 0.0698922990821302, + "rewards/refuse_rate_reward": -0.0027764423633925618, + "step": 2260 + }, + { + "completion_length": 23.915625, + "epoch": 0.1816, + "grad_norm": 3.5733253955841064, + "kl": 0.5555571272969246, + "learning_rate": 9.31794391196182e-06, + "loss": -0.0442, + "reward": 0.052161433012224735, + "reward_std": 0.024680611724033952, + "rewards/ddi_reward": -0.005874660842528101, + "rewards/jaccard_reward": 0.053011891676578674, + "rewards/refuse_rate_reward": -0.0042522886069491506, + "step": 2270 + }, + { + "completion_length": 19.9859375, + "epoch": 0.1824, + "grad_norm": 4.164638042449951, + "kl": 0.49260112680494783, + "learning_rate": 9.311490887603442e-06, + "loss": -0.0463, + "reward": 0.047878799680620435, + "reward_std": 0.02359506585635245, + "rewards/ddi_reward": -0.004940437094774097, + "rewards/jaccard_reward": 0.0482176523655653, + "rewards/refuse_rate_reward": -0.001694258430507034, + "step": 2280 + }, + { + "completion_length": 23.9796875, + "epoch": 0.1832, + "grad_norm": 2.667283773422241, + "kl": 0.3991350933909416, + "learning_rate": 9.30500973667602e-06, + "loss": -0.0657, + "reward": 0.0529655737336725, + "reward_std": 0.02367004598490894, + "rewards/ddi_reward": -0.002935809938935563, + "rewards/jaccard_reward": 0.053229958401061594, + "rewards/refuse_rate_reward": -0.0013219246407970785, + "step": 2290 + }, + { + "completion_length": 21.6359375, + "epoch": 0.184, + "grad_norm": 3.0174851417541504, + "kl": 0.5216975562274456, + "learning_rate": 9.298500501460182e-06, + "loss": -0.0478, + "reward": 0.05560749672586098, + "reward_std": 0.025546740042045712, + "rewards/ddi_reward": -0.001362323109060526, + "rewards/jaccard_reward": 0.05649979527806863, + "rewards/refuse_rate_reward": -0.00446148730115965, + "step": 2300 + }, + { + "completion_length": 21.3671875, + "epoch": 0.1848, + "grad_norm": 1.6960887908935547, + "kl": 0.4655955359339714, + "learning_rate": 9.291963224419764e-06, + "loss": -0.0423, + "reward": 0.04909660900011659, + "reward_std": 0.02258506587240845, + "rewards/ddi_reward": -0.007875502161914483, + "rewards/jaccard_reward": 0.04914125154609792, + "rewards/refuse_rate_reward": -0.00022321429569274187, + "step": 2310 + }, + { + "completion_length": 23.5265625, + "epoch": 0.1856, + "grad_norm": 3.4710092544555664, + "kl": 0.4213948152959347, + "learning_rate": 9.285397948201544e-06, + "loss": -0.054, + "reward": 0.0594272792339325, + "reward_std": 0.02356047830544412, + "rewards/ddi_reward": -0.008889208501204848, + "rewards/jaccard_reward": 0.05966165328864008, + "rewards/refuse_rate_reward": -0.001171875069849193, + "step": 2320 + }, + { + "completion_length": 24.0203125, + "epoch": 0.1864, + "grad_norm": 3.128159523010254, + "kl": 0.6071860998868942, + "learning_rate": 9.278804715634948e-06, + "loss": -0.0509, + "reward": 0.06420221906155348, + "reward_std": 0.026420789072290063, + "rewards/ddi_reward": 0.011185383889824152, + "rewards/jaccard_reward": 0.0649672320112586, + "rewards/refuse_rate_reward": -0.0038250594749115407, + "step": 2330 + }, + { + "completion_length": 19.971875, + "epoch": 0.1872, + "grad_norm": 3.178705930709839, + "kl": 0.4400537207722664, + "learning_rate": 9.272183569731783e-06, + "loss": -0.0595, + "reward": 0.05633322326466441, + "reward_std": 0.02231642617844045, + "rewards/ddi_reward": -0.0006848340039141476, + "rewards/jaccard_reward": 0.05777024962008, + "rewards/refuse_rate_reward": -0.0071851324522867795, + "step": 2340 + }, + { + "completion_length": 20.453125, + "epoch": 0.188, + "grad_norm": 7.180077075958252, + "kl": 0.5017179623246193, + "learning_rate": 9.26553455368595e-06, + "loss": -0.0527, + "reward": 0.05299028716981411, + "reward_std": 0.02441583517938852, + "rewards/ddi_reward": -0.010266990453237668, + "rewards/jaccard_reward": 0.05307510743150488, + "rewards/refuse_rate_reward": -0.00042410716414451597, + "step": 2350 + }, + { + "completion_length": 19.65, + "epoch": 0.1888, + "grad_norm": 3.33480167388916, + "kl": 0.43787841126322746, + "learning_rate": 9.25885771087317e-06, + "loss": -0.0763, + "reward": 0.05492332694120705, + "reward_std": 0.024819734413176774, + "rewards/ddi_reward": -0.0052295261702965945, + "rewards/jaccard_reward": 0.05509073687717318, + "rewards/refuse_rate_reward": -0.0008370535913854837, + "step": 2360 + }, + { + "completion_length": 24.0703125, + "epoch": 0.1896, + "grad_norm": 2.6694891452789307, + "kl": 0.3817500412464142, + "learning_rate": 9.25215308485069e-06, + "loss": -0.0342, + "reward": 0.05010451956186444, + "reward_std": 0.027001068321987988, + "rewards/ddi_reward": -0.0033217837917618454, + "rewards/jaccard_reward": 0.050598912150599064, + "rewards/refuse_rate_reward": -0.00247196382842958, + "step": 2370 + }, + { + "completion_length": 24.3984375, + "epoch": 0.1904, + "grad_norm": 6.571107387542725, + "kl": 0.41570008620619775, + "learning_rate": 9.245420719357004e-06, + "loss": -0.0792, + "reward": 0.06798123186454177, + "reward_std": 0.03254405171610415, + "rewards/ddi_reward": -0.011754598084371537, + "rewards/jaccard_reward": 0.06824522488750517, + "rewards/refuse_rate_reward": -0.0013199690962210297, + "step": 2380 + }, + { + "completion_length": 22.0421875, + "epoch": 0.1912, + "grad_norm": 1.3059055805206299, + "kl": 0.42288120463490486, + "learning_rate": 9.238660658311571e-06, + "loss": -0.0283, + "reward": 0.043285995023325086, + "reward_std": 0.0206716881133616, + "rewards/ddi_reward": -0.0011750669655157253, + "rewards/jaccard_reward": 0.04342736357357353, + "rewards/refuse_rate_reward": -0.0007068452658131718, + "step": 2390 + }, + { + "completion_length": 17.7875, + "epoch": 0.192, + "grad_norm": 3.873521089553833, + "kl": 0.4363426771014929, + "learning_rate": 9.231872945814526e-06, + "loss": -0.0376, + "reward": 0.03930276180617511, + "reward_std": 0.02201805217191577, + "rewards/ddi_reward": -0.009777951950673014, + "rewards/jaccard_reward": 0.03979471269994974, + "rewards/refuse_rate_reward": -0.002459753816947341, + "step": 2400 + }, + { + "completion_length": 22.571875, + "epoch": 0.1928, + "grad_norm": 4.05241060256958, + "kl": 0.4199648626148701, + "learning_rate": 9.225057626146387e-06, + "loss": -0.0688, + "reward": 0.07271978370845318, + "reward_std": 0.02787358877249062, + "rewards/ddi_reward": -0.006571923295268789, + "rewards/jaccard_reward": 0.0735116370022297, + "rewards/refuse_rate_reward": -0.003959262475837022, + "step": 2410 + }, + { + "completion_length": 17.00625, + "epoch": 0.1936, + "grad_norm": 9.674854278564453, + "kl": 0.45208516493439677, + "learning_rate": 9.218214743767779e-06, + "loss": -0.0632, + "reward": 0.057701674196869134, + "reward_std": 0.02737024212256074, + "rewards/ddi_reward": -0.006974391383118927, + "rewards/jaccard_reward": 0.05827297735377215, + "rewards/refuse_rate_reward": -0.0028565228683874013, + "step": 2420 + }, + { + "completion_length": 20.909375, + "epoch": 0.1944, + "grad_norm": 3.6298134326934814, + "kl": 0.7896842263638973, + "learning_rate": 9.211344343319126e-06, + "loss": -0.0269, + "reward": 0.05413362798281014, + "reward_std": 0.025024003675207497, + "rewards/ddi_reward": -0.0024250696296803653, + "rewards/jaccard_reward": 0.05460474650026299, + "rewards/refuse_rate_reward": -0.0023555871564894916, + "step": 2430 + }, + { + "completion_length": 24.2484375, + "epoch": 0.1952, + "grad_norm": 1.9223147630691528, + "kl": 0.4461164578795433, + "learning_rate": 9.20444646962038e-06, + "loss": -0.0458, + "reward": 0.052505670581012964, + "reward_std": 0.025955163897015153, + "rewards/ddi_reward": -0.00780632428068202, + "rewards/jaccard_reward": 0.05321066689211875, + "rewards/refuse_rate_reward": -0.0035249820444732904, + "step": 2440 + }, + { + "completion_length": 22.2984375, + "epoch": 0.196, + "grad_norm": 1.2683377265930176, + "kl": 0.4427480913698673, + "learning_rate": 9.19752116767071e-06, + "loss": -0.0546, + "reward": 0.057337849657051264, + "reward_std": 0.02698727399110794, + "rewards/ddi_reward": -0.0021687219268642366, + "rewards/jaccard_reward": 0.05807149324100465, + "rewards/refuse_rate_reward": -0.0036682111560367046, + "step": 2450 + }, + { + "completion_length": 25.40625, + "epoch": 0.1968, + "grad_norm": 4.37007474899292, + "kl": 0.41971403211355207, + "learning_rate": 9.19056848264822e-06, + "loss": -0.0527, + "reward": 0.05972102601081133, + "reward_std": 0.029197870241478085, + "rewards/ddi_reward": -0.0041444914764724675, + "rewards/jaccard_reward": 0.06016906467266381, + "rewards/refuse_rate_reward": -0.0022402034257538615, + "step": 2460 + }, + { + "completion_length": 22.859375, + "epoch": 0.1976, + "grad_norm": 2.4578871726989746, + "kl": 0.449562132358551, + "learning_rate": 9.183588459909651e-06, + "loss": -0.0731, + "reward": 0.060955293802544476, + "reward_std": 0.02502320921048522, + "rewards/ddi_reward": -0.0021333025593776255, + "rewards/jaccard_reward": 0.06128411670215428, + "rewards/refuse_rate_reward": -0.001644116861280054, + "step": 2470 + }, + { + "completion_length": 26.3421875, + "epoch": 0.1984, + "grad_norm": 1.8988115787506104, + "kl": 0.4899417206645012, + "learning_rate": 9.176581144990088e-06, + "loss": -0.0542, + "reward": 0.06194128319621086, + "reward_std": 0.032542659901082516, + "rewards/ddi_reward": -0.003445206582546234, + "rewards/jaccard_reward": 0.06266112760640681, + "rewards/refuse_rate_reward": -0.003599221783224493, + "step": 2480 + }, + { + "completion_length": 24.646875, + "epoch": 0.1992, + "grad_norm": 1.6844537258148193, + "kl": 0.48271438851952553, + "learning_rate": 9.169546583602653e-06, + "loss": -0.029, + "reward": 0.04260879880748689, + "reward_std": 0.024896484287455677, + "rewards/ddi_reward": 0.003384406308759935, + "rewards/jaccard_reward": 0.043597526755183934, + "rewards/refuse_rate_reward": -0.004943632800132036, + "step": 2490 + }, + { + "completion_length": 26.3890625, + "epoch": 0.2, + "grad_norm": 5.245153427124023, + "kl": 0.6172430709004402, + "learning_rate": 9.16248482163822e-06, + "loss": -0.018, + "reward": 0.04650095384567976, + "reward_std": 0.02618453400209546, + "rewards/ddi_reward": -0.006905769888544455, + "rewards/jaccard_reward": 0.047744666336802764, + "rewards/refuse_rate_reward": -0.0062185645918361844, + "step": 2500 + }, + { + "completion_length": 25.8546875, + "epoch": 0.2008, + "grad_norm": 2.603583574295044, + "kl": 0.4811552479863167, + "learning_rate": 9.155395905165108e-06, + "loss": -0.0509, + "reward": 0.0651238770224154, + "reward_std": 0.03031811830587685, + "rewards/ddi_reward": -0.0048297894827555865, + "rewards/jaccard_reward": 0.06566630927845836, + "rewards/refuse_rate_reward": -0.0027121663209982217, + "step": 2510 + }, + { + "completion_length": 27.865625, + "epoch": 0.2016, + "grad_norm": 3.701397180557251, + "kl": 0.45878340378403665, + "learning_rate": 9.14827988042878e-06, + "loss": -0.0443, + "reward": 0.062065267283469436, + "reward_std": 0.02794117443263531, + "rewards/ddi_reward": -0.009052013361360878, + "rewards/jaccard_reward": 0.06292022177949548, + "rewards/refuse_rate_reward": -0.00427477911580354, + "step": 2520 + }, + { + "completion_length": 23.459375, + "epoch": 0.2024, + "grad_norm": 3.6364500522613525, + "kl": 0.43540762066841127, + "learning_rate": 9.141136793851546e-06, + "loss": -0.0624, + "reward": 0.06386534227058291, + "reward_std": 0.029243325465358794, + "rewards/ddi_reward": -0.004957678215578199, + "rewards/jaccard_reward": 0.06431152113946155, + "rewards/refuse_rate_reward": -0.0022309028659947216, + "step": 2530 + }, + { + "completion_length": 25.909375, + "epoch": 0.2032, + "grad_norm": 1.9359971284866333, + "kl": 0.5138444900512695, + "learning_rate": 9.133966692032257e-06, + "loss": -0.0494, + "reward": 0.06253448827192187, + "reward_std": 0.030100872134789826, + "rewards/ddi_reward": -0.007978466572239995, + "rewards/jaccard_reward": 0.06310725067742169, + "rewards/refuse_rate_reward": -0.0028638130403123795, + "step": 2540 + }, + { + "completion_length": 26.2109375, + "epoch": 0.204, + "grad_norm": 1.7930562496185303, + "kl": 0.3960649274289608, + "learning_rate": 9.126769621745997e-06, + "loss": -0.0516, + "reward": 0.06317725032567978, + "reward_std": 0.027697277907282113, + "rewards/ddi_reward": -0.0018155859012040309, + "rewards/jaccard_reward": 0.06472308419179171, + "rewards/refuse_rate_reward": -0.007729166187345982, + "step": 2550 + }, + { + "completion_length": 25.54375, + "epoch": 0.2048, + "grad_norm": 0.8872078657150269, + "kl": 0.35730703696608546, + "learning_rate": 9.11954562994379e-06, + "loss": -0.0659, + "reward": 0.06722852271050214, + "reward_std": 0.02659902423620224, + "rewards/ddi_reward": -0.007030723750358447, + "rewards/jaccard_reward": 0.06834785911487415, + "rewards/refuse_rate_reward": -0.0055966806423384695, + "step": 2560 + }, + { + "completion_length": 25.8453125, + "epoch": 0.2056, + "grad_norm": 3.729085922241211, + "kl": 0.4512031123042107, + "learning_rate": 9.11229476375228e-06, + "loss": -0.0437, + "reward": 0.06144486651755869, + "reward_std": 0.026092369761317968, + "rewards/ddi_reward": -0.009362724516540766, + "rewards/jaccard_reward": 0.061768632964231074, + "rewards/refuse_rate_reward": -0.0016188282170332968, + "step": 2570 + }, + { + "completion_length": 26.403125, + "epoch": 0.2064, + "grad_norm": 7.9621076583862305, + "kl": 0.38288776129484176, + "learning_rate": 9.105017070473429e-06, + "loss": -0.0528, + "reward": 0.06390803270041942, + "reward_std": 0.027840884565375746, + "rewards/ddi_reward": -0.008087479381356389, + "rewards/jaccard_reward": 0.06452719010412693, + "rewards/refuse_rate_reward": -0.0030957911163568496, + "step": 2580 + }, + { + "completion_length": 24.4640625, + "epoch": 0.2072, + "grad_norm": 4.864141464233398, + "kl": 0.42718667462468146, + "learning_rate": 9.097712597584216e-06, + "loss": -0.0464, + "reward": 0.05993908466771245, + "reward_std": 0.026415859721601008, + "rewards/ddi_reward": -0.008447397174313665, + "rewards/jaccard_reward": 0.060030603874474765, + "rewards/refuse_rate_reward": -0.00045758930500596764, + "step": 2590 + }, + { + "completion_length": 22.2265625, + "epoch": 0.208, + "grad_norm": 9.629461288452148, + "kl": 0.4100318856537342, + "learning_rate": 9.090381392736313e-06, + "loss": -0.0552, + "reward": 0.060290454095229505, + "reward_std": 0.022124692844226957, + "rewards/ddi_reward": -0.0065167180728167295, + "rewards/jaccard_reward": 0.060910119814798235, + "rewards/refuse_rate_reward": -0.0030983238539192826, + "step": 2600 + }, + { + "completion_length": 24.0375, + "epoch": 0.2088, + "grad_norm": 2.022310733795166, + "kl": 0.4371942222118378, + "learning_rate": 9.083023503755783e-06, + "loss": -0.0587, + "reward": 0.07371797561645507, + "reward_std": 0.02758964980021119, + "rewards/ddi_reward": -0.004693978544673882, + "rewards/jaccard_reward": 0.07449277937412262, + "rewards/refuse_rate_reward": -0.00387402530759573, + "step": 2610 + }, + { + "completion_length": 26.6203125, + "epoch": 0.2096, + "grad_norm": 1.326812505722046, + "kl": 0.48673198372125626, + "learning_rate": 9.07563897864277e-06, + "loss": -0.0372, + "reward": 0.05066087883897126, + "reward_std": 0.027654993161559106, + "rewards/ddi_reward": -0.009929238981567324, + "rewards/jaccard_reward": 0.051857261639088395, + "rewards/refuse_rate_reward": -0.005981906631495804, + "step": 2620 + }, + { + "completion_length": 26.74375, + "epoch": 0.2104, + "grad_norm": 1.8446236848831177, + "kl": 0.4853005215525627, + "learning_rate": 9.068227865571181e-06, + "loss": -0.04, + "reward": 0.04863303084857762, + "reward_std": 0.02796918456442654, + "rewards/ddi_reward": -0.00800940815533977, + "rewards/jaccard_reward": 0.04934966278960928, + "rewards/refuse_rate_reward": -0.0035831525921821593, + "step": 2630 + }, + { + "completion_length": 26.2921875, + "epoch": 0.2112, + "grad_norm": 2.012676477432251, + "kl": 0.45617960691452025, + "learning_rate": 9.060790212888368e-06, + "loss": -0.0413, + "reward": 0.05702798520214856, + "reward_std": 0.024297937843948604, + "rewards/ddi_reward": -0.006024221106781624, + "rewards/jaccard_reward": 0.05799295441247523, + "rewards/refuse_rate_reward": -0.004824838822241872, + "step": 2640 + }, + { + "completion_length": 23.4484375, + "epoch": 0.212, + "grad_norm": 1.9540410041809082, + "kl": 0.4018107794225216, + "learning_rate": 9.053326069114824e-06, + "loss": -0.0364, + "reward": 0.05045122439041734, + "reward_std": 0.023721464653499426, + "rewards/ddi_reward": -0.005313313967781142, + "rewards/jaccard_reward": 0.05157886708620936, + "rewards/refuse_rate_reward": -0.0056382166338153185, + "step": 2650 + }, + { + "completion_length": 26.63125, + "epoch": 0.2128, + "grad_norm": 1.8235100507736206, + "kl": 0.44563463851809504, + "learning_rate": 9.045835482943853e-06, + "loss": -0.0465, + "reward": 0.06382811204530299, + "reward_std": 0.022205183189362286, + "rewards/ddi_reward": -0.005953696748474613, + "rewards/jaccard_reward": 0.06417893188772723, + "rewards/refuse_rate_reward": -0.0017540923319756985, + "step": 2660 + }, + { + "completion_length": 23.6578125, + "epoch": 0.2136, + "grad_norm": 2.5415401458740234, + "kl": 0.39569213315844537, + "learning_rate": 9.038318503241268e-06, + "loss": -0.0448, + "reward": 0.04603364751674235, + "reward_std": 0.026641821675002574, + "rewards/ddi_reward": -0.003334927442483604, + "rewards/jaccard_reward": 0.04621233439538628, + "rewards/refuse_rate_reward": -0.0008934381883591413, + "step": 2670 + }, + { + "completion_length": 28.3546875, + "epoch": 0.2144, + "grad_norm": 2.1438443660736084, + "kl": 0.39954256936907767, + "learning_rate": 9.030775179045057e-06, + "loss": -0.0635, + "reward": 0.05983076868578792, + "reward_std": 0.028623516112565993, + "rewards/ddi_reward": -0.01740075788402464, + "rewards/jaccard_reward": 0.06070175007916987, + "rewards/refuse_rate_reward": -0.004354905185755343, + "step": 2680 + }, + { + "completion_length": 23.85, + "epoch": 0.2152, + "grad_norm": 3.025562047958374, + "kl": 0.48802322447299956, + "learning_rate": 9.023205559565075e-06, + "loss": -0.0234, + "reward": 0.05196871992666274, + "reward_std": 0.02589406128972769, + "rewards/ddi_reward": -0.004953783238306642, + "rewards/jaccard_reward": 0.05247922588605434, + "rewards/refuse_rate_reward": -0.0025525295757688584, + "step": 2690 + }, + { + "completion_length": 24.0984375, + "epoch": 0.216, + "grad_norm": 2.5680713653564453, + "kl": 0.7185844123363495, + "learning_rate": 9.015609694182711e-06, + "loss": -0.026, + "reward": 0.05259557422250509, + "reward_std": 0.022932869312353433, + "rewards/ddi_reward": -0.0018657904292922467, + "rewards/jaccard_reward": 0.053177440515719356, + "rewards/refuse_rate_reward": -0.002909335884032771, + "step": 2700 + }, + { + "completion_length": 28.2390625, + "epoch": 0.2168, + "grad_norm": 5.444572448730469, + "kl": 0.48934043049812315, + "learning_rate": 9.00798763245058e-06, + "loss": -0.033, + "reward": 0.04748350642621517, + "reward_std": 0.03002787479199469, + "rewards/ddi_reward": -0.007445745856966823, + "rewards/jaccard_reward": 0.04807327062590048, + "rewards/refuse_rate_reward": -0.0029488234664313494, + "step": 2710 + }, + { + "completion_length": 25.2640625, + "epoch": 0.2176, + "grad_norm": 2.500049352645874, + "kl": 0.5511682733893395, + "learning_rate": 9.000339424092186e-06, + "loss": -0.0547, + "reward": 0.06717669514473527, + "reward_std": 0.028850508946925403, + "rewards/ddi_reward": -0.009148062812164426, + "rewards/jaccard_reward": 0.06780331940390169, + "rewards/refuse_rate_reward": -0.0031331169186159967, + "step": 2720 + }, + { + "completion_length": 23.0125, + "epoch": 0.2184, + "grad_norm": 3.484053611755371, + "kl": 0.47472557947039606, + "learning_rate": 8.992665119001609e-06, + "loss": -0.0414, + "reward": 0.04930750064086169, + "reward_std": 0.0288610810181126, + "rewards/ddi_reward": -0.007141754857730121, + "rewards/jaccard_reward": 0.049801124120131136, + "rewards/refuse_rate_reward": -0.002468124427832663, + "step": 2730 + }, + { + "completion_length": 22.09375, + "epoch": 0.2192, + "grad_norm": 3.281832695007324, + "kl": 0.4710004523396492, + "learning_rate": 8.984964767243168e-06, + "loss": -0.0744, + "reward": 0.05993571649305522, + "reward_std": 0.023718556901440025, + "rewards/ddi_reward": -0.012448575743474067, + "rewards/jaccard_reward": 0.060209154081530866, + "rewards/refuse_rate_reward": -0.0013671875, + "step": 2740 + }, + { + "completion_length": 19.546875, + "epoch": 0.22, + "grad_norm": 1.364771842956543, + "kl": 0.4650067888200283, + "learning_rate": 8.977238419051109e-06, + "loss": -0.0475, + "reward": 0.04667692664079368, + "reward_std": 0.02117045153863728, + "rewards/ddi_reward": -0.009433262067614124, + "rewards/jaccard_reward": 0.0469060932751745, + "rewards/refuse_rate_reward": -0.001145833358168602, + "step": 2750 + }, + { + "completion_length": 20.8828125, + "epoch": 0.2208, + "grad_norm": 20.16322898864746, + "kl": 0.4910243883728981, + "learning_rate": 8.96948612482926e-06, + "loss": -0.0475, + "reward": 0.04858410370070487, + "reward_std": 0.026956258248537777, + "rewards/ddi_reward": -0.007723672920837999, + "rewards/jaccard_reward": 0.049130514333955946, + "rewards/refuse_rate_reward": -0.002732052328065038, + "step": 2760 + }, + { + "completion_length": 21.503125, + "epoch": 0.2216, + "grad_norm": 3.7681808471679688, + "kl": 0.4698008343577385, + "learning_rate": 8.96170793515072e-06, + "loss": -0.0488, + "reward": 0.053140597371384504, + "reward_std": 0.02428124933503568, + "rewards/ddi_reward": 0.000861728229210712, + "rewards/jaccard_reward": 0.05347610698081553, + "rewards/refuse_rate_reward": -0.0016775412426795811, + "step": 2770 + }, + { + "completion_length": 19.7890625, + "epoch": 0.2224, + "grad_norm": 1.1950474977493286, + "kl": 0.5169407293200493, + "learning_rate": 8.953903900757517e-06, + "loss": -0.0497, + "reward": 0.05360835562460124, + "reward_std": 0.024349492276087403, + "rewards/ddi_reward": -0.0025420083431527017, + "rewards/jaccard_reward": 0.05427116002538242, + "rewards/refuse_rate_reward": -0.0033140167710371316, + "step": 2780 + }, + { + "completion_length": 21.075, + "epoch": 0.2232, + "grad_norm": 3.6384150981903076, + "kl": 0.45140016824007034, + "learning_rate": 8.946074072560277e-06, + "loss": -0.0456, + "reward": 0.052247468335554006, + "reward_std": 0.021968678338453174, + "rewards/ddi_reward": -0.006292072014184668, + "rewards/jaccard_reward": 0.0524691947735846, + "rewards/refuse_rate_reward": -0.0011086309561505914, + "step": 2790 + }, + { + "completion_length": 19.115625, + "epoch": 0.224, + "grad_norm": 1.9371311664581299, + "kl": 0.5459960408508777, + "learning_rate": 8.938218501637898e-06, + "loss": -0.0423, + "reward": 0.044015984423458575, + "reward_std": 0.024379406752996147, + "rewards/ddi_reward": -0.007223123026778922, + "rewards/jaccard_reward": 0.04449273282662034, + "rewards/refuse_rate_reward": -0.0023837426560930908, + "step": 2800 + }, + { + "completion_length": 22.74375, + "epoch": 0.2248, + "grad_norm": 3.7183241844177246, + "kl": 0.4292034663259983, + "learning_rate": 8.930337239237217e-06, + "loss": -0.0554, + "reward": 0.06938050324097275, + "reward_std": 0.02297007739543915, + "rewards/ddi_reward": -0.008357681683264673, + "rewards/jaccard_reward": 0.06970776151865721, + "rewards/refuse_rate_reward": -0.0016362847294658423, + "step": 2810 + }, + { + "completion_length": 27.403125, + "epoch": 0.2256, + "grad_norm": 1.5222381353378296, + "kl": 0.5360401183366775, + "learning_rate": 8.922430336772668e-06, + "loss": -0.0494, + "reward": 0.06973467757925392, + "reward_std": 0.02755713729420677, + "rewards/ddi_reward": -0.008129899605410173, + "rewards/jaccard_reward": 0.07030511307530105, + "rewards/refuse_rate_reward": -0.002852177759632468, + "step": 2820 + }, + { + "completion_length": 28.0546875, + "epoch": 0.2264, + "grad_norm": 1.6403383016586304, + "kl": 0.43833101615309716, + "learning_rate": 8.914497845825952e-06, + "loss": -0.0523, + "reward": 0.06148632103577256, + "reward_std": 0.02702302080579102, + "rewards/ddi_reward": -0.003929536935902434, + "rewards/jaccard_reward": 0.06202533822506666, + "rewards/refuse_rate_reward": -0.0026950870756991207, + "step": 2830 + }, + { + "completion_length": 26.2453125, + "epoch": 0.2272, + "grad_norm": 2.5686075687408447, + "kl": 0.4912868067622185, + "learning_rate": 8.906539818145704e-06, + "loss": -0.0415, + "reward": 0.05219597602263093, + "reward_std": 0.025085366191342474, + "rewards/ddi_reward": -0.0042167521169176325, + "rewards/jaccard_reward": 0.053038214519619944, + "rewards/refuse_rate_reward": -0.004211196477990597, + "step": 2840 + }, + { + "completion_length": 24.296875, + "epoch": 0.228, + "grad_norm": 2.697357177734375, + "kl": 0.49429128766059877, + "learning_rate": 8.89855630564714e-06, + "loss": -0.0457, + "reward": 0.0631515758112073, + "reward_std": 0.025196602288633584, + "rewards/ddi_reward": -0.005074422294273973, + "rewards/jaccard_reward": 0.0640087035484612, + "rewards/refuse_rate_reward": -0.004285635298583656, + "step": 2850 + }, + { + "completion_length": 22.696875, + "epoch": 0.2288, + "grad_norm": 3.713592767715454, + "kl": 0.49073486328125, + "learning_rate": 8.890547360411747e-06, + "loss": -0.0403, + "reward": 0.05624875983921811, + "reward_std": 0.02272271951660514, + "rewards/ddi_reward": -0.007936608466843608, + "rewards/jaccard_reward": 0.05643774857744575, + "rewards/refuse_rate_reward": -0.00094494050135836, + "step": 2860 + }, + { + "completion_length": 23.490625, + "epoch": 0.2296, + "grad_norm": 11.481719017028809, + "kl": 0.5088707730174065, + "learning_rate": 8.882513034686913e-06, + "loss": -0.0507, + "reward": 0.06174170477315784, + "reward_std": 0.02474029315635562, + "rewards/ddi_reward": -0.008650566870346665, + "rewards/jaccard_reward": 0.06222784612327814, + "rewards/refuse_rate_reward": -0.0024307014944497498, + "step": 2870 + }, + { + "completion_length": 21.859375, + "epoch": 0.2304, + "grad_norm": 3.009390354156494, + "kl": 0.43884772062301636, + "learning_rate": 8.874453380885601e-06, + "loss": -0.0637, + "reward": 0.062085192278027536, + "reward_std": 0.023404400731669738, + "rewards/ddi_reward": -0.007708142488263547, + "rewards/jaccard_reward": 0.062379196472465995, + "rewards/refuse_rate_reward": -0.0014700187603011727, + "step": 2880 + }, + { + "completion_length": 20.984375, + "epoch": 0.2312, + "grad_norm": 4.076268196105957, + "kl": 0.44191154539585115, + "learning_rate": 8.866368451586005e-06, + "loss": -0.0834, + "reward": 0.070994165353477, + "reward_std": 0.022957664681598544, + "rewards/ddi_reward": -0.0013683943077921867, + "rewards/jaccard_reward": 0.0714509729295969, + "rewards/refuse_rate_reward": -0.002284035424236208, + "step": 2890 + }, + { + "completion_length": 21.7671875, + "epoch": 0.232, + "grad_norm": 2.201460123062134, + "kl": 0.42130781784653665, + "learning_rate": 8.85825829953121e-06, + "loss": -0.0609, + "reward": 0.05946478331461549, + "reward_std": 0.024801702331751586, + "rewards/ddi_reward": -0.0002235436113551259, + "rewards/jaccard_reward": 0.060062937560724096, + "rewards/refuse_rate_reward": -0.0029907786985859273, + "step": 2900 + }, + { + "completion_length": 22.3640625, + "epoch": 0.2328, + "grad_norm": 23.326126098632812, + "kl": 0.442724971473217, + "learning_rate": 8.850122977628843e-06, + "loss": -0.0468, + "reward": 0.057434793934226035, + "reward_std": 0.029445191100239753, + "rewards/ddi_reward": -0.0062896268034819515, + "rewards/jaccard_reward": 0.05817474075593054, + "rewards/refuse_rate_reward": -0.0036997377872467043, + "step": 2910 + }, + { + "completion_length": 25.121875, + "epoch": 0.2336, + "grad_norm": 2.8395166397094727, + "kl": 0.44645926132798197, + "learning_rate": 8.84196253895073e-06, + "loss": -0.029, + "reward": 0.051536439917981625, + "reward_std": 0.02487413599155843, + "rewards/ddi_reward": -0.003966562676941976, + "rewards/jaccard_reward": 0.05175903502386063, + "rewards/refuse_rate_reward": -0.0011129712569527329, + "step": 2920 + }, + { + "completion_length": 21.6703125, + "epoch": 0.2344, + "grad_norm": 2.5979933738708496, + "kl": 0.4428347244858742, + "learning_rate": 8.83377703673255e-06, + "loss": -0.069, + "reward": 0.06500998334959149, + "reward_std": 0.021780450048390777, + "rewards/ddi_reward": -0.003453536715824157, + "rewards/jaccard_reward": 0.06548978071659803, + "rewards/refuse_rate_reward": -0.00239898992003873, + "step": 2930 + }, + { + "completion_length": 22.9203125, + "epoch": 0.2352, + "grad_norm": 3.379554033279419, + "kl": 0.46044707149267194, + "learning_rate": 8.825566524373482e-06, + "loss": -0.0479, + "reward": 0.04748523295857012, + "reward_std": 0.025843387795612217, + "rewards/ddi_reward": -0.005516414367593825, + "rewards/jaccard_reward": 0.04848746070638299, + "rewards/refuse_rate_reward": -0.005011139519046992, + "step": 2940 + }, + { + "completion_length": 20.7296875, + "epoch": 0.236, + "grad_norm": 4.074550151824951, + "kl": 0.5097568646073342, + "learning_rate": 8.817331055435871e-06, + "loss": -0.0476, + "reward": 0.04591690320521593, + "reward_std": 0.023270357213914395, + "rewards/ddi_reward": -0.0025834557425696405, + "rewards/jaccard_reward": 0.0463110710028559, + "rewards/refuse_rate_reward": -0.0019708342850208284, + "step": 2950 + }, + { + "completion_length": 21.5765625, + "epoch": 0.2368, + "grad_norm": 3.283331871032715, + "kl": 0.4500380098819733, + "learning_rate": 8.809070683644865e-06, + "loss": -0.0532, + "reward": 0.05369919603690505, + "reward_std": 0.023140084510669113, + "rewards/ddi_reward": -0.0018585315207019447, + "rewards/jaccard_reward": 0.05374992482829839, + "rewards/refuse_rate_reward": -0.0002536526066251099, + "step": 2960 + }, + { + "completion_length": 23.1421875, + "epoch": 0.2376, + "grad_norm": 2.0280752182006836, + "kl": 0.44061320200562476, + "learning_rate": 8.800785462888065e-06, + "loss": -0.0532, + "reward": 0.05984259652905166, + "reward_std": 0.02491878098808229, + "rewards/ddi_reward": -0.008178418508032336, + "rewards/jaccard_reward": 0.060219219489954415, + "rewards/refuse_rate_reward": -0.0018831168999895453, + "step": 2970 + }, + { + "completion_length": 23.609375, + "epoch": 0.2384, + "grad_norm": 1.939505934715271, + "kl": 0.47988018244504926, + "learning_rate": 8.792475447215186e-06, + "loss": -0.0462, + "reward": 0.053968916460871694, + "reward_std": 0.02665441483259201, + "rewards/ddi_reward": -0.004099877114640549, + "rewards/jaccard_reward": 0.05423801261931658, + "rewards/refuse_rate_reward": -0.0013454861124046146, + "step": 2980 + }, + { + "completion_length": 25.25, + "epoch": 0.2392, + "grad_norm": 5.405709743499756, + "kl": 0.4687493696808815, + "learning_rate": 8.784140690837689e-06, + "loss": -0.011, + "reward": 0.0335414755390957, + "reward_std": 0.023851195257157086, + "rewards/ddi_reward": -0.004616758704651147, + "rewards/jaccard_reward": 0.034002404659986496, + "rewards/refuse_rate_reward": -0.0023046530899591745, + "step": 2990 + }, + { + "completion_length": 29.4578125, + "epoch": 0.24, + "grad_norm": 0.9830299615859985, + "kl": 0.4499352782964706, + "learning_rate": 8.775781248128435e-06, + "loss": -0.1063, + "reward": 0.08236269736662508, + "reward_std": 0.0258682596962899, + "rewards/ddi_reward": -0.007086545132915489, + "rewards/jaccard_reward": 0.08300218047224917, + "rewards/refuse_rate_reward": -0.0031974281650036574, + "step": 3000 + }, + { + "completion_length": 26.121875, + "epoch": 0.2408, + "grad_norm": 1.6610071659088135, + "kl": 0.44846615344285967, + "learning_rate": 8.767397173621334e-06, + "loss": -0.0103, + "reward": 0.061906314874067905, + "reward_std": 0.030056709423661232, + "rewards/ddi_reward": -0.007868033857084811, + "rewards/jaccard_reward": 0.06244196168845519, + "rewards/refuse_rate_reward": -0.0026782332570292056, + "step": 3010 + }, + { + "completion_length": 26.8328125, + "epoch": 0.2416, + "grad_norm": 3.796088933944702, + "kl": 0.48326931074261664, + "learning_rate": 8.758988522010978e-06, + "loss": -0.057, + "reward": 0.05996643747203052, + "reward_std": 0.026553795370273293, + "rewards/ddi_reward": -0.00932534341700375, + "rewards/jaccard_reward": 0.061056413454934955, + "rewards/refuse_rate_reward": -0.005449877103092149, + "step": 3020 + }, + { + "completion_length": 26.6546875, + "epoch": 0.2424, + "grad_norm": 4.508175849914551, + "kl": 0.46569644510746, + "learning_rate": 8.750555348152299e-06, + "loss": -0.0339, + "reward": 0.05432733129709959, + "reward_std": 0.02449544765986502, + "rewards/ddi_reward": -0.002766920979775023, + "rewards/jaccard_reward": 0.054703306825831535, + "rewards/refuse_rate_reward": -0.0018798748613335192, + "step": 3030 + }, + { + "completion_length": 27.5171875, + "epoch": 0.2432, + "grad_norm": 1.9377633333206177, + "kl": 0.4829740092158318, + "learning_rate": 8.742097707060194e-06, + "loss": -0.0484, + "reward": 0.05858259373344481, + "reward_std": 0.025163089809939265, + "rewards/ddi_reward": -0.011400296783540398, + "rewards/jaccard_reward": 0.05867001905571669, + "rewards/refuse_rate_reward": -0.000437127985060215, + "step": 3040 + }, + { + "completion_length": 21.0828125, + "epoch": 0.244, + "grad_norm": 4.301794052124023, + "kl": 0.44868904873728754, + "learning_rate": 8.733615653909183e-06, + "loss": -0.0208, + "reward": 0.039431558398064226, + "reward_std": 0.025224699801765383, + "rewards/ddi_reward": -0.009003936708904803, + "rewards/jaccard_reward": 0.03968253128696233, + "rewards/refuse_rate_reward": -0.0012548649567179382, + "step": 3050 + }, + { + "completion_length": 22.3703125, + "epoch": 0.2448, + "grad_norm": 3.316183090209961, + "kl": 0.4476640187203884, + "learning_rate": 8.725109244033035e-06, + "loss": -0.0498, + "reward": 0.047605782630853355, + "reward_std": 0.028455467289313675, + "rewards/ddi_reward": -0.009110490052262322, + "rewards/jaccard_reward": 0.047852942772442474, + "rewards/refuse_rate_reward": -0.0012357954634353518, + "step": 3060 + }, + { + "completion_length": 25.0515625, + "epoch": 0.2456, + "grad_norm": 3.3296828269958496, + "kl": 0.3687894869595766, + "learning_rate": 8.71657853292442e-06, + "loss": -0.0584, + "reward": 0.07318671578541398, + "reward_std": 0.024680068250745535, + "rewards/ddi_reward": -0.0006158471049275249, + "rewards/jaccard_reward": 0.07331059620482847, + "rewards/refuse_rate_reward": -0.0006194196525029838, + "step": 3070 + }, + { + "completion_length": 25.86875, + "epoch": 0.2464, + "grad_norm": 2.828301191329956, + "kl": 0.4517100930213928, + "learning_rate": 8.708023576234532e-06, + "loss": -0.0689, + "reward": 0.07134665148332715, + "reward_std": 0.0258945663459599, + "rewards/ddi_reward": -0.007765221677254885, + "rewards/jaccard_reward": 0.07167068468406797, + "rewards/refuse_rate_reward": -0.0016201637219637633, + "step": 3080 + }, + { + "completion_length": 22.471875, + "epoch": 0.2472, + "grad_norm": 2.8431546688079834, + "kl": 0.576896159350872, + "learning_rate": 8.699444429772742e-06, + "loss": -0.0627, + "reward": 0.06567090945318341, + "reward_std": 0.023310763027984648, + "rewards/ddi_reward": -0.003998840070562437, + "rewards/jaccard_reward": 0.06656022844254039, + "rewards/refuse_rate_reward": -0.004446586431004107, + "step": 3090 + }, + { + "completion_length": 23.1109375, + "epoch": 0.248, + "grad_norm": 5.898187637329102, + "kl": 0.40052360072731974, + "learning_rate": 8.690841149506223e-06, + "loss": -0.027, + "reward": 0.045038693305104974, + "reward_std": 0.022811843920499085, + "rewards/ddi_reward": -0.0019890351642970925, + "rewards/jaccard_reward": 0.04508333480916917, + "rewards/refuse_rate_reward": -0.00022321429569274187, + "step": 3100 + }, + { + "completion_length": 20.5875, + "epoch": 0.2488, + "grad_norm": 4.227458953857422, + "kl": 0.4252838507294655, + "learning_rate": 8.682213791559586e-06, + "loss": -0.062, + "reward": 0.05807242300361395, + "reward_std": 0.025812331726774572, + "rewards/ddi_reward": -0.010268346080556512, + "rewards/jaccard_reward": 0.05855814215610735, + "rewards/refuse_rate_reward": -0.0024286000640131532, + "step": 3110 + }, + { + "completion_length": 23.203125, + "epoch": 0.2496, + "grad_norm": 2.00490665435791, + "kl": 0.37107969596982004, + "learning_rate": 8.673562412214519e-06, + "loss": -0.0576, + "reward": 0.059366515558213, + "reward_std": 0.02548006772994995, + "rewards/ddi_reward": -0.006409401138080284, + "rewards/jaccard_reward": 0.0598155926913023, + "rewards/refuse_rate_reward": -0.0022453784476965666, + "step": 3120 + }, + { + "completion_length": 23.9296875, + "epoch": 0.2504, + "grad_norm": 2.1283557415008545, + "kl": 0.3959325529634953, + "learning_rate": 8.664887067909411e-06, + "loss": -0.0534, + "reward": 0.06816515799728222, + "reward_std": 0.023847310710698365, + "rewards/ddi_reward": -0.00271172997308895, + "rewards/jaccard_reward": 0.06849538179812953, + "rewards/refuse_rate_reward": -0.0016511266818270086, + "step": 3130 + }, + { + "completion_length": 22.2578125, + "epoch": 0.2512, + "grad_norm": 1.9326989650726318, + "kl": 0.45284838080406187, + "learning_rate": 8.656187815239e-06, + "loss": -0.0912, + "reward": 0.05024641645140946, + "reward_std": 0.02268458204343915, + "rewards/ddi_reward": -0.008951419836375863, + "rewards/jaccard_reward": 0.050653913500718774, + "rewards/refuse_rate_reward": -0.0020374826854094863, + "step": 3140 + }, + { + "completion_length": 23.9234375, + "epoch": 0.252, + "grad_norm": 2.3111937046051025, + "kl": 0.5136361449956894, + "learning_rate": 8.647464710953983e-06, + "loss": -0.0598, + "reward": 0.06120430477894843, + "reward_std": 0.02537825210019946, + "rewards/ddi_reward": 0.0056509678543079644, + "rewards/jaccard_reward": 0.061374443967361005, + "rewards/refuse_rate_reward": -0.0008506944286637008, + "step": 3150 + }, + { + "completion_length": 24.346875, + "epoch": 0.2528, + "grad_norm": 1.7831692695617676, + "kl": 0.425423576682806, + "learning_rate": 8.638717811960663e-06, + "loss": -0.0652, + "reward": 0.05780803440138697, + "reward_std": 0.02488366365432739, + "rewards/ddi_reward": -0.009805871150456369, + "rewards/jaccard_reward": 0.05800148674752563, + "rewards/refuse_rate_reward": -0.0009672619635239244, + "step": 3160 + }, + { + "completion_length": 21.115625, + "epoch": 0.2536, + "grad_norm": 1.3834995031356812, + "kl": 0.4016414985060692, + "learning_rate": 8.629947175320565e-06, + "loss": -0.0208, + "reward": 0.04331979057751596, + "reward_std": 0.022669435059651733, + "rewards/ddi_reward": 0.004758180821227143, + "rewards/jaccard_reward": 0.04347343594999984, + "rewards/refuse_rate_reward": -0.000768229179084301, + "step": 3170 + }, + { + "completion_length": 24.009375, + "epoch": 0.2544, + "grad_norm": 2.0312066078186035, + "kl": 0.46408483013510704, + "learning_rate": 8.621152858250078e-06, + "loss": -0.0697, + "reward": 0.07362616648897528, + "reward_std": 0.029768516914919017, + "rewards/ddi_reward": -0.007177199103171006, + "rewards/jaccard_reward": 0.07425860591465608, + "rewards/refuse_rate_reward": -0.0031622024602256716, + "step": 3180 + }, + { + "completion_length": 21.9, + "epoch": 0.2552, + "grad_norm": 2.421872615814209, + "kl": 0.44665803015232086, + "learning_rate": 8.61233491812006e-06, + "loss": -0.0518, + "reward": 0.05351432655006647, + "reward_std": 0.020164067239966242, + "rewards/ddi_reward": -0.00021563404006883503, + "rewards/jaccard_reward": 0.05398665339453146, + "rewards/refuse_rate_reward": -0.00236163578229025, + "step": 3190 + }, + { + "completion_length": 22.34375, + "epoch": 0.256, + "grad_norm": 1.7675981521606445, + "kl": 0.45371915102005006, + "learning_rate": 8.603493412455493e-06, + "loss": -0.07, + "reward": 0.06563791111111641, + "reward_std": 0.02338489885441959, + "rewards/ddi_reward": -0.005064262391533703, + "rewards/jaccard_reward": 0.06593515826389193, + "rewards/refuse_rate_reward": -0.0014862351468764245, + "step": 3200 + }, + { + "completion_length": 22.21875, + "epoch": 0.2568, + "grad_norm": 2.7711923122406006, + "kl": 0.5812795482575893, + "learning_rate": 8.594628398935078e-06, + "loss": -0.0842, + "reward": 0.07297977465204894, + "reward_std": 0.024209752166643738, + "rewards/ddi_reward": -0.008434303518151864, + "rewards/jaccard_reward": 0.07305172708583996, + "rewards/refuse_rate_reward": -0.0003597689210437238, + "step": 3210 + }, + { + "completion_length": 21.3265625, + "epoch": 0.2576, + "grad_norm": 2.0667386054992676, + "kl": 0.4933007977902889, + "learning_rate": 8.585739935390884e-06, + "loss": -0.0525, + "reward": 0.05944466446526349, + "reward_std": 0.025106807332485916, + "rewards/ddi_reward": -0.0077793146381736735, + "rewards/jaccard_reward": 0.0601492783986032, + "rewards/refuse_rate_reward": -0.0035230656387284396, + "step": 3220 + }, + { + "completion_length": 24.69375, + "epoch": 0.2584, + "grad_norm": 10.16128158569336, + "kl": 0.5730562038719654, + "learning_rate": 8.57682807980795e-06, + "loss": -0.031, + "reward": 0.05315808923915029, + "reward_std": 0.024800380831584333, + "rewards/ddi_reward": -0.011465661742840893, + "rewards/jaccard_reward": 0.05343476159032434, + "rewards/refuse_rate_reward": -0.0013833649456501008, + "step": 3230 + }, + { + "completion_length": 22.13125, + "epoch": 0.2592, + "grad_norm": 2.213733196258545, + "kl": 0.4640055179595947, + "learning_rate": 8.567892890323918e-06, + "loss": -0.0559, + "reward": 0.06307052057236433, + "reward_std": 0.025348420976661144, + "rewards/ddi_reward": -0.005609981424640864, + "rewards/jaccard_reward": 0.06353028399171308, + "rewards/refuse_rate_reward": -0.0022988211479969324, + "step": 3240 + }, + { + "completion_length": 20.459375, + "epoch": 0.26, + "grad_norm": 1.780197262763977, + "kl": 0.5117516204714775, + "learning_rate": 8.55893442522866e-06, + "loss": -0.1151, + "reward": 0.06779884239658714, + "reward_std": 0.024482189025729895, + "rewards/ddi_reward": 0.003291722881840542, + "rewards/jaccard_reward": 0.06820062887854875, + "rewards/refuse_rate_reward": -0.0020089285913854837, + "step": 3250 + }, + { + "completion_length": 21.4015625, + "epoch": 0.2608, + "grad_norm": 4.36026668548584, + "kl": 0.417854380607605, + "learning_rate": 8.549952742963877e-06, + "loss": -0.0523, + "reward": 0.05391600206494331, + "reward_std": 0.029473632108420134, + "rewards/ddi_reward": -0.007772690779529512, + "rewards/jaccard_reward": 0.05446064537391067, + "rewards/refuse_rate_reward": -0.002723214332945645, + "step": 3260 + }, + { + "completion_length": 23.9515625, + "epoch": 0.2616, + "grad_norm": 2.1829164028167725, + "kl": 0.46180942133069036, + "learning_rate": 8.540947902122739e-06, + "loss": -0.0638, + "reward": 0.06301611801609397, + "reward_std": 0.025500428536906837, + "rewards/ddi_reward": -0.008431075865519232, + "rewards/jaccard_reward": 0.06314573711715639, + "rewards/refuse_rate_reward": -0.0006481046555563807, + "step": 3270 + }, + { + "completion_length": 21.146875, + "epoch": 0.2624, + "grad_norm": 1.6328959465026855, + "kl": 0.4804652012884617, + "learning_rate": 8.531919961449491e-06, + "loss": -0.0358, + "reward": 0.05046690064482391, + "reward_std": 0.025232425332069396, + "rewards/ddi_reward": -0.00531689103518147, + "rewards/jaccard_reward": 0.05082733947783709, + "rewards/refuse_rate_reward": -0.001802201708778739, + "step": 3280 + }, + { + "completion_length": 23.0703125, + "epoch": 0.2632, + "grad_norm": 4.004902362823486, + "kl": 0.49734855964779856, + "learning_rate": 8.522868979839072e-06, + "loss": -0.0735, + "reward": 0.06560228699818253, + "reward_std": 0.026786084100604058, + "rewards/ddi_reward": -0.006112369982292876, + "rewards/jaccard_reward": 0.06603345451876522, + "rewards/refuse_rate_reward": -0.0021558389882557092, + "step": 3290 + }, + { + "completion_length": 20.5078125, + "epoch": 0.264, + "grad_norm": 2.6534745693206787, + "kl": 0.5450518883764743, + "learning_rate": 8.513795016336732e-06, + "loss": -0.0613, + "reward": 0.050946249335538596, + "reward_std": 0.026318037137389182, + "rewards/ddi_reward": -0.007550646568415686, + "rewards/jaccard_reward": 0.05120159331709147, + "rewards/refuse_rate_reward": -0.0012767180800437928, + "step": 3300 + }, + { + "completion_length": 20.509375, + "epoch": 0.2648, + "grad_norm": 1.82460355758667, + "kl": 0.46560739949345586, + "learning_rate": 8.50469813013765e-06, + "loss": -0.0737, + "reward": 0.06016452638432383, + "reward_std": 0.020075904228724538, + "rewards/ddi_reward": 0.004947590838128235, + "rewards/jaccard_reward": 0.060242651123553516, + "rewards/refuse_rate_reward": -0.000390625, + "step": 3310 + }, + { + "completion_length": 21.6265625, + "epoch": 0.2656, + "grad_norm": 1.694087266921997, + "kl": 0.4812693029642105, + "learning_rate": 8.495578380586535e-06, + "loss": -0.0434, + "reward": 0.05323516339994967, + "reward_std": 0.022252862621098757, + "rewards/ddi_reward": -0.010129218897782266, + "rewards/jaccard_reward": 0.05345651679672301, + "rewards/refuse_rate_reward": -0.0011067708721384407, + "step": 3320 + }, + { + "completion_length": 20.8890625, + "epoch": 0.2664, + "grad_norm": 2.3376710414886475, + "kl": 0.39822512939572335, + "learning_rate": 8.48643582717726e-06, + "loss": -0.0584, + "reward": 0.05647784741595387, + "reward_std": 0.023262703279033303, + "rewards/ddi_reward": -0.0038263673304754775, + "rewards/jaccard_reward": 0.056660138769075274, + "rewards/refuse_rate_reward": -0.0009114583488553762, + "step": 3330 + }, + { + "completion_length": 18.1375, + "epoch": 0.2672, + "grad_norm": 1.2850306034088135, + "kl": 0.42474248483777044, + "learning_rate": 8.477270529552453e-06, + "loss": -0.0677, + "reward": 0.053146008704788986, + "reward_std": 0.0217383045819588, + "rewards/ddi_reward": -0.005554023380682338, + "rewards/jaccard_reward": 0.0531850722967647, + "rewards/refuse_rate_reward": -0.0001953125, + "step": 3340 + }, + { + "completion_length": 21.2546875, + "epoch": 0.268, + "grad_norm": 1.9532208442687988, + "kl": 0.4944688305258751, + "learning_rate": 8.46808254750312e-06, + "loss": -0.0566, + "reward": 0.05585604310035706, + "reward_std": 0.026323253707960247, + "rewards/ddi_reward": -0.001091669115703553, + "rewards/jaccard_reward": 0.0559532661922276, + "rewards/refuse_rate_reward": -0.0004861111170612276, + "step": 3350 + }, + { + "completion_length": 22.1625, + "epoch": 0.2688, + "grad_norm": 2.1520869731903076, + "kl": 0.44050338864326477, + "learning_rate": 8.458871940968253e-06, + "loss": -0.0717, + "reward": 0.072936827596277, + "reward_std": 0.024233977776020765, + "rewards/ddi_reward": -0.004546569957165047, + "rewards/jaccard_reward": 0.0731555748730898, + "rewards/refuse_rate_reward": -0.0010937500279396772, + "step": 3360 + }, + { + "completion_length": 19.1921875, + "epoch": 0.2696, + "grad_norm": 5.147994518280029, + "kl": 0.441680496186018, + "learning_rate": 8.449638770034434e-06, + "loss": -0.0517, + "reward": 0.0532393763307482, + "reward_std": 0.023251339117996393, + "rewards/ddi_reward": 0.0007298426644410938, + "rewards/jaccard_reward": 0.053432829433586446, + "rewards/refuse_rate_reward": -0.00096726194024086, + "step": 3370 + }, + { + "completion_length": 20.0, + "epoch": 0.2704, + "grad_norm": 4.452634334564209, + "kl": 0.45961809903383255, + "learning_rate": 8.44038309493545e-06, + "loss": -0.0255, + "reward": 0.04886534188408405, + "reward_std": 0.023244120716117322, + "rewards/ddi_reward": -0.006335533887613565, + "rewards/jaccard_reward": 0.04950396433705464, + "rewards/refuse_rate_reward": -0.003193109016865492, + "step": 3380 + }, + { + "completion_length": 21.0375, + "epoch": 0.2712, + "grad_norm": 4.639807224273682, + "kl": 0.46730564162135124, + "learning_rate": 8.431104976051897e-06, + "loss": -0.0561, + "reward": 0.0541178249521181, + "reward_std": 0.02420153699349612, + "rewards/ddi_reward": -0.014124164721579291, + "rewards/jaccard_reward": 0.05458373483270407, + "rewards/refuse_rate_reward": -0.0023295455146580936, + "step": 3390 + }, + { + "completion_length": 22.0890625, + "epoch": 0.272, + "grad_norm": 6.288341522216797, + "kl": 0.5704606883227825, + "learning_rate": 8.421804473910782e-06, + "loss": -0.0714, + "reward": 0.07540071746334434, + "reward_std": 0.02026235016528517, + "rewards/ddi_reward": -0.0023965713393408806, + "rewards/jaccard_reward": 0.07560420669615268, + "rewards/refuse_rate_reward": -0.001017446129117161, + "step": 3400 + }, + { + "completion_length": 19.8875, + "epoch": 0.2728, + "grad_norm": 5.558920383453369, + "kl": 0.5004430532455444, + "learning_rate": 8.412481649185139e-06, + "loss": -0.0553, + "reward": 0.05361985298804939, + "reward_std": 0.02166210124269128, + "rewards/ddi_reward": -0.008439553561038338, + "rewards/jaccard_reward": 0.053619852382689716, + "rewards/refuse_rate_reward": 0.0, + "step": 3410 + }, + { + "completion_length": 19.5890625, + "epoch": 0.2736, + "grad_norm": 3.9315755367279053, + "kl": 0.5401183590292931, + "learning_rate": 8.403136562693617e-06, + "loss": -0.0582, + "reward": 0.05690098945051432, + "reward_std": 0.023504870105534793, + "rewards/ddi_reward": -0.0044451484573073685, + "rewards/jaccard_reward": 0.0572784916497767, + "rewards/refuse_rate_reward": -0.001887513534165919, + "step": 3420 + }, + { + "completion_length": 19.6828125, + "epoch": 0.2744, + "grad_norm": 4.659592628479004, + "kl": 0.48324897736310957, + "learning_rate": 8.393769275400102e-06, + "loss": -0.0479, + "reward": 0.04742775917984545, + "reward_std": 0.023972029983997344, + "rewards/ddi_reward": -0.0019196021035895682, + "rewards/jaccard_reward": 0.04771161363460123, + "rewards/refuse_rate_reward": -0.0014192708535119892, + "step": 3430 + }, + { + "completion_length": 19.8640625, + "epoch": 0.2752, + "grad_norm": 1.9591701030731201, + "kl": 0.5057938031852245, + "learning_rate": 8.384379848413304e-06, + "loss": -0.0685, + "reward": 0.0594648867379874, + "reward_std": 0.020130380999762564, + "rewards/ddi_reward": -0.0011918660573428496, + "rewards/jaccard_reward": 0.06016380864894018, + "rewards/refuse_rate_reward": -0.0034946087980642915, + "step": 3440 + }, + { + "completion_length": 21.740625, + "epoch": 0.276, + "grad_norm": 7.733784198760986, + "kl": 0.5241677075624466, + "learning_rate": 8.374968342986367e-06, + "loss": -0.0674, + "reward": 0.06376803917810321, + "reward_std": 0.026566418528091162, + "rewards/ddi_reward": -0.00917607478913851, + "rewards/jaccard_reward": 0.0643349249381572, + "rewards/refuse_rate_reward": -0.002834426867775619, + "step": 3450 + }, + { + "completion_length": 19.434375, + "epoch": 0.2768, + "grad_norm": 2.5428953170776367, + "kl": 0.4919879503548145, + "learning_rate": 8.365534820516463e-06, + "loss": -0.0434, + "reward": 0.04304747623391449, + "reward_std": 0.02060578460805118, + "rewards/ddi_reward": -0.00027866315795108674, + "rewards/jaccard_reward": 0.043521250947378576, + "rewards/refuse_rate_reward": -0.0023688665591180325, + "step": 3460 + }, + { + "completion_length": 17.6921875, + "epoch": 0.2776, + "grad_norm": 2.589301586151123, + "kl": 0.5118543796241284, + "learning_rate": 8.356079342544398e-06, + "loss": -0.0663, + "reward": 0.05131287395488471, + "reward_std": 0.019676434574648738, + "rewards/ddi_reward": 0.0016014899927540682, + "rewards/jaccard_reward": 0.051312873233109714, + "rewards/refuse_rate_reward": 0.0, + "step": 3470 + }, + { + "completion_length": 19.0296875, + "epoch": 0.2784, + "grad_norm": 1.3881621360778809, + "kl": 0.5717234060168266, + "learning_rate": 8.346601970754208e-06, + "loss": -0.0462, + "reward": 0.04818312553688884, + "reward_std": 0.023769437707960604, + "rewards/ddi_reward": -0.0007866642903536558, + "rewards/jaccard_reward": 0.048453195858746766, + "rewards/refuse_rate_reward": -0.00135035109706223, + "step": 3480 + }, + { + "completion_length": 19.8421875, + "epoch": 0.2792, + "grad_norm": 2.893519401550293, + "kl": 0.585880696028471, + "learning_rate": 8.337102766972754e-06, + "loss": -0.0801, + "reward": 0.061186797358095645, + "reward_std": 0.021174764074385167, + "rewards/ddi_reward": -0.006517049123067409, + "rewards/jaccard_reward": 0.061471252050250766, + "rewards/refuse_rate_reward": -0.0014222756610251964, + "step": 3490 + }, + { + "completion_length": 20.034375, + "epoch": 0.28, + "grad_norm": 2.992377758026123, + "kl": 0.6315887719392776, + "learning_rate": 8.327581793169322e-06, + "loss": -0.0338, + "reward": 0.044903398887254295, + "reward_std": 0.02016610335558653, + "rewards/ddi_reward": -0.011509597138501703, + "rewards/jaccard_reward": 0.04520645544398576, + "rewards/refuse_rate_reward": -0.0015152816544286908, + "step": 3500 + }, + { + "completion_length": 21.8453125, + "epoch": 0.2808, + "grad_norm": 2.432476282119751, + "kl": 0.518993903696537, + "learning_rate": 8.318039111455217e-06, + "loss": -0.0659, + "reward": 0.0575285910628736, + "reward_std": 0.02613461222499609, + "rewards/ddi_reward": -0.0029056660539936276, + "rewards/jaccard_reward": 0.05767268834169954, + "rewards/refuse_rate_reward": -0.0007204861147329211, + "step": 3510 + }, + { + "completion_length": 23.65, + "epoch": 0.2816, + "grad_norm": 2.0521581172943115, + "kl": 0.580216721445322, + "learning_rate": 8.30847478408336e-06, + "loss": -0.079, + "reward": 0.07244507670402527, + "reward_std": 0.025539979711174964, + "rewards/ddi_reward": -0.0011134783475426958, + "rewards/jaccard_reward": 0.07264336748048664, + "rewards/refuse_rate_reward": -0.0009914434747770428, + "step": 3520 + }, + { + "completion_length": 25.721875, + "epoch": 0.2824, + "grad_norm": 11.153634071350098, + "kl": 0.49894604831933975, + "learning_rate": 8.298888873447876e-06, + "loss": -0.042, + "reward": 0.05360544952563941, + "reward_std": 0.023947419272735714, + "rewards/ddi_reward": -0.0046456501295324415, + "rewards/jaccard_reward": 0.05379989356733859, + "rewards/refuse_rate_reward": -0.0009722222341224552, + "step": 3530 + }, + { + "completion_length": 26.4796875, + "epoch": 0.2832, + "grad_norm": 2.177267074584961, + "kl": 0.5573345609009266, + "learning_rate": 8.289281442083697e-06, + "loss": -0.0413, + "reward": 0.060173400957137343, + "reward_std": 0.02517187453340739, + "rewards/ddi_reward": -0.006772763477056287, + "rewards/jaccard_reward": 0.06041741603985429, + "rewards/refuse_rate_reward": -0.0012200739816762506, + "step": 3540 + }, + { + "completion_length": 21.3796875, + "epoch": 0.284, + "grad_norm": 1.3698498010635376, + "kl": 0.4828543052077293, + "learning_rate": 8.279652552666145e-06, + "loss": -0.0428, + "reward": 0.04909967458806932, + "reward_std": 0.022706454945728184, + "rewards/ddi_reward": -0.00752749843231868, + "rewards/jaccard_reward": 0.04935236517339945, + "rewards/refuse_rate_reward": -0.0012634501326829195, + "step": 3550 + }, + { + "completion_length": 18.39375, + "epoch": 0.2848, + "grad_norm": 3.0486793518066406, + "kl": 0.46053321212530135, + "learning_rate": 8.270002268010526e-06, + "loss": -0.0994, + "reward": 0.059009043499827385, + "reward_std": 0.01914986337069422, + "rewards/ddi_reward": 0.007971170623204672, + "rewards/jaccard_reward": 0.05916529418900609, + "rewards/refuse_rate_reward": -0.00078125, + "step": 3560 + }, + { + "completion_length": 17.896875, + "epoch": 0.2856, + "grad_norm": 2.8847484588623047, + "kl": 0.4939333513379097, + "learning_rate": 8.260330651071722e-06, + "loss": -0.0422, + "reward": 0.04356559300795197, + "reward_std": 0.018288728687912224, + "rewards/ddi_reward": -0.001997549319639802, + "rewards/jaccard_reward": 0.04364929823204875, + "rewards/refuse_rate_reward": -0.00041852679569274185, + "step": 3570 + }, + { + "completion_length": 21.25625, + "epoch": 0.2864, + "grad_norm": 4.272251129150391, + "kl": 0.4567627996206284, + "learning_rate": 8.250637764943779e-06, + "loss": -0.0441, + "reward": 0.05143334696767852, + "reward_std": 0.02007224541157484, + "rewards/ddi_reward": -0.0030932957422919573, + "rewards/jaccard_reward": 0.052235566813033076, + "rewards/refuse_rate_reward": -0.0040110930916853246, + "step": 3580 + }, + { + "completion_length": 22.5609375, + "epoch": 0.2872, + "grad_norm": 2.1047558784484863, + "kl": 0.7344595707952977, + "learning_rate": 8.240923672859492e-06, + "loss": -0.0423, + "reward": 0.062241385364905, + "reward_std": 0.02615097053349018, + "rewards/ddi_reward": -0.0025581046706065536, + "rewards/jaccard_reward": 0.06269022412598133, + "rewards/refuse_rate_reward": -0.0022441947017796337, + "step": 3590 + }, + { + "completion_length": 24.8046875, + "epoch": 0.288, + "grad_norm": 2.6576340198516846, + "kl": 0.49465222135186193, + "learning_rate": 8.23118843819e-06, + "loss": -0.0484, + "reward": 0.05908774035051465, + "reward_std": 0.023527788976207376, + "rewards/ddi_reward": -0.0060886840481543915, + "rewards/jaccard_reward": 0.05977003267034888, + "rewards/refuse_rate_reward": -0.003411458386108279, + "step": 3600 + }, + { + "completion_length": 27.1734375, + "epoch": 0.2888, + "grad_norm": 1.5869933366775513, + "kl": 0.39800150096416476, + "learning_rate": 8.221432124444371e-06, + "loss": -0.0368, + "reward": 0.053481131105218085, + "reward_std": 0.024655510555021465, + "rewards/ddi_reward": -0.00667242337949574, + "rewards/jaccard_reward": 0.053685077396221456, + "rewards/refuse_rate_reward": -0.0010197368683293462, + "step": 3610 + }, + { + "completion_length": 27.1015625, + "epoch": 0.2896, + "grad_norm": 1.8670889139175415, + "kl": 0.4817777894437313, + "learning_rate": 8.211654795269174e-06, + "loss": -0.0644, + "reward": 0.06540652085095644, + "reward_std": 0.02390417435672134, + "rewards/ddi_reward": -0.009701798693276942, + "rewards/jaccard_reward": 0.06580913583748042, + "rewards/refuse_rate_reward": -0.0020130781107582153, + "step": 3620 + }, + { + "completion_length": 26.8046875, + "epoch": 0.2904, + "grad_norm": 1.03025484085083, + "kl": 0.41996166333556173, + "learning_rate": 8.201856514448086e-06, + "loss": -0.037, + "reward": 0.05892532463185489, + "reward_std": 0.02158219558186829, + "rewards/ddi_reward": -0.008869463967857882, + "rewards/jaccard_reward": 0.059143008664250374, + "rewards/refuse_rate_reward": -0.001088412443641573, + "step": 3630 + }, + { + "completion_length": 24.6265625, + "epoch": 0.2912, + "grad_norm": 3.712641716003418, + "kl": 0.4612051397562027, + "learning_rate": 8.192037345901457e-06, + "loss": -0.0367, + "reward": 0.04807311366312206, + "reward_std": 0.02340151071548462, + "rewards/ddi_reward": -0.005276259258971549, + "rewards/jaccard_reward": 0.04839184391312301, + "rewards/refuse_rate_reward": -0.0015936478856019675, + "step": 3640 + }, + { + "completion_length": 24.1265625, + "epoch": 0.292, + "grad_norm": 21.688501358032227, + "kl": 0.6431027822196483, + "learning_rate": 8.18219735368591e-06, + "loss": -0.1303, + "reward": 0.07915545897558332, + "reward_std": 0.030924259102903306, + "rewards/ddi_reward": -0.011836749193025752, + "rewards/jaccard_reward": 0.08024399997666479, + "rewards/refuse_rate_reward": -0.005442708468763158, + "step": 3650 + }, + { + "completion_length": 24.0, + "epoch": 0.2928, + "grad_norm": 1.4556090831756592, + "kl": 0.44483714178204536, + "learning_rate": 8.172336601993905e-06, + "loss": -0.061, + "reward": 0.06682211621664465, + "reward_std": 0.023892723303288222, + "rewards/ddi_reward": -0.02165318592742551, + "rewards/jaccard_reward": 0.06705574872903526, + "rewards/refuse_rate_reward": -0.0011681547854095697, + "step": 3660 + }, + { + "completion_length": 23.84375, + "epoch": 0.2936, + "grad_norm": 3.5061774253845215, + "kl": 0.45560200661420824, + "learning_rate": 8.162455155153331e-06, + "loss": -0.058, + "reward": 0.06345326714217663, + "reward_std": 0.025727844331413507, + "rewards/ddi_reward": -0.003077446052338928, + "rewards/jaccard_reward": 0.06421759758959525, + "rewards/refuse_rate_reward": -0.0038216551998630165, + "step": 3670 + }, + { + "completion_length": 23.7828125, + "epoch": 0.2944, + "grad_norm": 1.9334876537322998, + "kl": 0.5317226305603981, + "learning_rate": 8.152553077627089e-06, + "loss": -0.0515, + "reward": 0.06113225813023746, + "reward_std": 0.021182649955153465, + "rewards/ddi_reward": -0.0060627548606134955, + "rewards/jaccard_reward": 0.06190623170696199, + "rewards/refuse_rate_reward": -0.003869854274671525, + "step": 3680 + }, + { + "completion_length": 24.8046875, + "epoch": 0.2952, + "grad_norm": 3.0352656841278076, + "kl": 0.596536822617054, + "learning_rate": 8.142630434012661e-06, + "loss": -0.0616, + "reward": 0.06664485139772296, + "reward_std": 0.02593204053118825, + "rewards/ddi_reward": -0.012737951253075152, + "rewards/jaccard_reward": 0.06684053526259959, + "rewards/refuse_rate_reward": -0.0009784226422198117, + "step": 3690 + }, + { + "completion_length": 23.671875, + "epoch": 0.296, + "grad_norm": 2.2267026901245117, + "kl": 0.4651129260659218, + "learning_rate": 8.132687289041698e-06, + "loss": -0.0445, + "reward": 0.05377180660143495, + "reward_std": 0.022797753708437084, + "rewards/ddi_reward": -0.003819385175302159, + "rewards/jaccard_reward": 0.05399353387765586, + "rewards/refuse_rate_reward": -0.0011086310027167202, + "step": 3700 + }, + { + "completion_length": 25.371875, + "epoch": 0.2968, + "grad_norm": 4.368247032165527, + "kl": 0.41208291724324225, + "learning_rate": 8.122723707579595e-06, + "loss": -0.0527, + "reward": 0.05738261993974447, + "reward_std": 0.0256970738992095, + "rewards/ddi_reward": -0.008000735088717192, + "rewards/jaccard_reward": 0.058268419839441775, + "rewards/refuse_rate_reward": -0.004428999905940145, + "step": 3710 + }, + { + "completion_length": 27.4578125, + "epoch": 0.2976, + "grad_norm": 1.527454137802124, + "kl": 0.4933447867631912, + "learning_rate": 8.112739754625066e-06, + "loss": -0.054, + "reward": 0.07001769265625626, + "reward_std": 0.025157842342741786, + "rewards/ddi_reward": -0.01266570328734815, + "rewards/jaccard_reward": 0.07039489755406976, + "rewards/refuse_rate_reward": -0.0018860263167880475, + "step": 3720 + }, + { + "completion_length": 29.071875, + "epoch": 0.2984, + "grad_norm": 3.7897233963012695, + "kl": 0.4457262076437473, + "learning_rate": 8.102735495309718e-06, + "loss": -0.0381, + "reward": 0.061074284743517636, + "reward_std": 0.027060413872823118, + "rewards/ddi_reward": -0.010735475999535993, + "rewards/jaccard_reward": 0.062083639204502106, + "rewards/refuse_rate_reward": -0.005046774097718298, + "step": 3730 + }, + { + "completion_length": 26.05, + "epoch": 0.2992, + "grad_norm": 7.545781135559082, + "kl": 0.441113156080246, + "learning_rate": 8.092710994897634e-06, + "loss": -0.038, + "reward": 0.055151350889354946, + "reward_std": 0.026471696328371763, + "rewards/ddi_reward": -0.007602213591599139, + "rewards/jaccard_reward": 0.05530038964934647, + "rewards/refuse_rate_reward": -0.0007451923214830458, + "step": 3740 + }, + { + "completion_length": 26.0875, + "epoch": 0.3, + "grad_norm": 21.156335830688477, + "kl": 0.5939672574400902, + "learning_rate": 8.082666318784943e-06, + "loss": -0.0514, + "reward": 0.07076195958070457, + "reward_std": 0.026052152877673505, + "rewards/ddi_reward": -0.008694902725983411, + "rewards/jaccard_reward": 0.0710120941279456, + "rewards/refuse_rate_reward": -0.0012506765022408217, + "step": 3750 + }, + { + "completion_length": 25.396875, + "epoch": 0.3008, + "grad_norm": 1.209087610244751, + "kl": 0.4914479225873947, + "learning_rate": 8.072601532499384e-06, + "loss": -0.0484, + "reward": 0.06771848145872354, + "reward_std": 0.023389843944460153, + "rewards/ddi_reward": -0.004166886289021931, + "rewards/jaccard_reward": 0.06865936610847712, + "rewards/refuse_rate_reward": -0.004704415274318308, + "step": 3760 + }, + { + "completion_length": 23.6234375, + "epoch": 0.3016, + "grad_norm": 4.614313125610352, + "kl": 0.4652214400470257, + "learning_rate": 8.062516701699898e-06, + "loss": -0.045, + "reward": 0.06399687944212928, + "reward_std": 0.0240139314904809, + "rewards/ddi_reward": -0.010850237525301055, + "rewards/jaccard_reward": 0.06420506860595196, + "rewards/refuse_rate_reward": -0.0010409512324258685, + "step": 3770 + }, + { + "completion_length": 27.0734375, + "epoch": 0.3024, + "grad_norm": 0.7635372877120972, + "kl": 0.4194511353969574, + "learning_rate": 8.052411892176184e-06, + "loss": -0.051, + "reward": 0.06026054942049086, + "reward_std": 0.021720048086717725, + "rewards/ddi_reward": 0.005070450820494443, + "rewards/jaccard_reward": 0.060838922299444674, + "rewards/refuse_rate_reward": -0.0028918604017235338, + "step": 3780 + }, + { + "completion_length": 25.5515625, + "epoch": 0.3032, + "grad_norm": 1.699328899383545, + "kl": 0.438091329485178, + "learning_rate": 8.042287169848273e-06, + "loss": -0.0362, + "reward": 0.05831987159326672, + "reward_std": 0.02218374612275511, + "rewards/ddi_reward": -0.0037416568477055987, + "rewards/jaccard_reward": 0.05876368414610624, + "rewards/refuse_rate_reward": -0.0022190706979017703, + "step": 3790 + }, + { + "completion_length": 22.734375, + "epoch": 0.304, + "grad_norm": 0.9344492554664612, + "kl": 0.3709112055599689, + "learning_rate": 8.032142600766104e-06, + "loss": -0.044, + "reward": 0.04743454707786441, + "reward_std": 0.02232429157011211, + "rewards/ddi_reward": -0.0077407961885910485, + "rewards/jaccard_reward": 0.0482735094614327, + "rewards/refuse_rate_reward": -0.0041948040714487435, + "step": 3800 + }, + { + "completion_length": 20.428125, + "epoch": 0.3048, + "grad_norm": 1.3399951457977295, + "kl": 0.4031485810875893, + "learning_rate": 8.021978251109086e-06, + "loss": -0.0504, + "reward": 0.04829299282282591, + "reward_std": 0.021422040276229383, + "rewards/ddi_reward": -0.0059352944605052475, + "rewards/jaccard_reward": 0.04857867229729891, + "rewards/refuse_rate_reward": -0.001428399363067001, + "step": 3810 + }, + { + "completion_length": 17.1265625, + "epoch": 0.3056, + "grad_norm": 4.259536266326904, + "kl": 0.5286498293280602, + "learning_rate": 8.011794187185672e-06, + "loss": -0.0521, + "reward": 0.049448040407150985, + "reward_std": 0.018643018242437394, + "rewards/ddi_reward": -0.004618728457717225, + "rewards/jaccard_reward": 0.04949598154053092, + "rewards/refuse_rate_reward": -0.00023970170877873898, + "step": 3820 + }, + { + "completion_length": 23.2359375, + "epoch": 0.3064, + "grad_norm": 1.4983032941818237, + "kl": 0.4315218389034271, + "learning_rate": 8.00159047543292e-06, + "loss": -0.0802, + "reward": 0.07081307447515428, + "reward_std": 0.02321097943931818, + "rewards/ddi_reward": -0.004554698645370081, + "rewards/jaccard_reward": 0.07123765754513442, + "rewards/refuse_rate_reward": -0.0021229205187410114, + "step": 3830 + }, + { + "completion_length": 21.4984375, + "epoch": 0.3072, + "grad_norm": 1.9034507274627686, + "kl": 0.40621767565608025, + "learning_rate": 7.991367182416063e-06, + "loss": -0.0436, + "reward": 0.04267345438711345, + "reward_std": 0.022023393679410218, + "rewards/ddi_reward": -0.005983375979121774, + "rewards/jaccard_reward": 0.04303751839324832, + "rewards/refuse_rate_reward": -0.0018203208222985269, + "step": 3840 + }, + { + "completion_length": 23.228125, + "epoch": 0.308, + "grad_norm": 2.50730037689209, + "kl": 0.450784769654274, + "learning_rate": 7.981124374828079e-06, + "loss": -0.0342, + "reward": 0.054345266660675406, + "reward_std": 0.02359158652834594, + "rewards/ddi_reward": -0.007989523676224053, + "rewards/jaccard_reward": 0.05445180160459131, + "rewards/refuse_rate_reward": -0.000532670458778739, + "step": 3850 + }, + { + "completion_length": 20.284375, + "epoch": 0.3088, + "grad_norm": 1.8570446968078613, + "kl": 0.4979007229208946, + "learning_rate": 7.970862119489252e-06, + "loss": -0.0731, + "reward": 0.06906350725330412, + "reward_std": 0.022553938115015625, + "rewards/ddi_reward": -0.006767189281526953, + "rewards/jaccard_reward": 0.06995129138231278, + "rewards/refuse_rate_reward": -0.004438920505344867, + "step": 3860 + }, + { + "completion_length": 18.4703125, + "epoch": 0.3096, + "grad_norm": 12.639558792114258, + "kl": 0.39994069561362267, + "learning_rate": 7.960580483346727e-06, + "loss": -0.0538, + "reward": 0.052308655064553025, + "reward_std": 0.017859176406636833, + "rewards/ddi_reward": -0.004918942070798948, + "rewards/jaccard_reward": 0.05253000773955137, + "rewards/refuse_rate_reward": -0.001106770837213844, + "step": 3870 + }, + { + "completion_length": 19.7609375, + "epoch": 0.3104, + "grad_norm": 2.624177932739258, + "kl": 0.4059476241469383, + "learning_rate": 7.950279533474093e-06, + "loss": -0.056, + "reward": 0.04888806766830385, + "reward_std": 0.02175523662008345, + "rewards/ddi_reward": -0.002922821417450905, + "rewards/jaccard_reward": 0.04899131928104907, + "rewards/refuse_rate_reward": -0.000516264617908746, + "step": 3880 + }, + { + "completion_length": 22.1546875, + "epoch": 0.3112, + "grad_norm": 2.00905704498291, + "kl": 0.4247338943183422, + "learning_rate": 7.93995933707093e-06, + "loss": -0.0576, + "reward": 0.06526061594486236, + "reward_std": 0.025085633341223, + "rewards/ddi_reward": -0.01011606897227466, + "rewards/jaccard_reward": 0.06549906400032342, + "rewards/refuse_rate_reward": -0.0011922409175895154, + "step": 3890 + }, + { + "completion_length": 21.0953125, + "epoch": 0.312, + "grad_norm": 6.046164512634277, + "kl": 0.473842191696167, + "learning_rate": 7.929619961462373e-06, + "loss": -0.0658, + "reward": 0.05805251360870898, + "reward_std": 0.02070999061688781, + "rewards/ddi_reward": -0.006326377211371437, + "rewards/jaccard_reward": 0.05861922753974795, + "rewards/refuse_rate_reward": -0.0028335814015008507, + "step": 3900 + }, + { + "completion_length": 21.6734375, + "epoch": 0.3128, + "grad_norm": 2.075681447982788, + "kl": 0.4318449944257736, + "learning_rate": 7.919261474098681e-06, + "loss": -0.05, + "reward": 0.05587701539043337, + "reward_std": 0.022796107269823552, + "rewards/ddi_reward": -0.010349912742094602, + "rewards/jaccard_reward": 0.05652797415386886, + "rewards/refuse_rate_reward": -0.0032547906681429594, + "step": 3910 + }, + { + "completion_length": 19.09375, + "epoch": 0.3136, + "grad_norm": 3.824960708618164, + "kl": 0.4721973739564419, + "learning_rate": 7.908883942554786e-06, + "loss": -0.0606, + "reward": 0.058338171569630504, + "reward_std": 0.023559150658547877, + "rewards/ddi_reward": -0.00753710291464813, + "rewards/jaccard_reward": 0.05876413774676621, + "rewards/refuse_rate_reward": -0.002129836322274059, + "step": 3920 + }, + { + "completion_length": 22.6875, + "epoch": 0.3144, + "grad_norm": 3.218313455581665, + "kl": 0.421812779456377, + "learning_rate": 7.898487434529859e-06, + "loss": -0.066, + "reward": 0.07604786455631256, + "reward_std": 0.022504991153255105, + "rewards/ddi_reward": -0.008514500546152704, + "rewards/jaccard_reward": 0.07665976537391543, + "rewards/refuse_rate_reward": -0.003059501270763576, + "step": 3930 + }, + { + "completion_length": 23.9984375, + "epoch": 0.3152, + "grad_norm": 3.642709255218506, + "kl": 0.4697163552045822, + "learning_rate": 7.888072017846864e-06, + "loss": -0.064, + "reward": 0.0548655791208148, + "reward_std": 0.025173926493152976, + "rewards/ddi_reward": -0.003949017118429765, + "rewards/jaccard_reward": 0.05552802816964686, + "rewards/refuse_rate_reward": -0.003312241216190159, + "step": 3940 + }, + { + "completion_length": 22.3046875, + "epoch": 0.316, + "grad_norm": 4.755966663360596, + "kl": 0.5247101873159409, + "learning_rate": 7.877637760452121e-06, + "loss": -0.0414, + "reward": 0.06001965836621821, + "reward_std": 0.025695469649508595, + "rewards/ddi_reward": -0.008491114521166309, + "rewards/jaccard_reward": 0.06064812870172318, + "rewards/refuse_rate_reward": -0.003142361156642437, + "step": 3950 + }, + { + "completion_length": 22.24375, + "epoch": 0.3168, + "grad_norm": 3.1834707260131836, + "kl": 0.46806135177612307, + "learning_rate": 7.867184730414862e-06, + "loss": -0.0455, + "reward": 0.055689889239147305, + "reward_std": 0.02455013464204967, + "rewards/ddi_reward": -0.0024124448362272235, + "rewards/jaccard_reward": 0.05629334240220487, + "rewards/refuse_rate_reward": -0.003017258970066905, + "step": 3960 + }, + { + "completion_length": 23.8015625, + "epoch": 0.3176, + "grad_norm": 1.8649958372116089, + "kl": 0.4916589200496674, + "learning_rate": 7.856712995926779e-06, + "loss": -0.0442, + "reward": 0.05976963313296437, + "reward_std": 0.02147288960404694, + "rewards/ddi_reward": -0.007153566234046593, + "rewards/jaccard_reward": 0.060069278534501794, + "rewards/refuse_rate_reward": -0.001498230523429811, + "step": 3970 + }, + { + "completion_length": 23.70625, + "epoch": 0.3184, + "grad_norm": 2.0663468837738037, + "kl": 0.44478181898593905, + "learning_rate": 7.84622262530159e-06, + "loss": -0.0392, + "reward": 0.046912732627242805, + "reward_std": 0.02566678145667538, + "rewards/ddi_reward": -0.0021425704238936306, + "rewards/jaccard_reward": 0.04764595788437873, + "rewards/refuse_rate_reward": -0.0036661255871877074, + "step": 3980 + }, + { + "completion_length": 24.83125, + "epoch": 0.3192, + "grad_norm": 1.697867512702942, + "kl": 0.4262972816824913, + "learning_rate": 7.835713686974582e-06, + "loss": -0.0622, + "reward": 0.06677479676436633, + "reward_std": 0.024274983210489154, + "rewards/ddi_reward": -0.0014789594803005456, + "rewards/jaccard_reward": 0.06746933904942125, + "rewards/refuse_rate_reward": -0.003472707897890359, + "step": 3990 + }, + { + "completion_length": 24.540625, + "epoch": 0.32, + "grad_norm": 1.7634979486465454, + "kl": 0.4302885323762894, + "learning_rate": 7.825186249502178e-06, + "loss": -0.0761, + "reward": 0.06137065174989402, + "reward_std": 0.025861761416308582, + "rewards/ddi_reward": -0.0017227076255949213, + "rewards/jaccard_reward": 0.061895700648892674, + "rewards/refuse_rate_reward": -0.0026252480689436196, + "step": 4000 + }, + { + "completion_length": 22.95, + "epoch": 0.3208, + "grad_norm": 2.4889698028564453, + "kl": 0.4289160095155239, + "learning_rate": 7.814640381561482e-06, + "loss": -0.0388, + "reward": 0.052796343644149604, + "reward_std": 0.022274532867595554, + "rewards/ddi_reward": -0.008626941149123013, + "rewards/jaccard_reward": 0.05300641381181777, + "rewards/refuse_rate_reward": -0.0010503472294658422, + "step": 4010 + }, + { + "completion_length": 28.48125, + "epoch": 0.3216, + "grad_norm": 10.24564266204834, + "kl": 0.4173016034066677, + "learning_rate": 7.804076151949825e-06, + "loss": -0.084, + "reward": 0.0636389865539968, + "reward_std": 0.024773009540513157, + "rewards/ddi_reward": -0.0064750600256957115, + "rewards/jaccard_reward": 0.06454513985663653, + "rewards/refuse_rate_reward": -0.004530776327010244, + "step": 4020 + }, + { + "completion_length": 24.0453125, + "epoch": 0.3224, + "grad_norm": 5.022263526916504, + "kl": 0.5104011818766594, + "learning_rate": 7.793493629584326e-06, + "loss": -0.0503, + "reward": 0.06541276667267085, + "reward_std": 0.026634778873994947, + "rewards/ddi_reward": -0.0032117681141244246, + "rewards/jaccard_reward": 0.06592016806825995, + "rewards/refuse_rate_reward": -0.0025370115414261817, + "step": 4030 + }, + { + "completion_length": 24.0546875, + "epoch": 0.3232, + "grad_norm": 3.3261642456054688, + "kl": 0.45986602231860163, + "learning_rate": 7.782892883501442e-06, + "loss": -0.0629, + "reward": 0.06508471814449876, + "reward_std": 0.025220875395461917, + "rewards/ddi_reward": -0.001498989202082157, + "rewards/jaccard_reward": 0.06519993209512905, + "rewards/refuse_rate_reward": -0.0005760732572525739, + "step": 4040 + }, + { + "completion_length": 23.3421875, + "epoch": 0.324, + "grad_norm": 1.1562427282333374, + "kl": 0.38448923006653785, + "learning_rate": 7.772273982856513e-06, + "loss": -0.047, + "reward": 0.05431275684386492, + "reward_std": 0.021299010468646885, + "rewards/ddi_reward": -0.006341255496954546, + "rewards/jaccard_reward": 0.05547676892019808, + "rewards/refuse_rate_reward": -0.005820059089455753, + "step": 4050 + }, + { + "completion_length": 27.23125, + "epoch": 0.3248, + "grad_norm": 0.9961440563201904, + "kl": 0.4431544661521912, + "learning_rate": 7.761636996923312e-06, + "loss": -0.0578, + "reward": 0.06894335532560944, + "reward_std": 0.02811043516267091, + "rewards/ddi_reward": -0.011601537559181451, + "rewards/jaccard_reward": 0.06939177681924776, + "rewards/refuse_rate_reward": -0.0022421199129894374, + "step": 4060 + }, + { + "completion_length": 23.39375, + "epoch": 0.3256, + "grad_norm": 2.9689249992370605, + "kl": 0.4344467528164387, + "learning_rate": 7.750981995093592e-06, + "loss": -0.0474, + "reward": 0.0688378238119185, + "reward_std": 0.025464314268901944, + "rewards/ddi_reward": -0.0029946208349429073, + "rewards/jaccard_reward": 0.06900685760192574, + "rewards/refuse_rate_reward": -0.0008451704634353518, + "step": 4070 + }, + { + "completion_length": 28.221875, + "epoch": 0.3264, + "grad_norm": 1.334642767906189, + "kl": 0.392879007011652, + "learning_rate": 7.740309046876637e-06, + "loss": -0.0597, + "reward": 0.07051625838503242, + "reward_std": 0.029005658300593495, + "rewards/ddi_reward": -0.007084015221334994, + "rewards/jaccard_reward": 0.0716930930968374, + "rewards/refuse_rate_reward": -0.00588416107930243, + "step": 4080 + }, + { + "completion_length": 23.1421875, + "epoch": 0.3272, + "grad_norm": 18.57900619506836, + "kl": 0.45194959044456484, + "learning_rate": 7.72961822189881e-06, + "loss": -0.0614, + "reward": 0.05975470505654812, + "reward_std": 0.020884520560503005, + "rewards/ddi_reward": -0.0021321469568647443, + "rewards/jaccard_reward": 0.05983975036069751, + "rewards/refuse_rate_reward": -0.00042522321455180646, + "step": 4090 + }, + { + "completion_length": 22.7921875, + "epoch": 0.328, + "grad_norm": 3.005028247833252, + "kl": 0.48803186044096947, + "learning_rate": 7.71890958990309e-06, + "loss": -0.0662, + "reward": 0.06884464509785175, + "reward_std": 0.026963122515007855, + "rewards/ddi_reward": -0.00703455230104737, + "rewards/jaccard_reward": 0.06941012172028423, + "rewards/refuse_rate_reward": -0.002827381086535752, + "step": 4100 + }, + { + "completion_length": 23.171875, + "epoch": 0.3288, + "grad_norm": 27.315208435058594, + "kl": 0.6588039018213749, + "learning_rate": 7.708183220748623e-06, + "loss": -0.0909, + "reward": 0.0758883722126484, + "reward_std": 0.026341959415003658, + "rewards/ddi_reward": -4.86170407384634e-05, + "rewards/jaccard_reward": 0.07606843113899231, + "rewards/refuse_rate_reward": -0.0009002976585179567, + "step": 4110 + }, + { + "completion_length": 28.01875, + "epoch": 0.3296, + "grad_norm": 1.8462525606155396, + "kl": 0.42466747388243675, + "learning_rate": 7.697439184410269e-06, + "loss": -0.033, + "reward": 0.06523569556884468, + "reward_std": 0.02462212387472391, + "rewards/ddi_reward": -0.009081405893084593, + "rewards/jaccard_reward": 0.06562706581316888, + "rewards/refuse_rate_reward": -0.0019568452960811557, + "step": 4120 + }, + { + "completion_length": 23.5875, + "epoch": 0.3304, + "grad_norm": 2.041724443435669, + "kl": 0.4955477327108383, + "learning_rate": 7.686677550978138e-06, + "loss": -0.0537, + "reward": 0.06068112249486148, + "reward_std": 0.023475565202534198, + "rewards/ddi_reward": 0.004871338477823883, + "rewards/jaccard_reward": 0.06105736405588687, + "rewards/refuse_rate_reward": -0.0018812003778293728, + "step": 4130 + }, + { + "completion_length": 25.521875, + "epoch": 0.3312, + "grad_norm": 1.7296901941299438, + "kl": 0.5373996794223785, + "learning_rate": 7.675898390657143e-06, + "loss": -0.0549, + "reward": 0.06426096246577799, + "reward_std": 0.026324491202831268, + "rewards/ddi_reward": -0.010977002169238404, + "rewards/jaccard_reward": 0.06447091954760253, + "rewards/refuse_rate_reward": -0.0010497835814021529, + "step": 4140 + }, + { + "completion_length": 21.6421875, + "epoch": 0.332, + "grad_norm": 3.9395742416381836, + "kl": 0.4926187671720982, + "learning_rate": 7.665101773766528e-06, + "loss": -0.0338, + "reward": 0.040837951190769675, + "reward_std": 0.027356984699144958, + "rewards/ddi_reward": -0.009046233119443059, + "rewards/jaccard_reward": 0.04220547057921067, + "rewards/refuse_rate_reward": -0.00683760034153238, + "step": 4150 + }, + { + "completion_length": 24.3453125, + "epoch": 0.3328, + "grad_norm": 2.744459867477417, + "kl": 0.41003070026636124, + "learning_rate": 7.654287770739424e-06, + "loss": -0.0562, + "reward": 0.06150808761594817, + "reward_std": 0.02495516915805638, + "rewards/ddi_reward": -0.002540010155644268, + "rewards/jaccard_reward": 0.06200387144635897, + "rewards/refuse_rate_reward": -0.0024789187067653985, + "step": 4160 + }, + { + "completion_length": 24.646875, + "epoch": 0.3336, + "grad_norm": 1.8770420551300049, + "kl": 0.6068558067083358, + "learning_rate": 7.643456452122377e-06, + "loss": -0.0508, + "reward": 0.0722439780831337, + "reward_std": 0.02719088438898325, + "rewards/ddi_reward": -0.016555344534572213, + "rewards/jaccard_reward": 0.07236674479208886, + "rewards/refuse_rate_reward": -0.0006138392724096775, + "step": 4170 + }, + { + "completion_length": 26.2375, + "epoch": 0.3344, + "grad_norm": 1.65442955493927, + "kl": 0.41516606882214546, + "learning_rate": 7.632607888574898e-06, + "loss": -0.0683, + "reward": 0.06841912977397442, + "reward_std": 0.0317548499442637, + "rewards/ddi_reward": -0.00887016402266454, + "rewards/jaccard_reward": 0.06877850475721062, + "rewards/refuse_rate_reward": -0.0017968750093132258, + "step": 4180 + }, + { + "completion_length": 27.178125, + "epoch": 0.3352, + "grad_norm": 0.8991714715957642, + "kl": 0.44882656186819075, + "learning_rate": 7.621742150868998e-06, + "loss": -0.0391, + "reward": 0.060994851589202884, + "reward_std": 0.02820127042941749, + "rewards/ddi_reward": -0.008936375111807138, + "rewards/jaccard_reward": 0.0613790059927851, + "rewards/refuse_rate_reward": -0.001920787594281137, + "step": 4190 + }, + { + "completion_length": 24.1, + "epoch": 0.336, + "grad_norm": 6.6605610847473145, + "kl": 0.4308492362499237, + "learning_rate": 7.61085930988872e-06, + "loss": -0.0392, + "reward": 0.0462826456874609, + "reward_std": 0.025690733711235224, + "rewards/ddi_reward": -0.009553671983303503, + "rewards/jaccard_reward": 0.04664823105558753, + "rewards/refuse_rate_reward": -0.001827933406457305, + "step": 4200 + }, + { + "completion_length": 21.609375, + "epoch": 0.3368, + "grad_norm": 0.9738613367080688, + "kl": 0.43143691793084143, + "learning_rate": 7.599959436629688e-06, + "loss": -0.0607, + "reward": 0.059224544651806356, + "reward_std": 0.02497368077747524, + "rewards/ddi_reward": -0.006881495018023998, + "rewards/jaccard_reward": 0.05945384604856372, + "rewards/refuse_rate_reward": -0.0011465098010376095, + "step": 4210 + }, + { + "completion_length": 19.371875, + "epoch": 0.3376, + "grad_norm": 4.7611517906188965, + "kl": 0.5018811248242855, + "learning_rate": 7.589042602198637e-06, + "loss": -0.0331, + "reward": 0.04566350500099361, + "reward_std": 0.025715709384530782, + "rewards/ddi_reward": 0.0009928672690875829, + "rewards/jaccard_reward": 0.04589136963477358, + "rewards/refuse_rate_reward": -0.0011393229477107524, + "step": 4220 + }, + { + "completion_length": 23.603125, + "epoch": 0.3384, + "grad_norm": 3.2716546058654785, + "kl": 0.5374553389847279, + "learning_rate": 7.578108877812949e-06, + "loss": -0.095, + "reward": 0.06463057411601766, + "reward_std": 0.02446468423586339, + "rewards/ddi_reward": 0.0003483026797766797, + "rewards/jaccard_reward": 0.06508866973454133, + "rewards/refuse_rate_reward": -0.0022904830053448676, + "step": 4230 + }, + { + "completion_length": 23.2046875, + "epoch": 0.3392, + "grad_norm": 2.265876531600952, + "kl": 0.42915281690657137, + "learning_rate": 7.5671583348001895e-06, + "loss": -0.0352, + "reward": 0.045918431505560874, + "reward_std": 0.02536369962617755, + "rewards/ddi_reward": -0.004616374184843152, + "rewards/jaccard_reward": 0.04691372207598761, + "rewards/refuse_rate_reward": -0.004976455727592111, + "step": 4240 + }, + { + "completion_length": 25.5203125, + "epoch": 0.34, + "grad_norm": 1.0043556690216064, + "kl": 0.508238909393549, + "learning_rate": 7.556191044597647e-06, + "loss": -0.0424, + "reward": 0.05523124919272959, + "reward_std": 0.0247103625908494, + "rewards/ddi_reward": -0.010759522195439786, + "rewards/jaccard_reward": 0.055739809782244264, + "rewards/refuse_rate_reward": -0.002542800176888704, + "step": 4250 + }, + { + "completion_length": 23.2890625, + "epoch": 0.3408, + "grad_norm": 1.2649824619293213, + "kl": 0.5048773892223835, + "learning_rate": 7.545207078751858e-06, + "loss": -0.0601, + "reward": 0.06465784655883908, + "reward_std": 0.024431026307865977, + "rewards/ddi_reward": -0.007207496991031803, + "rewards/jaccard_reward": 0.06515257712453604, + "rewards/refuse_rate_reward": -0.0024736483581364157, + "step": 4260 + }, + { + "completion_length": 27.8859375, + "epoch": 0.3416, + "grad_norm": 9.643706321716309, + "kl": 0.47001732140779495, + "learning_rate": 7.534206508918142e-06, + "loss": -0.0388, + "reward": 0.06135748594533652, + "reward_std": 0.024756442196667193, + "rewards/ddi_reward": -0.008443138911388814, + "rewards/jaccard_reward": 0.06176779891829938, + "rewards/refuse_rate_reward": -0.002051564579596743, + "step": 4270 + }, + { + "completion_length": 23.2859375, + "epoch": 0.3424, + "grad_norm": 3.3951427936553955, + "kl": 0.46070226430892947, + "learning_rate": 7.523189406860148e-06, + "loss": -0.0455, + "reward": 0.05244887485168874, + "reward_std": 0.0227031740359962, + "rewards/ddi_reward": -0.003465480380691588, + "rewards/jaccard_reward": 0.05306989825330675, + "rewards/refuse_rate_reward": -0.0031051021302118897, + "step": 4280 + }, + { + "completion_length": 27.309375, + "epoch": 0.3432, + "grad_norm": 3.8420591354370117, + "kl": 0.4472386106848717, + "learning_rate": 7.5121558444493605e-06, + "loss": -0.0443, + "reward": 0.06724371444433927, + "reward_std": 0.026837373664602638, + "rewards/ddi_reward": -0.007864771905587986, + "rewards/jaccard_reward": 0.0680614973185584, + "rewards/refuse_rate_reward": -0.0040889081894420086, + "step": 4290 + }, + { + "completion_length": 30.2640625, + "epoch": 0.344, + "grad_norm": 2.8471624851226807, + "kl": 0.4764496490359306, + "learning_rate": 7.501105893664658e-06, + "loss": -0.0417, + "reward": 0.05773649774491787, + "reward_std": 0.023097693570889533, + "rewards/ddi_reward": -0.004788057447876781, + "rewards/jaccard_reward": 0.05820354027673602, + "rewards/refuse_rate_reward": -0.002335210650926456, + "step": 4300 + }, + { + "completion_length": 28.153125, + "epoch": 0.3448, + "grad_norm": 2.5402605533599854, + "kl": 0.44977799355983733, + "learning_rate": 7.490039626591821e-06, + "loss": -0.0522, + "reward": 0.06263686940073968, + "reward_std": 0.025476353662088513, + "rewards/ddi_reward": -0.0054018523311242465, + "rewards/jaccard_reward": 0.06300965971313417, + "rewards/refuse_rate_reward": -0.0018639520276337862, + "step": 4310 + }, + { + "completion_length": 24.25625, + "epoch": 0.3456, + "grad_norm": 2.5666120052337646, + "kl": 0.44024451449513435, + "learning_rate": 7.478957115423078e-06, + "loss": -0.0453, + "reward": 0.059215834084898236, + "reward_std": 0.026376401679590343, + "rewards/ddi_reward": -0.009363031448447145, + "rewards/jaccard_reward": 0.059629896795377134, + "rewards/refuse_rate_reward": -0.0020703125162981452, + "step": 4320 + }, + { + "completion_length": 23.2625, + "epoch": 0.3464, + "grad_norm": 2.2073941230773926, + "kl": 0.5043004333972931, + "learning_rate": 7.467858432456624e-06, + "loss": -0.0504, + "reward": 0.053572281775996086, + "reward_std": 0.02324232857208699, + "rewards/ddi_reward": -0.009665548289194702, + "rewards/jaccard_reward": 0.05367375165224075, + "rewards/refuse_rate_reward": -0.0005073383683338762, + "step": 4330 + }, + { + "completion_length": 18.171875, + "epoch": 0.3472, + "grad_norm": 3.4655063152313232, + "kl": 0.39196822643280027, + "learning_rate": 7.456743650096157e-06, + "loss": -0.061, + "reward": 0.046501608984544876, + "reward_std": 0.022562485036905854, + "rewards/ddi_reward": -0.00942746877844911, + "rewards/jaccard_reward": 0.046659098984673616, + "rewards/refuse_rate_reward": -0.0007874504080973566, + "step": 4340 + }, + { + "completion_length": 25.5640625, + "epoch": 0.348, + "grad_norm": 6.383001804351807, + "kl": 0.4653278239071369, + "learning_rate": 7.445612840850394e-06, + "loss": -0.0495, + "reward": 0.06545818764716387, + "reward_std": 0.028075755340978503, + "rewards/ddi_reward": -0.0022903482487890868, + "rewards/jaccard_reward": 0.06643167259171605, + "rewards/refuse_rate_reward": -0.0048674242570996284, + "step": 4350 + }, + { + "completion_length": 20.553125, + "epoch": 0.3488, + "grad_norm": 1.8061131238937378, + "kl": 0.5269994586706161, + "learning_rate": 7.434466077332616e-06, + "loss": -0.0665, + "reward": 0.06850778195075691, + "reward_std": 0.025893712136894464, + "rewards/ddi_reward": -0.00271711292443797, + "rewards/jaccard_reward": 0.06936641475185752, + "rewards/refuse_rate_reward": -0.004293154785409569, + "step": 4360 + }, + { + "completion_length": 25.45, + "epoch": 0.3496, + "grad_norm": 1.4202911853790283, + "kl": 0.4408915489912033, + "learning_rate": 7.423303432260175e-06, + "loss": -0.0399, + "reward": 0.057569860387593506, + "reward_std": 0.01833585584536195, + "rewards/ddi_reward": -0.003037840675096959, + "rewards/jaccard_reward": 0.05791361019946635, + "rewards/refuse_rate_reward": -0.0017187500023283064, + "step": 4370 + }, + { + "completion_length": 19.6125, + "epoch": 0.3504, + "grad_norm": 4.649608612060547, + "kl": 0.378526471555233, + "learning_rate": 7.41212497845403e-06, + "loss": -0.0323, + "reward": 0.04454324836842716, + "reward_std": 0.023501609498634935, + "rewards/ddi_reward": 0.00018073811515932902, + "rewards/jaccard_reward": 0.04479388038162142, + "rewards/refuse_rate_reward": -0.001253156561870128, + "step": 4380 + }, + { + "completion_length": 23.4078125, + "epoch": 0.3512, + "grad_norm": 6.739236831665039, + "kl": 0.4685405924916267, + "learning_rate": 7.4009307888382755e-06, + "loss": -0.0408, + "reward": 0.05053093666210771, + "reward_std": 0.020606562308967114, + "rewards/ddi_reward": -0.007136772290687076, + "rewards/jaccard_reward": 0.0508568515535444, + "rewards/refuse_rate_reward": -0.0016295770998112856, + "step": 4390 + }, + { + "completion_length": 21.7296875, + "epoch": 0.352, + "grad_norm": 2.2558205127716064, + "kl": 0.4167357228696346, + "learning_rate": 7.389720936439655e-06, + "loss": -0.0613, + "reward": 0.05868989303708076, + "reward_std": 0.024949319753795864, + "rewards/ddi_reward": -0.0015098547097295522, + "rewards/jaccard_reward": 0.05888644584920257, + "rewards/refuse_rate_reward": -0.0009827629080973565, + "step": 4400 + }, + { + "completion_length": 22.7015625, + "epoch": 0.3528, + "grad_norm": 1.8305294513702393, + "kl": 0.545240393280983, + "learning_rate": 7.37849549438709e-06, + "loss": -0.0519, + "reward": 0.055309080285951495, + "reward_std": 0.025532243587076665, + "rewards/ddi_reward": -0.010763785470044241, + "rewards/jaccard_reward": 0.055881067097652705, + "rewards/refuse_rate_reward": -0.0028599331388249995, + "step": 4410 + }, + { + "completion_length": 22.8859375, + "epoch": 0.3536, + "grad_norm": 2.071201801300049, + "kl": 0.48584063425660134, + "learning_rate": 7.367254535911208e-06, + "loss": -0.0667, + "reward": 0.06227944660931826, + "reward_std": 0.023121455078944565, + "rewards/ddi_reward": -0.01052097933134064, + "rewards/jaccard_reward": 0.06259690602310002, + "rewards/refuse_rate_reward": -0.0015873016091063619, + "step": 4420 + }, + { + "completion_length": 19.921875, + "epoch": 0.3544, + "grad_norm": 1.199068546295166, + "kl": 0.6705225460231304, + "learning_rate": 7.355998134343857e-06, + "loss": -0.0267, + "reward": 0.041933165420778094, + "reward_std": 0.023414055327884854, + "rewards/ddi_reward": -0.006326974518015049, + "rewards/jaccard_reward": 0.043082346813753246, + "rewards/refuse_rate_reward": -0.005745907768141478, + "step": 4430 + }, + { + "completion_length": 21.0796875, + "epoch": 0.3552, + "grad_norm": 3.100738286972046, + "kl": 0.5271327055990696, + "learning_rate": 7.344726363117629e-06, + "loss": -0.0539, + "reward": 0.05873792900238186, + "reward_std": 0.023743621772155166, + "rewards/ddi_reward": -0.010054526072053704, + "rewards/jaccard_reward": 0.059056638111360374, + "rewards/refuse_rate_reward": -0.0015935583389364183, + "step": 4440 + }, + { + "completion_length": 24.915625, + "epoch": 0.356, + "grad_norm": 2.742736339569092, + "kl": 0.4631090387701988, + "learning_rate": 7.333439295765382e-06, + "loss": -0.075, + "reward": 0.0738607975654304, + "reward_std": 0.024271003669127822, + "rewards/ddi_reward": -0.007566177871194668, + "rewards/jaccard_reward": 0.0742240902967751, + "rewards/refuse_rate_reward": -0.0018164683133363724, + "step": 4450 + }, + { + "completion_length": 23.2453125, + "epoch": 0.3568, + "grad_norm": 8.819568634033203, + "kl": 0.38800063282251357, + "learning_rate": 7.322137005919765e-06, + "loss": -0.0454, + "reward": 0.05511370408348739, + "reward_std": 0.023496886016801, + "rewards/ddi_reward": -0.0029034425038844346, + "rewards/jaccard_reward": 0.055462690512649716, + "rewards/refuse_rate_reward": -0.001744929701089859, + "step": 4460 + }, + { + "completion_length": 26.0109375, + "epoch": 0.3576, + "grad_norm": 3.9375193119049072, + "kl": 0.4636474534869194, + "learning_rate": 7.310819567312726e-06, + "loss": -0.0576, + "reward": 0.05856331542599946, + "reward_std": 0.024318572180345655, + "rewards/ddi_reward": -0.002496837597573176, + "rewards/jaccard_reward": 0.05875230464152992, + "rewards/refuse_rate_reward": -0.00094494050135836, + "step": 4470 + }, + { + "completion_length": 24.234375, + "epoch": 0.3584, + "grad_norm": 5.443451404571533, + "kl": 0.405844846367836, + "learning_rate": 7.29948705377504e-06, + "loss": -0.0716, + "reward": 0.06611823840066791, + "reward_std": 0.024838732555508614, + "rewards/ddi_reward": -0.0032381427270593123, + "rewards/jaccard_reward": 0.0662789958063513, + "rewards/refuse_rate_reward": -0.0008037860621698201, + "step": 4480 + }, + { + "completion_length": 25.0765625, + "epoch": 0.3592, + "grad_norm": 4.020279407501221, + "kl": 0.48930060416460036, + "learning_rate": 7.28813953923583e-06, + "loss": -0.0533, + "reward": 0.06036203433759511, + "reward_std": 0.024572074040770532, + "rewards/ddi_reward": -0.001835251518059522, + "rewards/jaccard_reward": 0.0608822469599545, + "rewards/refuse_rate_reward": -0.0026010665344074367, + "step": 4490 + }, + { + "completion_length": 24.33125, + "epoch": 0.36, + "grad_norm": 1.7380889654159546, + "kl": 0.4404007725417614, + "learning_rate": 7.276777097722074e-06, + "loss": -0.0494, + "reward": 0.05217121075838804, + "reward_std": 0.019938822044059633, + "rewards/ddi_reward": 0.0004970628026057966, + "rewards/jaccard_reward": 0.052715244609862565, + "rewards/refuse_rate_reward": -0.002720170607790351, + "step": 4500 + }, + { + "completion_length": 26.0625, + "epoch": 0.3608, + "grad_norm": 2.034254312515259, + "kl": 0.49098805114626887, + "learning_rate": 7.265399803358131e-06, + "loss": -0.0501, + "reward": 0.06564304628409445, + "reward_std": 0.027840149914845826, + "rewards/ddi_reward": -0.006425651430618018, + "rewards/jaccard_reward": 0.06694120625033975, + "rewards/refuse_rate_reward": -0.006490795064019039, + "step": 4510 + }, + { + "completion_length": 24.3046875, + "epoch": 0.3616, + "grad_norm": 1.7011876106262207, + "kl": 0.47325644344091417, + "learning_rate": 7.2540077303652535e-06, + "loss": -0.0698, + "reward": 0.06850908147171139, + "reward_std": 0.028084002342075108, + "rewards/ddi_reward": -0.006215856221388094, + "rewards/jaccard_reward": 0.0689005739055574, + "rewards/refuse_rate_reward": -0.0019574652891606093, + "step": 4520 + }, + { + "completion_length": 23.646875, + "epoch": 0.3624, + "grad_norm": 2.722552537918091, + "kl": 0.42063862942159175, + "learning_rate": 7.242600953061105e-06, + "loss": -0.0523, + "reward": 0.0645093725877814, + "reward_std": 0.028155562770552933, + "rewards/ddi_reward": 0.004008374054683373, + "rewards/jaccard_reward": 0.06509172602090985, + "rewards/refuse_rate_reward": -0.002911762776784599, + "step": 4530 + }, + { + "completion_length": 23.4265625, + "epoch": 0.3632, + "grad_norm": 1.27824866771698, + "kl": 0.44058878943324087, + "learning_rate": 7.231179545859275e-06, + "loss": -0.0275, + "reward": 0.046378329116851094, + "reward_std": 0.023955184128135444, + "rewards/ddi_reward": 0.0027961004816461354, + "rewards/jaccard_reward": 0.04701544684940018, + "rewards/refuse_rate_reward": -0.003185590566135943, + "step": 4540 + }, + { + "completion_length": 23.15625, + "epoch": 0.364, + "grad_norm": 2.5391297340393066, + "kl": 0.45000093430280685, + "learning_rate": 7.21974358326879e-06, + "loss": -0.0537, + "reward": 0.05272922462318093, + "reward_std": 0.022108788881450892, + "rewards/ddi_reward": -0.011540290206903591, + "rewards/jaccard_reward": 0.05322542418725788, + "rewards/refuse_rate_reward": -0.0024809948459733277, + "step": 4550 + }, + { + "completion_length": 26.8796875, + "epoch": 0.3648, + "grad_norm": 3.2365570068359375, + "kl": 0.40463748201727867, + "learning_rate": 7.2082931398936366e-06, + "loss": -0.038, + "reward": 0.05810967548750341, + "reward_std": 0.0255532864946872, + "rewards/ddi_reward": -0.0025292471749708057, + "rewards/jaccard_reward": 0.058556331822182985, + "rewards/refuse_rate_reward": -0.002233287668786943, + "step": 4560 + }, + { + "completion_length": 28.728125, + "epoch": 0.3656, + "grad_norm": 2.869107484817505, + "kl": 0.46209707781672477, + "learning_rate": 7.196828290432258e-06, + "loss": -0.043, + "reward": 0.0636894385330379, + "reward_std": 0.029974795365706085, + "rewards/ddi_reward": -0.0027854342246428133, + "rewards/jaccard_reward": 0.06423929070588201, + "rewards/refuse_rate_reward": -0.0027492560213431714, + "step": 4570 + }, + { + "completion_length": 26.4734375, + "epoch": 0.3664, + "grad_norm": 3.2786478996276855, + "kl": 0.35767223685979843, + "learning_rate": 7.185349109677091e-06, + "loss": -0.0413, + "reward": 0.05400811992585659, + "reward_std": 0.024287624238058924, + "rewards/ddi_reward": -0.005592527723638341, + "rewards/jaccard_reward": 0.05429167559486814, + "rewards/refuse_rate_reward": -0.0014177771052345633, + "step": 4580 + }, + { + "completion_length": 26.0515625, + "epoch": 0.3672, + "grad_norm": 1.3365833759307861, + "kl": 0.44370660558342934, + "learning_rate": 7.173855672514054e-06, + "loss": -0.0274, + "reward": 0.053131753392517564, + "reward_std": 0.027667887136340143, + "rewards/ddi_reward": -0.00020364090451039374, + "rewards/jaccard_reward": 0.05338322257157415, + "rewards/refuse_rate_reward": -0.0012573451036587357, + "step": 4590 + }, + { + "completion_length": 24.490625, + "epoch": 0.368, + "grad_norm": 2.1384873390197754, + "kl": 0.4090543411672115, + "learning_rate": 7.162348053922075e-06, + "loss": -0.0583, + "reward": 0.06582437446340919, + "reward_std": 0.025139685766771436, + "rewards/ddi_reward": -0.005885255197063089, + "rewards/jaccard_reward": 0.06596065890043974, + "rewards/refuse_rate_reward": -0.0006814236170612275, + "step": 4600 + }, + { + "completion_length": 25.3359375, + "epoch": 0.3688, + "grad_norm": 1.1945209503173828, + "kl": 0.37714942544698715, + "learning_rate": 7.150826328972589e-06, + "loss": -0.0494, + "reward": 0.05139366304501891, + "reward_std": 0.021202223747968672, + "rewards/ddi_reward": -0.00565069425792899, + "rewards/jaccard_reward": 0.052063286397606136, + "rewards/refuse_rate_reward": -0.003348119056317955, + "step": 4610 + }, + { + "completion_length": 23.6328125, + "epoch": 0.3696, + "grad_norm": 3.8797810077667236, + "kl": 0.4708124153316021, + "learning_rate": 7.139290572829066e-06, + "loss": -0.0604, + "reward": 0.06354591357521713, + "reward_std": 0.02894364409148693, + "rewards/ddi_reward": -0.0026813414064235986, + "rewards/jaccard_reward": 0.06378065197495744, + "rewards/refuse_rate_reward": -0.0011736961314454675, + "step": 4620 + }, + { + "completion_length": 22.1515625, + "epoch": 0.3704, + "grad_norm": 2.772733211517334, + "kl": 0.4667303405702114, + "learning_rate": 7.127740860746501e-06, + "loss": -0.0753, + "reward": 0.07316510793752969, + "reward_std": 0.022159057017415763, + "rewards/ddi_reward": -0.007888889452442528, + "rewards/jaccard_reward": 0.0740244819317013, + "rewards/refuse_rate_reward": -0.004296875139698386, + "step": 4630 + }, + { + "completion_length": 20.828125, + "epoch": 0.3712, + "grad_norm": 6.280205249786377, + "kl": 0.5759078912436962, + "learning_rate": 7.116177268070939e-06, + "loss": -0.0816, + "reward": 0.06896197842434049, + "reward_std": 0.023681730026146396, + "rewards/ddi_reward": -0.0010016986081609503, + "rewards/jaccard_reward": 0.06968854013830424, + "rewards/refuse_rate_reward": -0.0036328125279396772, + "step": 4640 + }, + { + "completion_length": 21.184375, + "epoch": 0.372, + "grad_norm": 2.799098491668701, + "kl": 0.55084587931633, + "learning_rate": 7.104599870238972e-06, + "loss": -0.0597, + "reward": 0.05057095088995993, + "reward_std": 0.020163870742544533, + "rewards/ddi_reward": -0.0022135732928290964, + "rewards/jaccard_reward": 0.05081177328247577, + "rewards/refuse_rate_reward": -0.0012041170615702868, + "step": 4650 + }, + { + "completion_length": 18.4125, + "epoch": 0.3728, + "grad_norm": 5.947014331817627, + "kl": 0.46923684179782865, + "learning_rate": 7.093008742777253e-06, + "loss": -0.0711, + "reward": 0.05330326999537647, + "reward_std": 0.022331652790308, + "rewards/ddi_reward": -0.0077977514709346, + "rewards/jaccard_reward": 0.053578443638980386, + "rewards/refuse_rate_reward": -0.0013758680783212186, + "step": 4660 + }, + { + "completion_length": 20.9390625, + "epoch": 0.3736, + "grad_norm": 4.435387134552002, + "kl": 0.4175998978316784, + "learning_rate": 7.081403961302007e-06, + "loss": -0.0754, + "reward": 0.05928326821886003, + "reward_std": 0.024610456731170415, + "rewards/ddi_reward": -0.005465707357507199, + "rewards/jaccard_reward": 0.059542939183302224, + "rewards/refuse_rate_reward": -0.0012983630993403494, + "step": 4670 + }, + { + "completion_length": 26.5453125, + "epoch": 0.3744, + "grad_norm": 1.1599231958389282, + "kl": 0.4506212718784809, + "learning_rate": 7.069785601518524e-06, + "loss": -0.048, + "reward": 0.06387505205348135, + "reward_std": 0.022657937416806818, + "rewards/ddi_reward": -0.00102208947064355, + "rewards/jaccard_reward": 0.06390109248459339, + "rewards/refuse_rate_reward": -0.00013020833721384407, + "step": 4680 + }, + { + "completion_length": 24.1078125, + "epoch": 0.3752, + "grad_norm": 12.011680603027344, + "kl": 0.5368737943470479, + "learning_rate": 7.058153739220681e-06, + "loss": -0.0561, + "reward": 0.06477405657060445, + "reward_std": 0.028009354742243885, + "rewards/ddi_reward": -0.007952675473643468, + "rewards/jaccard_reward": 0.06567238054703921, + "rewards/refuse_rate_reward": -0.004491623863577843, + "step": 4690 + }, + { + "completion_length": 27.1765625, + "epoch": 0.376, + "grad_norm": 2.4326512813568115, + "kl": 0.3825482726097107, + "learning_rate": 7.046508450290434e-06, + "loss": -0.0544, + "reward": 0.06843514936044812, + "reward_std": 0.027179496921598913, + "rewards/ddi_reward": -0.013304078113287687, + "rewards/jaccard_reward": 0.06900719669647515, + "rewards/refuse_rate_reward": -0.0028602431062608956, + "step": 4700 + }, + { + "completion_length": 27.4578125, + "epoch": 0.3768, + "grad_norm": 1.6266087293624878, + "kl": 0.4217719718813896, + "learning_rate": 7.034849810697339e-06, + "loss": -0.0677, + "reward": 0.07374142229091377, + "reward_std": 0.027172363130375743, + "rewards/ddi_reward": -0.003111345018260181, + "rewards/jaccard_reward": 0.07455865542870015, + "rewards/refuse_rate_reward": -0.004086174257099629, + "step": 4710 + }, + { + "completion_length": 27.5625, + "epoch": 0.3776, + "grad_norm": 5.78209924697876, + "kl": 0.35999170430004596, + "learning_rate": 7.023177896498038e-06, + "loss": -0.0439, + "reward": 0.06086941345129162, + "reward_std": 0.02714906111359596, + "rewards/ddi_reward": 0.0037184332904871553, + "rewards/jaccard_reward": 0.06169278002344072, + "rewards/refuse_rate_reward": -0.004116838099434972, + "step": 4720 + }, + { + "completion_length": 22.3578125, + "epoch": 0.3784, + "grad_norm": 2.0929644107818604, + "kl": 0.5260403856635094, + "learning_rate": 7.011492783835772e-06, + "loss": -0.0614, + "reward": 0.06533048474229872, + "reward_std": 0.023818599246442317, + "rewards/ddi_reward": 0.0011984170021605677, + "rewards/jaccard_reward": 0.06592572275549173, + "rewards/refuse_rate_reward": -0.002976190485060215, + "step": 4730 + }, + { + "completion_length": 24.565625, + "epoch": 0.3792, + "grad_norm": 2.7280445098876953, + "kl": 0.45082471519708633, + "learning_rate": 6.999794548939891e-06, + "loss": -0.0657, + "reward": 0.0742401505354792, + "reward_std": 0.02607360212132335, + "rewards/ddi_reward": -0.0007983187679201365, + "rewards/jaccard_reward": 0.07446795417927206, + "rewards/refuse_rate_reward": -0.0011390129220671952, + "step": 4740 + }, + { + "completion_length": 26.653125, + "epoch": 0.38, + "grad_norm": 3.3679769039154053, + "kl": 0.4710536912083626, + "learning_rate": 6.988083268125342e-06, + "loss": -0.0476, + "reward": 0.06189166205003858, + "reward_std": 0.026304505253210663, + "rewards/ddi_reward": -0.009997599327471107, + "rewards/jaccard_reward": 0.06214197999797762, + "rewards/refuse_rate_reward": -0.0012515782844275236, + "step": 4750 + }, + { + "completion_length": 25.2296875, + "epoch": 0.3808, + "grad_norm": 3.604426622390747, + "kl": 0.44166252836585046, + "learning_rate": 6.976359017792181e-06, + "loss": -0.0603, + "reward": 0.0700279972050339, + "reward_std": 0.025948231993243098, + "rewards/ddi_reward": -0.01030623980332166, + "rewards/jaccard_reward": 0.070356988965068, + "rewards/refuse_rate_reward": -0.0016449652728624641, + "step": 4760 + }, + { + "completion_length": 23.8734375, + "epoch": 0.3816, + "grad_norm": 3.2883059978485107, + "kl": 0.44430881440639497, + "learning_rate": 6.964621874425077e-06, + "loss": -0.0529, + "reward": 0.05587598541169427, + "reward_std": 0.023383550019934772, + "rewards/ddi_reward": -0.002856587854330428, + "rewards/jaccard_reward": 0.05660192957147956, + "rewards/refuse_rate_reward": -0.0036297296290285887, + "step": 4770 + }, + { + "completion_length": 20.7015625, + "epoch": 0.3824, + "grad_norm": 2.800508737564087, + "kl": 0.4535280793905258, + "learning_rate": 6.952871914592801e-06, + "loss": -0.0483, + "reward": 0.0501835897564888, + "reward_std": 0.021309690037742256, + "rewards/ddi_reward": -0.0012146503780968488, + "rewards/jaccard_reward": 0.050264080776832996, + "rewards/refuse_rate_reward": -0.0004024621332064271, + "step": 4780 + }, + { + "completion_length": 21.375, + "epoch": 0.3832, + "grad_norm": 2.065145492553711, + "kl": 0.43252764493227003, + "learning_rate": 6.941109214947738e-06, + "loss": -0.0743, + "reward": 0.06490348787046969, + "reward_std": 0.023336734622716904, + "rewards/ddi_reward": -0.006426441477378831, + "rewards/jaccard_reward": 0.06530967527069151, + "rewards/refuse_rate_reward": -0.0020309399580582975, + "step": 4790 + }, + { + "completion_length": 24.4390625, + "epoch": 0.384, + "grad_norm": 2.5033445358276367, + "kl": 0.49276462495326995, + "learning_rate": 6.929333852225381e-06, + "loss": -0.0563, + "reward": 0.06130686346441507, + "reward_std": 0.026181075908243655, + "rewards/ddi_reward": -0.0009976322297006845, + "rewards/jaccard_reward": 0.0615523656073492, + "rewards/refuse_rate_reward": -0.0012275094632059336, + "step": 4800 + }, + { + "completion_length": 21.3, + "epoch": 0.3848, + "grad_norm": 2.4213180541992188, + "kl": 0.414698701351881, + "learning_rate": 6.9175459032438375e-06, + "loss": -0.0606, + "reward": 0.04904086315073073, + "reward_std": 0.023317658761516213, + "rewards/ddi_reward": -0.001578994641022291, + "rewards/jaccard_reward": 0.049215714121237396, + "rewards/refuse_rate_reward": -0.00087425597012043, + "step": 4810 + }, + { + "completion_length": 25.040625, + "epoch": 0.3856, + "grad_norm": 2.064443826675415, + "kl": 0.42126192077994346, + "learning_rate": 6.905745444903315e-06, + "loss": -0.059, + "reward": 0.06255180556327104, + "reward_std": 0.024905221490189432, + "rewards/ddi_reward": -0.005564945354126394, + "rewards/jaccard_reward": 0.06281966203823686, + "rewards/refuse_rate_reward": -0.0013392857159487902, + "step": 4820 + }, + { + "completion_length": 21.53125, + "epoch": 0.3864, + "grad_norm": 3.011622905731201, + "kl": 0.42758476734161377, + "learning_rate": 6.893932554185633e-06, + "loss": -0.0579, + "reward": 0.062999829929322, + "reward_std": 0.021218989649787544, + "rewards/ddi_reward": -0.007640923833241687, + "rewards/jaccard_reward": 0.06326300418004394, + "rewards/refuse_rate_reward": -0.0013158701069187373, + "step": 4830 + }, + { + "completion_length": 22.64375, + "epoch": 0.3872, + "grad_norm": 5.893509387969971, + "kl": 0.7076126545667648, + "learning_rate": 6.882107308153714e-06, + "loss": -0.0664, + "reward": 0.06222544563934207, + "reward_std": 0.02926969910040498, + "rewards/ddi_reward": -0.008899013162590563, + "rewards/jaccard_reward": 0.06224497677758336, + "rewards/refuse_rate_reward": -9.765625e-05, + "step": 4840 + }, + { + "completion_length": 24.1203125, + "epoch": 0.388, + "grad_norm": 1.4024691581726074, + "kl": 0.4475990541279316, + "learning_rate": 6.870269783951078e-06, + "loss": -0.0331, + "reward": 0.04708757591433823, + "reward_std": 0.024556503107305616, + "rewards/ddi_reward": -0.004624242895806674, + "rewards/jaccard_reward": 0.047644892567768694, + "rewards/refuse_rate_reward": -0.002786588459275663, + "step": 4850 + }, + { + "completion_length": 24.309375, + "epoch": 0.3888, + "grad_norm": 1.9319877624511719, + "kl": 0.4537143364548683, + "learning_rate": 6.8584200588013524e-06, + "loss": -0.0633, + "reward": 0.06330885766074061, + "reward_std": 0.02794992383569479, + "rewards/ddi_reward": -0.002787484280997887, + "rewards/jaccard_reward": 0.06367170535959303, + "rewards/refuse_rate_reward": -0.0018142360728234052, + "step": 4860 + }, + { + "completion_length": 24.0203125, + "epoch": 0.3896, + "grad_norm": 1.268160104751587, + "kl": 0.4155892215669155, + "learning_rate": 6.84655821000775e-06, + "loss": -0.025, + "reward": 0.04838104941882193, + "reward_std": 0.027315820474177598, + "rewards/ddi_reward": -0.0019488547637593, + "rewards/jaccard_reward": 0.04901707274839282, + "rewards/refuse_rate_reward": -0.0031801183242350815, + "step": 4870 + }, + { + "completion_length": 24.0, + "epoch": 0.3904, + "grad_norm": 1.621381163597107, + "kl": 0.6335779130458832, + "learning_rate": 6.834684314952578e-06, + "loss": -0.0507, + "reward": 0.061322068050503734, + "reward_std": 0.02686582747846842, + "rewards/ddi_reward": -0.010087816917803138, + "rewards/jaccard_reward": 0.061598931346088646, + "rewards/refuse_rate_reward": -0.0013843201915733517, + "step": 4880 + }, + { + "completion_length": 28.58125, + "epoch": 0.3912, + "grad_norm": 1.516948938369751, + "kl": 0.39191055446863177, + "learning_rate": 6.822798451096728e-06, + "loss": -0.0596, + "reward": 0.0664059491828084, + "reward_std": 0.024911895208060742, + "rewards/ddi_reward": -0.0046284014941193165, + "rewards/jaccard_reward": 0.06677649603225291, + "rewards/refuse_rate_reward": -0.0018527349457144737, + "step": 4890 + }, + { + "completion_length": 24.375, + "epoch": 0.392, + "grad_norm": 2.245201826095581, + "kl": 0.9064103491604328, + "learning_rate": 6.810900695979173e-06, + "loss": -0.0118, + "reward": 0.04517674231901765, + "reward_std": 0.026818851311691105, + "rewards/ddi_reward": -0.001768001919845119, + "rewards/jaccard_reward": 0.04784638467244804, + "rewards/refuse_rate_reward": -0.013348214456345885, + "step": 4900 + }, + { + "completion_length": 24.4421875, + "epoch": 0.3928, + "grad_norm": 2.517580032348633, + "kl": 0.4773964747786522, + "learning_rate": 6.798991127216455e-06, + "loss": -0.0434, + "reward": 0.05136877465993166, + "reward_std": 0.023520084144547582, + "rewards/ddi_reward": -0.0046075609396211805, + "rewards/jaccard_reward": 0.05167018335778266, + "rewards/refuse_rate_reward": -0.0015070445835590363, + "step": 4910 + }, + { + "completion_length": 25.65625, + "epoch": 0.3936, + "grad_norm": 2.0672175884246826, + "kl": 0.6584254458546639, + "learning_rate": 6.787069822502194e-06, + "loss": -0.0714, + "reward": 0.07491923857014626, + "reward_std": 0.026495365239679814, + "rewards/ddi_reward": 0.0023396297823637725, + "rewards/jaccard_reward": 0.07552861256990581, + "rewards/refuse_rate_reward": -0.0030468750162981452, + "step": 4920 + }, + { + "completion_length": 24.4046875, + "epoch": 0.3944, + "grad_norm": 3.24208927154541, + "kl": 0.48026110902428626, + "learning_rate": 6.77513685960656e-06, + "loss": -0.0486, + "reward": 0.0578880897257477, + "reward_std": 0.023842297215014695, + "rewards/ddi_reward": -0.003632215887773782, + "rewards/jaccard_reward": 0.0583842258900404, + "rewards/refuse_rate_reward": -0.002480683452449739, + "step": 4930 + }, + { + "completion_length": 24.84375, + "epoch": 0.3952, + "grad_norm": 1.5318622589111328, + "kl": 0.46440194845199584, + "learning_rate": 6.763192316375784e-06, + "loss": -0.0367, + "reward": 0.05903365621343255, + "reward_std": 0.02350117340683937, + "rewards/ddi_reward": -0.007118824421195313, + "rewards/jaccard_reward": 0.05965209435671568, + "rewards/refuse_rate_reward": -0.0030921943369321527, + "step": 4940 + }, + { + "completion_length": 22.43125, + "epoch": 0.396, + "grad_norm": 3.844261646270752, + "kl": 0.41935104206204415, + "learning_rate": 6.751236270731638e-06, + "loss": -0.0653, + "reward": 0.06943052606657148, + "reward_std": 0.02307703534606844, + "rewards/ddi_reward": -0.005361492745578289, + "rewards/jaccard_reward": 0.06972844917327166, + "rewards/refuse_rate_reward": -0.0014896271110046654, + "step": 4950 + }, + { + "completion_length": 27.36875, + "epoch": 0.3968, + "grad_norm": 2.613508462905884, + "kl": 0.4192417249083519, + "learning_rate": 6.739268800670938e-06, + "loss": -0.0675, + "reward": 0.07910770904272794, + "reward_std": 0.02532946695573628, + "rewards/ddi_reward": -0.012125634025142062, + "rewards/jaccard_reward": 0.07937187869101762, + "rewards/refuse_rate_reward": -0.001320847775787115, + "step": 4960 + }, + { + "completion_length": 25.5625, + "epoch": 0.3976, + "grad_norm": 2.018786668777466, + "kl": 0.4077054843306541, + "learning_rate": 6.727289984265019e-06, + "loss": -0.0569, + "reward": 0.06120823707897216, + "reward_std": 0.022857160912826656, + "rewards/ddi_reward": -0.008526828524190932, + "rewards/jaccard_reward": 0.062174946838058534, + "rewards/refuse_rate_reward": -0.004833549028262496, + "step": 4970 + }, + { + "completion_length": 24.6484375, + "epoch": 0.3984, + "grad_norm": 1.0322526693344116, + "kl": 0.4985127247869968, + "learning_rate": 6.7152998996592445e-06, + "loss": -0.0455, + "reward": 0.06145512079820037, + "reward_std": 0.024092452507466076, + "rewards/ddi_reward": -0.011673035827698186, + "rewards/jaccard_reward": 0.06186755020171404, + "rewards/refuse_rate_reward": -0.002062156645115465, + "step": 4980 + }, + { + "completion_length": 23.3765625, + "epoch": 0.3992, + "grad_norm": 2.3893649578094482, + "kl": 0.49801395162940026, + "learning_rate": 6.703298625072482e-06, + "loss": -0.0361, + "reward": 0.04687255712924525, + "reward_std": 0.026009664172306657, + "rewards/ddi_reward": -0.007761400140589103, + "rewards/jaccard_reward": 0.04833920360542834, + "rewards/refuse_rate_reward": -0.007333226827904582, + "step": 4990 + }, + { + "completion_length": 20.49375, + "epoch": 0.4, + "grad_norm": 1.8487073183059692, + "kl": 0.5141971245408058, + "learning_rate": 6.6912862387966015e-06, + "loss": -0.0665, + "reward": 0.06368977804668248, + "reward_std": 0.022686888976022602, + "rewards/ddi_reward": -0.0036307050788309423, + "rewards/jaccard_reward": 0.06389325666241348, + "rewards/refuse_rate_reward": -0.0010173897608183324, + "step": 5000 + }, + { + "completion_length": 24.815625, + "epoch": 0.4008, + "grad_norm": 5.38779878616333, + "kl": 0.5174560263752938, + "learning_rate": 6.67926281919596e-06, + "loss": -0.0391, + "reward": 0.05746736850123853, + "reward_std": 0.022584606939926744, + "rewards/ddi_reward": -0.011224595014937222, + "rewards/jaccard_reward": 0.05794860615860671, + "rewards/refuse_rate_reward": -0.002406183339189738, + "step": 5010 + }, + { + "completion_length": 21.9140625, + "epoch": 0.4016, + "grad_norm": 4.189798355102539, + "kl": 0.4732095241546631, + "learning_rate": 6.667228444706891e-06, + "loss": -0.0649, + "reward": 0.061576416436582807, + "reward_std": 0.017980804084800184, + "rewards/ddi_reward": -0.005888753966428339, + "rewards/jaccard_reward": 0.06180111789144575, + "rewards/refuse_rate_reward": -0.0011235119425691664, + "step": 5020 + }, + { + "completion_length": 20.38125, + "epoch": 0.4024, + "grad_norm": 2.8333826065063477, + "kl": 0.495409969240427, + "learning_rate": 6.655183193837197e-06, + "loss": -0.0569, + "reward": 0.05978780118748546, + "reward_std": 0.027408759901300074, + "rewards/ddi_reward": -0.006718852135236375, + "rewards/jaccard_reward": 0.05994653357192874, + "rewards/refuse_rate_reward": -0.0007936508161947131, + "step": 5030 + }, + { + "completion_length": 20.8328125, + "epoch": 0.4032, + "grad_norm": 2.7691938877105713, + "kl": 0.5348535932600498, + "learning_rate": 6.643127145165628e-06, + "loss": -0.0483, + "reward": 0.051897828769870105, + "reward_std": 0.019304519845172764, + "rewards/ddi_reward": -0.006479200097965076, + "rewards/jaccard_reward": 0.05240137252258137, + "rewards/refuse_rate_reward": -0.00251771881012246, + "step": 5040 + }, + { + "completion_length": 24.6078125, + "epoch": 0.404, + "grad_norm": 0.9759635329246521, + "kl": 0.521192030608654, + "learning_rate": 6.631060377341382e-06, + "loss": -0.0648, + "reward": 0.07253303555771709, + "reward_std": 0.02112937537021935, + "rewards/ddi_reward": -0.007923029028461315, + "rewards/jaccard_reward": 0.0732238681986928, + "rewards/refuse_rate_reward": -0.0034541630418971183, + "step": 5050 + }, + { + "completion_length": 24.7796875, + "epoch": 0.4048, + "grad_norm": 1.4546983242034912, + "kl": 0.47762149572372437, + "learning_rate": 6.618982969083579e-06, + "loss": -0.0442, + "reward": 0.06208335948176682, + "reward_std": 0.022538114828057586, + "rewards/ddi_reward": -0.008282135555054993, + "rewards/jaccard_reward": 0.062485032994300124, + "rewards/refuse_rate_reward": -0.002008364978246391, + "step": 5060 + }, + { + "completion_length": 21.7296875, + "epoch": 0.4056, + "grad_norm": 3.703130006790161, + "kl": 0.5238255754113197, + "learning_rate": 6.6068949991807555e-06, + "loss": -0.0664, + "reward": 0.06518286047503352, + "reward_std": 0.02381889154203236, + "rewards/ddi_reward": -0.00589017825841438, + "rewards/jaccard_reward": 0.06545606162399054, + "rewards/refuse_rate_reward": -0.0013660037773661315, + "step": 5070 + }, + { + "completion_length": 22.54375, + "epoch": 0.4064, + "grad_norm": 1.634771704673767, + "kl": 0.47357811257243154, + "learning_rate": 6.594796546490351e-06, + "loss": -0.0582, + "reward": 0.0650988802779466, + "reward_std": 0.021435034368187188, + "rewards/ddi_reward": 0.0032984539400786163, + "rewards/jaccard_reward": 0.06608815374784172, + "rewards/refuse_rate_reward": -0.004946371540427208, + "step": 5080 + }, + { + "completion_length": 22.4140625, + "epoch": 0.4072, + "grad_norm": 2.545024871826172, + "kl": 0.4842130243778229, + "learning_rate": 6.582687689938185e-06, + "loss": -0.0726, + "reward": 0.06447117780335247, + "reward_std": 0.03146414067596197, + "rewards/ddi_reward": 0.002608145424164832, + "rewards/jaccard_reward": 0.06538303061388433, + "rewards/refuse_rate_reward": -0.004559264553245157, + "step": 5090 + }, + { + "completion_length": 24.63125, + "epoch": 0.408, + "grad_norm": 2.932321786880493, + "kl": 0.5521106481552124, + "learning_rate": 6.570568508517951e-06, + "loss": -0.069, + "reward": 0.07402197690680623, + "reward_std": 0.025257916655391455, + "rewards/ddi_reward": -0.004488944495096803, + "rewards/jaccard_reward": 0.0745191243942827, + "rewards/refuse_rate_reward": -0.0024857390788383784, + "step": 5100 + }, + { + "completion_length": 22.2140625, + "epoch": 0.4088, + "grad_norm": 2.047551155090332, + "kl": 0.4144434303045273, + "learning_rate": 6.558439081290702e-06, + "loss": -0.0432, + "reward": 0.04808359046000987, + "reward_std": 0.0186899229651317, + "rewards/ddi_reward": -0.007796873804181814, + "rewards/jaccard_reward": 0.04808359011076391, + "rewards/refuse_rate_reward": 0.0, + "step": 5110 + }, + { + "completion_length": 25.4515625, + "epoch": 0.4096, + "grad_norm": 1.7031850814819336, + "kl": 0.4026504881680012, + "learning_rate": 6.546299487384323e-06, + "loss": -0.0495, + "reward": 0.0635294334962964, + "reward_std": 0.024706838373094796, + "rewards/ddi_reward": -0.012460077300784179, + "rewards/jaccard_reward": 0.06366408867761493, + "rewards/refuse_rate_reward": -0.0006732677225954831, + "step": 5120 + }, + { + "completion_length": 23.68125, + "epoch": 0.4104, + "grad_norm": 2.31990385055542, + "kl": 0.5000673286616802, + "learning_rate": 6.5341498059930275e-06, + "loss": -0.0474, + "reward": 0.060229612281546, + "reward_std": 0.024271931080147625, + "rewards/ddi_reward": -0.00751569964340888, + "rewards/jaccard_reward": 0.06074525276198983, + "rewards/refuse_rate_reward": -0.0025781987700611354, + "step": 5130 + }, + { + "completion_length": 21.8109375, + "epoch": 0.4112, + "grad_norm": 1.9126131534576416, + "kl": 0.3941440314054489, + "learning_rate": 6.521990116376836e-06, + "loss": -0.0157, + "reward": 0.04806815227493644, + "reward_std": 0.02349648205563426, + "rewards/ddi_reward": -0.00291942156618461, + "rewards/jaccard_reward": 0.04862571367993951, + "rewards/refuse_rate_reward": -0.0027878111461177467, + "step": 5140 + }, + { + "completion_length": 21.74375, + "epoch": 0.412, + "grad_norm": 2.7517213821411133, + "kl": 0.46768858954310416, + "learning_rate": 6.5098204978610565e-06, + "loss": -0.0236, + "reward": 0.041630987729877236, + "reward_std": 0.021640120772644877, + "rewards/ddi_reward": -0.010950652335304767, + "rewards/jaccard_reward": 0.04209924223832786, + "rewards/refuse_rate_reward": -0.002341269876342267, + "step": 5150 + }, + { + "completion_length": 25.83125, + "epoch": 0.4128, + "grad_norm": 1.2679941654205322, + "kl": 0.5191069155931473, + "learning_rate": 6.497641029835775e-06, + "loss": -0.052, + "reward": 0.06704415306448937, + "reward_std": 0.02444509891793132, + "rewards/ddi_reward": -0.008084014907944947, + "rewards/jaccard_reward": 0.06726240925490856, + "rewards/refuse_rate_reward": -0.0010912698518950493, + "step": 5160 + }, + { + "completion_length": 27.634375, + "epoch": 0.4136, + "grad_norm": 2.293553590774536, + "kl": 0.45491471141576767, + "learning_rate": 6.485451791755323e-06, + "loss": -0.0473, + "reward": 0.0708262083120644, + "reward_std": 0.024748293031007052, + "rewards/ddi_reward": -0.007852168928366154, + "rewards/jaccard_reward": 0.07105724681168794, + "rewards/refuse_rate_reward": -0.0011551903095096349, + "step": 5170 + }, + { + "completion_length": 23.29375, + "epoch": 0.4144, + "grad_norm": 1.4285218715667725, + "kl": 0.5539902880787849, + "learning_rate": 6.473252863137778e-06, + "loss": -0.0768, + "reward": 0.06656536147929729, + "reward_std": 0.025625863624736667, + "rewards/ddi_reward": -0.009714428240840789, + "rewards/jaccard_reward": 0.06675388370640575, + "rewards/refuse_rate_reward": -0.0009426062286365777, + "step": 5180 + }, + { + "completion_length": 26.440625, + "epoch": 0.4152, + "grad_norm": 4.780998706817627, + "kl": 0.4272093154489994, + "learning_rate": 6.461044323564425e-06, + "loss": -0.064, + "reward": 0.06513437060639263, + "reward_std": 0.02723626382648945, + "rewards/ddi_reward": -0.01322274025878869, + "rewards/jaccard_reward": 0.06565206404775381, + "rewards/refuse_rate_reward": -0.002588468382600695, + "step": 5190 + }, + { + "completion_length": 26.553125, + "epoch": 0.416, + "grad_norm": 3.0445775985717773, + "kl": 0.5035666033625603, + "learning_rate": 6.4488262526792585e-06, + "loss": -0.0701, + "reward": 0.07228520684875547, + "reward_std": 0.027060097269713878, + "rewards/ddi_reward": -0.01150637530372478, + "rewards/jaccard_reward": 0.07265637516975403, + "rewards/refuse_rate_reward": -0.0018558351788669825, + "step": 5200 + }, + { + "completion_length": 27.421875, + "epoch": 0.4168, + "grad_norm": 3.2199110984802246, + "kl": 0.5058070033788681, + "learning_rate": 6.4365987301884426e-06, + "loss": -0.0628, + "reward": 0.07256874185986817, + "reward_std": 0.026956348307430746, + "rewards/ddi_reward": -0.00692408587783575, + "rewards/jaccard_reward": 0.07336273756809533, + "rewards/refuse_rate_reward": -0.003969987761229277, + "step": 5210 + }, + { + "completion_length": 24.93125, + "epoch": 0.4176, + "grad_norm": 2.778681516647339, + "kl": 0.5170176684856415, + "learning_rate": 6.424361835859803e-06, + "loss": -0.0629, + "reward": 0.07029617261141538, + "reward_std": 0.027329100016504525, + "rewards/ddi_reward": -0.004607315803878009, + "rewards/jaccard_reward": 0.07079997677356005, + "rewards/refuse_rate_reward": -0.0025190146174281836, + "step": 5220 + }, + { + "completion_length": 25.33125, + "epoch": 0.4184, + "grad_norm": 3.2480356693267822, + "kl": 0.4871144235134125, + "learning_rate": 6.412115649522305e-06, + "loss": -0.0463, + "reward": 0.05558372689411044, + "reward_std": 0.027316735964268445, + "rewards/ddi_reward": -0.003499528096290305, + "rewards/jaccard_reward": 0.05617797349113971, + "rewards/refuse_rate_reward": -0.0029712302377447484, + "step": 5230 + }, + { + "completion_length": 25.9640625, + "epoch": 0.4192, + "grad_norm": 0.9959756135940552, + "kl": 0.45893699377775193, + "learning_rate": 6.399860251065533e-06, + "loss": -0.0729, + "reward": 0.07162775173783302, + "reward_std": 0.02378947949036956, + "rewards/ddi_reward": -0.0035726239206269384, + "rewards/jaccard_reward": 0.07210754959378392, + "rewards/refuse_rate_reward": -0.00239898992003873, + "step": 5240 + }, + { + "completion_length": 24.9296875, + "epoch": 0.42, + "grad_norm": 2.4256465435028076, + "kl": 0.47619090527296065, + "learning_rate": 6.387595720439162e-06, + "loss": -0.076, + "reward": 0.06844368316233158, + "reward_std": 0.024747014185413717, + "rewards/ddi_reward": -0.003197651426307857, + "rewards/jaccard_reward": 0.06875997937750071, + "rewards/refuse_rate_reward": -0.0015814928221516312, + "step": 5250 + }, + { + "completion_length": 24.821875, + "epoch": 0.4208, + "grad_norm": 3.0007448196411133, + "kl": 0.4673247069120407, + "learning_rate": 6.375322137652447e-06, + "loss": -0.1106, + "reward": 0.06625811231788248, + "reward_std": 0.027250483771786092, + "rewards/ddi_reward": -0.009253621746029238, + "rewards/jaccard_reward": 0.06700544109335169, + "rewards/refuse_rate_reward": -0.0037366509670391677, + "step": 5260 + }, + { + "completion_length": 26.9796875, + "epoch": 0.4216, + "grad_norm": 5.414450168609619, + "kl": 0.5100875183939934, + "learning_rate": 6.363039582773696e-06, + "loss": -0.0407, + "reward": 0.06104331915266812, + "reward_std": 0.023097265139222146, + "rewards/ddi_reward": -0.0069910963153233755, + "rewards/jaccard_reward": 0.06129525187425315, + "rewards/refuse_rate_reward": -0.0012596670072525739, + "step": 5270 + }, + { + "completion_length": 22.1484375, + "epoch": 0.4224, + "grad_norm": 2.5436484813690186, + "kl": 0.5326015748083591, + "learning_rate": 6.350748135929744e-06, + "loss": -0.0384, + "reward": 0.048380835261195895, + "reward_std": 0.021378714079037308, + "rewards/ddi_reward": 0.006537750503048301, + "rewards/jaccard_reward": 0.04865812739590183, + "rewards/refuse_rate_reward": -0.0013864651322364808, + "step": 5280 + }, + { + "completion_length": 26.6078125, + "epoch": 0.4232, + "grad_norm": 2.561167001724243, + "kl": 0.4877026692032814, + "learning_rate": 6.338447877305436e-06, + "loss": -0.0405, + "reward": 0.06980415333528071, + "reward_std": 0.02251637475565076, + "rewards/ddi_reward": 0.00493436950200703, + "rewards/jaccard_reward": 0.07008035264443606, + "rewards/refuse_rate_reward": -0.0013809975003823637, + "step": 5290 + }, + { + "completion_length": 22.165625, + "epoch": 0.424, + "grad_norm": 1.6320656538009644, + "kl": 0.5155673280358315, + "learning_rate": 6.3261388871431016e-06, + "loss": -0.039, + "reward": 0.056829247437417504, + "reward_std": 0.023772699804976583, + "rewards/ddi_reward": -0.006955909391399473, + "rewards/jaccard_reward": 0.05700733487028629, + "rewards/refuse_rate_reward": -0.0008904333575628698, + "step": 5300 + }, + { + "completion_length": 24.46875, + "epoch": 0.4248, + "grad_norm": 1.549864411354065, + "kl": 0.5133664324879647, + "learning_rate": 6.313821245742034e-06, + "loss": -0.052, + "reward": 0.05673738610930741, + "reward_std": 0.025423013744875787, + "rewards/ddi_reward": -0.009960926455096341, + "rewards/jaccard_reward": 0.05732595562003553, + "rewards/refuse_rate_reward": -0.0029428491136059163, + "step": 5310 + }, + { + "completion_length": 24.828125, + "epoch": 0.4256, + "grad_norm": 4.606925010681152, + "kl": 0.4638635091483593, + "learning_rate": 6.301495033457959e-06, + "loss": -0.055, + "reward": 0.06232795584946871, + "reward_std": 0.024230857472866775, + "rewards/ddi_reward": -0.00894749879371375, + "rewards/jaccard_reward": 0.06254583494737745, + "rewards/refuse_rate_reward": -0.0010894097620621323, + "step": 5320 + }, + { + "completion_length": 24.9796875, + "epoch": 0.4264, + "grad_norm": 5.061791896820068, + "kl": 0.5162159994244575, + "learning_rate": 6.289160330702522e-06, + "loss": -0.0442, + "reward": 0.06211809567175806, + "reward_std": 0.02338571585714817, + "rewards/ddi_reward": -0.008875997044378891, + "rewards/jaccard_reward": 0.062412206642329694, + "rewards/refuse_rate_reward": -0.001470553409308195, + "step": 5330 + }, + { + "completion_length": 22.4453125, + "epoch": 0.4272, + "grad_norm": 3.1046204566955566, + "kl": 0.46947559230029584, + "learning_rate": 6.2768172179427475e-06, + "loss": -0.0236, + "reward": 0.038143709042924454, + "reward_std": 0.025015958212316038, + "rewards/ddi_reward": -0.008474622218636796, + "rewards/jaccard_reward": 0.03869856287492439, + "rewards/refuse_rate_reward": -0.0027742722304537892, + "step": 5340 + }, + { + "completion_length": 25.5078125, + "epoch": 0.428, + "grad_norm": 2.2629241943359375, + "kl": 0.514503188431263, + "learning_rate": 6.264465775700537e-06, + "loss": -0.0493, + "reward": 0.06085688499733806, + "reward_std": 0.02513847891241312, + "rewards/ddi_reward": -0.008703241313924082, + "rewards/jaccard_reward": 0.06110514972824603, + "rewards/refuse_rate_reward": -0.0012413194519467652, + "step": 5350 + }, + { + "completion_length": 24.4, + "epoch": 0.4288, + "grad_norm": 2.408179998397827, + "kl": 0.45110048949718473, + "learning_rate": 6.252106084552117e-06, + "loss": -0.0409, + "reward": 0.052235545194707814, + "reward_std": 0.02413225336931646, + "rewards/ddi_reward": -0.012523101398255677, + "rewards/jaccard_reward": 0.0523397120879963, + "rewards/refuse_rate_reward": -0.0005208333488553763, + "step": 5360 + }, + { + "completion_length": 24.9796875, + "epoch": 0.4296, + "grad_norm": 2.1440682411193848, + "kl": 0.5382043033838272, + "learning_rate": 6.2397382251275376e-06, + "loss": -0.0469, + "reward": 0.06518610557541252, + "reward_std": 0.02311951927258633, + "rewards/ddi_reward": -0.0059037288534455005, + "rewards/jaccard_reward": 0.06578731783665717, + "rewards/refuse_rate_reward": -0.0030060650780797003, + "step": 5370 + }, + { + "completion_length": 26.796875, + "epoch": 0.4304, + "grad_norm": 3.446985960006714, + "kl": 0.5132034584879875, + "learning_rate": 6.2273622781101275e-06, + "loss": -0.0244, + "reward": 0.04925171788781881, + "reward_std": 0.02869924558326602, + "rewards/ddi_reward": -0.007766539184376598, + "rewards/jaccard_reward": 0.04981386447325349, + "rewards/refuse_rate_reward": -0.0028107388177886607, + "step": 5380 + }, + { + "completion_length": 24.396875, + "epoch": 0.4312, + "grad_norm": 2.395159959793091, + "kl": 0.6506419748067855, + "learning_rate": 6.214978324235982e-06, + "loss": -0.0205, + "reward": 0.05231028948910534, + "reward_std": 0.026163920154795052, + "rewards/ddi_reward": -0.005474000202957541, + "rewards/jaccard_reward": 0.05301590990275144, + "rewards/refuse_rate_reward": -0.0035280996467918157, + "step": 5390 + }, + { + "completion_length": 24.375, + "epoch": 0.432, + "grad_norm": 1.6034551858901978, + "kl": 0.46464487016201017, + "learning_rate": 6.202586444293426e-06, + "loss": -0.058, + "reward": 0.06790939783677459, + "reward_std": 0.02462999161798507, + "rewards/ddi_reward": -0.0032356692536268383, + "rewards/jaccard_reward": 0.06815479090437293, + "rewards/refuse_rate_reward": -0.0012269631668459624, + "step": 5400 + }, + { + "completion_length": 21.9234375, + "epoch": 0.4328, + "grad_norm": 2.3141396045684814, + "kl": 0.43823581114411353, + "learning_rate": 6.190186719122493e-06, + "loss": -0.0683, + "reward": 0.06528603970073163, + "reward_std": 0.02041883028578013, + "rewards/ddi_reward": -0.008956225565634668, + "rewards/jaccard_reward": 0.06597353878314607, + "rewards/refuse_rate_reward": -0.0034375001210719346, + "step": 5410 + }, + { + "completion_length": 25.753125, + "epoch": 0.4336, + "grad_norm": 2.2972936630249023, + "kl": 0.49055708944797516, + "learning_rate": 6.177779229614398e-06, + "loss": -0.0571, + "reward": 0.07106593865901231, + "reward_std": 0.028330434951931237, + "rewards/ddi_reward": -0.008122880442533642, + "rewards/jaccard_reward": 0.07145627420395613, + "rewards/refuse_rate_reward": -0.0019516782951541245, + "step": 5420 + }, + { + "completion_length": 21.96875, + "epoch": 0.4344, + "grad_norm": 1.90984046459198, + "kl": 0.5056957051157951, + "learning_rate": 6.165364056711003e-06, + "loss": -0.0265, + "reward": 0.04619229744421318, + "reward_std": 0.021848199283704163, + "rewards/ddi_reward": -0.01394189948332496, + "rewards/jaccard_reward": 0.04632250512950122, + "rewards/refuse_rate_reward": -0.0006510416744276882, + "step": 5430 + }, + { + "completion_length": 25.8328125, + "epoch": 0.4352, + "grad_norm": 1.1177639961242676, + "kl": 0.456753883510828, + "learning_rate": 6.152941281404294e-06, + "loss": -0.0516, + "reward": 0.06672234665602446, + "reward_std": 0.02294736597687006, + "rewards/ddi_reward": -0.013585805345792323, + "rewards/jaccard_reward": 0.06705550736514851, + "rewards/refuse_rate_reward": -0.0016658181557431817, + "step": 5440 + }, + { + "completion_length": 25.990625, + "epoch": 0.436, + "grad_norm": 3.5611488819122314, + "kl": 0.5425142124295235, + "learning_rate": 6.140510984735859e-06, + "loss": -0.0535, + "reward": 0.062569042108953, + "reward_std": 0.02387990013230592, + "rewards/ddi_reward": -0.005616942142660264, + "rewards/jaccard_reward": 0.0632630247157067, + "rewards/refuse_rate_reward": -0.0034699123934842646, + "step": 5450 + }, + { + "completion_length": 24.859375, + "epoch": 0.4368, + "grad_norm": 3.863586902618408, + "kl": 0.4316178157925606, + "learning_rate": 6.1280732477963455e-06, + "loss": -0.067, + "reward": 0.06616913224570453, + "reward_std": 0.020035488647408783, + "rewards/ddi_reward": -0.009287729510106146, + "rewards/jaccard_reward": 0.0662767696660012, + "rewards/refuse_rate_reward": -0.0005381944589316845, + "step": 5460 + }, + { + "completion_length": 30.5765625, + "epoch": 0.4376, + "grad_norm": 1.4780603647232056, + "kl": 0.44519276171922684, + "learning_rate": 6.11562815172494e-06, + "loss": -0.0592, + "reward": 0.0765171171631664, + "reward_std": 0.027342139510437846, + "rewards/ddi_reward": -0.009856060100719332, + "rewards/jaccard_reward": 0.07776500832405872, + "rewards/refuse_rate_reward": -0.006239464529789985, + "step": 5470 + }, + { + "completion_length": 29.9328125, + "epoch": 0.4384, + "grad_norm": 1.607317328453064, + "kl": 0.41677802354097365, + "learning_rate": 6.103175777708838e-06, + "loss": -0.0349, + "reward": 0.06058637611567974, + "reward_std": 0.025977352727204563, + "rewards/ddi_reward": -0.008793008694192394, + "rewards/jaccard_reward": 0.061059889988973735, + "rewards/refuse_rate_reward": -0.0023675651696976273, + "step": 5480 + }, + { + "completion_length": 26.7125, + "epoch": 0.4392, + "grad_norm": 1.6488953828811646, + "kl": 0.43719795495271685, + "learning_rate": 6.090716206982714e-06, + "loss": -0.0257, + "reward": 0.05091107878834009, + "reward_std": 0.02299737762659788, + "rewards/ddi_reward": -0.009995373903075233, + "rewards/jaccard_reward": 0.05110578266903758, + "rewards/refuse_rate_reward": -0.0009735128318425268, + "step": 5490 + }, + { + "completion_length": 25.353125, + "epoch": 0.44, + "grad_norm": 2.6681177616119385, + "kl": 0.528657253086567, + "learning_rate": 6.078249520828192e-06, + "loss": -0.0257, + "reward": 0.04820033023133874, + "reward_std": 0.028616708097979428, + "rewards/ddi_reward": -0.014156875250046141, + "rewards/jaccard_reward": 0.04839824860682711, + "rewards/refuse_rate_reward": -0.0009895833558402956, + "step": 5500 + }, + { + "completion_length": 25.834375, + "epoch": 0.4408, + "grad_norm": 2.8462488651275635, + "kl": 0.4159099653363228, + "learning_rate": 6.065775800573312e-06, + "loss": -0.0411, + "reward": 0.055605961149558424, + "reward_std": 0.025405291700735688, + "rewards/ddi_reward": -0.003572146888473071, + "rewards/jaccard_reward": 0.05570752341300249, + "rewards/refuse_rate_reward": -0.0005078125046566129, + "step": 5510 + }, + { + "completion_length": 24.590625, + "epoch": 0.4416, + "grad_norm": 1.6491575241088867, + "kl": 0.44692535549402235, + "learning_rate": 6.053295127592004e-06, + "loss": -0.0746, + "reward": 0.07779781967401504, + "reward_std": 0.025905431481078267, + "rewards/ddi_reward": -0.002875912401941605, + "rewards/jaccard_reward": 0.07833761693909765, + "rewards/refuse_rate_reward": -0.0026989937061443923, + "step": 5520 + }, + { + "completion_length": 25.05, + "epoch": 0.4424, + "grad_norm": 1.358824372291565, + "kl": 0.46066590398550034, + "learning_rate": 6.040807583303552e-06, + "loss": -0.066, + "reward": 0.07731485245749355, + "reward_std": 0.02668250212445855, + "rewards/ddi_reward": -0.015110299445223064, + "rewards/jaccard_reward": 0.07756094671785832, + "rewards/refuse_rate_reward": -0.0012304687523283064, + "step": 5530 + }, + { + "completion_length": 24.6578125, + "epoch": 0.4432, + "grad_norm": 1.8305268287658691, + "kl": 0.4080900102853775, + "learning_rate": 6.028313249172072e-06, + "loss": -0.0442, + "reward": 0.061552715580910446, + "reward_std": 0.02312248256057501, + "rewards/ddi_reward": -0.0112359672319144, + "rewards/jaccard_reward": 0.0617544905282557, + "rewards/refuse_rate_reward": -0.0010088679264299572, + "step": 5540 + }, + { + "completion_length": 23.6359375, + "epoch": 0.444, + "grad_norm": 2.073066473007202, + "kl": 0.3958142623305321, + "learning_rate": 6.015812206705967e-06, + "loss": -0.057, + "reward": 0.06630591074936092, + "reward_std": 0.026932776533067228, + "rewards/ddi_reward": -0.0069102816836675626, + "rewards/jaccard_reward": 0.06644219486042857, + "rewards/refuse_rate_reward": -0.0006814236170612275, + "step": 5550 + }, + { + "completion_length": 21.63125, + "epoch": 0.4448, + "grad_norm": 2.5214993953704834, + "kl": 0.4926064252853394, + "learning_rate": 6.003304537457408e-06, + "loss": -0.0576, + "reward": 0.053522751899436116, + "reward_std": 0.02240333630470559, + "rewards/ddi_reward": -0.007105690322350711, + "rewards/jaccard_reward": 0.05356739558046684, + "rewards/refuse_rate_reward": -0.00022321429569274187, + "step": 5560 + }, + { + "completion_length": 20.1421875, + "epoch": 0.4456, + "grad_norm": 2.9928834438323975, + "kl": 0.46164785251021384, + "learning_rate": 5.990790323021793e-06, + "loss": -0.0464, + "reward": 0.04566268748603761, + "reward_std": 0.02240771371871233, + "rewards/ddi_reward": -0.0034208840923383834, + "rewards/jaccard_reward": 0.04602541036438197, + "rewards/refuse_rate_reward": -0.0018136161146685482, + "step": 5570 + }, + { + "completion_length": 22.7015625, + "epoch": 0.4464, + "grad_norm": 2.4130282402038574, + "kl": 0.476196426153183, + "learning_rate": 5.978269645037223e-06, + "loss": -0.0392, + "reward": 0.05085597652941942, + "reward_std": 0.020173613191582264, + "rewards/ddi_reward": 0.0005548212204303127, + "rewards/jaccard_reward": 0.051221067737787965, + "rewards/refuse_rate_reward": -0.0018254533177241683, + "step": 5580 + }, + { + "completion_length": 22.6515625, + "epoch": 0.4472, + "grad_norm": 1.5184675455093384, + "kl": 0.4526237979531288, + "learning_rate": 5.9657425851839604e-06, + "loss": -0.0581, + "reward": 0.06254659723490477, + "reward_std": 0.028367548063397408, + "rewards/ddi_reward": -0.0019924297113902865, + "rewards/jaccard_reward": 0.06296748002059757, + "rewards/refuse_rate_reward": -0.0021044146735221146, + "step": 5590 + }, + { + "completion_length": 22.19375, + "epoch": 0.448, + "grad_norm": 1.599055528640747, + "kl": 0.5934031203389167, + "learning_rate": 5.953209225183899e-06, + "loss": -0.034, + "reward": 0.0508936611469835, + "reward_std": 0.02224164349026978, + "rewards/ddi_reward": -0.003276975842891261, + "rewards/jaccard_reward": 0.051139702799264344, + "rewards/refuse_rate_reward": -0.0012302150949835776, + "step": 5600 + }, + { + "completion_length": 22.1671875, + "epoch": 0.4488, + "grad_norm": 7.192503452301025, + "kl": 0.47870202735066414, + "learning_rate": 5.940669646800041e-06, + "loss": -0.0612, + "reward": 0.059080766327679154, + "reward_std": 0.0240185406524688, + "rewards/ddi_reward": -0.003599869378376752, + "rewards/jaccard_reward": 0.059237989317625764, + "rewards/refuse_rate_reward": -0.0007861149497330189, + "step": 5610 + }, + { + "completion_length": 23.6453125, + "epoch": 0.4496, + "grad_norm": 2.170820474624634, + "kl": 0.5801210470497609, + "learning_rate": 5.9281239318359475e-06, + "loss": -0.0318, + "reward": 0.05764435511082411, + "reward_std": 0.0215236758813262, + "rewards/ddi_reward": -0.008363074882072396, + "rewards/jaccard_reward": 0.058032377160270696, + "rewards/refuse_rate_reward": -0.0019401042023673653, + "step": 5620 + }, + { + "completion_length": 22.8109375, + "epoch": 0.4504, + "grad_norm": 3.935621976852417, + "kl": 0.45059762001037595, + "learning_rate": 5.915572162135212e-06, + "loss": -0.0605, + "reward": 0.06621391866356134, + "reward_std": 0.02155483071692288, + "rewards/ddi_reward": -0.00886317502008751, + "rewards/jaccard_reward": 0.06722954292781651, + "rewards/refuse_rate_reward": -0.005078125139698386, + "step": 5630 + }, + { + "completion_length": 23.5765625, + "epoch": 0.4512, + "grad_norm": 2.115232467651367, + "kl": 0.5822868749499321, + "learning_rate": 5.903014419580933e-06, + "loss": -0.0325, + "reward": 0.05420938949100673, + "reward_std": 0.02267206225078553, + "rewards/ddi_reward": -0.009646855969913304, + "rewards/jaccard_reward": 0.05451644503045827, + "rewards/refuse_rate_reward": -0.00153527467045933, + "step": 5640 + }, + { + "completion_length": 20.35625, + "epoch": 0.452, + "grad_norm": 8.8477783203125, + "kl": 0.5090240344405175, + "learning_rate": 5.890450786095169e-06, + "loss": -0.0501, + "reward": 0.05086080431938171, + "reward_std": 0.02207764140330255, + "rewards/ddi_reward": -0.003284724937111605, + "rewards/jaccard_reward": 0.05119190440746024, + "rewards/refuse_rate_reward": -0.0016555059468373657, + "step": 5650 + }, + { + "completion_length": 21.0484375, + "epoch": 0.4528, + "grad_norm": 3.353484869003296, + "kl": 0.47795771062374115, + "learning_rate": 5.877881343638408e-06, + "loss": -0.0572, + "reward": 0.06508579056244343, + "reward_std": 0.02426075404509902, + "rewards/ddi_reward": -0.00755782688211184, + "rewards/jaccard_reward": 0.06533642068970949, + "rewards/refuse_rate_reward": -0.001253156561870128, + "step": 5660 + }, + { + "completion_length": 21.69375, + "epoch": 0.4536, + "grad_norm": 1.5907663106918335, + "kl": 0.506357941031456, + "learning_rate": 5.865306174209035e-06, + "loss": -0.049, + "reward": 0.05112798176705837, + "reward_std": 0.0210716369561851, + "rewards/ddi_reward": -0.011404778831638396, + "rewards/jaccard_reward": 0.051127982418984176, + "rewards/refuse_rate_reward": 0.0, + "step": 5670 + }, + { + "completion_length": 20.903125, + "epoch": 0.4544, + "grad_norm": 3.436236619949341, + "kl": 0.5641608566045762, + "learning_rate": 5.8527253598428e-06, + "loss": -0.063, + "reward": 0.0628491597250104, + "reward_std": 0.027976021030917765, + "rewards/ddi_reward": 0.004770744999404997, + "rewards/jaccard_reward": 0.0630760956555605, + "rewards/refuse_rate_reward": -0.0011346726445481182, + "step": 5680 + }, + { + "completion_length": 22.3171875, + "epoch": 0.4552, + "grad_norm": 3.1585593223571777, + "kl": 0.5483339369297028, + "learning_rate": 5.84013898261227e-06, + "loss": -0.0148, + "reward": 0.04822512024547905, + "reward_std": 0.02552981358021498, + "rewards/ddi_reward": -0.013246434459870216, + "rewards/jaccard_reward": 0.04861574531532824, + "rewards/refuse_rate_reward": -0.001953125, + "step": 5690 + }, + { + "completion_length": 22.153125, + "epoch": 0.456, + "grad_norm": 1.5913722515106201, + "kl": 0.5159244693815708, + "learning_rate": 5.82754712462631e-06, + "loss": -0.0503, + "reward": 0.058845136500895026, + "reward_std": 0.026761250710114835, + "rewards/ddi_reward": -0.006097089196555316, + "rewards/jaccard_reward": 0.05931723543908447, + "rewards/refuse_rate_reward": -0.0023604910937137904, + "step": 5700 + }, + { + "completion_length": 28.6953125, + "epoch": 0.4568, + "grad_norm": 2.2807116508483887, + "kl": 0.49215307608246806, + "learning_rate": 5.8149498680295336e-06, + "loss": -0.0491, + "reward": 0.06768626151606441, + "reward_std": 0.023789847316220403, + "rewards/ddi_reward": -0.008297525631496683, + "rewards/jaccard_reward": 0.06796443276107311, + "rewards/refuse_rate_reward": -0.0013908617896959186, + "step": 5710 + }, + { + "completion_length": 27.353125, + "epoch": 0.4576, + "grad_norm": 2.507427215576172, + "kl": 0.4633881613612175, + "learning_rate": 5.802347295001774e-06, + "loss": -0.0478, + "reward": 0.0677155314013362, + "reward_std": 0.023790690023452042, + "rewards/ddi_reward": -0.009995834017172456, + "rewards/jaccard_reward": 0.06837763166986406, + "rewards/refuse_rate_reward": -0.00331050471868366, + "step": 5720 + }, + { + "completion_length": 20.278125, + "epoch": 0.4584, + "grad_norm": 1.736864686012268, + "kl": 0.5358931481838226, + "learning_rate": 5.789739487757551e-06, + "loss": -0.081, + "reward": 0.06702345001976937, + "reward_std": 0.02756262249313295, + "rewards/ddi_reward": -0.010403246694477276, + "rewards/jaccard_reward": 0.06749219986377283, + "rewards/refuse_rate_reward": -0.002343750023283064, + "step": 5730 + }, + { + "completion_length": 20.9296875, + "epoch": 0.4592, + "grad_norm": 4.671194553375244, + "kl": 0.4968462139368057, + "learning_rate": 5.777126528545528e-06, + "loss": -0.0676, + "reward": 0.06939554661512375, + "reward_std": 0.023164224112406373, + "rewards/ddi_reward": 5.646706849802285e-05, + "rewards/jaccard_reward": 0.06939554531127215, + "rewards/refuse_rate_reward": 0.0, + "step": 5740 + }, + { + "completion_length": 19.89375, + "epoch": 0.46, + "grad_norm": 1.5807188749313354, + "kl": 0.43431409150362016, + "learning_rate": 5.764508499647976e-06, + "loss": -0.0427, + "reward": 0.04462694409303367, + "reward_std": 0.01707464570645243, + "rewards/ddi_reward": 0.0024806205488857812, + "rewards/jaccard_reward": 0.044839617010438815, + "rewards/refuse_rate_reward": -0.0010633680503815413, + "step": 5750 + }, + { + "completion_length": 22.7765625, + "epoch": 0.4608, + "grad_norm": 8.192351341247559, + "kl": 0.49223894625902176, + "learning_rate": 5.751885483380238e-06, + "loss": -0.0531, + "reward": 0.06031082109548151, + "reward_std": 0.022132088104262947, + "rewards/ddi_reward": -0.004500839280080981, + "rewards/jaccard_reward": 0.06052783525665291, + "rewards/refuse_rate_reward": -0.0010850695078261196, + "step": 5760 + }, + { + "completion_length": 22.2703125, + "epoch": 0.4616, + "grad_norm": 1.6460305452346802, + "kl": 0.4877010300755501, + "learning_rate": 5.7392575620902e-06, + "loss": -0.0616, + "reward": 0.06618448081426323, + "reward_std": 0.023307499289512635, + "rewards/ddi_reward": -0.0027034214173909275, + "rewards/jaccard_reward": 0.06630724809365347, + "rewards/refuse_rate_reward": -0.0006138392724096775, + "step": 5770 + }, + { + "completion_length": 24.5453125, + "epoch": 0.4624, + "grad_norm": 1.898838996887207, + "kl": 0.575176177918911, + "learning_rate": 5.726624818157737e-06, + "loss": -0.0396, + "reward": 0.05476091708987951, + "reward_std": 0.02157174814492464, + "rewards/ddi_reward": -0.006626635638531298, + "rewards/jaccard_reward": 0.05504954466596246, + "rewards/refuse_rate_reward": -0.001443142385687679, + "step": 5780 + }, + { + "completion_length": 23.2703125, + "epoch": 0.4632, + "grad_norm": 3.3632688522338867, + "kl": 0.527679231762886, + "learning_rate": 5.7139873339941934e-06, + "loss": -0.0658, + "reward": 0.07464771885424852, + "reward_std": 0.025667449389584363, + "rewards/ddi_reward": -0.0028249514929484577, + "rewards/jaccard_reward": 0.07534737633541226, + "rewards/refuse_rate_reward": -0.00349828137550503, + "step": 5790 + }, + { + "completion_length": 26.5328125, + "epoch": 0.464, + "grad_norm": 2.071760654449463, + "kl": 0.491928306221962, + "learning_rate": 5.70134519204183e-06, + "loss": -0.0449, + "reward": 0.05944228838197887, + "reward_std": 0.02346959481947124, + "rewards/ddi_reward": -0.00794330007629469, + "rewards/jaccard_reward": 0.06027229907922447, + "rewards/refuse_rate_reward": -0.004150055756326765, + "step": 5800 + }, + { + "completion_length": 27.20625, + "epoch": 0.4648, + "grad_norm": 3.1301615238189697, + "kl": 0.5283231653273106, + "learning_rate": 5.6886984747733e-06, + "loss": -0.0448, + "reward": 0.06485925647430121, + "reward_std": 0.03117725607007742, + "rewards/ddi_reward": -0.00846056927985046, + "rewards/jaccard_reward": 0.06515208296477795, + "rewards/refuse_rate_reward": -0.0014641283662058413, + "step": 5810 + }, + { + "completion_length": 27.075, + "epoch": 0.4656, + "grad_norm": 2.3326539993286133, + "kl": 0.48133454769849776, + "learning_rate": 5.676047264691098e-06, + "loss": -0.0665, + "reward": 0.07373797738109715, + "reward_std": 0.028246072120964527, + "rewards/ddi_reward": -0.0062002391292480755, + "rewards/jaccard_reward": 0.07445581159554422, + "rewards/refuse_rate_reward": -0.0035891675914172085, + "step": 5820 + }, + { + "completion_length": 25.140625, + "epoch": 0.4664, + "grad_norm": 1.9613667726516724, + "kl": 0.4851860962808132, + "learning_rate": 5.663391644327032e-06, + "loss": -0.0592, + "reward": 0.07213568994775414, + "reward_std": 0.02525409827940166, + "rewards/ddi_reward": -0.002557901630643755, + "rewards/jaccard_reward": 0.07251193067058921, + "rewards/refuse_rate_reward": -0.0018812004011124373, + "step": 5830 + }, + { + "completion_length": 27.240625, + "epoch": 0.4672, + "grad_norm": 2.8992466926574707, + "kl": 0.5345324747264385, + "learning_rate": 5.650731696241681e-06, + "loss": -0.0569, + "reward": 0.07413897099904716, + "reward_std": 0.02554952255450189, + "rewards/ddi_reward": -0.0033973959623835983, + "rewards/jaccard_reward": 0.07437645238824189, + "rewards/refuse_rate_reward": -0.0011873998446390033, + "step": 5840 + }, + { + "completion_length": 28.2296875, + "epoch": 0.468, + "grad_norm": 1.943688154220581, + "kl": 0.48720556795597075, + "learning_rate": 5.638067503023854e-06, + "loss": -0.0636, + "reward": 0.07482774942182005, + "reward_std": 0.025214561773464084, + "rewards/ddi_reward": -0.010657080402597784, + "rewards/jaccard_reward": 0.07568230107426643, + "rewards/refuse_rate_reward": -0.004272749868687242, + "step": 5850 + }, + { + "completion_length": 23.94375, + "epoch": 0.4688, + "grad_norm": 1.427697777748108, + "kl": 0.5331411004066468, + "learning_rate": 5.625399147290058e-06, + "loss": -0.0441, + "reward": 0.05461634453386068, + "reward_std": 0.023784565879032014, + "rewards/ddi_reward": -0.008793245331617072, + "rewards/jaccard_reward": 0.054865183960646394, + "rewards/refuse_rate_reward": -0.0012441912549547851, + "step": 5860 + }, + { + "completion_length": 25.109375, + "epoch": 0.4696, + "grad_norm": 2.3583176136016846, + "kl": 0.512057289481163, + "learning_rate": 5.612726711683949e-06, + "loss": -0.0387, + "reward": 0.06082488500978798, + "reward_std": 0.023545705573633315, + "rewards/ddi_reward": 0.0008907268405891955, + "rewards/jaccard_reward": 0.06157046472653747, + "rewards/refuse_rate_reward": -0.0037279031530488282, + "step": 5870 + }, + { + "completion_length": 25.98125, + "epoch": 0.4704, + "grad_norm": 1.6658458709716797, + "kl": 0.479242754727602, + "learning_rate": 5.600050278875802e-06, + "loss": -0.0487, + "reward": 0.06712786350399255, + "reward_std": 0.022887288127094507, + "rewards/ddi_reward": -0.007746895559830591, + "rewards/jaccard_reward": 0.06755699152126908, + "rewards/refuse_rate_reward": -0.0021456473506987093, + "step": 5880 + }, + { + "completion_length": 23.428125, + "epoch": 0.4712, + "grad_norm": 3.1711695194244385, + "kl": 0.43459360376000405, + "learning_rate": 5.587369931561968e-06, + "loss": -0.0529, + "reward": 0.05897147622890771, + "reward_std": 0.026655631116591395, + "rewards/ddi_reward": -0.01056353560416028, + "rewards/jaccard_reward": 0.05941939235199243, + "rewards/refuse_rate_reward": -0.0022395833395421503, + "step": 5890 + }, + { + "completion_length": 22.83125, + "epoch": 0.472, + "grad_norm": 1.59918212890625, + "kl": 0.5144707694649696, + "learning_rate": 5.5746857524643335e-06, + "loss": -0.0433, + "reward": 0.05900983726605773, + "reward_std": 0.019185346178710462, + "rewards/ddi_reward": -0.0069387565366923806, + "rewards/jaccard_reward": 0.05942673282697797, + "rewards/refuse_rate_reward": -0.0020844781247433275, + "step": 5900 + }, + { + "completion_length": 20.334375, + "epoch": 0.4728, + "grad_norm": 2.9236738681793213, + "kl": 0.60022601634264, + "learning_rate": 5.5619978243297855e-06, + "loss": -0.0372, + "reward": 0.05767725892364979, + "reward_std": 0.022687561414204537, + "rewards/ddi_reward": -6.532040424644947e-05, + "rewards/jaccard_reward": 0.05790005000308156, + "rewards/refuse_rate_reward": -0.0011139576556161046, + "step": 5910 + }, + { + "completion_length": 27.446875, + "epoch": 0.4736, + "grad_norm": 2.262749671936035, + "kl": 0.5491224937140942, + "learning_rate": 5.549306229929664e-06, + "loss": -0.0504, + "reward": 0.06376605490222573, + "reward_std": 0.0297078687697649, + "rewards/ddi_reward": -0.007270385336596519, + "rewards/jaccard_reward": 0.06443354294169694, + "rewards/refuse_rate_reward": -0.0033374438527971504, + "step": 5920 + }, + { + "completion_length": 26.5890625, + "epoch": 0.4744, + "grad_norm": 2.0202763080596924, + "kl": 0.5682420559227467, + "learning_rate": 5.536611052059231e-06, + "loss": -0.0512, + "reward": 0.06161219731438905, + "reward_std": 0.029484274610877038, + "rewards/ddi_reward": -0.00871595358767081, + "rewards/jaccard_reward": 0.061959946097340436, + "rewards/refuse_rate_reward": -0.0017387396655976772, + "step": 5930 + }, + { + "completion_length": 28.640625, + "epoch": 0.4752, + "grad_norm": 2.015613079071045, + "kl": 0.4681591145694256, + "learning_rate": 5.523912373537119e-06, + "loss": -0.0435, + "reward": 0.06342230178415775, + "reward_std": 0.024930863222107292, + "rewards/ddi_reward": -0.00807138541713357, + "rewards/jaccard_reward": 0.06414142742287368, + "rewards/refuse_rate_reward": -0.003595627495087683, + "step": 5940 + }, + { + "completion_length": 20.9296875, + "epoch": 0.476, + "grad_norm": 3.4609735012054443, + "kl": 0.4944312460720539, + "learning_rate": 5.511210277204805e-06, + "loss": -0.0627, + "reward": 0.05416892059147358, + "reward_std": 0.02360522486269474, + "rewards/ddi_reward": -0.0027702317980583756, + "rewards/jaccard_reward": 0.054495308391051364, + "rewards/refuse_rate_reward": -0.001631944440305233, + "step": 5950 + }, + { + "completion_length": 25.69375, + "epoch": 0.4768, + "grad_norm": 4.0075907707214355, + "kl": 0.5519361592829227, + "learning_rate": 5.498504845926057e-06, + "loss": 0.0378, + "reward": 0.034533752035349606, + "reward_std": 0.02743733236566186, + "rewards/ddi_reward": -0.0066110166371800005, + "rewards/jaccard_reward": 0.035288954712450506, + "rewards/refuse_rate_reward": -0.0037760098930448294, + "step": 5960 + }, + { + "completion_length": 25.434375, + "epoch": 0.4776, + "grad_norm": 1.872504711151123, + "kl": 0.5532846838235855, + "learning_rate": 5.485796162586403e-06, + "loss": -0.0475, + "reward": 0.06042863903567195, + "reward_std": 0.023458380065858365, + "rewards/ddi_reward": 0.004390113987028599, + "rewards/jaccard_reward": 0.06097703874111175, + "rewards/refuse_rate_reward": -0.0027420019963756205, + "step": 5970 + }, + { + "completion_length": 20.359375, + "epoch": 0.4784, + "grad_norm": 1.4290579557418823, + "kl": 0.5619689658284187, + "learning_rate": 5.473084310092581e-06, + "loss": -0.0534, + "reward": 0.0524887939915061, + "reward_std": 0.020526262605562805, + "rewards/ddi_reward": -0.008252669750072527, + "rewards/jaccard_reward": 0.053241768712177874, + "rewards/refuse_rate_reward": -0.003764881077222526, + "step": 5980 + }, + { + "completion_length": 27.246875, + "epoch": 0.4792, + "grad_norm": 1.5238196849822998, + "kl": 0.4977162152528763, + "learning_rate": 5.46036937137201e-06, + "loss": -0.0664, + "reward": 0.08044731933623553, + "reward_std": 0.02782729845494032, + "rewards/ddi_reward": -0.01066084296326153, + "rewards/jaccard_reward": 0.08103806069120764, + "rewards/refuse_rate_reward": -0.002953722304664552, + "step": 5990 + }, + { + "completion_length": 26.4265625, + "epoch": 0.48, + "grad_norm": 13.056252479553223, + "kl": 0.45370248034596444, + "learning_rate": 5.447651429372238e-06, + "loss": -0.0545, + "reward": 0.06849884828552603, + "reward_std": 0.0257082412019372, + "rewards/ddi_reward": -0.0054086741642095145, + "rewards/jaccard_reward": 0.06863459823653102, + "rewards/refuse_rate_reward": -0.000678752682870254, + "step": 6000 + }, + { + "completion_length": 24.8578125, + "epoch": 0.4808, + "grad_norm": 1.7385427951812744, + "kl": 0.3925068944692612, + "learning_rate": 5.434930567060404e-06, + "loss": -0.0424, + "reward": 0.051114298147149385, + "reward_std": 0.02765309817623347, + "rewards/ddi_reward": -0.010482138178485912, + "rewards/jaccard_reward": 0.05155714155407622, + "rewards/refuse_rate_reward": -0.002214218117296696, + "step": 6010 + }, + { + "completion_length": 24.096875, + "epoch": 0.4816, + "grad_norm": 13.568134307861328, + "kl": 0.5246622107923031, + "learning_rate": 5.422206867422705e-06, + "loss": -0.0385, + "reward": 0.04752338277176023, + "reward_std": 0.02238847189582884, + "rewards/ddi_reward": -0.011297387938247994, + "rewards/jaccard_reward": 0.048018259555101396, + "rewards/refuse_rate_reward": -0.00247438769438304, + "step": 6020 + }, + { + "completion_length": 23.8703125, + "epoch": 0.4824, + "grad_norm": 1.1531639099121094, + "kl": 0.48382900804281237, + "learning_rate": 5.409480413463839e-06, + "loss": -0.0408, + "reward": 0.057252436480484906, + "reward_std": 0.02340046470053494, + "rewards/ddi_reward": -0.00859138680389151, + "rewards/jaccard_reward": 0.057733268523588774, + "rewards/refuse_rate_reward": -0.0024041648488491774, + "step": 6030 + }, + { + "completion_length": 27.09375, + "epoch": 0.4832, + "grad_norm": 2.893909215927124, + "kl": 0.4715189814567566, + "learning_rate": 5.396751288206476e-06, + "loss": -0.0628, + "reward": 0.07162342988885939, + "reward_std": 0.02569672742392868, + "rewards/ddi_reward": -0.005200936342589557, + "rewards/jaccard_reward": 0.0721479416359216, + "rewards/refuse_rate_reward": -0.0026225631940178574, + "step": 6040 + }, + { + "completion_length": 23.7625, + "epoch": 0.484, + "grad_norm": 5.616015434265137, + "kl": 0.5278986811637878, + "learning_rate": 5.384019574690716e-06, + "loss": -0.0654, + "reward": 0.06377928010188043, + "reward_std": 0.027768640918657182, + "rewards/ddi_reward": -0.00377223226823844, + "rewards/jaccard_reward": 0.06430417676456272, + "rewards/refuse_rate_reward": -0.002624478412326425, + "step": 6050 + }, + { + "completion_length": 27.1296875, + "epoch": 0.4848, + "grad_norm": 3.0034096240997314, + "kl": 0.44955657571554186, + "learning_rate": 5.37128535597354e-06, + "loss": -0.0581, + "reward": 0.07334388745948672, + "reward_std": 0.024356788396835326, + "rewards/ddi_reward": -0.0010494289745111018, + "rewards/jaccard_reward": 0.07390280729159712, + "rewards/refuse_rate_reward": -0.002794603444635868, + "step": 6060 + }, + { + "completion_length": 22.2703125, + "epoch": 0.4856, + "grad_norm": 2.481381416320801, + "kl": 0.5168855741620064, + "learning_rate": 5.358548715128269e-06, + "loss": -0.0627, + "reward": 0.06099908580072224, + "reward_std": 0.02274960810318589, + "rewards/ddi_reward": -0.005155063723213971, + "rewards/jaccard_reward": 0.06118094415869564, + "rewards/refuse_rate_reward": -0.0009092882042750716, + "step": 6070 + }, + { + "completion_length": 21.646875, + "epoch": 0.4864, + "grad_norm": 1.631284475326538, + "kl": 0.47045900449156763, + "learning_rate": 5.345809735244032e-06, + "loss": -0.0824, + "reward": 0.06423655764665455, + "reward_std": 0.021635168744251133, + "rewards/ddi_reward": -0.004719364794436842, + "rewards/jaccard_reward": 0.06443507461808622, + "rewards/refuse_rate_reward": -0.000992588174995035, + "step": 6080 + }, + { + "completion_length": 23.825, + "epoch": 0.4872, + "grad_norm": 3.8288462162017822, + "kl": 0.5083848029375077, + "learning_rate": 5.333068499425212e-06, + "loss": -0.061, + "reward": 0.06554458409082145, + "reward_std": 0.02634851443581283, + "rewards/ddi_reward": -0.012412904633674771, + "rewards/jaccard_reward": 0.066124054370448, + "rewards/refuse_rate_reward": -0.0028973501292057337, + "step": 6090 + }, + { + "completion_length": 21.990625, + "epoch": 0.488, + "grad_norm": 6.643016338348389, + "kl": 0.6295560956001282, + "learning_rate": 5.3203250907909135e-06, + "loss": -0.0505, + "reward": 0.05929162479005754, + "reward_std": 0.024267062055878343, + "rewards/ddi_reward": -0.0008223663753597066, + "rewards/jaccard_reward": 0.05969492178410292, + "rewards/refuse_rate_reward": -0.0020164818270131946, + "step": 6100 + }, + { + "completion_length": 25.0125, + "epoch": 0.4888, + "grad_norm": 1.0413322448730469, + "kl": 0.45786730796098707, + "learning_rate": 5.307579592474407e-06, + "loss": -0.0484, + "reward": 0.05052102860063314, + "reward_std": 0.02142191332532093, + "rewards/ddi_reward": -0.006552838499192149, + "rewards/jaccard_reward": 0.05128892962820828, + "rewards/refuse_rate_reward": -0.0038395133800804616, + "step": 6110 + }, + { + "completion_length": 22.0359375, + "epoch": 0.4896, + "grad_norm": 4.239551067352295, + "kl": 0.5790748924016953, + "learning_rate": 5.294832087622608e-06, + "loss": -0.0456, + "reward": 0.0527322628069669, + "reward_std": 0.02199742558877915, + "rewards/ddi_reward": -0.006177327877958305, + "rewards/jaccard_reward": 0.052988958125934, + "rewards/refuse_rate_reward": -0.0012834821478463708, + "step": 6120 + }, + { + "completion_length": 25.0578125, + "epoch": 0.4904, + "grad_norm": 4.43096399307251, + "kl": 0.6158456638455391, + "learning_rate": 5.28208265939551e-06, + "loss": -0.0453, + "reward": 0.05786117380484938, + "reward_std": 0.020996566163375974, + "rewards/ddi_reward": -0.008088101167231798, + "rewards/jaccard_reward": 0.05833911132067442, + "rewards/refuse_rate_reward": -0.0023896893486380576, + "step": 6130 + }, + { + "completion_length": 23.9828125, + "epoch": 0.4912, + "grad_norm": 2.4466516971588135, + "kl": 0.5328575931489468, + "learning_rate": 5.269331390965661e-06, + "loss": -0.0528, + "reward": 0.06390995057299734, + "reward_std": 0.02065859141293913, + "rewards/ddi_reward": -0.00887911401805468, + "rewards/jaccard_reward": 0.0648564393632114, + "rewards/refuse_rate_reward": -0.004732434684410691, + "step": 6140 + }, + { + "completion_length": 22.371875, + "epoch": 0.492, + "grad_norm": 2.9098026752471924, + "kl": 0.5141178384423256, + "learning_rate": 5.256578365517609e-06, + "loss": -0.0315, + "reward": 0.05110132291447371, + "reward_std": 0.021528695803135635, + "rewards/ddi_reward": -0.004124518259777688, + "rewards/jaccard_reward": 0.051329390075989065, + "rewards/refuse_rate_reward": -0.0011403338867239654, + "step": 6150 + }, + { + "completion_length": 24.4421875, + "epoch": 0.4928, + "grad_norm": 16.460506439208984, + "kl": 0.5080384023487567, + "learning_rate": 5.24382366624737e-06, + "loss": -0.0529, + "reward": 0.06210186360403895, + "reward_std": 0.02368609542027116, + "rewards/ddi_reward": 0.0024478377046762033, + "rewards/jaccard_reward": 0.062365064211189744, + "rewards/refuse_rate_reward": -0.001316000212682411, + "step": 6160 + }, + { + "completion_length": 21.0796875, + "epoch": 0.4936, + "grad_norm": 1.6279324293136597, + "kl": 0.6325390420854091, + "learning_rate": 5.2310673763618745e-06, + "loss": -0.0368, + "reward": 0.05458187707699835, + "reward_std": 0.022306081745773552, + "rewards/ddi_reward": -0.009632618760224431, + "rewards/jaccard_reward": 0.05534577565267682, + "rewards/refuse_rate_reward": -0.0038195008179172873, + "step": 6170 + }, + { + "completion_length": 22.9109375, + "epoch": 0.4944, + "grad_norm": 1.426849365234375, + "kl": 0.5098595045506954, + "learning_rate": 5.2183095790784325e-06, + "loss": -0.0543, + "reward": 0.06129354839213193, + "reward_std": 0.02500255648046732, + "rewards/ddi_reward": -0.009507938235765323, + "rewards/jaccard_reward": 0.06184231247752905, + "rewards/refuse_rate_reward": -0.0027438214048743246, + "step": 6180 + }, + { + "completion_length": 19.590625, + "epoch": 0.4952, + "grad_norm": 2.749723196029663, + "kl": 0.5706714823842048, + "learning_rate": 5.205550357624183e-06, + "loss": -0.0474, + "reward": 0.047778718266636135, + "reward_std": 0.020883592125028373, + "rewards/ddi_reward": -0.012724375631660224, + "rewards/jaccard_reward": 0.0489431525580585, + "rewards/refuse_rate_reward": -0.005822172621265054, + "step": 6190 + }, + { + "completion_length": 24.3234375, + "epoch": 0.496, + "grad_norm": 6.1558613777160645, + "kl": 0.49277218133211137, + "learning_rate": 5.192789795235563e-06, + "loss": -0.0563, + "reward": 0.06425580205395817, + "reward_std": 0.022654782701283693, + "rewards/ddi_reward": -0.015556803965591826, + "rewards/jaccard_reward": 0.06472381884232163, + "rewards/refuse_rate_reward": -0.0023400861537083983, + "step": 6200 + }, + { + "completion_length": 27.325, + "epoch": 0.4968, + "grad_norm": 2.0898232460021973, + "kl": 0.6019249126315117, + "learning_rate": 5.180027975157752e-06, + "loss": -0.0692, + "reward": 0.07158835669979453, + "reward_std": 0.020912307081744075, + "rewards/ddi_reward": -0.0012968064693268389, + "rewards/jaccard_reward": 0.07206113315187394, + "rewards/refuse_rate_reward": -0.0023638831451535227, + "step": 6210 + }, + { + "completion_length": 24.7796875, + "epoch": 0.4976, + "grad_norm": 3.741673707962036, + "kl": 0.5483842991292477, + "learning_rate": 5.167264980644136e-06, + "loss": -0.0463, + "reward": 0.06459719631820918, + "reward_std": 0.022374802734702826, + "rewards/ddi_reward": -0.005927323549985886, + "rewards/jaccard_reward": 0.06554479319602251, + "rewards/refuse_rate_reward": -0.00473798958119005, + "step": 6220 + }, + { + "completion_length": 20.828125, + "epoch": 0.4984, + "grad_norm": 5.439343452453613, + "kl": 0.5423669785261154, + "learning_rate": 5.154500894955761e-06, + "loss": -0.0277, + "reward": 0.04365191818214953, + "reward_std": 0.023118571704253556, + "rewards/ddi_reward": -0.005873129772953689, + "rewards/jaccard_reward": 0.04422499153297395, + "rewards/refuse_rate_reward": -0.0028653652290813623, + "step": 6230 + }, + { + "completion_length": 26.8796875, + "epoch": 0.4992, + "grad_norm": 2.7032229900360107, + "kl": 0.6270432189106941, + "learning_rate": 5.141735801360793e-06, + "loss": -0.0496, + "reward": 0.06969062583521009, + "reward_std": 0.025715809012763203, + "rewards/ddi_reward": -0.00566001704428345, + "rewards/jaccard_reward": 0.07081379797309637, + "rewards/refuse_rate_reward": -0.005615868559107185, + "step": 6240 + }, + { + "completion_length": 23.015625, + "epoch": 0.5, + "grad_norm": 1.7160470485687256, + "kl": 0.49382374063134193, + "learning_rate": 5.128969783133975e-06, + "loss": -0.0535, + "reward": 0.055869909445755185, + "reward_std": 0.02050423405598849, + "rewards/ddi_reward": -0.007890483525989112, + "rewards/jaccard_reward": 0.05634102828335017, + "rewards/refuse_rate_reward": -0.002355587179772556, + "step": 6250 + }, + { + "completion_length": 24.86875, + "epoch": 0.5008, + "grad_norm": 3.355942726135254, + "kl": 0.529905553162098, + "learning_rate": 5.116202923556076e-06, + "loss": -0.0457, + "reward": 0.06515237614512444, + "reward_std": 0.02240901398472488, + "rewards/ddi_reward": -0.006003839790355414, + "rewards/jaccard_reward": 0.06533336695283651, + "rewards/refuse_rate_reward": -0.0009049479383975267, + "step": 6260 + }, + { + "completion_length": 23.871875, + "epoch": 0.5016, + "grad_norm": 5.493901252746582, + "kl": 0.47462970167398455, + "learning_rate": 5.103435305913361e-06, + "loss": -0.0589, + "reward": 0.061514501366764306, + "reward_std": 0.022465922171249986, + "rewards/ddi_reward": -0.012584945640992374, + "rewards/jaccard_reward": 0.06177929956465959, + "rewards/refuse_rate_reward": -0.001323987008072436, + "step": 6270 + }, + { + "completion_length": 22.5515625, + "epoch": 0.5024, + "grad_norm": 2.598093032836914, + "kl": 0.4884233567863703, + "learning_rate": 5.090667013497034e-06, + "loss": -0.0505, + "reward": 0.06845165528357029, + "reward_std": 0.0253711789380759, + "rewards/ddi_reward": -0.012002932414179667, + "rewards/jaccard_reward": 0.06969991789665073, + "rewards/refuse_rate_reward": -0.006241319421678782, + "step": 6280 + }, + { + "completion_length": 25.034375, + "epoch": 0.5032, + "grad_norm": 1.6193944215774536, + "kl": 0.5044068813323974, + "learning_rate": 5.077898129602705e-06, + "loss": -0.0397, + "reward": 0.05458526983857155, + "reward_std": 0.020771301258355378, + "rewards/ddi_reward": 0.0007279478071723133, + "rewards/jaccard_reward": 0.05481455917470157, + "rewards/refuse_rate_reward": -0.0011464533803518862, + "step": 6290 + }, + { + "completion_length": 28.5546875, + "epoch": 0.504, + "grad_norm": 1.6762906312942505, + "kl": 0.4467997416853905, + "learning_rate": 5.065128737529844e-06, + "loss": -0.0574, + "reward": 0.07404935555532574, + "reward_std": 0.02366983313113451, + "rewards/ddi_reward": -0.01260832599364221, + "rewards/jaccard_reward": 0.07444815817289055, + "rewards/refuse_rate_reward": -0.00199400867568329, + "step": 6300 + }, + { + "completion_length": 27.65, + "epoch": 0.5048, + "grad_norm": 3.0134425163269043, + "kl": 0.4793724797666073, + "learning_rate": 5.05235892058123e-06, + "loss": -0.0426, + "reward": 0.0671910353936255, + "reward_std": 0.023871373990550636, + "rewards/ddi_reward": -0.013686793512897566, + "rewards/jaccard_reward": 0.06783226327970623, + "rewards/refuse_rate_reward": -0.003206129791215062, + "step": 6310 + }, + { + "completion_length": 24.1046875, + "epoch": 0.5056, + "grad_norm": 4.990246772766113, + "kl": 0.5225734859704971, + "learning_rate": 5.0395887620624175e-06, + "loss": -0.0635, + "reward": 0.06942931544035673, + "reward_std": 0.022511632228270174, + "rewards/ddi_reward": 0.0017595025507034733, + "rewards/jaccard_reward": 0.06981560017447919, + "rewards/refuse_rate_reward": -0.0019314236007630825, + "step": 6320 + }, + { + "completion_length": 26.840625, + "epoch": 0.5064, + "grad_norm": 1.4038431644439697, + "kl": 0.561567985266447, + "learning_rate": 5.02681834528119e-06, + "loss": -0.0561, + "reward": 0.0700810918584466, + "reward_std": 0.026936646644026042, + "rewards/ddi_reward": -0.006113642937270925, + "rewards/jaccard_reward": 0.07066755592823029, + "rewards/refuse_rate_reward": -0.00293231361429207, + "step": 6330 + }, + { + "completion_length": 24.1921875, + "epoch": 0.5072, + "grad_norm": 1.8560715913772583, + "kl": 0.4064297780394554, + "learning_rate": 5.014047753547015e-06, + "loss": -0.0665, + "reward": 0.0663526451215148, + "reward_std": 0.024958927556872366, + "rewards/ddi_reward": -0.013940963411005213, + "rewards/jaccard_reward": 0.06703953081741928, + "rewards/refuse_rate_reward": -0.0034344279789365827, + "step": 6340 + }, + { + "completion_length": 21.8984375, + "epoch": 0.508, + "grad_norm": 3.579805374145508, + "kl": 0.3707948237657547, + "learning_rate": 5.001277070170502e-06, + "loss": -0.0503, + "reward": 0.052009007520973684, + "reward_std": 0.023487046058289707, + "rewards/ddi_reward": -0.0014028742909431458, + "rewards/jaccard_reward": 0.05242567444220185, + "rewards/refuse_rate_reward": -0.0020833333721384404, + "step": 6350 + }, + { + "completion_length": 21.5234375, + "epoch": 0.5088, + "grad_norm": 8.254324913024902, + "kl": 0.5970456138253212, + "learning_rate": 4.988506378462855e-06, + "loss": -0.047, + "reward": 0.061713569425046445, + "reward_std": 0.022579300915822387, + "rewards/ddi_reward": -0.004907424945849925, + "rewards/jaccard_reward": 0.06257460163906217, + "rewards/refuse_rate_reward": -0.004305161046795547, + "step": 6360 + }, + { + "completion_length": 24.303125, + "epoch": 0.5096, + "grad_norm": 1.0704234838485718, + "kl": 0.4978997394442558, + "learning_rate": 4.975735761735336e-06, + "loss": -0.0513, + "reward": 0.05945343845523894, + "reward_std": 0.02193365036509931, + "rewards/ddi_reward": -0.006169575484818779, + "rewards/jaccard_reward": 0.05987097397446632, + "rewards/refuse_rate_reward": -0.002087673614732921, + "step": 6370 + }, + { + "completion_length": 24.8859375, + "epoch": 0.5104, + "grad_norm": 2.831535577774048, + "kl": 0.5386984467506408, + "learning_rate": 4.962965303298719e-06, + "loss": -0.0422, + "reward": 0.061190436128526926, + "reward_std": 0.023378275334835052, + "rewards/ddi_reward": -0.0027923662390094252, + "rewards/jaccard_reward": 0.061367891170084475, + "rewards/refuse_rate_reward": -0.0008872768143191933, + "step": 6380 + }, + { + "completion_length": 25.653125, + "epoch": 0.5112, + "grad_norm": 4.688365459442139, + "kl": 0.512549539655447, + "learning_rate": 4.950195086462739e-06, + "loss": -0.0346, + "reward": 0.04994496230501681, + "reward_std": 0.019049208192154766, + "rewards/ddi_reward": -0.005460099372430705, + "rewards/jaccard_reward": 0.050034476176369934, + "rewards/refuse_rate_reward": -0.00044757327996194365, + "step": 6390 + }, + { + "completion_length": 24.1859375, + "epoch": 0.512, + "grad_norm": 2.4040253162384033, + "kl": 0.5295929484069347, + "learning_rate": 4.93742519453556e-06, + "loss": -0.0496, + "reward": 0.0652826872188598, + "reward_std": 0.0258365694899112, + "rewards/ddi_reward": -0.007576000859262422, + "rewards/jaccard_reward": 0.06567078088410198, + "rewards/refuse_rate_reward": -0.001940470573026687, + "step": 6400 + }, + { + "completion_length": 20.3671875, + "epoch": 0.5128, + "grad_norm": 3.356154203414917, + "kl": 0.5422316431999207, + "learning_rate": 4.924655710823231e-06, + "loss": -0.078, + "reward": 0.06576826407108456, + "reward_std": 0.025069005647674204, + "rewards/ddi_reward": -0.009878507271059789, + "rewards/jaccard_reward": 0.06691819168627262, + "rewards/refuse_rate_reward": -0.005749628040939569, + "step": 6410 + }, + { + "completion_length": 23.6828125, + "epoch": 0.5136, + "grad_norm": 3.575666666030884, + "kl": 0.3993518978357315, + "learning_rate": 4.911886718629123e-06, + "loss": -0.0641, + "reward": 0.06684799543581903, + "reward_std": 0.022193212440470234, + "rewards/ddi_reward": -0.0038015561643987895, + "rewards/jaccard_reward": 0.06735223873984068, + "rewards/refuse_rate_reward": -0.002521227300167084, + "step": 6420 + }, + { + "completion_length": 26.5296875, + "epoch": 0.5144, + "grad_norm": 2.542243242263794, + "kl": 0.4518233075737953, + "learning_rate": 4.899118301253417e-06, + "loss": -0.0451, + "reward": 0.04948234334588051, + "reward_std": 0.022936028195545077, + "rewards/ddi_reward": -0.0017651974107138813, + "rewards/jaccard_reward": 0.050101538607850674, + "rewards/refuse_rate_reward": -0.003095970896538347, + "step": 6430 + }, + { + "completion_length": 20.6421875, + "epoch": 0.5152, + "grad_norm": 2.2543046474456787, + "kl": 0.49472799599170686, + "learning_rate": 4.886350541992535e-06, + "loss": -0.0443, + "reward": 0.04778860452352092, + "reward_std": 0.02164558379445225, + "rewards/ddi_reward": -0.004032505772192963, + "rewards/jaccard_reward": 0.04821223578765057, + "rewards/refuse_rate_reward": -0.0021181624906603247, + "step": 6440 + }, + { + "completion_length": 23.7296875, + "epoch": 0.516, + "grad_norm": 2.0596470832824707, + "kl": 0.4908596701920033, + "learning_rate": 4.873583524138606e-06, + "loss": -0.0471, + "reward": 0.055684730783104895, + "reward_std": 0.02199112591333687, + "rewards/ddi_reward": -0.00925710255978629, + "rewards/jaccard_reward": 0.05625020570587367, + "rewards/refuse_rate_reward": -0.0028273810516111554, + "step": 6450 + }, + { + "completion_length": 27.05, + "epoch": 0.5168, + "grad_norm": 1.8182717561721802, + "kl": 0.4184515751898289, + "learning_rate": 4.860817330978925e-06, + "loss": -0.0496, + "reward": 0.06749745858833194, + "reward_std": 0.020789349678670986, + "rewards/ddi_reward": -0.008020547055639326, + "rewards/jaccard_reward": 0.06788190500810742, + "rewards/refuse_rate_reward": -0.0019222358474507927, + "step": 6460 + }, + { + "completion_length": 24.2375, + "epoch": 0.5176, + "grad_norm": 2.943462371826172, + "kl": 0.4905851945281029, + "learning_rate": 4.848052045795408e-06, + "loss": -0.0647, + "reward": 0.07237309024203568, + "reward_std": 0.025346144661307336, + "rewards/ddi_reward": -0.001583480421686545, + "rewards/jaccard_reward": 0.07281341425841674, + "rewards/refuse_rate_reward": -0.0022016265953425317, + "step": 6470 + }, + { + "completion_length": 23.8140625, + "epoch": 0.5184, + "grad_norm": 2.9408228397369385, + "kl": 0.3799766913056374, + "learning_rate": 4.8352877518640425e-06, + "loss": -0.0635, + "reward": 0.06603735396638513, + "reward_std": 0.021496833162382246, + "rewards/ddi_reward": -0.009705776488408447, + "rewards/jaccard_reward": 0.06615905002690851, + "rewards/refuse_rate_reward": -0.00060847356216982, + "step": 6480 + }, + { + "completion_length": 26.225, + "epoch": 0.5192, + "grad_norm": 4.898510456085205, + "kl": 0.4323403887450695, + "learning_rate": 4.822524532454355e-06, + "loss": -0.0509, + "reward": 0.05332894362509251, + "reward_std": 0.020889255637302994, + "rewards/ddi_reward": 0.006219188880641014, + "rewards/jaccard_reward": 0.053899713791906834, + "rewards/refuse_rate_reward": -0.0028538553975522517, + "step": 6490 + }, + { + "completion_length": 24.0328125, + "epoch": 0.52, + "grad_norm": 3.3224589824676514, + "kl": 0.5022544786334038, + "learning_rate": 4.809762470828859e-06, + "loss": -0.0568, + "reward": 0.05938707503955811, + "reward_std": 0.019559148605912922, + "rewards/ddi_reward": -0.014591686526546254, + "rewards/jaccard_reward": 0.05948169108014554, + "rewards/refuse_rate_reward": -0.0004730902845039964, + "step": 6500 + }, + { + "completion_length": 26.1140625, + "epoch": 0.5208, + "grad_norm": 1.4385201930999756, + "kl": 0.5578229203820229, + "learning_rate": 4.797001650242515e-06, + "loss": -0.0606, + "reward": 0.07513490440323949, + "reward_std": 0.021019735373556614, + "rewards/ddi_reward": -0.004163735508336685, + "rewards/jaccard_reward": 0.07545237680897117, + "rewards/refuse_rate_reward": -0.0015873579599428923, + "step": 6510 + }, + { + "completion_length": 24.265625, + "epoch": 0.5216, + "grad_norm": 1.9959070682525635, + "kl": 0.47389940470457076, + "learning_rate": 4.784242153942189e-06, + "loss": -0.045, + "reward": 0.05451723840087652, + "reward_std": 0.0246887129265815, + "rewards/ddi_reward": -0.007736452709650621, + "rewards/jaccard_reward": 0.055219360254704954, + "rewards/refuse_rate_reward": -0.0035106054623611272, + "step": 6520 + }, + { + "completion_length": 25.9984375, + "epoch": 0.5224, + "grad_norm": 5.793447017669678, + "kl": 0.414336171746254, + "learning_rate": 4.77148406516611e-06, + "loss": -0.0435, + "reward": 0.06283827894367278, + "reward_std": 0.023464550613425672, + "rewards/ddi_reward": -0.010458802024368196, + "rewards/jaccard_reward": 0.0633459139498882, + "rewards/refuse_rate_reward": -0.0025381830346304923, + "step": 6530 + }, + { + "completion_length": 25.440625, + "epoch": 0.5232, + "grad_norm": 1.7804234027862549, + "kl": 0.5726325280964375, + "learning_rate": 4.758727467143317e-06, + "loss": -0.0556, + "reward": 0.0698218978010118, + "reward_std": 0.024288109270855784, + "rewards/ddi_reward": -0.0034771251841448246, + "rewards/jaccard_reward": 0.07019247491843998, + "rewards/refuse_rate_reward": -0.0018528808664996176, + "step": 6540 + }, + { + "completion_length": 26.34375, + "epoch": 0.524, + "grad_norm": 1.2255589962005615, + "kl": 0.5248061552643776, + "learning_rate": 4.745972443093131e-06, + "loss": -0.0336, + "reward": 0.06726805008947849, + "reward_std": 0.02328969577793032, + "rewards/ddi_reward": -0.0035023538555833513, + "rewards/jaccard_reward": 0.06807581684552133, + "rewards/refuse_rate_reward": -0.004038843268062919, + "step": 6550 + }, + { + "completion_length": 26.7640625, + "epoch": 0.5248, + "grad_norm": 3.4892144203186035, + "kl": 0.5362689264118672, + "learning_rate": 4.733219076224606e-06, + "loss": -0.0358, + "reward": 0.061017458653077485, + "reward_std": 0.024780596559867262, + "rewards/ddi_reward": -0.006980248657055199, + "rewards/jaccard_reward": 0.061744428076781335, + "rewards/refuse_rate_reward": -0.0036348483932670205, + "step": 6560 + }, + { + "completion_length": 24.225, + "epoch": 0.5256, + "grad_norm": 3.794736862182617, + "kl": 0.5342194095253945, + "learning_rate": 4.720467449735977e-06, + "loss": -0.0637, + "reward": 0.0602798396255821, + "reward_std": 0.020320217567496003, + "rewards/ddi_reward": -0.01055831735720858, + "rewards/jaccard_reward": 0.0605573802953586, + "rewards/refuse_rate_reward": -0.0013877052348107099, + "step": 6570 + }, + { + "completion_length": 24.31875, + "epoch": 0.5264, + "grad_norm": 1.825440764427185, + "kl": 0.4731908835470676, + "learning_rate": 4.707717646814132e-06, + "loss": -0.0476, + "reward": 0.05293502327986062, + "reward_std": 0.021295331045985223, + "rewards/ddi_reward": -0.0024657603760715576, + "rewards/jaccard_reward": 0.05350633964408189, + "rewards/refuse_rate_reward": -0.0028565794229507445, + "step": 6580 + }, + { + "completion_length": 24.75, + "epoch": 0.5272, + "grad_norm": 0.9298772811889648, + "kl": 0.42130014970898627, + "learning_rate": 4.694969750634065e-06, + "loss": -0.0411, + "reward": 0.054377892176853494, + "reward_std": 0.02188604187685996, + "rewards/ddi_reward": -0.00198472497286275, + "rewards/jaccard_reward": 0.055217605200596154, + "rewards/refuse_rate_reward": -0.0041985646821558475, + "step": 6590 + }, + { + "completion_length": 22.8578125, + "epoch": 0.528, + "grad_norm": 3.6675148010253906, + "kl": 0.6081012263894081, + "learning_rate": 4.6822238443583215e-06, + "loss": -0.0425, + "reward": 0.05254110293462873, + "reward_std": 0.019776172330603005, + "rewards/ddi_reward": -0.0016688880627043546, + "rewards/jaccard_reward": 0.05285487109795213, + "rewards/refuse_rate_reward": -0.0015688385639805347, + "step": 6600 + }, + { + "completion_length": 24.16875, + "epoch": 0.5288, + "grad_norm": 2.323430299758911, + "kl": 0.4460606321692467, + "learning_rate": 4.669480011136474e-06, + "loss": -0.0866, + "reward": 0.07709819683805108, + "reward_std": 0.0220068933442235, + "rewards/ddi_reward": -0.007586790079949423, + "rewards/jaccard_reward": 0.07735047182068229, + "rewards/refuse_rate_reward": -0.0012613753555342556, + "step": 6610 + }, + { + "completion_length": 25.1625, + "epoch": 0.5296, + "grad_norm": 1.9898566007614136, + "kl": 0.434166856110096, + "learning_rate": 4.656738334104571e-06, + "loss": -0.0466, + "reward": 0.06335630295798182, + "reward_std": 0.025276265293359756, + "rewards/ddi_reward": -0.005842852607020177, + "rewards/jaccard_reward": 0.06388679028023034, + "rewards/refuse_rate_reward": -0.0026524343877099453, + "step": 6620 + }, + { + "completion_length": 24.909375, + "epoch": 0.5304, + "grad_norm": 2.318885326385498, + "kl": 0.5195876345038414, + "learning_rate": 4.643998896384587e-06, + "loss": -0.0447, + "reward": 0.06079356032423675, + "reward_std": 0.02230013513471931, + "rewards/ddi_reward": -0.0027534404711332173, + "rewards/jaccard_reward": 0.061287057120352983, + "rewards/refuse_rate_reward": -0.0024674818851053715, + "step": 6630 + }, + { + "completion_length": 26.00625, + "epoch": 0.5312, + "grad_norm": 1.6390197277069092, + "kl": 0.5660555273294449, + "learning_rate": 4.631261781083897e-06, + "loss": -0.0268, + "reward": 0.056948247645050284, + "reward_std": 0.019394006207585336, + "rewards/ddi_reward": -0.0004382640530820936, + "rewards/jaccard_reward": 0.057272322080098095, + "rewards/refuse_rate_reward": -0.0016203783685341477, + "step": 6640 + }, + { + "completion_length": 24.371875, + "epoch": 0.532, + "grad_norm": 1.9004158973693848, + "kl": 0.5855309844017029, + "learning_rate": 4.618527071294721e-06, + "loss": -0.0644, + "reward": 0.06774258580990136, + "reward_std": 0.022695778065826743, + "rewards/ddi_reward": -0.013709923741407693, + "rewards/jaccard_reward": 0.06845876500010491, + "rewards/refuse_rate_reward": -0.0035808925109449773, + "step": 6650 + }, + { + "completion_length": 23.7734375, + "epoch": 0.5328, + "grad_norm": 1.213969111442566, + "kl": 0.52272719591856, + "learning_rate": 4.605794850093589e-06, + "loss": -0.0491, + "reward": 0.05964443450793624, + "reward_std": 0.021451935172080994, + "rewards/ddi_reward": -0.0008100138787995092, + "rewards/jaccard_reward": 0.060066309524700046, + "rewards/refuse_rate_reward": -0.0021093750023283065, + "step": 6660 + }, + { + "completion_length": 24.9671875, + "epoch": 0.5336, + "grad_norm": 1.4676860570907593, + "kl": 0.5958477087318897, + "learning_rate": 4.593065200540792e-06, + "loss": -0.0814, + "reward": 0.0809970885515213, + "reward_std": 0.023757610097527505, + "rewards/ddi_reward": -0.007875746069476008, + "rewards/jaccard_reward": 0.08125182390213012, + "rewards/refuse_rate_reward": -0.0012736742617562413, + "step": 6670 + }, + { + "completion_length": 22.9171875, + "epoch": 0.5344, + "grad_norm": 6.221518516540527, + "kl": 0.4384287685155869, + "learning_rate": 4.580338205679847e-06, + "loss": -0.0552, + "reward": 0.06403686509002, + "reward_std": 0.02217488270252943, + "rewards/ddi_reward": -0.0015882469655480237, + "rewards/jaccard_reward": 0.06424299059435726, + "rewards/refuse_rate_reward": -0.0010306187090463937, + "step": 6680 + }, + { + "completion_length": 23.3, + "epoch": 0.5352, + "grad_norm": 3.0303399562835693, + "kl": 0.488938407599926, + "learning_rate": 4.567613948536954e-06, + "loss": -0.0619, + "reward": 0.0640897068195045, + "reward_std": 0.026938790548592807, + "rewards/ddi_reward": -0.008994279967737384, + "rewards/jaccard_reward": 0.06466626301407814, + "rewards/refuse_rate_reward": -0.002882778225466609, + "step": 6690 + }, + { + "completion_length": 28.90625, + "epoch": 0.536, + "grad_norm": 1.3936069011688232, + "kl": 0.45690601542592046, + "learning_rate": 4.554892512120453e-06, + "loss": -0.0347, + "reward": 0.05924031939357519, + "reward_std": 0.029263767087832092, + "rewards/ddi_reward": -0.021260147646535187, + "rewards/jaccard_reward": 0.05996643439866602, + "rewards/refuse_rate_reward": -0.003630572755355388, + "step": 6700 + }, + { + "completion_length": 24.146875, + "epoch": 0.5368, + "grad_norm": 4.311267852783203, + "kl": 0.46724267676472664, + "learning_rate": 4.54217397942028e-06, + "loss": -0.0468, + "reward": 0.05706012458540499, + "reward_std": 0.025661597680300473, + "rewards/ddi_reward": -0.01064846762456, + "rewards/jaccard_reward": 0.057487343158572915, + "rewards/refuse_rate_reward": -0.0021360931103117766, + "step": 6710 + }, + { + "completion_length": 26.1265625, + "epoch": 0.5376, + "grad_norm": 3.2427854537963867, + "kl": 0.5242197878658772, + "learning_rate": 4.529458433407429e-06, + "loss": -0.0421, + "reward": 0.06790460776537657, + "reward_std": 0.027172759314998983, + "rewards/ddi_reward": -0.0007725408067926764, + "rewards/jaccard_reward": 0.06830434657167643, + "rewards/refuse_rate_reward": -0.001998697919771075, + "step": 6720 + }, + { + "completion_length": 23.8203125, + "epoch": 0.5384, + "grad_norm": 3.873443126678467, + "kl": 0.5286301955580711, + "learning_rate": 4.516745957033414e-06, + "loss": -0.0593, + "reward": 0.07045793253928423, + "reward_std": 0.02036750935949385, + "rewards/ddi_reward": -0.012143263287725858, + "rewards/jaccard_reward": 0.07058802600950002, + "rewards/refuse_rate_reward": -0.0006504780030809343, + "step": 6730 + }, + { + "completion_length": 24.9859375, + "epoch": 0.5392, + "grad_norm": 3.5415453910827637, + "kl": 0.575188634544611, + "learning_rate": 4.504036633229716e-06, + "loss": -0.0609, + "reward": 0.07457140176557005, + "reward_std": 0.027439372846856713, + "rewards/ddi_reward": -0.013719338446389884, + "rewards/jaccard_reward": 0.07515675150789321, + "rewards/refuse_rate_reward": -0.0029267506033647805, + "step": 6740 + }, + { + "completion_length": 24.7671875, + "epoch": 0.54, + "grad_norm": 1.873224139213562, + "kl": 0.5274365335702896, + "learning_rate": 4.491330544907257e-06, + "loss": -0.0559, + "reward": 0.06465341984294355, + "reward_std": 0.026442655734717845, + "rewards/ddi_reward": -0.008354817487997934, + "rewards/jaccard_reward": 0.06491284687072038, + "rewards/refuse_rate_reward": -0.0012971230782568455, + "step": 6750 + }, + { + "completion_length": 24.378125, + "epoch": 0.5408, + "grad_norm": 2.155930995941162, + "kl": 0.4390135072171688, + "learning_rate": 4.478627774955853e-06, + "loss": -0.0478, + "reward": 0.06379326633177698, + "reward_std": 0.02087674690410495, + "rewards/ddi_reward": -0.007887985632987694, + "rewards/jaccard_reward": 0.06446820497512817, + "rewards/refuse_rate_reward": -0.003374696825630963, + "step": 6760 + }, + { + "completion_length": 24.7671875, + "epoch": 0.5416, + "grad_norm": 3.9738001823425293, + "kl": 0.5152201607823372, + "learning_rate": 4.465928406243661e-06, + "loss": -0.0469, + "reward": 0.05482088993303478, + "reward_std": 0.022708994150161744, + "rewards/ddi_reward": -0.004602966317906976, + "rewards/jaccard_reward": 0.055117230862379074, + "rewards/refuse_rate_reward": -0.0014817040879279374, + "step": 6770 + }, + { + "completion_length": 23.903125, + "epoch": 0.5424, + "grad_norm": 1.991038203239441, + "kl": 0.5316267266869545, + "learning_rate": 4.453232521616661e-06, + "loss": -0.072, + "reward": 0.07060359474271535, + "reward_std": 0.025494219129905106, + "rewards/ddi_reward": -0.01027571558370255, + "rewards/jaccard_reward": 0.07128767883405089, + "rewards/refuse_rate_reward": -0.0034204273018985988, + "step": 6780 + }, + { + "completion_length": 26.8203125, + "epoch": 0.5432, + "grad_norm": 2.0706684589385986, + "kl": 0.40624351277947424, + "learning_rate": 4.440540203898103e-06, + "loss": -0.0488, + "reward": 0.06546785552054643, + "reward_std": 0.024020770797505975, + "rewards/ddi_reward": -0.010847567458404228, + "rewards/jaccard_reward": 0.06615576683543622, + "rewards/refuse_rate_reward": -0.0034395624359603973, + "step": 6790 + }, + { + "completion_length": 23.0203125, + "epoch": 0.544, + "grad_norm": 6.979279041290283, + "kl": 0.4741579182446003, + "learning_rate": 4.427851535887959e-06, + "loss": -0.066, + "reward": 0.062356792762875556, + "reward_std": 0.025069405836984516, + "rewards/ddi_reward": -0.0017031469964422286, + "rewards/jaccard_reward": 0.06272274367511273, + "rewards/refuse_rate_reward": -0.0018297545262612402, + "step": 6800 + }, + { + "completion_length": 25.7765625, + "epoch": 0.5448, + "grad_norm": 1.6570427417755127, + "kl": 0.45586816519498824, + "learning_rate": 4.4151666003624e-06, + "loss": -0.0562, + "reward": 0.06423894162289798, + "reward_std": 0.021733401250094174, + "rewards/ddi_reward": -0.0032914411276578903, + "rewards/jaccard_reward": 0.06465100022032857, + "rewards/refuse_rate_reward": -0.0020602965261787174, + "step": 6810 + }, + { + "completion_length": 24.3546875, + "epoch": 0.5456, + "grad_norm": 1.5339010953903198, + "kl": 0.3616345763206482, + "learning_rate": 4.4024854800732485e-06, + "loss": -0.0516, + "reward": 0.050516861490905285, + "reward_std": 0.0193986093159765, + "rewards/ddi_reward": 0.006823931453982368, + "rewards/jaccard_reward": 0.05093790225218982, + "rewards/refuse_rate_reward": -0.002105198032222688, + "step": 6820 + }, + { + "completion_length": 28.353125, + "epoch": 0.5464, + "grad_norm": 2.8140575885772705, + "kl": 0.505376560986042, + "learning_rate": 4.389808257747432e-06, + "loss": -0.0579, + "reward": 0.07878094571642577, + "reward_std": 0.02518192012794316, + "rewards/ddi_reward": -0.011299010389484465, + "rewards/jaccard_reward": 0.07930101966485381, + "rewards/refuse_rate_reward": -0.0026003723847679793, + "step": 6830 + }, + { + "completion_length": 27.1734375, + "epoch": 0.5472, + "grad_norm": 4.140817165374756, + "kl": 0.5143594607710839, + "learning_rate": 4.37713501608645e-06, + "loss": -0.0462, + "reward": 0.06295211678370834, + "reward_std": 0.026508306991308926, + "rewards/ddi_reward": -0.007903743252973073, + "rewards/jaccard_reward": 0.06360092753311619, + "rewards/refuse_rate_reward": -0.0032440476934425533, + "step": 6840 + }, + { + "completion_length": 26.26875, + "epoch": 0.548, + "grad_norm": 1.7626484632492065, + "kl": 0.43337049037218095, + "learning_rate": 4.364465837765838e-06, + "loss": -0.0619, + "reward": 0.06309390235692262, + "reward_std": 0.02345989728346467, + "rewards/ddi_reward": -0.0031883022922556846, + "rewards/jaccard_reward": 0.06357392799109221, + "rewards/refuse_rate_reward": -0.0024001345969736575, + "step": 6850 + }, + { + "completion_length": 25.1953125, + "epoch": 0.5488, + "grad_norm": 8.617232322692871, + "kl": 0.5024356819689274, + "learning_rate": 4.35180080543462e-06, + "loss": -0.0728, + "reward": 0.0734723873436451, + "reward_std": 0.024044762179255486, + "rewards/ddi_reward": -0.0033070570963900535, + "rewards/jaccard_reward": 0.07367781549692154, + "rewards/refuse_rate_reward": -0.0010271521285176277, + "step": 6860 + }, + { + "completion_length": 23.740625, + "epoch": 0.5496, + "grad_norm": 2.696119546890259, + "kl": 0.47296978160738945, + "learning_rate": 4.339140001714774e-06, + "loss": -0.0725, + "reward": 0.08160908930003644, + "reward_std": 0.02473494429141283, + "rewards/ddi_reward": -0.002424738244735636, + "rewards/jaccard_reward": 0.08180032595992089, + "rewards/refuse_rate_reward": -0.0009561827988363803, + "step": 6870 + }, + { + "completion_length": 24.746875, + "epoch": 0.5504, + "grad_norm": 1.5603325366973877, + "kl": 0.5168900288641453, + "learning_rate": 4.326483509200695e-06, + "loss": -0.0618, + "reward": 0.07052637450397015, + "reward_std": 0.02338589886203408, + "rewards/ddi_reward": -0.012349107168847694, + "rewards/jaccard_reward": 0.07089787628501654, + "rewards/refuse_rate_reward": -0.0018575029738713057, + "step": 6880 + }, + { + "completion_length": 25.3703125, + "epoch": 0.5512, + "grad_norm": 3.9311609268188477, + "kl": 0.44190002530813216, + "learning_rate": 4.3138314104586475e-06, + "loss": -0.0847, + "reward": 0.08074561059474945, + "reward_std": 0.02561803674325347, + "rewards/ddi_reward": -0.00524386836332269, + "rewards/jaccard_reward": 0.08113551642745734, + "rewards/refuse_rate_reward": -0.0019495175569318236, + "step": 6890 + }, + { + "completion_length": 25.8078125, + "epoch": 0.552, + "grad_norm": 2.4335029125213623, + "kl": 0.4388198532164097, + "learning_rate": 4.3011837880262365e-06, + "loss": -0.0583, + "reward": 0.06714744372293353, + "reward_std": 0.02508882633410394, + "rewards/ddi_reward": 0.001420082157710567, + "rewards/jaccard_reward": 0.06739492351189255, + "rewards/refuse_rate_reward": -0.0012374019133858383, + "step": 6900 + }, + { + "completion_length": 20.590625, + "epoch": 0.5528, + "grad_norm": 4.63560152053833, + "kl": 0.705041091144085, + "learning_rate": 4.288540724411867e-06, + "loss": -0.0621, + "reward": 0.05818924196064472, + "reward_std": 0.020133042940869927, + "rewards/ddi_reward": 0.0007386071432847529, + "rewards/jaccard_reward": 0.05850992705672979, + "rewards/refuse_rate_reward": -0.0016034226457122714, + "step": 6910 + }, + { + "completion_length": 26.1109375, + "epoch": 0.5536, + "grad_norm": 2.007678270339966, + "kl": 0.45797247886657716, + "learning_rate": 4.275902302094196e-06, + "loss": -0.0445, + "reward": 0.054420662205666305, + "reward_std": 0.023325755400583147, + "rewards/ddi_reward": -0.0016680244560120627, + "rewards/jaccard_reward": 0.05472082085907459, + "rewards/refuse_rate_reward": -0.001500787807162851, + "step": 6920 + }, + { + "completion_length": 25.5921875, + "epoch": 0.5544, + "grad_norm": 2.8383123874664307, + "kl": 0.4759794220328331, + "learning_rate": 4.263268603521612e-06, + "loss": -0.0986, + "reward": 0.07982051307335496, + "reward_std": 0.02418315699324012, + "rewards/ddi_reward": -0.007151570396672469, + "rewards/jaccard_reward": 0.08001595381647349, + "rewards/refuse_rate_reward": -0.0009771998855285346, + "step": 6930 + }, + { + "completion_length": 24.55, + "epoch": 0.5552, + "grad_norm": 2.7737081050872803, + "kl": 0.5763327375054359, + "learning_rate": 4.250639711111686e-06, + "loss": -0.0473, + "reward": 0.05651824399828911, + "reward_std": 0.020263646310195327, + "rewards/ddi_reward": -0.0030615873634815215, + "rewards/jaccard_reward": 0.05723778521642089, + "rewards/refuse_rate_reward": -0.0035977131337858737, + "step": 6940 + }, + { + "completion_length": 25.0390625, + "epoch": 0.556, + "grad_norm": 5.9298810958862305, + "kl": 0.5085678622126579, + "learning_rate": 4.2380157072506254e-06, + "loss": -0.0638, + "reward": 0.07065602689981461, + "reward_std": 0.022984827356413006, + "rewards/ddi_reward": 0.003615243820240721, + "rewards/jaccard_reward": 0.07099407278001309, + "rewards/refuse_rate_reward": -0.001690228213556111, + "step": 6950 + }, + { + "completion_length": 23.0578125, + "epoch": 0.5568, + "grad_norm": 3.4321959018707275, + "kl": 0.5247200209647417, + "learning_rate": 4.225396674292759e-06, + "loss": -0.0664, + "reward": 0.06818693354725838, + "reward_std": 0.01964853082317859, + "rewards/ddi_reward": -0.004261865589069202, + "rewards/jaccard_reward": 0.06844588569365442, + "rewards/refuse_rate_reward": -0.0012947556329891085, + "step": 6960 + }, + { + "completion_length": 23.53125, + "epoch": 0.5576, + "grad_norm": 3.870588779449463, + "kl": 0.48695323765277865, + "learning_rate": 4.2127826945599835e-06, + "loss": -0.0526, + "reward": 0.06504823798313737, + "reward_std": 0.022218976682052017, + "rewards/ddi_reward": -0.0023954395175678654, + "rewards/jaccard_reward": 0.06513305939733982, + "rewards/refuse_rate_reward": -0.00042410715250298383, + "step": 6970 + }, + { + "completion_length": 21.0421875, + "epoch": 0.5584, + "grad_norm": 2.0230040550231934, + "kl": 0.47287083640694616, + "learning_rate": 4.200173850341223e-06, + "loss": -0.0457, + "reward": 0.05279469359666109, + "reward_std": 0.02104642486665398, + "rewards/ddi_reward": -0.004168049051077105, + "rewards/jaccard_reward": 0.05290553700178861, + "rewards/refuse_rate_reward": -0.0005542200990021229, + "step": 6980 + }, + { + "completion_length": 25.3421875, + "epoch": 0.5592, + "grad_norm": 1.7479428052902222, + "kl": 0.39987310469150544, + "learning_rate": 4.18757022389191e-06, + "loss": -0.082, + "reward": 0.08165356311947107, + "reward_std": 0.024488551635295154, + "rewards/ddi_reward": -0.005007708031916991, + "rewards/jaccard_reward": 0.0821579996496439, + "rewards/refuse_rate_reward": -0.0025221812189556657, + "step": 6990 + }, + { + "completion_length": 22.0171875, + "epoch": 0.56, + "grad_norm": 7.918801307678223, + "kl": 0.4939607232809067, + "learning_rate": 4.174971897433434e-06, + "loss": -0.027, + "reward": 0.05340996063314378, + "reward_std": 0.023159157996997238, + "rewards/ddi_reward": -0.003280015813652426, + "rewards/jaccard_reward": 0.05357569714542478, + "rewards/refuse_rate_reward": -0.0008286830503493548, + "step": 7000 + }, + { + "completion_length": 20.378125, + "epoch": 0.5608, + "grad_norm": 4.224366664886475, + "kl": 0.5352485671639442, + "learning_rate": 4.162378953152605e-06, + "loss": -0.0629, + "reward": 0.05767200822010636, + "reward_std": 0.02218383117578924, + "rewards/ddi_reward": -0.006372523723985069, + "rewards/jaccard_reward": 0.05767200791742653, + "rewards/refuse_rate_reward": 0.0, + "step": 7010 + }, + { + "completion_length": 20.5890625, + "epoch": 0.5616, + "grad_norm": 2.6879091262817383, + "kl": 0.517064418643713, + "learning_rate": 4.149791473201128e-06, + "loss": -0.0572, + "reward": 0.05905918125063181, + "reward_std": 0.029087682161480187, + "rewards/ddi_reward": 0.0050229613087140025, + "rewards/jaccard_reward": 0.059312653518281876, + "rewards/refuse_rate_reward": -0.0012673611054196954, + "step": 7020 + }, + { + "completion_length": 20.621875, + "epoch": 0.5624, + "grad_norm": 2.4503540992736816, + "kl": 0.5207821696996688, + "learning_rate": 4.137209539695059e-06, + "loss": -0.0404, + "reward": 0.04871672643348575, + "reward_std": 0.01646132618188858, + "rewards/ddi_reward": -0.0033313136431388558, + "rewards/jaccard_reward": 0.04876880934461951, + "rewards/refuse_rate_reward": -0.00026041667442768814, + "step": 7030 + }, + { + "completion_length": 22.5625, + "epoch": 0.5632, + "grad_norm": 2.3931026458740234, + "kl": 0.5165897242724895, + "learning_rate": 4.1246332347142685e-06, + "loss": -0.0304, + "reward": 0.05091268331743777, + "reward_std": 0.02291603572666645, + "rewards/ddi_reward": -0.010710462852148338, + "rewards/jaccard_reward": 0.05094393219333142, + "rewards/refuse_rate_reward": -0.00015625000232830644, + "step": 7040 + }, + { + "completion_length": 26.484375, + "epoch": 0.564, + "grad_norm": 1.2263752222061157, + "kl": 0.46828784644603727, + "learning_rate": 4.112062640301911e-06, + "loss": -0.0619, + "reward": 0.07189653543755412, + "reward_std": 0.026691583916544914, + "rewards/ddi_reward": -0.00984246312873438, + "rewards/jaccard_reward": 0.07215271694585682, + "rewards/refuse_rate_reward": -0.0012809066101908684, + "step": 7050 + }, + { + "completion_length": 21.3578125, + "epoch": 0.5648, + "grad_norm": 2.386303186416626, + "kl": 0.4730423629283905, + "learning_rate": 4.099497838463888e-06, + "loss": -0.0783, + "reward": 0.06195627457927912, + "reward_std": 0.023276326595805585, + "rewards/ddi_reward": -0.01109719195519574, + "rewards/jaccard_reward": 0.06250315003271681, + "rewards/refuse_rate_reward": -0.0027343751629814506, + "step": 7060 + }, + { + "completion_length": 24.8234375, + "epoch": 0.5656, + "grad_norm": 2.6995272636413574, + "kl": 0.5107507959008217, + "learning_rate": 4.086938911168308e-06, + "loss": -0.0368, + "reward": 0.06066566151566803, + "reward_std": 0.02387160954531282, + "rewards/ddi_reward": -0.006172319148026873, + "rewards/jaccard_reward": 0.06102978202980012, + "rewards/refuse_rate_reward": -0.00182061210507527, + "step": 7070 + }, + { + "completion_length": 25.0578125, + "epoch": 0.5664, + "grad_norm": 2.2023189067840576, + "kl": 0.5318434774875641, + "learning_rate": 4.074385940344962e-06, + "loss": -0.0629, + "reward": 0.0718966020271182, + "reward_std": 0.02557186014018953, + "rewards/ddi_reward": -0.004189191723708063, + "rewards/jaccard_reward": 0.07236398946261033, + "rewards/refuse_rate_reward": -0.002336929610464722, + "step": 7080 + }, + { + "completion_length": 23.6859375, + "epoch": 0.5672, + "grad_norm": 1.3140430450439453, + "kl": 0.4749010160565376, + "learning_rate": 4.061839007884782e-06, + "loss": -0.0548, + "reward": 0.060319830849766734, + "reward_std": 0.02356682447716594, + "rewards/ddi_reward": -0.0036583583714673296, + "rewards/jaccard_reward": 0.06069483212195337, + "rewards/refuse_rate_reward": -0.0018749999930150808, + "step": 7090 + }, + { + "completion_length": 24.975, + "epoch": 0.568, + "grad_norm": 3.468393087387085, + "kl": 0.47745676785707475, + "learning_rate": 4.049298195639302e-06, + "loss": -0.0474, + "reward": 0.05753217656165362, + "reward_std": 0.024528818437829614, + "rewards/ddi_reward": -0.00328134034643881, + "rewards/jaccard_reward": 0.057932409550994636, + "rewards/refuse_rate_reward": -0.002001164190005511, + "step": 7100 + }, + { + "completion_length": 23.5265625, + "epoch": 0.5688, + "grad_norm": 1.390409231185913, + "kl": 0.6067250028252602, + "learning_rate": 4.036763585420136e-06, + "loss": -0.0789, + "reward": 0.06741250669583679, + "reward_std": 0.02461388655938208, + "rewards/ddi_reward": -0.002752309689822141, + "rewards/jaccard_reward": 0.06768681220710278, + "rewards/refuse_rate_reward": -0.0013715277891606092, + "step": 7110 + }, + { + "completion_length": 25.70625, + "epoch": 0.5696, + "grad_norm": 2.657874822616577, + "kl": 0.4627686724066734, + "learning_rate": 4.024235258998441e-06, + "loss": -0.0494, + "reward": 0.07122092517092823, + "reward_std": 0.020662751561030746, + "rewards/ddi_reward": -0.004001517297001556, + "rewards/jaccard_reward": 0.0715692542027682, + "rewards/refuse_rate_reward": -0.0017416525399312377, + "step": 7120 + }, + { + "completion_length": 24.040625, + "epoch": 0.5704, + "grad_norm": 2.803555727005005, + "kl": 0.5802239745855331, + "learning_rate": 4.0117132981043695e-06, + "loss": -0.0646, + "reward": 0.06425662762485444, + "reward_std": 0.022849564626812934, + "rewards/ddi_reward": -0.012210158500238322, + "rewards/jaccard_reward": 0.06480603432282805, + "rewards/refuse_rate_reward": -0.002747033949708566, + "step": 7130 + }, + { + "completion_length": 28.8015625, + "epoch": 0.5712, + "grad_norm": 2.5095794200897217, + "kl": 0.46737234964966773, + "learning_rate": 3.999197784426561e-06, + "loss": -0.0185, + "reward": 0.05375439142808318, + "reward_std": 0.02835747357457876, + "rewards/ddi_reward": -0.0027810967032564802, + "rewards/jaccard_reward": 0.054899258736986666, + "rewards/refuse_rate_reward": -0.005724336439743638, + "step": 7140 + }, + { + "completion_length": 23.4359375, + "epoch": 0.572, + "grad_norm": 2.529841184616089, + "kl": 0.4606755882501602, + "learning_rate": 3.986688799611584e-06, + "loss": -0.0723, + "reward": 0.06971560176461936, + "reward_std": 0.02219753994140774, + "rewards/ddi_reward": -0.005455521721160039, + "rewards/jaccard_reward": 0.07007014327682555, + "rewards/refuse_rate_reward": -0.0017726934980601071, + "step": 7150 + }, + { + "completion_length": 25.390625, + "epoch": 0.5728, + "grad_norm": 2.3636534214019775, + "kl": 0.4538384757936001, + "learning_rate": 3.974186425263423e-06, + "loss": -0.0484, + "reward": 0.06654650596901775, + "reward_std": 0.024944187607616186, + "rewards/ddi_reward": -0.0011050227432860993, + "rewards/jaccard_reward": 0.06731436643749475, + "rewards/refuse_rate_reward": -0.0038393102120608093, + "step": 7160 + }, + { + "completion_length": 24.859375, + "epoch": 0.5736, + "grad_norm": 2.2890782356262207, + "kl": 0.4793344035744667, + "learning_rate": 3.9616907429429385e-06, + "loss": -0.0348, + "reward": 0.047741188947111365, + "reward_std": 0.023010024428367616, + "rewards/ddi_reward": 0.0034976519134943375, + "rewards/jaccard_reward": 0.04826404376653955, + "rewards/refuse_rate_reward": -0.002614273875951767, + "step": 7170 + }, + { + "completion_length": 28.684375, + "epoch": 0.5744, + "grad_norm": 2.6573073863983154, + "kl": 0.4775376126170158, + "learning_rate": 3.949201834167327e-06, + "loss": -0.0489, + "reward": 0.06266641351394356, + "reward_std": 0.02241888134740293, + "rewards/ddi_reward": -0.0005294997594319284, + "rewards/jaccard_reward": 0.06298593929968774, + "rewards/refuse_rate_reward": -0.0015976312104612588, + "step": 7180 + }, + { + "completion_length": 24.4890625, + "epoch": 0.5752, + "grad_norm": 2.360790491104126, + "kl": 0.49090011715888976, + "learning_rate": 3.936719780409605e-06, + "loss": -0.0469, + "reward": 0.061206613387912515, + "reward_std": 0.02419831738807261, + "rewards/ddi_reward": -0.004475412936881184, + "rewards/jaccard_reward": 0.061851526889950034, + "rewards/refuse_rate_reward": -0.003224572725594044, + "step": 7190 + }, + { + "completion_length": 26.1375, + "epoch": 0.576, + "grad_norm": 2.3248374462127686, + "kl": 0.44557765275239947, + "learning_rate": 3.9242446630980665e-06, + "loss": -0.0719, + "reward": 0.06395503152161837, + "reward_std": 0.0252802194096148, + "rewards/ddi_reward": -0.0034036920755170287, + "rewards/jaccard_reward": 0.06412069164216519, + "rewards/refuse_rate_reward": -0.0008282949798740447, + "step": 7200 + }, + { + "completion_length": 23.865625, + "epoch": 0.5768, + "grad_norm": 1.6675657033920288, + "kl": 0.6048918277025223, + "learning_rate": 3.9117765636157525e-06, + "loss": -0.05, + "reward": 0.06010677856393158, + "reward_std": 0.02219372158870101, + "rewards/ddi_reward": -0.007864729352877475, + "rewards/jaccard_reward": 0.060353492887225, + "rewards/refuse_rate_reward": -0.0012335689621977507, + "step": 7210 + }, + { + "completion_length": 24.065625, + "epoch": 0.5776, + "grad_norm": 1.7769778966903687, + "kl": 0.4656998209655285, + "learning_rate": 3.899315563299927e-06, + "loss": -0.0573, + "reward": 0.06323437513783574, + "reward_std": 0.024760193866677584, + "rewards/ddi_reward": -0.006167137148440816, + "rewards/jaccard_reward": 0.06362412078306079, + "rewards/refuse_rate_reward": -0.001948723942041397, + "step": 7220 + }, + { + "completion_length": 25.334375, + "epoch": 0.5784, + "grad_norm": 6.225308418273926, + "kl": 0.5087806217372417, + "learning_rate": 3.886861743441537e-06, + "loss": -0.0502, + "reward": 0.0675842649769038, + "reward_std": 0.0259773890953511, + "rewards/ddi_reward": -0.0111179432249628, + "rewards/jaccard_reward": 0.06795766446739435, + "rewards/refuse_rate_reward": -0.001866995869204402, + "step": 7230 + }, + { + "completion_length": 25.1484375, + "epoch": 0.5792, + "grad_norm": 1.9876642227172852, + "kl": 0.5459850274026394, + "learning_rate": 3.874415185284689e-06, + "loss": -0.0695, + "reward": 0.0684344306588173, + "reward_std": 0.025877268565818668, + "rewards/ddi_reward": -0.014598119816946564, + "rewards/jaccard_reward": 0.06908040272537619, + "rewards/refuse_rate_reward": -0.003229860495775938, + "step": 7240 + }, + { + "completion_length": 25.9078125, + "epoch": 0.58, + "grad_norm": 1.21009361743927, + "kl": 0.4772046320140362, + "learning_rate": 3.861975970026116e-06, + "loss": -0.0647, + "reward": 0.07844737302511931, + "reward_std": 0.025299992982763798, + "rewards/ddi_reward": -0.0027726611602702177, + "rewards/jaccard_reward": 0.07867777878418565, + "rewards/refuse_rate_reward": -0.0011520337546244263, + "step": 7250 + }, + { + "completion_length": 20.9421875, + "epoch": 0.5808, + "grad_norm": 4.8361711502075195, + "kl": 0.48132020235061646, + "learning_rate": 3.8495441788146535e-06, + "loss": -0.0357, + "reward": 0.049611128657124935, + "reward_std": 0.02166242345701903, + "rewards/ddi_reward": -0.00036317437479738146, + "rewards/jaccard_reward": 0.05006251721351873, + "rewards/refuse_rate_reward": -0.0022569444903638215, + "step": 7260 + }, + { + "completion_length": 24.18125, + "epoch": 0.5816, + "grad_norm": 3.5382516384124756, + "kl": 0.4713615760207176, + "learning_rate": 3.8371198927506955e-06, + "loss": -0.0337, + "reward": 0.05293899069074541, + "reward_std": 0.025449815765023233, + "rewards/ddi_reward": 0.0003091927064815536, + "rewards/jaccard_reward": 0.053200274414848536, + "rewards/refuse_rate_reward": -0.0013064236263744533, + "step": 7270 + }, + { + "completion_length": 25.3328125, + "epoch": 0.5824, + "grad_norm": 3.7095351219177246, + "kl": 0.47414655685424806, + "learning_rate": 3.824703192885683e-06, + "loss": -0.0677, + "reward": 0.07363300952129066, + "reward_std": 0.026354855950921775, + "rewards/ddi_reward": -0.007876367715653032, + "rewards/jaccard_reward": 0.07432829472236335, + "rewards/refuse_rate_reward": -0.0034764282056130467, + "step": 7280 + }, + { + "completion_length": 27.99375, + "epoch": 0.5832, + "grad_norm": 2.700371503829956, + "kl": 0.49019632786512374, + "learning_rate": 3.812294160221568e-06, + "loss": -0.0573, + "reward": 0.06873125010170042, + "reward_std": 0.026388746802695096, + "rewards/ddi_reward": -0.007618303530034609, + "rewards/jaccard_reward": 0.06917222240008414, + "rewards/refuse_rate_reward": -0.0022048610844649374, + "step": 7290 + }, + { + "completion_length": 22.39375, + "epoch": 0.584, + "grad_norm": 3.033001184463501, + "kl": 0.5600341767072677, + "learning_rate": 3.7998928757102773e-06, + "loss": -0.0364, + "reward": 0.05101366457529366, + "reward_std": 0.022347381501458586, + "rewards/ddi_reward": -0.005038872791919858, + "rewards/jaccard_reward": 0.05134397193323821, + "rewards/refuse_rate_reward": -0.0016515320865437388, + "step": 7300 + }, + { + "completion_length": 28.8265625, + "epoch": 0.5848, + "grad_norm": 2.295266628265381, + "kl": 0.46097235977649687, + "learning_rate": 3.7874994202531974e-06, + "loss": -0.0369, + "reward": 0.06696373764425516, + "reward_std": 0.026692843670025466, + "rewards/ddi_reward": -0.007472708291606978, + "rewards/jaccard_reward": 0.06761068678461016, + "rewards/refuse_rate_reward": -0.00323474716860801, + "step": 7310 + }, + { + "completion_length": 26.171875, + "epoch": 0.5856, + "grad_norm": 1.2646039724349976, + "kl": 0.5656484574079513, + "learning_rate": 3.7751138747006426e-06, + "loss": -0.0368, + "reward": 0.05115292528644204, + "reward_std": 0.02828150298446417, + "rewards/ddi_reward": -0.00542805849108845, + "rewards/jaccard_reward": 0.05157681270502508, + "rewards/refuse_rate_reward": -0.002119441528338939, + "step": 7320 + }, + { + "completion_length": 24.5359375, + "epoch": 0.5864, + "grad_norm": 3.5361416339874268, + "kl": 0.44157538525760176, + "learning_rate": 3.762736319851319e-06, + "loss": -0.0367, + "reward": 0.05940502183511853, + "reward_std": 0.024573556240648033, + "rewards/ddi_reward": -0.0024455550825223326, + "rewards/jaccard_reward": 0.059703137003816666, + "rewards/refuse_rate_reward": -0.0014905754360370338, + "step": 7330 + }, + { + "completion_length": 26.2328125, + "epoch": 0.5872, + "grad_norm": 1.322941541671753, + "kl": 0.5401469677686691, + "learning_rate": 3.7503668364518096e-06, + "loss": -0.0499, + "reward": 0.06273591546341777, + "reward_std": 0.0277328428812325, + "rewards/ddi_reward": -0.006326984279439785, + "rewards/jaccard_reward": 0.06321408981457352, + "rewards/refuse_rate_reward": -0.002390873059630394, + "step": 7340 + }, + { + "completion_length": 28.1046875, + "epoch": 0.588, + "grad_norm": 2.8738272190093994, + "kl": 0.47030903920531275, + "learning_rate": 3.738005505196043e-06, + "loss": -0.0484, + "reward": 0.06481588660972193, + "reward_std": 0.023207047022879124, + "rewards/ddi_reward": -0.014842574228532612, + "rewards/jaccard_reward": 0.06531904776347801, + "rewards/refuse_rate_reward": -0.0025158026954159142, + "step": 7350 + }, + { + "completion_length": 27.221875, + "epoch": 0.5888, + "grad_norm": 2.7426598072052, + "kl": 0.40980786383152007, + "learning_rate": 3.7256524067247613e-06, + "loss": -0.0684, + "reward": 0.07176165804266929, + "reward_std": 0.027057151682674883, + "rewards/ddi_reward": 0.00042148667853325605, + "rewards/jaccard_reward": 0.07198659116402269, + "rewards/refuse_rate_reward": -0.0011246566486079247, + "step": 7360 + }, + { + "completion_length": 29.0328125, + "epoch": 0.5896, + "grad_norm": 1.8521186113357544, + "kl": 0.41708499789237974, + "learning_rate": 3.713307621625002e-06, + "loss": -0.053, + "reward": 0.07986383913084864, + "reward_std": 0.023836973775178193, + "rewards/ddi_reward": -0.005631105327483965, + "rewards/jaccard_reward": 0.08010993821080774, + "rewards/refuse_rate_reward": -0.0012304926640354097, + "step": 7370 + }, + { + "completion_length": 25.4234375, + "epoch": 0.5904, + "grad_norm": 1.5477865934371948, + "kl": 0.4196326233446598, + "learning_rate": 3.7009712304295713e-06, + "loss": -0.0621, + "reward": 0.05899793850257993, + "reward_std": 0.022496023448184133, + "rewards/ddi_reward": -0.0003857173083815724, + "rewards/jaccard_reward": 0.05982630457729101, + "rewards/refuse_rate_reward": -0.004141826077830047, + "step": 7380 + }, + { + "completion_length": 22.8484375, + "epoch": 0.5912, + "grad_norm": 3.50933837890625, + "kl": 0.47187360376119614, + "learning_rate": 3.6886433136165134e-06, + "loss": -0.0433, + "reward": 0.05389622217044234, + "reward_std": 0.024052573554217815, + "rewards/ddi_reward": -0.00748457478475757, + "rewards/jaccard_reward": 0.054343646252527834, + "rewards/refuse_rate_reward": -0.0022371205966919662, + "step": 7390 + }, + { + "completion_length": 26.9, + "epoch": 0.592, + "grad_norm": 1.3206356763839722, + "kl": 0.5212762229144573, + "learning_rate": 3.676323951608588e-06, + "loss": -0.0649, + "reward": 0.0721087610349059, + "reward_std": 0.024831224977970124, + "rewards/ddi_reward": -0.009931860875803977, + "rewards/jaccard_reward": 0.07244485700502992, + "rewards/refuse_rate_reward": -0.0016804766957648099, + "step": 7400 + }, + { + "completion_length": 24.778125, + "epoch": 0.5928, + "grad_norm": 1.5353400707244873, + "kl": 0.4859080098569393, + "learning_rate": 3.6640132247727512e-06, + "loss": -0.0388, + "reward": 0.053894662344828245, + "reward_std": 0.024163085594773294, + "rewards/ddi_reward": -0.009520553360925988, + "rewards/jaccard_reward": 0.054474733956158164, + "rewards/refuse_rate_reward": -0.002900354843586683, + "step": 7410 + }, + { + "completion_length": 25.2296875, + "epoch": 0.5936, + "grad_norm": 0.8561848402023315, + "kl": 0.42197193056344984, + "learning_rate": 3.651711213419621e-06, + "loss": -0.0633, + "reward": 0.06955720484256744, + "reward_std": 0.024981764145195484, + "rewards/ddi_reward": -0.009923338692169636, + "rewards/jaccard_reward": 0.07009472958743572, + "rewards/refuse_rate_reward": -0.002687618404161185, + "step": 7420 + }, + { + "completion_length": 25.6765625, + "epoch": 0.5944, + "grad_norm": 2.187526226043701, + "kl": 0.47162898555397986, + "learning_rate": 3.639417997802962e-06, + "loss": -0.0546, + "reward": 0.06231241142377257, + "reward_std": 0.021300635975785555, + "rewards/ddi_reward": 0.00031228517182171347, + "rewards/jaccard_reward": 0.06259366088779643, + "rewards/refuse_rate_reward": -0.0014062500325962902, + "step": 7430 + }, + { + "completion_length": 26.7109375, + "epoch": 0.5952, + "grad_norm": 2.627514123916626, + "kl": 0.4623560443520546, + "learning_rate": 3.627133658119161e-06, + "loss": -0.0537, + "reward": 0.06606039819307626, + "reward_std": 0.02827987433411181, + "rewards/ddi_reward": -0.009451993613038213, + "rewards/jaccard_reward": 0.06661378452554345, + "rewards/refuse_rate_reward": -0.002766927087213844, + "step": 7440 + }, + { + "completion_length": 25.665625, + "epoch": 0.596, + "grad_norm": 2.850109577178955, + "kl": 0.4558702364563942, + "learning_rate": 3.614858274506693e-06, + "loss": -0.0336, + "reward": 0.04426273782737553, + "reward_std": 0.023112947354093194, + "rewards/ddi_reward": -0.0035939664638135584, + "rewards/jaccard_reward": 0.04450752860866487, + "rewards/refuse_rate_reward": -0.0012239583302289248, + "step": 7450 + }, + { + "completion_length": 24.13125, + "epoch": 0.5968, + "grad_norm": 3.205838918685913, + "kl": 0.46658542603254316, + "learning_rate": 3.6025919270456158e-06, + "loss": -0.0456, + "reward": 0.0493600835558027, + "reward_std": 0.026273378729820253, + "rewards/ddi_reward": -0.0033173142524901777, + "rewards/jaccard_reward": 0.04975237001199275, + "rewards/refuse_rate_reward": -0.0019614283868577332, + "step": 7460 + }, + { + "completion_length": 24.496875, + "epoch": 0.5976, + "grad_norm": 4.901173114776611, + "kl": 0.5113574415445328, + "learning_rate": 3.590334695757036e-06, + "loss": -0.0621, + "reward": 0.05465089278295636, + "reward_std": 0.022896335599943994, + "rewards/ddi_reward": -0.0038304679852444678, + "rewards/jaccard_reward": 0.05479672737419605, + "rewards/refuse_rate_reward": -0.0007291666814126074, + "step": 7470 + }, + { + "completion_length": 24.9640625, + "epoch": 0.5984, + "grad_norm": 2.208367109298706, + "kl": 0.5421738594770431, + "learning_rate": 3.5780866606025856e-06, + "loss": -0.0764, + "reward": 0.07132418947294354, + "reward_std": 0.02427695126389153, + "rewards/ddi_reward": -0.0077746242401190106, + "rewards/jaccard_reward": 0.07146481378003955, + "rewards/refuse_rate_reward": -0.0007031250162981451, + "step": 7480 + }, + { + "completion_length": 27.3296875, + "epoch": 0.5992, + "grad_norm": 2.871309995651245, + "kl": 0.46836133748292924, + "learning_rate": 3.5658479014839097e-06, + "loss": -0.0805, + "reward": 0.07238521138206125, + "reward_std": 0.025877521745860575, + "rewards/ddi_reward": -0.007272158429259434, + "rewards/jaccard_reward": 0.07288900804705918, + "rewards/refuse_rate_reward": -0.002518992847763002, + "step": 7490 + }, + { + "completion_length": 29.1015625, + "epoch": 0.6, + "grad_norm": 1.3383265733718872, + "kl": 0.5770276032388211, + "learning_rate": 3.5536184982421406e-06, + "loss": -0.0429, + "reward": 0.07204859778285026, + "reward_std": 0.025741960760205983, + "rewards/ddi_reward": -0.006178438531060237, + "rewards/jaccard_reward": 0.07281640470027924, + "rewards/refuse_rate_reward": -0.003839028184302151, + "step": 7500 + }, + { + "completion_length": 25.6078125, + "epoch": 0.6008, + "grad_norm": 1.408698320388794, + "kl": 0.5378157660365105, + "learning_rate": 3.5413985306573683e-06, + "loss": -0.0454, + "reward": 0.06444912860170007, + "reward_std": 0.027027911879122256, + "rewards/ddi_reward": -0.0009102673473535105, + "rewards/jaccard_reward": 0.06460673511028289, + "rewards/refuse_rate_reward": -0.0007880255754571408, + "step": 7510 + }, + { + "completion_length": 24.25625, + "epoch": 0.6016, + "grad_norm": 3.6597774028778076, + "kl": 0.5049683332443238, + "learning_rate": 3.529188078448137e-06, + "loss": -0.0782, + "reward": 0.06710387486964464, + "reward_std": 0.028315687412396075, + "rewards/ddi_reward": -0.0062055858434177935, + "rewards/jaccard_reward": 0.0681541638681665, + "rewards/refuse_rate_reward": -0.0052514543756842615, + "step": 7520 + }, + { + "completion_length": 25.7640625, + "epoch": 0.6024, + "grad_norm": 2.4487528800964355, + "kl": 0.5005743741989136, + "learning_rate": 3.5169872212709143e-06, + "loss": -0.0512, + "reward": 0.06616745912469923, + "reward_std": 0.024651872646063566, + "rewards/ddi_reward": -0.006585025181993842, + "rewards/jaccard_reward": 0.0663390058442019, + "rewards/refuse_rate_reward": -0.0008577295346185565, + "step": 7530 + }, + { + "completion_length": 26.7359375, + "epoch": 0.6032, + "grad_norm": 4.700608730316162, + "kl": 0.7304552018642425, + "learning_rate": 3.5047960387195673e-06, + "loss": -0.0498, + "reward": 0.06741597577929496, + "reward_std": 0.02515409835614264, + "rewards/ddi_reward": -0.0044844418996945025, + "rewards/jaccard_reward": 0.06797387022525073, + "rewards/refuse_rate_reward": -0.0027894631261006, + "step": 7540 + }, + { + "completion_length": 24.2875, + "epoch": 0.604, + "grad_norm": 3.7906415462493896, + "kl": 0.5252735555171967, + "learning_rate": 3.492614610324856e-06, + "loss": -0.0459, + "reward": 0.06230842238292098, + "reward_std": 0.026199793722480536, + "rewards/ddi_reward": -0.0009852727525867522, + "rewards/jaccard_reward": 0.06285979440435767, + "rewards/refuse_rate_reward": -0.002756859705550596, + "step": 7550 + }, + { + "completion_length": 25.490625, + "epoch": 0.6048, + "grad_norm": 2.9824283123016357, + "kl": 0.5696426883339882, + "learning_rate": 3.4804430155539083e-06, + "loss": -0.0595, + "reward": 0.056994283478707074, + "reward_std": 0.027342188637703657, + "rewards/ddi_reward": -0.005790715276816627, + "rewards/jaccard_reward": 0.05744532942771911, + "rewards/refuse_rate_reward": -0.0022552302863914518, + "step": 7560 + }, + { + "completion_length": 26.1328125, + "epoch": 0.6056, + "grad_norm": 6.671962261199951, + "kl": 0.6234826475381852, + "learning_rate": 3.468281333809694e-06, + "loss": -0.0464, + "reward": 0.061915687285363674, + "reward_std": 0.0251870340667665, + "rewards/ddi_reward": -0.004171362600754946, + "rewards/jaccard_reward": 0.06219568736851215, + "rewards/refuse_rate_reward": -0.001400004734750837, + "step": 7570 + }, + { + "completion_length": 26.046875, + "epoch": 0.6064, + "grad_norm": 2.6492631435394287, + "kl": 0.4590868838131428, + "learning_rate": 3.4561296444305204e-06, + "loss": -0.0347, + "reward": 0.059318491443991664, + "reward_std": 0.022598547162488103, + "rewards/ddi_reward": -0.0004913548851618543, + "rewards/jaccard_reward": 0.05954914577305317, + "rewards/refuse_rate_reward": -0.001153273822274059, + "step": 7580 + }, + { + "completion_length": 21.059375, + "epoch": 0.6072, + "grad_norm": 3.950796365737915, + "kl": 0.45914390236139296, + "learning_rate": 3.443988026689509e-06, + "loss": -0.038, + "reward": 0.04365195059217512, + "reward_std": 0.024651178205385805, + "rewards/ddi_reward": 0.00039147856732597576, + "rewards/jaccard_reward": 0.044122583232820034, + "rewards/refuse_rate_reward": -0.0023531634011305868, + "step": 7590 + }, + { + "completion_length": 26.1296875, + "epoch": 0.608, + "grad_norm": 1.7708196640014648, + "kl": 0.4460576303303242, + "learning_rate": 3.4318565597940707e-06, + "loss": -0.0453, + "reward": 0.06139483237639069, + "reward_std": 0.02770074480213225, + "rewards/ddi_reward": 0.0081573081552051, + "rewards/jaccard_reward": 0.06161003862507641, + "rewards/refuse_rate_reward": -0.001076039858162403, + "step": 7600 + }, + { + "completion_length": 24.140625, + "epoch": 0.6088, + "grad_norm": 0.9088141322135925, + "kl": 0.5775059178471565, + "learning_rate": 3.4197353228854045e-06, + "loss": -0.0618, + "reward": 0.06620961944572627, + "reward_std": 0.024670176114886998, + "rewards/ddi_reward": -0.0020980149158276616, + "rewards/jaccard_reward": 0.06657159877941013, + "rewards/refuse_rate_reward": -0.0018098958302289248, + "step": 7610 + }, + { + "completion_length": 24.015625, + "epoch": 0.6096, + "grad_norm": 3.0161116123199463, + "kl": 0.535990159213543, + "learning_rate": 3.4076243950379673e-06, + "loss": -0.0362, + "reward": 0.05775577297899872, + "reward_std": 0.020007286593317984, + "rewards/ddi_reward": -0.007838405383517965, + "rewards/jaccard_reward": 0.05824163620127365, + "rewards/refuse_rate_reward": -0.0024293155525811016, + "step": 7620 + }, + { + "completion_length": 23.125, + "epoch": 0.6104, + "grad_norm": 2.6114344596862793, + "kl": 0.5589703634381294, + "learning_rate": 3.3955238552589644e-06, + "loss": -0.0483, + "reward": 0.05319187694694847, + "reward_std": 0.025753522012382745, + "rewards/ddi_reward": -0.0005029332707636059, + "rewards/jaccard_reward": 0.053644505387637764, + "rewards/refuse_rate_reward": -0.0022631448984611778, + "step": 7630 + }, + { + "completion_length": 26.6296875, + "epoch": 0.6112, + "grad_norm": 2.4244072437286377, + "kl": 0.5150598414242268, + "learning_rate": 3.3834337824878355e-06, + "loss": -0.0406, + "reward": 0.06381271569989622, + "reward_std": 0.023796683782711624, + "rewards/ddi_reward": -0.002908136340556666, + "rewards/jaccard_reward": 0.06451784786768258, + "rewards/refuse_rate_reward": -0.0035256584757007658, + "step": 7640 + }, + { + "completion_length": 23.5, + "epoch": 0.612, + "grad_norm": 1.6821796894073486, + "kl": 0.47540894597768785, + "learning_rate": 3.371354255595733e-06, + "loss": -0.0368, + "reward": 0.05464593912474811, + "reward_std": 0.02500212276354432, + "rewards/ddi_reward": -0.008123066139523872, + "rewards/jaccard_reward": 0.055202755983918904, + "rewards/refuse_rate_reward": -0.0027840909315273166, + "step": 7650 + }, + { + "completion_length": 28.421875, + "epoch": 0.6128, + "grad_norm": 1.1964033842086792, + "kl": 0.4679717212915421, + "learning_rate": 3.3592853533850157e-06, + "loss": -0.0673, + "reward": 0.08225929317995906, + "reward_std": 0.025368270790204407, + "rewards/ddi_reward": -0.012580745812738315, + "rewards/jaccard_reward": 0.08276077955961228, + "rewards/refuse_rate_reward": -0.00250744050135836, + "step": 7660 + }, + { + "completion_length": 27.0234375, + "epoch": 0.6136, + "grad_norm": 1.3890907764434814, + "kl": 0.6729762852191925, + "learning_rate": 3.347227154588733e-06, + "loss": -0.055, + "reward": 0.0733304529916495, + "reward_std": 0.023305242974311115, + "rewards/ddi_reward": -0.010980798420496285, + "rewards/jaccard_reward": 0.07340076505206525, + "rewards/refuse_rate_reward": -0.00035156249068677423, + "step": 7670 + }, + { + "completion_length": 22.921875, + "epoch": 0.6144, + "grad_norm": 2.946911096572876, + "kl": 0.541457898914814, + "learning_rate": 3.3351797378701014e-06, + "loss": -0.0427, + "reward": 0.04930661784019321, + "reward_std": 0.019481435790657996, + "rewards/ddi_reward": -0.006833238530089147, + "rewards/jaccard_reward": 0.04946220295969397, + "rewards/refuse_rate_reward": -0.000777921371627599, + "step": 7680 + }, + { + "completion_length": 25.6078125, + "epoch": 0.6152, + "grad_norm": 1.923758625984192, + "kl": 0.5353443495929241, + "learning_rate": 3.3231431818220046e-06, + "loss": -0.0548, + "reward": 0.06593946246430278, + "reward_std": 0.0242723073810339, + "rewards/ddi_reward": -0.012333933461923153, + "rewards/jaccard_reward": 0.06622220100834966, + "rewards/refuse_rate_reward": -0.0014136905199848115, + "step": 7690 + }, + { + "completion_length": 26.825, + "epoch": 0.616, + "grad_norm": 3.6649515628814697, + "kl": 0.6191152468323707, + "learning_rate": 3.311117564966478e-06, + "loss": -0.0207, + "reward": 0.05401939293369651, + "reward_std": 0.02211248455569148, + "rewards/ddi_reward": -0.005078411026624962, + "rewards/jaccard_reward": 0.0541445275186561, + "rewards/refuse_rate_reward": -0.0006256764288991689, + "step": 7700 + }, + { + "completion_length": 26.3171875, + "epoch": 0.6168, + "grad_norm": 2.1409378051757812, + "kl": 0.47499294206500053, + "learning_rate": 3.299102965754185e-06, + "loss": -0.0413, + "reward": 0.060064123617485164, + "reward_std": 0.02590749613009393, + "rewards/ddi_reward": -0.010744087025523185, + "rewards/jaccard_reward": 0.060309113143011926, + "rewards/refuse_rate_reward": -0.0012249423540197312, + "step": 7710 + }, + { + "completion_length": 22.4890625, + "epoch": 0.6176, + "grad_norm": 1.709617018699646, + "kl": 0.48195300400257113, + "learning_rate": 3.287099462563923e-06, + "loss": -0.0539, + "reward": 0.05681052231229842, + "reward_std": 0.02556656990200281, + "rewards/ddi_reward": -0.011564818886108696, + "rewards/jaccard_reward": 0.057777758443262425, + "rewards/refuse_rate_reward": -0.004836175253149122, + "step": 7720 + }, + { + "completion_length": 23.853125, + "epoch": 0.6184, + "grad_norm": 7.124815464019775, + "kl": 0.5925846606492996, + "learning_rate": 3.275107133702099e-06, + "loss": -0.0389, + "reward": 0.04676824696362018, + "reward_std": 0.024206011835485697, + "rewards/ddi_reward": -0.0041936248453566805, + "rewards/jaccard_reward": 0.04684873828664422, + "rewards/refuse_rate_reward": -0.0004024621332064271, + "step": 7730 + }, + { + "completion_length": 25.4859375, + "epoch": 0.6192, + "grad_norm": 1.4173614978790283, + "kl": 0.4636569082736969, + "learning_rate": 3.2631260574022205e-06, + "loss": -0.0466, + "reward": 0.06016256343573332, + "reward_std": 0.02557390076108277, + "rewards/ddi_reward": -0.00665758914547041, + "rewards/jaccard_reward": 0.060333475144580005, + "rewards/refuse_rate_reward": -0.0008545605756808073, + "step": 7740 + }, + { + "completion_length": 25.3921875, + "epoch": 0.62, + "grad_norm": 5.636214733123779, + "kl": 0.48397753313183783, + "learning_rate": 3.2511563118243917e-06, + "loss": -0.0641, + "reward": 0.0655236778780818, + "reward_std": 0.02346500866115093, + "rewards/ddi_reward": -0.009946968760050367, + "rewards/jaccard_reward": 0.06611965904012322, + "rewards/refuse_rate_reward": -0.002979910746216774, + "step": 7750 + }, + { + "completion_length": 26.2125, + "epoch": 0.6208, + "grad_norm": 3.262237071990967, + "kl": 0.5050975263118744, + "learning_rate": 3.2391979750547986e-06, + "loss": -0.0591, + "reward": 0.06853938135318458, + "reward_std": 0.027196947112679483, + "rewards/ddi_reward": -0.012401304068043828, + "rewards/jaccard_reward": 0.06879345595370978, + "rewards/refuse_rate_reward": -0.0012703659245744347, + "step": 7760 + }, + { + "completion_length": 23.565625, + "epoch": 0.6216, + "grad_norm": 1.9367939233779907, + "kl": 0.5011484965682029, + "learning_rate": 3.227251125105197e-06, + "loss": -0.0419, + "reward": 0.057381607801653446, + "reward_std": 0.022071939730085432, + "rewards/ddi_reward": -0.008493733557406813, + "rewards/jaccard_reward": 0.05755571646150202, + "rewards/refuse_rate_reward": -0.0008705357438884675, + "step": 7770 + }, + { + "completion_length": 23.6328125, + "epoch": 0.6224, + "grad_norm": 3.042699098587036, + "kl": 0.422017814218998, + "learning_rate": 3.215315839912412e-06, + "loss": -0.0485, + "reward": 0.0603021957911551, + "reward_std": 0.02275228527141735, + "rewards/ddi_reward": -0.004854520849767141, + "rewards/jaccard_reward": 0.06045907714869827, + "rewards/refuse_rate_reward": -0.000784406578168273, + "step": 7780 + }, + { + "completion_length": 25.45625, + "epoch": 0.6232, + "grad_norm": 2.5307698249816895, + "kl": 0.4972169704735279, + "learning_rate": 3.2033921973378205e-06, + "loss": -0.0573, + "reward": 0.06594641949050128, + "reward_std": 0.02575859851203859, + "rewards/ddi_reward": -0.008848285581916571, + "rewards/jaccard_reward": 0.06612827561330051, + "rewards/refuse_rate_reward": -0.0009092882159166038, + "step": 7790 + }, + { + "completion_length": 23.6328125, + "epoch": 0.624, + "grad_norm": 2.325284242630005, + "kl": 0.6044656172394752, + "learning_rate": 3.1914802751668485e-06, + "loss": -0.0429, + "reward": 0.05623489273712039, + "reward_std": 0.024271506746299566, + "rewards/ddi_reward": 0.009519472147803753, + "rewards/jaccard_reward": 0.05655029034242034, + "rewards/refuse_rate_reward": -0.0015769864548929036, + "step": 7800 + }, + { + "completion_length": 27.096875, + "epoch": 0.6248, + "grad_norm": 4.141114711761475, + "kl": 0.5146437093615532, + "learning_rate": 3.1795801511084623e-06, + "loss": -0.0603, + "reward": 0.07966970419511199, + "reward_std": 0.025354732852429153, + "rewards/ddi_reward": -0.0014112358505371958, + "rewards/jaccard_reward": 0.08030525743961334, + "rewards/refuse_rate_reward": -0.003177759749814868, + "step": 7810 + }, + { + "completion_length": 26.7265625, + "epoch": 0.6256, + "grad_norm": 2.7429378032684326, + "kl": 0.5106159180402756, + "learning_rate": 3.167691902794666e-06, + "loss": -0.0547, + "reward": 0.06532635958865285, + "reward_std": 0.02339913104660809, + "rewards/ddi_reward": -0.017975702800322325, + "rewards/jaccard_reward": 0.06589356418699026, + "rewards/refuse_rate_reward": -0.0028360225318465383, + "step": 7820 + }, + { + "completion_length": 27.765625, + "epoch": 0.6264, + "grad_norm": 2.8782870769500732, + "kl": 0.49697487354278563, + "learning_rate": 3.15581560777998e-06, + "loss": -0.0376, + "reward": 0.05918194246478379, + "reward_std": 0.02731645666062832, + "rewards/ddi_reward": -0.011571789236040787, + "rewards/jaccard_reward": 0.05953075508587062, + "rewards/refuse_rate_reward": -0.001744055957533419, + "step": 7830 + }, + { + "completion_length": 29.2125, + "epoch": 0.6272, + "grad_norm": 3.556196451187134, + "kl": 0.5284214943647385, + "learning_rate": 3.1439513435409564e-06, + "loss": -0.0582, + "reward": 0.07014643913134933, + "reward_std": 0.026679213624447584, + "rewards/ddi_reward": -0.019655017578043044, + "rewards/jaccard_reward": 0.07134482488036156, + "rewards/refuse_rate_reward": -0.005991929164156318, + "step": 7840 + }, + { + "completion_length": 25.140625, + "epoch": 0.628, + "grad_norm": 1.595770001411438, + "kl": 0.6191526830196381, + "learning_rate": 3.132099187475661e-06, + "loss": -0.0385, + "reward": 0.05271881241351366, + "reward_std": 0.02764893267303705, + "rewards/ddi_reward": -0.005357312550768256, + "rewards/jaccard_reward": 0.053483685420360416, + "rewards/refuse_rate_reward": -0.003824365802574903, + "step": 7850 + }, + { + "completion_length": 28.1046875, + "epoch": 0.6288, + "grad_norm": 3.2407569885253906, + "kl": 0.4944401502609253, + "learning_rate": 3.1202592169031636e-06, + "loss": -0.0517, + "reward": 0.0716292280703783, + "reward_std": 0.025181340728886427, + "rewards/ddi_reward": -0.00815937325824052, + "rewards/jaccard_reward": 0.07250738906732294, + "rewards/refuse_rate_reward": -0.004390797310043127, + "step": 7860 + }, + { + "completion_length": 26.74375, + "epoch": 0.6296, + "grad_norm": 2.9786999225616455, + "kl": 0.39173159971833227, + "learning_rate": 3.108431509063048e-06, + "loss": -0.0358, + "reward": 0.050479576783254744, + "reward_std": 0.02331970618106425, + "rewards/ddi_reward": -0.004475538124097511, + "rewards/jaccard_reward": 0.05062919817864895, + "rewards/refuse_rate_reward": -0.0007481060805730521, + "step": 7870 + }, + { + "completion_length": 22.1171875, + "epoch": 0.6304, + "grad_norm": 2.473625898361206, + "kl": 0.5239693611860275, + "learning_rate": 3.0966161411148997e-06, + "loss": -0.0569, + "reward": 0.05949018085375428, + "reward_std": 0.025274443533271553, + "rewards/ddi_reward": -0.01221353976870887, + "rewards/jaccard_reward": 0.06013898169621825, + "rewards/refuse_rate_reward": -0.003244008671026677, + "step": 7880 + }, + { + "completion_length": 27.25, + "epoch": 0.6312, + "grad_norm": 2.74653959274292, + "kl": 0.46042475029826163, + "learning_rate": 3.0848131901377965e-06, + "loss": -0.0497, + "reward": 0.06881907833740115, + "reward_std": 0.024873548396863045, + "rewards/ddi_reward": -0.00748420757590793, + "rewards/jaccard_reward": 0.06899141995236277, + "rewards/refuse_rate_reward": -0.0008617083774879574, + "step": 7890 + }, + { + "completion_length": 26.4140625, + "epoch": 0.632, + "grad_norm": 1.7433725595474243, + "kl": 0.49429757297039034, + "learning_rate": 3.0730227331298203e-06, + "loss": -0.0629, + "reward": 0.07151489108800888, + "reward_std": 0.02696609771810472, + "rewards/ddi_reward": -0.0067072853038553145, + "rewards/jaccard_reward": 0.07192424538079649, + "rewards/refuse_rate_reward": -0.002046759688528255, + "step": 7900 + }, + { + "completion_length": 25.48125, + "epoch": 0.6328, + "grad_norm": 2.298621416091919, + "kl": 0.45092214196920394, + "learning_rate": 3.0612448470075455e-06, + "loss": -0.0591, + "reward": 0.06738282851874829, + "reward_std": 0.023898702254518865, + "rewards/ddi_reward": -0.0067061073961667715, + "rewards/jaccard_reward": 0.06750747505575419, + "rewards/refuse_rate_reward": -0.0006232244428247214, + "step": 7910 + }, + { + "completion_length": 25.7625, + "epoch": 0.6336, + "grad_norm": 1.793609619140625, + "kl": 0.43288504257798194, + "learning_rate": 3.049479608605533e-06, + "loss": -0.0487, + "reward": 0.057650796975940465, + "reward_std": 0.025376426824368536, + "rewards/ddi_reward": -0.010754899540916085, + "rewards/jaccard_reward": 0.0578630356118083, + "rewards/refuse_rate_reward": -0.001061197929084301, + "step": 7920 + }, + { + "completion_length": 22.33125, + "epoch": 0.6344, + "grad_norm": 3.22666597366333, + "kl": 0.4348852507770061, + "learning_rate": 3.037727094675842e-06, + "loss": -0.0587, + "reward": 0.05926342187449336, + "reward_std": 0.021661140862852334, + "rewards/ddi_reward": 0.00036623066698666663, + "rewards/jaccard_reward": 0.05962590789422393, + "rewards/refuse_rate_reward": -0.0018124323920346796, + "step": 7930 + }, + { + "completion_length": 20.6234375, + "epoch": 0.6352, + "grad_norm": 7.925380706787109, + "kl": 0.469981849193573, + "learning_rate": 3.0259873818875184e-06, + "loss": -0.0588, + "reward": 0.0576570940669626, + "reward_std": 0.024340663198381664, + "rewards/ddi_reward": 0.007428023207467049, + "rewards/jaccard_reward": 0.05790188526152633, + "rewards/refuse_rate_reward": -0.0012239583302289248, + "step": 7940 + }, + { + "completion_length": 21.45625, + "epoch": 0.636, + "grad_norm": 3.6145119667053223, + "kl": 0.4098002426326275, + "learning_rate": 3.0142605468260976e-06, + "loss": -0.0863, + "reward": 0.06099908747710288, + "reward_std": 0.022277913708239793, + "rewards/ddi_reward": -0.002780419890768826, + "rewards/jaccard_reward": 0.06122256200760603, + "rewards/refuse_rate_reward": -0.0011173678562045098, + "step": 7950 + }, + { + "completion_length": 19.484375, + "epoch": 0.6368, + "grad_norm": 9.57409381866455, + "kl": 0.5449393287301063, + "learning_rate": 3.002546665993107e-06, + "loss": -0.0398, + "reward": 0.04208237733691931, + "reward_std": 0.02325719912769273, + "rewards/ddi_reward": -0.00016200770041905344, + "rewards/jaccard_reward": 0.04244435499422252, + "rewards/refuse_rate_reward": -0.0018098958767950534, + "step": 7960 + }, + { + "completion_length": 27.121875, + "epoch": 0.6376, + "grad_norm": 5.079964637756348, + "kl": 0.49476148933172226, + "learning_rate": 2.9908458158055653e-06, + "loss": -0.0494, + "reward": 0.06632550647482276, + "reward_std": 0.02448322153650224, + "rewards/ddi_reward": -0.01741826579091139, + "rewards/jaccard_reward": 0.06670311065390706, + "rewards/refuse_rate_reward": -0.0018880208197515457, + "step": 7970 + }, + { + "completion_length": 26.0640625, + "epoch": 0.6384, + "grad_norm": 1.5791019201278687, + "kl": 0.5187544159591198, + "learning_rate": 2.979158072595482e-06, + "loss": -0.0566, + "reward": 0.06663534035906196, + "reward_std": 0.024681246001273395, + "rewards/ddi_reward": -0.009444739553146064, + "rewards/jaccard_reward": 0.0669877999695018, + "rewards/refuse_rate_reward": -0.001762298692483455, + "step": 7980 + }, + { + "completion_length": 24.1015625, + "epoch": 0.6392, + "grad_norm": 2.735718011856079, + "kl": 0.4932886213064194, + "learning_rate": 2.9674835126093647e-06, + "loss": -0.0558, + "reward": 0.07088903426192701, + "reward_std": 0.02692051827907562, + "rewards/ddi_reward": -0.008683975099120288, + "rewards/jaccard_reward": 0.07124556908383965, + "rewards/refuse_rate_reward": -0.0017826704774051906, + "step": 7990 + }, + { + "completion_length": 24.30625, + "epoch": 0.64, + "grad_norm": 1.4013521671295166, + "kl": 0.5514806970953942, + "learning_rate": 2.9558222120077197e-06, + "loss": -0.0564, + "reward": 0.0604289373382926, + "reward_std": 0.025600994192063808, + "rewards/ddi_reward": 0.00210688759398181, + "rewards/jaccard_reward": 0.060510968789458275, + "rewards/refuse_rate_reward": -0.00041015625465661286, + "step": 8000 + }, + { + "completion_length": 28.503125, + "epoch": 0.6408, + "grad_norm": 4.8650641441345215, + "kl": 0.5155169278383255, + "learning_rate": 2.944174246864545e-06, + "loss": -0.0293, + "reward": 0.053118772176094356, + "reward_std": 0.024346491834148765, + "rewards/ddi_reward": -0.009318362848716788, + "rewards/jaccard_reward": 0.05344340614392422, + "rewards/refuse_rate_reward": -0.0016231685178354383, + "step": 8010 + }, + { + "completion_length": 21.8921875, + "epoch": 0.6416, + "grad_norm": 3.5604710578918457, + "kl": 0.4938268542289734, + "learning_rate": 2.932539693166854e-06, + "loss": -0.0459, + "reward": 0.05815118998289108, + "reward_std": 0.022361523658037185, + "rewards/ddi_reward": -0.010786661208840087, + "rewards/jaccard_reward": 0.05817959972191602, + "rewards/refuse_rate_reward": -0.00014204545877873897, + "step": 8020 + }, + { + "completion_length": 26.3296875, + "epoch": 0.6424, + "grad_norm": 1.4873790740966797, + "kl": 0.42833594381809237, + "learning_rate": 2.920918626814164e-06, + "loss": -0.0758, + "reward": 0.08010613266378641, + "reward_std": 0.02642095065675676, + "rewards/ddi_reward": 0.0029453608847688885, + "rewards/jaccard_reward": 0.08054264038801193, + "rewards/refuse_rate_reward": -0.002182539727073163, + "step": 8030 + }, + { + "completion_length": 22.66875, + "epoch": 0.6432, + "grad_norm": 5.164370059967041, + "kl": 0.46945292353630064, + "learning_rate": 2.9093111236180038e-06, + "loss": -0.0767, + "reward": 0.062046742625534536, + "reward_std": 0.02821407513692975, + "rewards/ddi_reward": -0.007699621259234846, + "rewards/jaccard_reward": 0.06232253704220057, + "rewards/refuse_rate_reward": -0.001378968299832195, + "step": 8040 + }, + { + "completion_length": 24.1109375, + "epoch": 0.644, + "grad_norm": 2.206002712249756, + "kl": 0.4787923973053694, + "learning_rate": 2.897717259301426e-06, + "loss": -0.0414, + "reward": 0.05427861255593598, + "reward_std": 0.022299114475026728, + "rewards/ddi_reward": -0.006895741928019561, + "rewards/jaccard_reward": 0.054512392985634504, + "rewards/refuse_rate_reward": -0.0011689048842526972, + "step": 8050 + }, + { + "completion_length": 25.31875, + "epoch": 0.6448, + "grad_norm": 3.2426390647888184, + "kl": 0.45130044147372245, + "learning_rate": 2.8861371094985035e-06, + "loss": -0.0565, + "reward": 0.06913816477172077, + "reward_std": 0.025374815333634615, + "rewards/ddi_reward": -0.003437358245719224, + "rewards/jaccard_reward": 0.06929875537753105, + "rewards/refuse_rate_reward": -0.0008029513992369175, + "step": 8060 + }, + { + "completion_length": 30.1421875, + "epoch": 0.6456, + "grad_norm": 1.5541664361953735, + "kl": 0.40567322820425034, + "learning_rate": 2.874570749753841e-06, + "loss": -0.0509, + "reward": 0.06664309133775533, + "reward_std": 0.02601426187902689, + "rewards/ddi_reward": -0.0019155960995703935, + "rewards/jaccard_reward": 0.06691457401029766, + "rewards/refuse_rate_reward": -0.0013574104465078562, + "step": 8070 + }, + { + "completion_length": 24.8203125, + "epoch": 0.6464, + "grad_norm": 1.0287286043167114, + "kl": 0.48005858063697815, + "learning_rate": 2.8630182555220897e-06, + "loss": -0.068, + "reward": 0.06696637202985585, + "reward_std": 0.027446829620748757, + "rewards/ddi_reward": -0.007757163973292336, + "rewards/jaccard_reward": 0.06716974675655366, + "rewards/refuse_rate_reward": -0.001016865117708221, + "step": 8080 + }, + { + "completion_length": 23.7015625, + "epoch": 0.6472, + "grad_norm": 3.9363276958465576, + "kl": 0.45533704906702044, + "learning_rate": 2.8514797021674377e-06, + "loss": -0.0665, + "reward": 0.06323065161705017, + "reward_std": 0.022410303226206452, + "rewards/ddi_reward": -0.008279032545397058, + "rewards/jaccard_reward": 0.06326537197455764, + "rewards/refuse_rate_reward": -0.0001736111124046147, + "step": 8090 + }, + { + "completion_length": 22.8953125, + "epoch": 0.648, + "grad_norm": 2.219391107559204, + "kl": 0.46189219132065773, + "learning_rate": 2.839955164963133e-06, + "loss": -0.0638, + "reward": 0.06663487132173032, + "reward_std": 0.02257794882170856, + "rewards/ddi_reward": -0.0023091542272595687, + "rewards/jaccard_reward": 0.06670927655650302, + "rewards/refuse_rate_reward": -0.00037202382227405904, + "step": 8100 + }, + { + "completion_length": 27.9890625, + "epoch": 0.6488, + "grad_norm": 1.6925419569015503, + "kl": 0.4699766203761101, + "learning_rate": 2.8284447190909824e-06, + "loss": -0.0608, + "reward": 0.07613271134905517, + "reward_std": 0.023262731661088765, + "rewards/ddi_reward": -0.007477844748063944, + "rewards/jaccard_reward": 0.07682853178121149, + "rewards/refuse_rate_reward": -0.003479099145624787, + "step": 8110 + }, + { + "completion_length": 22.9859375, + "epoch": 0.6496, + "grad_norm": 2.031468629837036, + "kl": 0.6032622069120407, + "learning_rate": 2.816948439640878e-06, + "loss": -0.0514, + "reward": 0.06687070168554783, + "reward_std": 0.023997673066332936, + "rewards/ddi_reward": -0.006712831839104183, + "rewards/jaccard_reward": 0.06783224041573703, + "rewards/refuse_rate_reward": -0.00480769231216982, + "step": 8120 + }, + { + "completion_length": 23.48125, + "epoch": 0.6504, + "grad_norm": 3.071383237838745, + "kl": 0.5913398332893849, + "learning_rate": 2.8054664016102784e-06, + "loss": -0.0447, + "reward": 0.07193032433278859, + "reward_std": 0.025404802849516274, + "rewards/ddi_reward": -0.005495896545471624, + "rewards/jaccard_reward": 0.07264358159154653, + "rewards/refuse_rate_reward": -0.003566277481149882, + "step": 8130 + }, + { + "completion_length": 24.96875, + "epoch": 0.6512, + "grad_norm": 3.9458582401275635, + "kl": 0.5199545949697495, + "learning_rate": 2.793998679903751e-06, + "loss": -0.0413, + "reward": 0.06362319509498775, + "reward_std": 0.02233251726720482, + "rewards/ddi_reward": -0.009812760824570433, + "rewards/jaccard_reward": 0.0638119964627549, + "rewards/refuse_rate_reward": -0.0009440104593522846, + "step": 8140 + }, + { + "completion_length": 23.79375, + "epoch": 0.652, + "grad_norm": 4.699154853820801, + "kl": 0.407557787001133, + "learning_rate": 2.782545349332462e-06, + "loss": -0.0647, + "reward": 0.06521717882715165, + "reward_std": 0.023443437507376076, + "rewards/ddi_reward": -0.006241738077369519, + "rewards/jaccard_reward": 0.06582848536781967, + "rewards/refuse_rate_reward": -0.0030565419350750745, + "step": 8150 + }, + { + "completion_length": 21.5171875, + "epoch": 0.6528, + "grad_norm": 9.747129440307617, + "kl": 0.45294717997312545, + "learning_rate": 2.7711064846136947e-06, + "loss": -0.0639, + "reward": 0.06250444259494543, + "reward_std": 0.024872826947830617, + "rewards/ddi_reward": -0.003955143013445195, + "rewards/jaccard_reward": 0.06286301659420132, + "rewards/refuse_rate_reward": -0.0017928686109371483, + "step": 8160 + }, + { + "completion_length": 27.05625, + "epoch": 0.6536, + "grad_norm": 1.3969628810882568, + "kl": 0.49886085987091067, + "learning_rate": 2.759682160370366e-06, + "loss": -0.0445, + "reward": 0.06721660993061959, + "reward_std": 0.023002967284992338, + "rewards/ddi_reward": -0.00798868284909986, + "rewards/jaccard_reward": 0.0674626428168267, + "rewards/refuse_rate_reward": -0.0012301587965339422, + "step": 8170 + }, + { + "completion_length": 25.84375, + "epoch": 0.6544, + "grad_norm": 2.6913044452667236, + "kl": 0.5014951631426812, + "learning_rate": 2.748272451130534e-06, + "loss": -0.0419, + "reward": 0.06354690885636956, + "reward_std": 0.024167159292846917, + "rewards/ddi_reward": -0.006831042072735727, + "rewards/jaccard_reward": 0.06407434712164103, + "rewards/refuse_rate_reward": -0.00263719791546464, + "step": 8180 + }, + { + "completion_length": 25.140625, + "epoch": 0.6552, + "grad_norm": 4.270496368408203, + "kl": 0.4898619197309017, + "learning_rate": 2.7368774313269107e-06, + "loss": -0.0704, + "reward": 0.0674453556071967, + "reward_std": 0.02864320413209498, + "rewards/ddi_reward": -0.00901761547283968, + "rewards/jaccard_reward": 0.06756722899153829, + "rewards/refuse_rate_reward": -0.0006093750009313226, + "step": 8190 + }, + { + "completion_length": 23.19375, + "epoch": 0.656, + "grad_norm": 2.5586538314819336, + "kl": 0.5089573994278908, + "learning_rate": 2.7254971752963887e-06, + "loss": -0.0586, + "reward": 0.06208813346456736, + "reward_std": 0.025519425654783844, + "rewards/ddi_reward": -0.008602046998566948, + "rewards/jaccard_reward": 0.06236454723402858, + "rewards/refuse_rate_reward": -0.001382068474777043, + "step": 8200 + } + ], + "logging_steps": 10, + "max_steps": 12500, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}